@jinn-network/client 0.1.8 → 0.1.9-canary.050a41b1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/adapters/mech/adapter.d.ts +21 -1
- package/dist/adapters/mech/adapter.js +77 -10
- package/dist/adapters/mech/adapter.js.map +1 -1
- package/dist/adapters/mech/contracts.js +62 -28
- package/dist/adapters/mech/contracts.js.map +1 -1
- package/dist/adapters/mech/safe-revert.d.ts +4 -0
- package/dist/adapters/mech/safe-revert.js +5 -1
- package/dist/adapters/mech/safe-revert.js.map +1 -1
- package/dist/adapters/mech/safe.js +5 -1
- package/dist/adapters/mech/safe.js.map +1 -1
- package/dist/adapters/mech/verdict-code.js +1 -1
- package/dist/adapters/mech/verdict-code.js.map +1 -1
- package/dist/api/bootstrap-endpoint.d.ts +1 -0
- package/dist/api/bootstrap-endpoint.js +1 -0
- package/dist/api/bootstrap-endpoint.js.map +1 -1
- package/dist/api/discovery-endpoint.d.ts +1 -0
- package/dist/api/discovery-endpoint.js +24 -0
- package/dist/api/discovery-endpoint.js.map +1 -1
- package/dist/api/fleet-build.d.ts +1 -7
- package/dist/api/fleet-build.js +0 -7
- package/dist/api/fleet-build.js.map +1 -1
- package/dist/api/gather-status.d.ts +8 -2
- package/dist/api/gather-status.js +29 -117
- package/dist/api/gather-status.js.map +1 -1
- package/dist/api/loop-completion-build.d.ts +79 -0
- package/dist/api/loop-completion-build.js +155 -0
- package/dist/api/loop-completion-build.js.map +1 -0
- package/dist/api/peers.js +2 -0
- package/dist/api/peers.js.map +1 -1
- package/dist/api/setup-endpoints.d.ts +32 -0
- package/dist/api/setup-endpoints.js +93 -23
- package/dist/api/setup-endpoints.js.map +1 -1
- package/dist/api/solvernets-endpoints.js +3 -0
- package/dist/api/solvernets-endpoints.js.map +1 -1
- package/dist/api/status-build.d.ts +43 -33
- package/dist/api/status-build.js +3 -26
- package/dist/api/status-build.js.map +1 -1
- package/dist/api/status-rollup-build.d.ts +0 -4
- package/dist/api/status-rollup-build.js +0 -4
- package/dist/api/status-rollup-build.js.map +1 -1
- package/dist/build-info.json +4 -4
- package/dist/build-meta.json +1 -1
- package/dist/cli/commands/codedigest-revert-check.js +6 -2
- package/dist/cli/commands/codedigest-revert-check.js.map +1 -1
- package/dist/cli/commands/doctor.d.ts +3 -0
- package/dist/cli/commands/doctor.js +37 -2
- package/dist/cli/commands/doctor.js.map +1 -1
- package/dist/cli/commands/eval.d.ts +76 -0
- package/dist/cli/commands/eval.js +401 -0
- package/dist/cli/commands/eval.js.map +1 -0
- package/dist/cli/commands/rewards.d.ts +2 -0
- package/dist/cli/commands/rewards.js +30 -3
- package/dist/cli/commands/rewards.js.map +1 -1
- package/dist/cli/commands/solver-nets.js +68 -0
- package/dist/cli/commands/solver-nets.js.map +1 -1
- package/dist/cli/commands/status.js +0 -1
- package/dist/cli/commands/status.js.map +1 -1
- package/dist/cli/index.js +2 -0
- package/dist/cli/index.js.map +1 -1
- package/dist/config.d.ts +58 -7
- package/dist/config.js +96 -7
- package/dist/config.js.map +1 -1
- package/dist/daemon/ai-units-gate.d.ts +6 -6
- package/dist/daemon/ai-units-gate.js +11 -10
- package/dist/daemon/ai-units-gate.js.map +1 -1
- package/dist/daemon/balance-topup-loop.js +3 -0
- package/dist/daemon/balance-topup-loop.js.map +1 -1
- package/dist/daemon/creator.js +2 -0
- package/dist/daemon/creator.js.map +1 -1
- package/dist/daemon/daemon.d.ts +15 -0
- package/dist/daemon/daemon.js +78 -22
- package/dist/daemon/daemon.js.map +1 -1
- package/dist/daemon/eviction-loop.d.ts +7 -0
- package/dist/daemon/eviction-loop.js +16 -0
- package/dist/daemon/eviction-loop.js.map +1 -1
- package/dist/daemon/jinn-claim-loop.js +3 -0
- package/dist/daemon/jinn-claim-loop.js.map +1 -1
- package/dist/daemon/join-applier.d.ts +35 -0
- package/dist/daemon/join-applier.js +49 -0
- package/dist/daemon/join-applier.js.map +1 -0
- package/dist/daemon/loop-heartbeat.d.ts +34 -0
- package/dist/daemon/loop-heartbeat.js +39 -0
- package/dist/daemon/loop-heartbeat.js.map +1 -0
- package/dist/daemon/reward-claim-loop.js +3 -0
- package/dist/daemon/reward-claim-loop.js.map +1 -1
- package/dist/daemon/watchdog-loop.d.ts +84 -0
- package/dist/daemon/watchdog-loop.js +91 -0
- package/dist/daemon/watchdog-loop.js.map +1 -0
- package/dist/dashboard/assets/index-8tAiMbUV.css +1 -0
- package/dist/dashboard/assets/index-CSFVwGFh.js +167 -0
- package/dist/dashboard/index.html +2 -2
- package/dist/discovery/http.d.ts +7 -0
- package/dist/discovery/http.js +241 -25
- package/dist/discovery/http.js.map +1 -1
- package/dist/discovery/onchain.js +155 -1
- package/dist/discovery/onchain.js.map +1 -1
- package/dist/discovery/types.d.ts +106 -0
- package/dist/discovery/types.js +40 -0
- package/dist/discovery/types.js.map +1 -1
- package/dist/discovery/with-fallback.js +14 -0
- package/dist/discovery/with-fallback.js.map +1 -1
- package/dist/earning/bootstrap.d.ts +23 -0
- package/dist/earning/bootstrap.js +76 -27
- package/dist/earning/bootstrap.js.map +1 -1
- package/dist/earning/faucet.d.ts +1 -1
- package/dist/earning/faucet.js +2 -2
- package/dist/earning/faucet.js.map +1 -1
- package/dist/earning/safe-adapter.js +11 -0
- package/dist/earning/safe-adapter.js.map +1 -1
- package/dist/eval/eval-harness-run.d.ts +63 -0
- package/dist/eval/eval-harness-run.js +123 -0
- package/dist/eval/eval-harness-run.js.map +1 -0
- package/dist/eval/orchestrator.d.ts +163 -0
- package/dist/eval/orchestrator.js +232 -0
- package/dist/eval/orchestrator.js.map +1 -0
- package/dist/eval/paired.d.ts +68 -0
- package/dist/eval/paired.js +93 -0
- package/dist/eval/paired.js.map +1 -0
- package/dist/eval/resolve-slate-tasks.d.ts +35 -0
- package/dist/eval/resolve-slate-tasks.js +56 -0
- package/dist/eval/resolve-slate-tasks.js.map +1 -0
- package/dist/eval/screen-discovery.d.ts +22 -0
- package/dist/eval/screen-discovery.js +71 -0
- package/dist/eval/screen-discovery.js.map +1 -0
- package/dist/eval/screen-progress.d.ts +41 -0
- package/dist/eval/screen-progress.js +60 -0
- package/dist/eval/screen-progress.js.map +1 -0
- package/dist/eval/screen-runner.d.ts +30 -0
- package/dist/eval/screen-runner.js +289 -0
- package/dist/eval/screen-runner.js.map +1 -0
- package/dist/eval/screen.d.ts +107 -0
- package/dist/eval/screen.js +159 -0
- package/dist/eval/screen.js.map +1 -0
- package/dist/eval/slope.d.ts +29 -0
- package/dist/eval/slope.js +46 -0
- package/dist/eval/slope.js.map +1 -0
- package/dist/eval/train-sequence.d.ts +35 -0
- package/dist/eval/train-sequence.js +59 -0
- package/dist/eval/train-sequence.js.map +1 -0
- package/dist/eval/wilson.d.ts +45 -0
- package/dist/eval/wilson.js +48 -0
- package/dist/eval/wilson.js.map +1 -0
- package/dist/harnesses/engine/canonical-json.js +5 -3
- package/dist/harnesses/engine/canonical-json.js.map +1 -1
- package/dist/harnesses/engine/engine.d.ts +24 -0
- package/dist/harnesses/engine/engine.js +72 -9
- package/dist/harnesses/engine/engine.js.map +1 -1
- package/dist/harnesses/engine/persistence.d.ts +17 -0
- package/dist/harnesses/engine/persistence.js +28 -0
- package/dist/harnesses/engine/persistence.js.map +1 -1
- package/dist/harnesses/impls/hermes-agent/adapter.d.ts +2 -0
- package/dist/harnesses/impls/hermes-agent/adapter.js +8 -5
- package/dist/harnesses/impls/hermes-agent/adapter.js.map +1 -1
- package/dist/harnesses/impls/hermes-agent/bootstrap.d.ts +1 -0
- package/dist/harnesses/impls/hermes-agent/bootstrap.js +6 -1
- package/dist/harnesses/impls/hermes-agent/bootstrap.js.map +1 -1
- package/dist/harnesses/impls/hermes-agent/harness.d.ts +17 -3
- package/dist/harnesses/impls/hermes-agent/harness.js +68 -5
- package/dist/harnesses/impls/hermes-agent/harness.js.map +1 -1
- package/dist/harnesses/impls/index.d.ts +2 -0
- package/dist/harnesses/impls/index.js +2 -0
- package/dist/harnesses/impls/index.js.map +1 -1
- package/dist/harnesses/impls/learner/adapters/claude-code.js +5 -0
- package/dist/harnesses/impls/learner/adapters/claude-code.js.map +1 -1
- package/dist/harnesses/impls/learner/harness.d.ts +17 -1
- package/dist/harnesses/impls/learner/harness.js +51 -1
- package/dist/harnesses/impls/learner/harness.js.map +1 -1
- package/dist/harnesses/impls/learner/harvest.d.ts +2 -0
- package/dist/harnesses/impls/learner/harvest.js +7 -1
- package/dist/harnesses/impls/learner/harvest.js.map +1 -1
- package/dist/harnesses/impls/learner/plugin-path.js +1 -0
- package/dist/harnesses/impls/learner/plugin-path.js.map +1 -1
- package/dist/harnesses/impls/swe-rebench-v2-evaluator/harness.js +3 -1
- package/dist/harnesses/impls/swe-rebench-v2-evaluator/harness.js.map +1 -1
- package/dist/harnesses/impls/swe-rebench-v2-evaluator/index.d.ts +2 -2
- package/dist/harnesses/impls/swe-rebench-v2-evaluator/index.js +3 -1
- package/dist/harnesses/impls/swe-rebench-v2-evaluator/index.js.map +1 -1
- package/dist/harnesses/readiness-registry.d.ts +10 -0
- package/dist/harnesses/readiness-registry.js +13 -0
- package/dist/harnesses/readiness-registry.js.map +1 -1
- package/dist/harnesses/types.d.ts +14 -0
- package/dist/learner/revert-decision.d.ts +16 -1
- package/dist/learner/revert-decision.js +38 -18
- package/dist/learner/revert-decision.js.map +1 -1
- package/dist/learner/revert-stats.d.ts +14 -0
- package/dist/learner/revert-stats.js +42 -0
- package/dist/learner/revert-stats.js.map +1 -1
- package/dist/local-provider-url.d.ts +3 -0
- package/dist/local-provider-url.js +28 -0
- package/dist/local-provider-url.js.map +1 -0
- package/dist/main.js +94 -25
- package/dist/main.js.map +1 -1
- package/dist/plugins/learner/.claude-plugin/plugin.json +1 -1
- package/dist/plugins/learner/.codex-plugin/plugin.json +1 -1
- package/dist/plugins/learner/hooks/session-start +30 -1
- package/dist/plugins/learner/skills/learn/consolidator-prompt.md +4 -0
- package/dist/preflight/deployment-readiness.d.ts +147 -0
- package/dist/preflight/deployment-readiness.js +366 -0
- package/dist/preflight/deployment-readiness.js.map +1 -0
- package/dist/preflight/pidfile-liveness.d.ts +7 -1
- package/dist/preflight/pidfile-liveness.js +14 -0
- package/dist/preflight/pidfile-liveness.js.map +1 -1
- package/dist/rpc/transport.d.ts +36 -0
- package/dist/rpc/transport.js +123 -24
- package/dist/rpc/transport.js.map +1 -1
- package/dist/scripts/swe-rebench-v2-seed-pool.json +2 -1
- package/dist/solver-nets/registry.d.ts +19 -0
- package/dist/solver-nets/registry.js +92 -66
- package/dist/solver-nets/registry.js.map +1 -1
- package/dist/solver-types/_swe-rebench-v2-held-out-slate.d.ts +76 -0
- package/dist/solver-types/_swe-rebench-v2-held-out-slate.js +156 -0
- package/dist/solver-types/_swe-rebench-v2-held-out-slate.js.map +1 -0
- package/dist/solver-types/_swe-rebench-v2-pool-recovery.d.ts +81 -0
- package/dist/solver-types/_swe-rebench-v2-pool-recovery.js +116 -0
- package/dist/solver-types/_swe-rebench-v2-pool-recovery.js.map +1 -0
- package/dist/solver-types/_swe-rebench-v2-state.d.ts +9 -0
- package/dist/solver-types/_swe-rebench-v2-state.js +14 -0
- package/dist/solver-types/_swe-rebench-v2-state.js.map +1 -1
- package/dist/solver-types/_swe-rebench-v2-validated-pool.d.ts +30 -0
- package/dist/solver-types/_swe-rebench-v2-validated-pool.js +40 -0
- package/dist/solver-types/_swe-rebench-v2-validated-pool.js.map +1 -1
- package/dist/solver-types/slates/held-out-slate.swe-rebench-v2.v1.json +20 -0
- package/dist/solver-types/slates/held-out-slate.swe-rebench-v2.v2.json +19 -0
- package/dist/solver-types/slates/held-out-slate.swe-rebench-v2.v2.screening-report.json +628 -0
- package/dist/solver-types/solver-type.d.ts +8 -0
- package/dist/solver-types/swe-rebench-v2.d.ts +2 -0
- package/dist/solver-types/swe-rebench-v2.js +115 -10
- package/dist/solver-types/swe-rebench-v2.js.map +1 -1
- package/dist/solvernets/launched-record-dispatcher.d.ts +3 -0
- package/dist/solvernets/launched-record-dispatcher.js.map +1 -1
- package/dist/solvernets/registry-client-erc8004.js +29 -37
- package/dist/solvernets/registry-client-erc8004.js.map +1 -1
- package/dist/solvernets/registry-client.d.ts +6 -0
- package/dist/solvernets/store.js +7 -2
- package/dist/solvernets/store.js.map +1 -1
- package/dist/spend/ai-units-config.d.ts +10 -0
- package/dist/spend/ai-units-config.js +7 -1
- package/dist/spend/ai-units-config.js.map +1 -1
- package/dist/spend/ai-units.d.ts +51 -0
- package/dist/spend/ai-units.js +73 -0
- package/dist/spend/ai-units.js.map +1 -1
- package/dist/spend/record.js +12 -5
- package/dist/spend/record.js.map +1 -1
- package/dist/store/store.d.ts +91 -5
- package/dist/store/store.js +170 -7
- package/dist/store/store.js.map +1 -1
- package/dist/vendor/@jinn-network/sdk/dist/payloads/swe-rebench-v2.d.ts +108 -1
- package/dist/vendor/@jinn-network/sdk/dist/payloads/swe-rebench-v2.js +25 -1
- package/dist/vendor/@jinn-network/sdk/dist/solvernets/swe-rebench-v2-held-out-slate.d.ts +65 -0
- package/dist/vendor/@jinn-network/sdk/dist/solvernets/swe-rebench-v2-held-out-slate.js +123 -0
- package/dist/vendor/@jinn-network/sdk/dist/solvernets/swe-rebench-v2.d.ts +2 -2
- package/dist/vendor/@jinn-network/sdk/dist/solvernets/swe-rebench-v2.js +1 -1
- package/dist/vendor/@jinn-network/sdk/package.json +4 -0
- package/docker-compose.yml +3 -2
- package/package.json +22 -18
- package/plugins/learner/.claude-plugin/plugin.json +1 -1
- package/plugins/learner/.codex-plugin/plugin.json +1 -1
- package/plugins/learner/hooks/session-start +30 -1
- package/plugins/learner/skills/learn/consolidator-prompt.md +4 -0
- package/plugins/swe-rebench-v2-runtime/hooks/hooks.json +16 -0
- package/plugins/swe-rebench-v2-runtime/hooks/session-start +74 -0
- package/dist/dashboard/assets/index-CzKxvMcU.css +0 -32
- package/dist/dashboard/assets/index-yVemxHot.js +0 -351
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Paired (matched-design) comparison for the held-out exam.
|
|
3
|
+
*
|
|
4
|
+
* The exam scores the SAME slate against the before- and after-checkpoints, so
|
|
5
|
+
* the two result sets are MATCHED, not independent. The marginal test in
|
|
6
|
+
* `wilson.ts` (`compareRates`) compares two INDEPENDENT Wilson intervals and
|
|
7
|
+
* calls a delta trustworthy only when they are disjoint. On a matched design
|
|
8
|
+
* that is statistically wasteful: it carries the full between-instance
|
|
9
|
+
* difficulty variance ("this bug is just hard") in BOTH intervals, which swamps
|
|
10
|
+
* a real, consistent within-instance improvement. Example: a learner that flips
|
|
11
|
+
* 3 hard instances fail→pass and regresses none moves 4/10→7/10 — two Wilson
|
|
12
|
+
* intervals ([17,69]% vs [40,89]%) overlap → "within-noise", even though every
|
|
13
|
+
* observed change was an improvement.
|
|
14
|
+
*
|
|
15
|
+
* McNemar's test is the textbook test for matched binary data: it looks ONLY at
|
|
16
|
+
* the discordant pairs (instances that flipped) and asks whether the flips are
|
|
17
|
+
* asymmetric beyond chance. This is NOT a weaker bar — it is the CORRECT test
|
|
18
|
+
* for the design, and it still gates the claim on significance (exact two-sided
|
|
19
|
+
* p < alpha) AND on the improvement direction (more fail→pass than pass→fail).
|
|
20
|
+
* It is reported ALONGSIDE the conservative marginal verdict, never replacing it
|
|
21
|
+
* — so the exam is strengthened, never weakened (DR-2026-06-02-b §2a).
|
|
22
|
+
*
|
|
23
|
+
* R=1 (one run per instance) ships here: McNemar exact on per-instance flips.
|
|
24
|
+
* R>1 (per-instance pass RATES → Wilcoxon signed-rank / paired bootstrap) layers
|
|
25
|
+
* on once `eval_results` supports appended runs (DR-2026-06-02-b §2b).
|
|
26
|
+
*/
|
|
27
|
+
/** Minimal per-instance result (subset of PerTaskResult / EvalResultRow). */
|
|
28
|
+
export interface PairedInput {
|
|
29
|
+
instance_id: string;
|
|
30
|
+
/** null when unscorable. */
|
|
31
|
+
passed: boolean | null;
|
|
32
|
+
unscorable: boolean;
|
|
33
|
+
}
|
|
34
|
+
export type PairedVerdict = 'trustworthy' | 'within-noise';
|
|
35
|
+
export interface PairedComparison {
|
|
36
|
+
/** Instances scorable (passed !== null) in BOTH arms — the matched pairs. */
|
|
37
|
+
pairs: number;
|
|
38
|
+
/** b: before fail → after pass (improvements). */
|
|
39
|
+
improved: number;
|
|
40
|
+
/** c: before pass → after fail (regressions). */
|
|
41
|
+
regressed: number;
|
|
42
|
+
concordantPass: number;
|
|
43
|
+
concordantFail: number;
|
|
44
|
+
/** Instances unscorable/absent in either arm — excluded from the paired test. */
|
|
45
|
+
excluded: number;
|
|
46
|
+
/** Exact two-sided McNemar p-value over the discordant pairs. */
|
|
47
|
+
pValue: number;
|
|
48
|
+
/** 'trustworthy' iff pValue < alpha AND improved > regressed; else 'within-noise'. */
|
|
49
|
+
verdict: PairedVerdict;
|
|
50
|
+
}
|
|
51
|
+
/**
|
|
52
|
+
* Exact two-sided McNemar p-value for discordant counts `b` (improvements) and
|
|
53
|
+
* `c` (regressions). Under H0 each discordant pair is an independent fair coin,
|
|
54
|
+
* so the count of one direction is Binomial(n=b+c, 0.5); the two-sided p is
|
|
55
|
+
* `2 * P(X <= min(b,c))`, capped at 1. No `b==c==0` division (returns 1: no
|
|
56
|
+
* evidence). Computed iteratively (term ratio) so it is exact and overflow-free
|
|
57
|
+
* for any slate size.
|
|
58
|
+
*/
|
|
59
|
+
export declare function mcnemarExact(b: number, c: number): number;
|
|
60
|
+
/**
|
|
61
|
+
* Paired McNemar comparison of two matched per-instance result sets. An instance
|
|
62
|
+
* contributes a pair ONLY when it is scorable (passed !== null, not unscorable)
|
|
63
|
+
* in BOTH arms — an unscorable/missing side is excluded (honest: a disk-skip or
|
|
64
|
+
* harness failure is never coerced into a flip). `alpha` defaults to 0.05.
|
|
65
|
+
*/
|
|
66
|
+
export declare function comparePaired(before: readonly PairedInput[], after: readonly PairedInput[], opts?: {
|
|
67
|
+
alpha?: number;
|
|
68
|
+
}): PairedComparison;
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Paired (matched-design) comparison for the held-out exam.
|
|
3
|
+
*
|
|
4
|
+
* The exam scores the SAME slate against the before- and after-checkpoints, so
|
|
5
|
+
* the two result sets are MATCHED, not independent. The marginal test in
|
|
6
|
+
* `wilson.ts` (`compareRates`) compares two INDEPENDENT Wilson intervals and
|
|
7
|
+
* calls a delta trustworthy only when they are disjoint. On a matched design
|
|
8
|
+
* that is statistically wasteful: it carries the full between-instance
|
|
9
|
+
* difficulty variance ("this bug is just hard") in BOTH intervals, which swamps
|
|
10
|
+
* a real, consistent within-instance improvement. Example: a learner that flips
|
|
11
|
+
* 3 hard instances fail→pass and regresses none moves 4/10→7/10 — two Wilson
|
|
12
|
+
* intervals ([17,69]% vs [40,89]%) overlap → "within-noise", even though every
|
|
13
|
+
* observed change was an improvement.
|
|
14
|
+
*
|
|
15
|
+
* McNemar's test is the textbook test for matched binary data: it looks ONLY at
|
|
16
|
+
* the discordant pairs (instances that flipped) and asks whether the flips are
|
|
17
|
+
* asymmetric beyond chance. This is NOT a weaker bar — it is the CORRECT test
|
|
18
|
+
* for the design, and it still gates the claim on significance (exact two-sided
|
|
19
|
+
* p < alpha) AND on the improvement direction (more fail→pass than pass→fail).
|
|
20
|
+
* It is reported ALONGSIDE the conservative marginal verdict, never replacing it
|
|
21
|
+
* — so the exam is strengthened, never weakened (DR-2026-06-02-b §2a).
|
|
22
|
+
*
|
|
23
|
+
* R=1 (one run per instance) ships here: McNemar exact on per-instance flips.
|
|
24
|
+
* R>1 (per-instance pass RATES → Wilcoxon signed-rank / paired bootstrap) layers
|
|
25
|
+
* on once `eval_results` supports appended runs (DR-2026-06-02-b §2b).
|
|
26
|
+
*/
|
|
27
|
+
/**
|
|
28
|
+
* Exact two-sided McNemar p-value for discordant counts `b` (improvements) and
|
|
29
|
+
* `c` (regressions). Under H0 each discordant pair is an independent fair coin,
|
|
30
|
+
* so the count of one direction is Binomial(n=b+c, 0.5); the two-sided p is
|
|
31
|
+
* `2 * P(X <= min(b,c))`, capped at 1. No `b==c==0` division (returns 1: no
|
|
32
|
+
* evidence). Computed iteratively (term ratio) so it is exact and overflow-free
|
|
33
|
+
* for any slate size.
|
|
34
|
+
*/
|
|
35
|
+
export function mcnemarExact(b, c) {
|
|
36
|
+
const n = b + c;
|
|
37
|
+
if (n === 0)
|
|
38
|
+
return 1;
|
|
39
|
+
const k = Math.min(b, c);
|
|
40
|
+
// sum_{i=0}^{k} C(n,i) * 0.5^n, via prob_0 = 0.5^n, prob_i = prob_{i-1}*(n-i+1)/i.
|
|
41
|
+
let term = Math.pow(0.5, n); // i = 0
|
|
42
|
+
let cdf = term;
|
|
43
|
+
for (let i = 1; i <= k; i++) {
|
|
44
|
+
term = (term * (n - i + 1)) / i;
|
|
45
|
+
cdf += term;
|
|
46
|
+
}
|
|
47
|
+
return Math.min(1, 2 * cdf);
|
|
48
|
+
}
|
|
49
|
+
/**
|
|
50
|
+
* Paired McNemar comparison of two matched per-instance result sets. An instance
|
|
51
|
+
* contributes a pair ONLY when it is scorable (passed !== null, not unscorable)
|
|
52
|
+
* in BOTH arms — an unscorable/missing side is excluded (honest: a disk-skip or
|
|
53
|
+
* harness failure is never coerced into a flip). `alpha` defaults to 0.05.
|
|
54
|
+
*/
|
|
55
|
+
export function comparePaired(before, after, opts = {}) {
|
|
56
|
+
const alpha = opts.alpha ?? 0.05;
|
|
57
|
+
const scorable = (r) => !r.unscorable && r.passed !== null;
|
|
58
|
+
const beforeById = new Map(before.map((r) => [r.instance_id, r]));
|
|
59
|
+
let improved = 0;
|
|
60
|
+
let regressed = 0;
|
|
61
|
+
let concordantPass = 0;
|
|
62
|
+
let concordantFail = 0;
|
|
63
|
+
let pairs = 0;
|
|
64
|
+
let excluded = 0;
|
|
65
|
+
const seen = new Set();
|
|
66
|
+
for (const a of after) {
|
|
67
|
+
seen.add(a.instance_id);
|
|
68
|
+
const b = beforeById.get(a.instance_id);
|
|
69
|
+
if (!b || !scorable(b) || !scorable(a)) {
|
|
70
|
+
excluded++;
|
|
71
|
+
continue;
|
|
72
|
+
}
|
|
73
|
+
pairs++;
|
|
74
|
+
const wasPass = b.passed === true;
|
|
75
|
+
const nowPass = a.passed === true;
|
|
76
|
+
if (!wasPass && nowPass)
|
|
77
|
+
improved++;
|
|
78
|
+
else if (wasPass && !nowPass)
|
|
79
|
+
regressed++;
|
|
80
|
+
else if (wasPass && nowPass)
|
|
81
|
+
concordantPass++;
|
|
82
|
+
else
|
|
83
|
+
concordantFail++;
|
|
84
|
+
}
|
|
85
|
+
// before-only instances (absent from after) are excluded too.
|
|
86
|
+
for (const b of before)
|
|
87
|
+
if (!seen.has(b.instance_id))
|
|
88
|
+
excluded++;
|
|
89
|
+
const pValue = mcnemarExact(improved, regressed);
|
|
90
|
+
const verdict = pValue < alpha && improved > regressed ? 'trustworthy' : 'within-noise';
|
|
91
|
+
return { pairs, improved, regressed, concordantPass, concordantFail, excluded, pValue, verdict };
|
|
92
|
+
}
|
|
93
|
+
//# sourceMappingURL=paired.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"paired.js","sourceRoot":"","sources":["../../src/eval/paired.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;GAyBG;AA6BH;;;;;;;GAOG;AACH,MAAM,UAAU,YAAY,CAAC,CAAS,EAAE,CAAS;IAC/C,MAAM,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;IAChB,IAAI,CAAC,KAAK,CAAC;QAAE,OAAO,CAAC,CAAC;IACtB,MAAM,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;IACzB,mFAAmF;IACnF,IAAI,IAAI,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC,CAAC,QAAQ;IACrC,IAAI,GAAG,GAAG,IAAI,CAAC;IACf,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;QAC5B,IAAI,GAAG,CAAC,IAAI,GAAG,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC;QAChC,GAAG,IAAI,IAAI,CAAC;IACd,CAAC;IACD,OAAO,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC;AAC9B,CAAC;AAED;;;;;GAKG;AACH,MAAM,UAAU,aAAa,CAC3B,MAA8B,EAC9B,KAA6B,EAC7B,OAA2B,EAAE;IAE7B,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,IAAI,IAAI,CAAC;IACjC,MAAM,QAAQ,GAAG,CAAC,CAAc,EAAW,EAAE,CAAC,CAAC,CAAC,CAAC,UAAU,IAAI,CAAC,CAAC,MAAM,KAAK,IAAI,CAAC;IACjF,MAAM,UAAU,GAAG,IAAI,GAAG,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,WAAW,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC;IAElE,IAAI,QAAQ,GAAG,CAAC,CAAC;IACjB,IAAI,SAAS,GAAG,CAAC,CAAC;IAClB,IAAI,cAAc,GAAG,CAAC,CAAC;IACvB,IAAI,cAAc,GAAG,CAAC,CAAC;IACvB,IAAI,KAAK,GAAG,CAAC,CAAC;IACd,IAAI,QAAQ,GAAG,CAAC,CAAC;IAEjB,MAAM,IAAI,GAAG,IAAI,GAAG,EAAU,CAAC;IAC/B,KAAK,MAAM,CAAC,IAAI,KAAK,EAAE,CAAC;QACtB,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC;QACxB,MAAM,CAAC,GAAG,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC;QACxC,IAAI,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC,CAAC,EAAE,CAAC;YACvC,QAAQ,EAAE,CAAC;YACX,SAAS;QACX,CAAC;QACD,KAAK,EAAE,CAAC;QACR,MAAM,OAAO,GAAG,CAAC,CAAC,MAAM,KAAK,IAAI,CAAC;QAClC,MAAM,OAAO,GAAG,CAAC,CAAC,MAAM,KAAK,IAAI,CAAC;QAClC,IAAI,CAAC,OAAO,IAAI,OAAO;YAAE,QAAQ,EAAE,CAAC;aAC/B,IAAI,OAAO,IAAI,CAAC,OAAO;YAAE,SAAS,EAAE,CAAC;aACrC,IAAI,OAAO,IAAI,OAAO;YAAE,cAAc,EAAE,CAAC;;YACzC,cAAc,EAAE,CAAC;IACxB,CAAC;IACD,8DAA8D;IAC9D,KAAK,MAAM,CAAC,IAAI,MAAM;QAAE,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,WAAW,CAAC;YAAE,QAAQ,EAAE,CAAC;IAEjE,MAAM,MAAM,GAAG,YAAY,CAAC,QAAQ,EAAE,SAAS,CAAC,CAAC;IACjD,MAAM,OAAO,GACX,MAAM,GAAG,KAAK,IAAI,QAAQ,GAAG,SAAS,CAAC,CAAC,CAAC,aAAa,CAAC,CAAC,CAAC,cAAc,CAAC;IAE1E,OAAO,EAAE,KAAK,EAAE,QAAQ,EAAE,SAAS,EAAE,cAAc,EAAE,cAAc,EAAE,QAAQ,EAAE,MAAM,EAAE,OAAO,EAAE,CAAC;AACnG,CAAC"}
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Resolve a held-out slate's `instance_id`s into `SweRebenchV2Task` objects
|
|
3
|
+
* for the `jinn eval` orchestrator (issue #818, AC#1).
|
|
4
|
+
*
|
|
5
|
+
* A slate stores only `instance_id`s (`_swe-rebench-v2-held-out-slate.ts`). The
|
|
6
|
+
* evaluator grades against the full HuggingFace row, fetched by
|
|
7
|
+
* `(hf_dataset, hf_split, instance_id)`. The `HfRow` does NOT echo
|
|
8
|
+
* `hf_dataset`/`hf_split` (see `swe-rebench-v2-evaluator/index.ts`), so the
|
|
9
|
+
* caller supplies the slate-level dataset+split as args; the resolver verifies
|
|
10
|
+
* each id exists by fetching its row and returns the `{ task, row }` pair so the
|
|
11
|
+
* orchestrator reuses the row at grade time (avoids a second fetch).
|
|
12
|
+
*
|
|
13
|
+
* No retry logic here — `HttpHfFetcher` owns retries. A fetcher throw for any id
|
|
14
|
+
* propagates loudly (a missing slate instance is a hard error, never a silent
|
|
15
|
+
* drop).
|
|
16
|
+
*/
|
|
17
|
+
import { type SweRebenchV2Task } from '../vendor/@jinn-network/sdk/dist/solvernets/swe-rebench-v2.js';
|
|
18
|
+
import type { HfFetcher, HfRow } from '../harnesses/impls/swe-rebench-v2-evaluator/index.js';
|
|
19
|
+
import type { PoolTask } from '../solver-types/_swe-rebench-v2-pool.js';
|
|
20
|
+
export interface ResolvedSlateTask {
|
|
21
|
+
task: SweRebenchV2Task;
|
|
22
|
+
row: HfRow;
|
|
23
|
+
}
|
|
24
|
+
export declare function resolveSlateTasks(args: {
|
|
25
|
+
/**
|
|
26
|
+
* The pool tasks for this slate group, each carrying the real
|
|
27
|
+
* `problem_statement` / `base_commit` / `language` the agent run needs to
|
|
28
|
+
* solve the instance (the slate stores only ids; the pool is the source of
|
|
29
|
+
* these fields).
|
|
30
|
+
*/
|
|
31
|
+
poolTasks: PoolTask[];
|
|
32
|
+
hf_dataset: string;
|
|
33
|
+
hf_split: string;
|
|
34
|
+
fetcher: HfFetcher;
|
|
35
|
+
}): Promise<ResolvedSlateTask[]>;
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Resolve a held-out slate's `instance_id`s into `SweRebenchV2Task` objects
|
|
3
|
+
* for the `jinn eval` orchestrator (issue #818, AC#1).
|
|
4
|
+
*
|
|
5
|
+
* A slate stores only `instance_id`s (`_swe-rebench-v2-held-out-slate.ts`). The
|
|
6
|
+
* evaluator grades against the full HuggingFace row, fetched by
|
|
7
|
+
* `(hf_dataset, hf_split, instance_id)`. The `HfRow` does NOT echo
|
|
8
|
+
* `hf_dataset`/`hf_split` (see `swe-rebench-v2-evaluator/index.ts`), so the
|
|
9
|
+
* caller supplies the slate-level dataset+split as args; the resolver verifies
|
|
10
|
+
* each id exists by fetching its row and returns the `{ task, row }` pair so the
|
|
11
|
+
* orchestrator reuses the row at grade time (avoids a second fetch).
|
|
12
|
+
*
|
|
13
|
+
* No retry logic here — `HttpHfFetcher` owns retries. A fetcher throw for any id
|
|
14
|
+
* propagates loudly (a missing slate instance is a hard error, never a silent
|
|
15
|
+
* drop).
|
|
16
|
+
*/
|
|
17
|
+
import { SweRebenchV2TaskSchema } from '../vendor/@jinn-network/sdk/dist/solvernets/swe-rebench-v2.js';
|
|
18
|
+
export async function resolveSlateTasks(args) {
|
|
19
|
+
const tasks = [...args.poolTasks].sort((a, b) => a.instance_id.localeCompare(b.instance_id));
|
|
20
|
+
const out = [];
|
|
21
|
+
for (const poolTask of tasks) {
|
|
22
|
+
const row = await args.fetcher.fetchTaskRow({
|
|
23
|
+
hf_dataset: args.hf_dataset,
|
|
24
|
+
hf_split: args.hf_split,
|
|
25
|
+
instance_id: poolTask.instance_id,
|
|
26
|
+
});
|
|
27
|
+
out.push({ task: buildTask(poolTask, args.hf_dataset, args.hf_split, row), row });
|
|
28
|
+
}
|
|
29
|
+
return out;
|
|
30
|
+
}
|
|
31
|
+
/**
|
|
32
|
+
* Construct the `SweRebenchV2Task` the orchestrator hands to the harness +
|
|
33
|
+
* evaluator. The agent run needs the real `problem_statement` (what to solve)
|
|
34
|
+
* and `base_commit` (the repo state to check out) — these are threaded through
|
|
35
|
+
* from the pool task (the generator's mapping in `swe-rebench-v2.ts`).
|
|
36
|
+
* `hf_dataset`/`hf_split` are the evaluator's row-fetch key; `repo` comes from
|
|
37
|
+
* the fetched row. A pool task missing an optional field falls back the same
|
|
38
|
+
* way the generator does, so the object stays schema-valid.
|
|
39
|
+
*/
|
|
40
|
+
function buildTask(poolTask, hf_dataset, hf_split, row) {
|
|
41
|
+
const language = SweRebenchV2TaskSchema.shape.language.safeParse(poolTask.language);
|
|
42
|
+
return {
|
|
43
|
+
schemaVersion: 'swe-rebench-v2.v1',
|
|
44
|
+
instance_id: poolTask.instance_id,
|
|
45
|
+
repo: row.repo,
|
|
46
|
+
base_commit: poolTask.base_commit ?? '0'.repeat(40),
|
|
47
|
+
language: language.success ? language.data : 'python',
|
|
48
|
+
problem_statement: poolTask.problem_statement ?? '',
|
|
49
|
+
interface: poolTask.interface ?? '',
|
|
50
|
+
hf_dataset,
|
|
51
|
+
hf_split,
|
|
52
|
+
deadline_unix: 1,
|
|
53
|
+
round_month: hf_split.replace('_', '-'),
|
|
54
|
+
};
|
|
55
|
+
}
|
|
56
|
+
//# sourceMappingURL=resolve-slate-tasks.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"resolve-slate-tasks.js","sourceRoot":"","sources":["../../src/eval/resolve-slate-tasks.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;GAeG;AAEH,OAAO,EAAE,sBAAsB,EAAyB,MAAM,6CAA6C,CAAC;AAS5G,MAAM,CAAC,KAAK,UAAU,iBAAiB,CAAC,IAWvC;IACC,MAAM,KAAK,GAAG,CAAC,GAAG,IAAI,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,WAAW,CAAC,aAAa,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC;IAC7F,MAAM,GAAG,GAAwB,EAAE,CAAC;IACpC,KAAK,MAAM,QAAQ,IAAI,KAAK,EAAE,CAAC;QAC7B,MAAM,GAAG,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,YAAY,CAAC;YAC1C,UAAU,EAAE,IAAI,CAAC,UAAU;YAC3B,QAAQ,EAAE,IAAI,CAAC,QAAQ;YACvB,WAAW,EAAE,QAAQ,CAAC,WAAW;SAClC,CAAC,CAAC;QACH,GAAG,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,SAAS,CAAC,QAAQ,EAAE,IAAI,CAAC,UAAU,EAAE,IAAI,CAAC,QAAQ,EAAE,GAAG,CAAC,EAAE,GAAG,EAAE,CAAC,CAAC;IACpF,CAAC;IACD,OAAO,GAAG,CAAC;AACb,CAAC;AAED;;;;;;;;GAQG;AACH,SAAS,SAAS,CAChB,QAAkB,EAClB,UAAkB,EAClB,QAAgB,EAChB,GAAU;IAEV,MAAM,QAAQ,GAAG,sBAAsB,CAAC,KAAK,CAAC,QAAQ,CAAC,SAAS,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC;IACpF,OAAO;QACL,aAAa,EAAE,mBAAmB;QAClC,WAAW,EAAE,QAAQ,CAAC,WAAW;QACjC,IAAI,EAAE,GAAG,CAAC,IAAI;QACd,WAAW,EAAE,QAAQ,CAAC,WAAW,IAAI,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC;QACnD,QAAQ,EAAE,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,QAAQ;QACrD,iBAAiB,EAAE,QAAQ,CAAC,iBAAiB,IAAI,EAAE;QACnD,SAAS,EAAE,QAAQ,CAAC,SAAS,IAAI,EAAE;QACnC,UAAU;QACV,QAAQ;QACR,aAAa,EAAE,CAAC;QAChB,WAAW,EAAE,QAAQ,CAAC,OAAO,CAAC,GAAG,EAAE,GAAG,CAAC;KACxC,CAAC;AACJ,CAAC"}
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Held-out screening's authoritative "already trained-on" exclusion source (#986).
|
|
3
|
+
*
|
|
4
|
+
* Returns the set of swe-rebench-v2 `instance_id`s that have been ATTEMPTED on
|
|
5
|
+
* the network for a SolverNet — any verdict envelope, passed OR failed,
|
|
6
|
+
* cross-operator. An attempted instance was executed by an operator, so the
|
|
7
|
+
* learner trained on it; holding it out later would make a trained-checkpoint
|
|
8
|
+
* pass count as memorization, not generalization.
|
|
9
|
+
*
|
|
10
|
+
* Why the indexer and not the local generator-state: the local ledger only
|
|
11
|
+
* reflects THIS box's posting and can be stale (a different active generator —
|
|
12
|
+
* e.g. a hosted operator — posts to its own ledger). The indexer's
|
|
13
|
+
* `verdictEnvelopeMeta` is the cross-operator, current record. (The on-chain
|
|
14
|
+
* task/attempt tables carry no instance_id; only the indexer's IPFS enrichment
|
|
15
|
+
* resolves it — same backing as `DiscoveryAPI.getInstanceSuccessCounts`, minus
|
|
16
|
+
* the `actualPassed: true` filter so failed attempts count too.)
|
|
17
|
+
*
|
|
18
|
+
* Throws on indexer failure — callers MUST abort rather than screen against an
|
|
19
|
+
* unknown attempted set, because a missing exclusion can silently contaminate
|
|
20
|
+
* the exam (the whole point of held-out discipline).
|
|
21
|
+
*/
|
|
22
|
+
export declare function fetchAttemptedInstanceIds(discoveryUrl: string, manifestCid: string, fetchImpl?: typeof fetch): Promise<Set<string>>;
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Held-out screening's authoritative "already trained-on" exclusion source (#986).
|
|
3
|
+
*
|
|
4
|
+
* Returns the set of swe-rebench-v2 `instance_id`s that have been ATTEMPTED on
|
|
5
|
+
* the network for a SolverNet — any verdict envelope, passed OR failed,
|
|
6
|
+
* cross-operator. An attempted instance was executed by an operator, so the
|
|
7
|
+
* learner trained on it; holding it out later would make a trained-checkpoint
|
|
8
|
+
* pass count as memorization, not generalization.
|
|
9
|
+
*
|
|
10
|
+
* Why the indexer and not the local generator-state: the local ledger only
|
|
11
|
+
* reflects THIS box's posting and can be stale (a different active generator —
|
|
12
|
+
* e.g. a hosted operator — posts to its own ledger). The indexer's
|
|
13
|
+
* `verdictEnvelopeMeta` is the cross-operator, current record. (The on-chain
|
|
14
|
+
* task/attempt tables carry no instance_id; only the indexer's IPFS enrichment
|
|
15
|
+
* resolves it — same backing as `DiscoveryAPI.getInstanceSuccessCounts`, minus
|
|
16
|
+
* the `actualPassed: true` filter so failed attempts count too.)
|
|
17
|
+
*
|
|
18
|
+
* Throws on indexer failure — callers MUST abort rather than screen against an
|
|
19
|
+
* unknown attempted set, because a missing exclusion can silently contaminate
|
|
20
|
+
* the exam (the whole point of held-out discipline).
|
|
21
|
+
*/
|
|
22
|
+
const ATTEMPTED_QUERY = `
|
|
23
|
+
query InstanceAttempted($cid: String!, $limit: Int!, $after: String) {
|
|
24
|
+
verdictEnvelopeMetas(
|
|
25
|
+
where: {
|
|
26
|
+
solverNetManifestCid: $cid,
|
|
27
|
+
solverType_starts_with: "swe-rebench-v2",
|
|
28
|
+
enrichmentStatus: "ok",
|
|
29
|
+
instanceId_not: ""
|
|
30
|
+
},
|
|
31
|
+
limit: $limit,
|
|
32
|
+
after: $after,
|
|
33
|
+
orderBy: "enrichedAtBlock",
|
|
34
|
+
orderDirection: "asc"
|
|
35
|
+
) {
|
|
36
|
+
items { instanceId }
|
|
37
|
+
pageInfo { hasNextPage endCursor }
|
|
38
|
+
}
|
|
39
|
+
}`;
|
|
40
|
+
export async function fetchAttemptedInstanceIds(discoveryUrl, manifestCid, fetchImpl = fetch) {
|
|
41
|
+
const gqlUrl = discoveryUrl.endsWith('/graphql') ? discoveryUrl : `${discoveryUrl.replace(/\/$/, '')}/graphql`;
|
|
42
|
+
const ids = new Set();
|
|
43
|
+
let cursor = null;
|
|
44
|
+
const MAX_PAGES = 20;
|
|
45
|
+
const PAGE_LIMIT = 1000;
|
|
46
|
+
for (let page = 0; page < MAX_PAGES; page++) {
|
|
47
|
+
const res = await fetchImpl(gqlUrl, {
|
|
48
|
+
method: 'POST',
|
|
49
|
+
headers: { 'content-type': 'application/json' },
|
|
50
|
+
body: JSON.stringify({ query: ATTEMPTED_QUERY, variables: { cid: manifestCid, limit: PAGE_LIMIT, after: cursor } }),
|
|
51
|
+
});
|
|
52
|
+
if (!res.ok) {
|
|
53
|
+
throw new Error(`held-out screening: indexer attempted-ids query failed (HTTP ${res.status}) at ${gqlUrl}`);
|
|
54
|
+
}
|
|
55
|
+
const json = (await res.json());
|
|
56
|
+
if (json.errors) {
|
|
57
|
+
throw new Error(`held-out screening: indexer attempted-ids query errors: ${JSON.stringify(json.errors).slice(0, 300)}`);
|
|
58
|
+
}
|
|
59
|
+
const conn = json.data?.verdictEnvelopeMetas;
|
|
60
|
+
for (const item of conn?.items ?? []) {
|
|
61
|
+
if (item.instanceId)
|
|
62
|
+
ids.add(item.instanceId);
|
|
63
|
+
}
|
|
64
|
+
const pageInfo = conn?.pageInfo;
|
|
65
|
+
if (!pageInfo?.hasNextPage || !pageInfo.endCursor)
|
|
66
|
+
break;
|
|
67
|
+
cursor = pageInfo.endCursor;
|
|
68
|
+
}
|
|
69
|
+
return ids;
|
|
70
|
+
}
|
|
71
|
+
//# sourceMappingURL=screen-discovery.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"screen-discovery.js","sourceRoot":"","sources":["../../src/eval/screen-discovery.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;GAoBG;AAEH,MAAM,eAAe,GAAG;;;;;;;;;;;;;;;;;EAiBtB,CAAC;AAYH,MAAM,CAAC,KAAK,UAAU,yBAAyB,CAC7C,YAAoB,EACpB,WAAmB,EACnB,YAA0B,KAAK;IAE/B,MAAM,MAAM,GAAG,YAAY,CAAC,QAAQ,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,GAAG,YAAY,CAAC,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC,UAAU,CAAC;IAC/G,MAAM,GAAG,GAAG,IAAI,GAAG,EAAU,CAAC;IAC9B,IAAI,MAAM,GAAkB,IAAI,CAAC;IACjC,MAAM,SAAS,GAAG,EAAE,CAAC;IACrB,MAAM,UAAU,GAAG,IAAI,CAAC;IAExB,KAAK,IAAI,IAAI,GAAG,CAAC,EAAE,IAAI,GAAG,SAAS,EAAE,IAAI,EAAE,EAAE,CAAC;QAC5C,MAAM,GAAG,GAAG,MAAM,SAAS,CAAC,MAAM,EAAE;YAClC,MAAM,EAAE,MAAM;YACd,OAAO,EAAE,EAAE,cAAc,EAAE,kBAAkB,EAAE;YAC/C,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC,EAAE,KAAK,EAAE,eAAe,EAAE,SAAS,EAAE,EAAE,GAAG,EAAE,WAAW,EAAE,KAAK,EAAE,UAAU,EAAE,KAAK,EAAE,MAAM,EAAE,EAAE,CAAC;SACpH,CAAC,CAAC;QACH,IAAI,CAAC,GAAG,CAAC,EAAE,EAAE,CAAC;YACZ,MAAM,IAAI,KAAK,CAAC,gEAAgE,GAAG,CAAC,MAAM,QAAQ,MAAM,EAAE,CAAC,CAAC;QAC9G,CAAC;QACD,MAAM,IAAI,GAAG,CAAC,MAAM,GAAG,CAAC,IAAI,EAAE,CAAkB,CAAC;QACjD,IAAI,IAAI,CAAC,MAAM,EAAE,CAAC;YAChB,MAAM,IAAI,KAAK,CAAC,2DAA2D,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE,CAAC,CAAC;QAC1H,CAAC;QACD,MAAM,IAAI,GAAG,IAAI,CAAC,IAAI,EAAE,oBAAoB,CAAC;QAC7C,KAAK,MAAM,IAAI,IAAI,IAAI,EAAE,KAAK,IAAI,EAAE,EAAE,CAAC;YACrC,IAAI,IAAI,CAAC,UAAU;gBAAE,GAAG,CAAC,GAAG,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;QAChD,CAAC;QACD,MAAM,QAAQ,GAAG,IAAI,EAAE,QAAQ,CAAC;QAChC,IAAI,CAAC,QAAQ,EAAE,WAAW,IAAI,CAAC,QAAQ,CAAC,SAAS;YAAE,MAAM;QACzD,MAAM,GAAG,QAAQ,CAAC,SAAS,CAAC;IAC9B,CAAC;IAED,OAAO,GAAG,CAAC;AACb,CAAC"}
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Resumability store for held-out screening (#986).
|
|
3
|
+
*
|
|
4
|
+
* A real exam cut runs base R-runs + a prover over dozens of candidates — many
|
|
5
|
+
* hours of inference. This store persists each candidate's {@link ScreenMeasurement}
|
|
6
|
+
* so an interrupted run (rate limit, crash, disk) resumes instead of restarting:
|
|
7
|
+
* re-running the same command replays cached candidates for free and the
|
|
8
|
+
* `maxCandidates` budget bounds only the NEW work, so a long screen proceeds in
|
|
9
|
+
* budget-sized chunks.
|
|
10
|
+
*
|
|
11
|
+
* Keyed by a `signature` of the measurement-determining config (base model,
|
|
12
|
+
* prover, R, eval-semantics version). A signature mismatch on load → fresh start:
|
|
13
|
+
* cached measurements are only valid for the exact config that produced them
|
|
14
|
+
* (e.g. changing R changes the 0/R determination; a stronger base model changes
|
|
15
|
+
* pass/fail). Stored at `<stateDir>/held-out-screen-progress.json`.
|
|
16
|
+
*/
|
|
17
|
+
import type { ScreenMeasurement } from './screen.js';
|
|
18
|
+
export declare class ScreenProgressStore {
|
|
19
|
+
private readonly file;
|
|
20
|
+
private readonly signature;
|
|
21
|
+
private data;
|
|
22
|
+
constructor(opts: {
|
|
23
|
+
stateDir: string;
|
|
24
|
+
signature: string;
|
|
25
|
+
});
|
|
26
|
+
private load;
|
|
27
|
+
/** Cached measurement for this instance under the current signature, or undefined. */
|
|
28
|
+
get(instance_id: string): ScreenMeasurement | undefined;
|
|
29
|
+
/** Persist a freshly-measured candidate (atomic-enough: whole-file rewrite). */
|
|
30
|
+
record(instance_id: string, m: ScreenMeasurement): void;
|
|
31
|
+
/** Number of candidates already measured under the current signature. */
|
|
32
|
+
get size(): number;
|
|
33
|
+
}
|
|
34
|
+
/** Build the cache signature from the measurement-determining config. */
|
|
35
|
+
export declare function screenSignature(args: {
|
|
36
|
+
baseModel: string;
|
|
37
|
+
proverHarness: string;
|
|
38
|
+
proverModel: string;
|
|
39
|
+
R: number;
|
|
40
|
+
evalSemanticsVersion: string;
|
|
41
|
+
}): string;
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Resumability store for held-out screening (#986).
|
|
3
|
+
*
|
|
4
|
+
* A real exam cut runs base R-runs + a prover over dozens of candidates — many
|
|
5
|
+
* hours of inference. This store persists each candidate's {@link ScreenMeasurement}
|
|
6
|
+
* so an interrupted run (rate limit, crash, disk) resumes instead of restarting:
|
|
7
|
+
* re-running the same command replays cached candidates for free and the
|
|
8
|
+
* `maxCandidates` budget bounds only the NEW work, so a long screen proceeds in
|
|
9
|
+
* budget-sized chunks.
|
|
10
|
+
*
|
|
11
|
+
* Keyed by a `signature` of the measurement-determining config (base model,
|
|
12
|
+
* prover, R, eval-semantics version). A signature mismatch on load → fresh start:
|
|
13
|
+
* cached measurements are only valid for the exact config that produced them
|
|
14
|
+
* (e.g. changing R changes the 0/R determination; a stronger base model changes
|
|
15
|
+
* pass/fail). Stored at `<stateDir>/held-out-screen-progress.json`.
|
|
16
|
+
*/
|
|
17
|
+
import { readFileSync, writeFileSync, mkdirSync } from 'node:fs';
|
|
18
|
+
import { dirname, join } from 'node:path';
|
|
19
|
+
const SCHEMA_VERSION = 'held-out-screen-progress.v1';
|
|
20
|
+
export class ScreenProgressStore {
|
|
21
|
+
file;
|
|
22
|
+
signature;
|
|
23
|
+
data;
|
|
24
|
+
constructor(opts) {
|
|
25
|
+
this.file = join(opts.stateDir, 'held-out-screen-progress.json');
|
|
26
|
+
this.signature = opts.signature;
|
|
27
|
+
this.data = this.load();
|
|
28
|
+
}
|
|
29
|
+
load() {
|
|
30
|
+
try {
|
|
31
|
+
const raw = JSON.parse(readFileSync(this.file, 'utf8'));
|
|
32
|
+
if (raw?.schemaVersion === SCHEMA_VERSION && raw.signature === this.signature && raw.measurements) {
|
|
33
|
+
return raw;
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
catch {
|
|
37
|
+
/* absent or corrupt → fresh */
|
|
38
|
+
}
|
|
39
|
+
return { schemaVersion: SCHEMA_VERSION, signature: this.signature, measurements: {} };
|
|
40
|
+
}
|
|
41
|
+
/** Cached measurement for this instance under the current signature, or undefined. */
|
|
42
|
+
get(instance_id) {
|
|
43
|
+
return this.data.measurements[instance_id];
|
|
44
|
+
}
|
|
45
|
+
/** Persist a freshly-measured candidate (atomic-enough: whole-file rewrite). */
|
|
46
|
+
record(instance_id, m) {
|
|
47
|
+
this.data.measurements[instance_id] = m;
|
|
48
|
+
mkdirSync(dirname(this.file), { recursive: true });
|
|
49
|
+
writeFileSync(this.file, `${JSON.stringify(this.data, null, 2)}\n`);
|
|
50
|
+
}
|
|
51
|
+
/** Number of candidates already measured under the current signature. */
|
|
52
|
+
get size() {
|
|
53
|
+
return Object.keys(this.data.measurements).length;
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
/** Build the cache signature from the measurement-determining config. */
|
|
57
|
+
export function screenSignature(args) {
|
|
58
|
+
return `base=${args.baseModel}|prover=${args.proverHarness}:${args.proverModel}|R=${args.R}|sem=${args.evalSemanticsVersion}`;
|
|
59
|
+
}
|
|
60
|
+
//# sourceMappingURL=screen-progress.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"screen-progress.js","sourceRoot":"","sources":["../../src/eval/screen-progress.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;GAeG;AAEH,OAAO,EAAE,YAAY,EAAE,aAAa,EAAE,SAAS,EAAE,MAAM,SAAS,CAAC;AACjE,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AAG1C,MAAM,cAAc,GAAG,6BAAsC,CAAC;AAS9D,MAAM,OAAO,mBAAmB;IACb,IAAI,CAAS;IACb,SAAS,CAAS;IAC3B,IAAI,CAAe;IAE3B,YAAY,IAA6C;QACvD,IAAI,CAAC,IAAI,GAAG,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,+BAA+B,CAAC,CAAC;QACjE,IAAI,CAAC,SAAS,GAAG,IAAI,CAAC,SAAS,CAAC;QAChC,IAAI,CAAC,IAAI,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC;IAC1B,CAAC;IAEO,IAAI;QACV,IAAI,CAAC;YACH,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,YAAY,CAAC,IAAI,CAAC,IAAI,EAAE,MAAM,CAAC,CAAiB,CAAC;YACxE,IAAI,GAAG,EAAE,aAAa,KAAK,cAAc,IAAI,GAAG,CAAC,SAAS,KAAK,IAAI,CAAC,SAAS,IAAI,GAAG,CAAC,YAAY,EAAE,CAAC;gBAClG,OAAO,GAAG,CAAC;YACb,CAAC;QACH,CAAC;QAAC,MAAM,CAAC;YACP,+BAA+B;QACjC,CAAC;QACD,OAAO,EAAE,aAAa,EAAE,cAAc,EAAE,SAAS,EAAE,IAAI,CAAC,SAAS,EAAE,YAAY,EAAE,EAAE,EAAE,CAAC;IACxF,CAAC;IAED,sFAAsF;IACtF,GAAG,CAAC,WAAmB;QACrB,OAAO,IAAI,CAAC,IAAI,CAAC,YAAY,CAAC,WAAW,CAAC,CAAC;IAC7C,CAAC;IAED,gFAAgF;IAChF,MAAM,CAAC,WAAmB,EAAE,CAAoB;QAC9C,IAAI,CAAC,IAAI,CAAC,YAAY,CAAC,WAAW,CAAC,GAAG,CAAC,CAAC;QACxC,SAAS,CAAC,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;QACnD,aAAa,CAAC,IAAI,CAAC,IAAI,EAAE,GAAG,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC,IAAI,EAAE,IAAI,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC;IACtE,CAAC;IAED,yEAAyE;IACzE,IAAI,IAAI;QACN,OAAO,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC,MAAM,CAAC;IACpD,CAAC;CACF;AAED,yEAAyE;AACzE,MAAM,UAAU,eAAe,CAAC,IAM/B;IACC,OAAO,QAAQ,IAAI,CAAC,SAAS,WAAW,IAAI,CAAC,aAAa,IAAI,IAAI,CAAC,WAAW,MAAM,IAAI,CAAC,CAAC,QAAQ,IAAI,CAAC,oBAAoB,EAAE,CAAC;AAChI,CAAC"}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
import { type ScreenResult } from './screen.js';
|
|
2
|
+
export interface ScreenRunOptions {
|
|
3
|
+
R: number;
|
|
4
|
+
heldOutCount: number;
|
|
5
|
+
maxCandidates: number;
|
|
6
|
+
perRepoCap: number;
|
|
7
|
+
/** Prover agent harness: `codex` (default) or `claude-code` (e.g. an Opus prover
|
|
8
|
+
* via the working Claude auth — useful when codex is rate-limited). */
|
|
9
|
+
proverHarness?: 'codex' | 'claude-code';
|
|
10
|
+
/** Prover model: `codexModel` for the codex harness, `claudeModel` for the
|
|
11
|
+
* claude-code harness (defaults to `opus` there). */
|
|
12
|
+
proverModel?: string;
|
|
13
|
+
/** Restrict candidates to these instance ids (else whole gradeable pool). */
|
|
14
|
+
instanceIds?: string[];
|
|
15
|
+
/** Restrict candidates to one repo (org prefix), e.g. `tobymao`. */
|
|
16
|
+
repo?: string;
|
|
17
|
+
configPath?: string;
|
|
18
|
+
log?: (msg: string) => void;
|
|
19
|
+
}
|
|
20
|
+
export interface ScreenRunSummary {
|
|
21
|
+
result: ScreenResult;
|
|
22
|
+
baseCodeDigest: string;
|
|
23
|
+
slatePath: string;
|
|
24
|
+
reportPath: string;
|
|
25
|
+
heldOutCount: number;
|
|
26
|
+
/** Base-failing candidates whose prover run returned no gradeable result
|
|
27
|
+
* (excluded as no-headroom) — a signal the prover may be unavailable. */
|
|
28
|
+
proverUnscorable: number;
|
|
29
|
+
}
|
|
30
|
+
export declare function runScreenHeldOut(opts: ScreenRunOptions): Promise<ScreenRunSummary>;
|