@jinn-network/client 0.1.8 → 0.1.9-canary.144d87d2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -0
- package/dist/adapters/mech/adapter.d.ts +21 -1
- package/dist/adapters/mech/adapter.js +77 -10
- package/dist/adapters/mech/adapter.js.map +1 -1
- package/dist/adapters/mech/contracts.js +62 -28
- package/dist/adapters/mech/contracts.js.map +1 -1
- package/dist/adapters/mech/safe-revert.d.ts +4 -0
- package/dist/adapters/mech/safe-revert.js +5 -1
- package/dist/adapters/mech/safe-revert.js.map +1 -1
- package/dist/adapters/mech/safe.js +5 -1
- package/dist/adapters/mech/safe.js.map +1 -1
- package/dist/adapters/mech/verdict-code.js +1 -1
- package/dist/adapters/mech/verdict-code.js.map +1 -1
- package/dist/api/bootstrap-endpoint.d.ts +1 -0
- package/dist/api/bootstrap-endpoint.js +1 -0
- package/dist/api/bootstrap-endpoint.js.map +1 -1
- package/dist/api/discovery-endpoint.d.ts +1 -0
- package/dist/api/discovery-endpoint.js +24 -0
- package/dist/api/discovery-endpoint.js.map +1 -1
- package/dist/api/fleet-build.d.ts +1 -7
- package/dist/api/fleet-build.js +0 -7
- package/dist/api/fleet-build.js.map +1 -1
- package/dist/api/gather-status.d.ts +8 -2
- package/dist/api/gather-status.js +29 -117
- package/dist/api/gather-status.js.map +1 -1
- package/dist/api/loop-completion-build.d.ts +79 -0
- package/dist/api/loop-completion-build.js +155 -0
- package/dist/api/loop-completion-build.js.map +1 -0
- package/dist/api/operator-artifacts-endpoint.js +1 -1
- package/dist/api/operator-artifacts-endpoint.js.map +1 -1
- package/dist/api/peers.js +2 -0
- package/dist/api/peers.js.map +1 -1
- package/dist/api/setup-endpoints.d.ts +32 -0
- package/dist/api/setup-endpoints.js +94 -24
- package/dist/api/setup-endpoints.js.map +1 -1
- package/dist/api/solvernets-endpoints.js +4 -1
- package/dist/api/solvernets-endpoints.js.map +1 -1
- package/dist/api/status-build.d.ts +43 -33
- package/dist/api/status-build.js +3 -26
- package/dist/api/status-build.js.map +1 -1
- package/dist/api/status-rollup-build.d.ts +0 -4
- package/dist/api/status-rollup-build.js +0 -4
- package/dist/api/status-rollup-build.js.map +1 -1
- package/dist/api/stop-hook.d.ts +1 -1
- package/dist/api/stop-hook.js +1 -1
- package/dist/api/stop-hook.js.map +1 -1
- package/dist/build-info.json +4 -4
- package/dist/build-meta.json +1 -1
- package/dist/cli/commands/codedigest-revert-check.js +6 -2
- package/dist/cli/commands/codedigest-revert-check.js.map +1 -1
- package/dist/cli/commands/doctor.d.ts +3 -0
- package/dist/cli/commands/doctor.js +37 -2
- package/dist/cli/commands/doctor.js.map +1 -1
- package/dist/cli/commands/eval.d.ts +87 -0
- package/dist/cli/commands/eval.js +481 -0
- package/dist/cli/commands/eval.js.map +1 -0
- package/dist/cli/commands/rewards.d.ts +2 -0
- package/dist/cli/commands/rewards.js +30 -3
- package/dist/cli/commands/rewards.js.map +1 -1
- package/dist/cli/commands/solver-nets.js +68 -0
- package/dist/cli/commands/solver-nets.js.map +1 -1
- package/dist/cli/commands/status.js +0 -1
- package/dist/cli/commands/status.js.map +1 -1
- package/dist/cli/index.js +2 -0
- package/dist/cli/index.js.map +1 -1
- package/dist/config.d.ts +102 -15
- package/dist/config.js +166 -19
- package/dist/config.js.map +1 -1
- package/dist/daemon/ai-units-gate.d.ts +6 -6
- package/dist/daemon/ai-units-gate.js +11 -10
- package/dist/daemon/ai-units-gate.js.map +1 -1
- package/dist/daemon/balance-topup-loop.js +3 -0
- package/dist/daemon/balance-topup-loop.js.map +1 -1
- package/dist/daemon/checkpoint-loop.js +2 -2
- package/dist/daemon/creator.d.ts +1 -0
- package/dist/daemon/creator.js +26 -14
- package/dist/daemon/creator.js.map +1 -1
- package/dist/daemon/daemon.d.ts +15 -0
- package/dist/daemon/daemon.js +78 -22
- package/dist/daemon/daemon.js.map +1 -1
- package/dist/daemon/eviction-loop.d.ts +7 -0
- package/dist/daemon/eviction-loop.js +19 -3
- package/dist/daemon/eviction-loop.js.map +1 -1
- package/dist/daemon/jinn-claim-loop.js +3 -0
- package/dist/daemon/jinn-claim-loop.js.map +1 -1
- package/dist/daemon/join-applier.d.ts +35 -0
- package/dist/daemon/join-applier.js +49 -0
- package/dist/daemon/join-applier.js.map +1 -0
- package/dist/daemon/loop-heartbeat.d.ts +34 -0
- package/dist/daemon/loop-heartbeat.js +39 -0
- package/dist/daemon/loop-heartbeat.js.map +1 -0
- package/dist/daemon/reward-claim-loop.js +4 -1
- package/dist/daemon/reward-claim-loop.js.map +1 -1
- package/dist/daemon/watchdog-loop.d.ts +84 -0
- package/dist/daemon/watchdog-loop.js +91 -0
- package/dist/daemon/watchdog-loop.js.map +1 -0
- package/dist/dashboard/assets/index-8tAiMbUV.css +1 -0
- package/dist/dashboard/assets/index-D6a-DfaM.js +171 -0
- package/dist/dashboard/index.html +2 -2
- package/dist/discovery/http.d.ts +17 -0
- package/dist/discovery/http.js +295 -25
- package/dist/discovery/http.js.map +1 -1
- package/dist/discovery/onchain.js +155 -1
- package/dist/discovery/onchain.js.map +1 -1
- package/dist/discovery/types.d.ts +106 -0
- package/dist/discovery/types.js +40 -0
- package/dist/discovery/types.js.map +1 -1
- package/dist/discovery/with-fallback.js +14 -0
- package/dist/discovery/with-fallback.js.map +1 -1
- package/dist/earning/bootstrap.d.ts +25 -0
- package/dist/earning/bootstrap.js +79 -28
- package/dist/earning/bootstrap.js.map +1 -1
- package/dist/earning/faucet.d.ts +1 -1
- package/dist/earning/faucet.js +2 -2
- package/dist/earning/faucet.js.map +1 -1
- package/dist/earning/safe-adapter.js +11 -0
- package/dist/earning/safe-adapter.js.map +1 -1
- package/dist/earning/stolas-claim.js +5 -5
- package/dist/earning/types.d.ts +1 -1
- package/dist/earning/types.js +1 -1
- package/dist/earning/types.js.map +1 -1
- package/dist/eval/eval-harness-run.d.ts +63 -0
- package/dist/eval/eval-harness-run.js +123 -0
- package/dist/eval/eval-harness-run.js.map +1 -0
- package/dist/eval/orchestrator.d.ts +224 -0
- package/dist/eval/orchestrator.js +250 -0
- package/dist/eval/orchestrator.js.map +1 -0
- package/dist/eval/paired.d.ts +68 -0
- package/dist/eval/paired.js +93 -0
- package/dist/eval/paired.js.map +1 -0
- package/dist/eval/resolve-slate-tasks.d.ts +35 -0
- package/dist/eval/resolve-slate-tasks.js +56 -0
- package/dist/eval/resolve-slate-tasks.js.map +1 -0
- package/dist/eval/screen-discovery.d.ts +22 -0
- package/dist/eval/screen-discovery.js +71 -0
- package/dist/eval/screen-discovery.js.map +1 -0
- package/dist/eval/screen-progress.d.ts +41 -0
- package/dist/eval/screen-progress.js +60 -0
- package/dist/eval/screen-progress.js.map +1 -0
- package/dist/eval/screen-runner.d.ts +30 -0
- package/dist/eval/screen-runner.js +289 -0
- package/dist/eval/screen-runner.js.map +1 -0
- package/dist/eval/screen.d.ts +107 -0
- package/dist/eval/screen.js +159 -0
- package/dist/eval/screen.js.map +1 -0
- package/dist/eval/slope.d.ts +29 -0
- package/dist/eval/slope.js +46 -0
- package/dist/eval/slope.js.map +1 -0
- package/dist/eval/train-sequence.d.ts +35 -0
- package/dist/eval/train-sequence.js +59 -0
- package/dist/eval/train-sequence.js.map +1 -0
- package/dist/eval/wilson.d.ts +45 -0
- package/dist/eval/wilson.js +48 -0
- package/dist/eval/wilson.js.map +1 -0
- package/dist/events/types.d.ts +1 -1
- package/dist/events/types.js +1 -1
- package/dist/events/types.js.map +1 -1
- package/dist/harnesses/engine/canonical-json.js +5 -3
- package/dist/harnesses/engine/canonical-json.js.map +1 -1
- package/dist/harnesses/engine/engine.d.ts +24 -0
- package/dist/harnesses/engine/engine.js +72 -9
- package/dist/harnesses/engine/engine.js.map +1 -1
- package/dist/harnesses/engine/packaging.js +1 -1
- package/dist/harnesses/engine/packaging.js.map +1 -1
- package/dist/harnesses/engine/persistence.d.ts +17 -0
- package/dist/harnesses/engine/persistence.js +28 -0
- package/dist/harnesses/engine/persistence.js.map +1 -1
- package/dist/harnesses/impls/claude-mcp-hyperliquid/mcp-tools.d.ts +1 -1
- package/dist/harnesses/impls/claude-mcp-hyperliquid/mcp-tools.js +1 -1
- package/dist/harnesses/impls/claude-mcp-hyperliquid/mcp-tools.js.map +1 -1
- package/dist/harnesses/impls/hermes-agent/adapter.d.ts +2 -0
- package/dist/harnesses/impls/hermes-agent/adapter.js +8 -5
- package/dist/harnesses/impls/hermes-agent/adapter.js.map +1 -1
- package/dist/harnesses/impls/hermes-agent/bootstrap.d.ts +1 -0
- package/dist/harnesses/impls/hermes-agent/bootstrap.js +6 -1
- package/dist/harnesses/impls/hermes-agent/bootstrap.js.map +1 -1
- package/dist/harnesses/impls/hermes-agent/harness.d.ts +17 -3
- package/dist/harnesses/impls/hermes-agent/harness.js +68 -5
- package/dist/harnesses/impls/hermes-agent/harness.js.map +1 -1
- package/dist/harnesses/impls/index.d.ts +2 -0
- package/dist/harnesses/impls/index.js +9 -0
- package/dist/harnesses/impls/index.js.map +1 -1
- package/dist/harnesses/impls/jinn-repo-evaluator/eval-runner.d.ts +34 -0
- package/dist/harnesses/impls/jinn-repo-evaluator/eval-runner.js +111 -0
- package/dist/harnesses/impls/jinn-repo-evaluator/eval-runner.js.map +1 -0
- package/dist/harnesses/impls/jinn-repo-evaluator/evaluator.d.ts +24 -0
- package/dist/harnesses/impls/jinn-repo-evaluator/evaluator.js +19 -0
- package/dist/harnesses/impls/jinn-repo-evaluator/evaluator.js.map +1 -0
- package/dist/harnesses/impls/jinn-repo-evaluator/harness.d.ts +64 -0
- package/dist/harnesses/impls/jinn-repo-evaluator/harness.js +125 -0
- package/dist/harnesses/impls/jinn-repo-evaluator/harness.js.map +1 -0
- package/dist/harnesses/impls/jinn-repo-evaluator/repro.d.ts +32 -0
- package/dist/harnesses/impls/jinn-repo-evaluator/repro.js +73 -0
- package/dist/harnesses/impls/jinn-repo-evaluator/repro.js.map +1 -0
- package/dist/harnesses/impls/learner/adapters/claude-code.js +5 -0
- package/dist/harnesses/impls/learner/adapters/claude-code.js.map +1 -1
- package/dist/harnesses/impls/learner/harness.d.ts +17 -1
- package/dist/harnesses/impls/learner/harness.js +51 -1
- package/dist/harnesses/impls/learner/harness.js.map +1 -1
- package/dist/harnesses/impls/learner/harvest.d.ts +2 -0
- package/dist/harnesses/impls/learner/harvest.js +51 -1
- package/dist/harnesses/impls/learner/harvest.js.map +1 -1
- package/dist/harnesses/impls/learner/plugin-path.js +1 -0
- package/dist/harnesses/impls/learner/plugin-path.js.map +1 -1
- package/dist/harnesses/impls/swe-rebench-v2-evaluator/harness.js +3 -1
- package/dist/harnesses/impls/swe-rebench-v2-evaluator/harness.js.map +1 -1
- package/dist/harnesses/impls/swe-rebench-v2-evaluator/index.d.ts +2 -2
- package/dist/harnesses/impls/swe-rebench-v2-evaluator/index.js +3 -1
- package/dist/harnesses/impls/swe-rebench-v2-evaluator/index.js.map +1 -1
- package/dist/harnesses/readiness-registry.d.ts +10 -0
- package/dist/harnesses/readiness-registry.js +13 -0
- package/dist/harnesses/readiness-registry.js.map +1 -1
- package/dist/harnesses/types.d.ts +14 -0
- package/dist/learner/revert-decision.d.ts +16 -1
- package/dist/learner/revert-decision.js +38 -18
- package/dist/learner/revert-decision.js.map +1 -1
- package/dist/learner/revert-stats.d.ts +14 -0
- package/dist/learner/revert-stats.js +42 -0
- package/dist/learner/revert-stats.js.map +1 -1
- package/dist/local-provider-url.d.ts +3 -0
- package/dist/local-provider-url.js +28 -0
- package/dist/local-provider-url.js.map +1 -0
- package/dist/main.js +94 -25
- package/dist/main.js.map +1 -1
- package/dist/mcp/operator-server.js +1 -1
- package/dist/mcp/operator-server.js.map +1 -1
- package/dist/mcp/server.js +1 -1
- package/dist/mcp/server.js.map +1 -1
- package/dist/plugins/learner/.claude-plugin/plugin.json +1 -1
- package/dist/plugins/learner/.codex-plugin/plugin.json +1 -1
- package/dist/plugins/learner/hooks/session-start +30 -1
- package/dist/plugins/learner/skills/learn/consolidator-prompt.md +4 -0
- package/dist/preflight/deployment-readiness.d.ts +147 -0
- package/dist/preflight/deployment-readiness.js +366 -0
- package/dist/preflight/deployment-readiness.js.map +1 -0
- package/dist/preflight/pidfile-liveness.d.ts +7 -1
- package/dist/preflight/pidfile-liveness.js +14 -0
- package/dist/preflight/pidfile-liveness.js.map +1 -1
- package/dist/rpc/transport.d.ts +43 -5
- package/dist/rpc/transport.js +131 -30
- package/dist/rpc/transport.js.map +1 -1
- package/dist/scripts/swe-rebench-v2-seed-pool.json +2 -1
- package/dist/solver-nets/registry.d.ts +19 -0
- package/dist/solver-nets/registry.js +95 -66
- package/dist/solver-nets/registry.js.map +1 -1
- package/dist/solver-types/_jinn-repo-pool.d.ts +27 -0
- package/dist/solver-types/_jinn-repo-pool.js +27 -0
- package/dist/solver-types/_jinn-repo-pool.js.map +1 -0
- package/dist/solver-types/_swe-rebench-v2-held-out-slate.d.ts +76 -0
- package/dist/solver-types/_swe-rebench-v2-held-out-slate.js +156 -0
- package/dist/solver-types/_swe-rebench-v2-held-out-slate.js.map +1 -0
- package/dist/solver-types/_swe-rebench-v2-pool-recovery.d.ts +81 -0
- package/dist/solver-types/_swe-rebench-v2-pool-recovery.js +116 -0
- package/dist/solver-types/_swe-rebench-v2-pool-recovery.js.map +1 -0
- package/dist/solver-types/_swe-rebench-v2-state.d.ts +9 -0
- package/dist/solver-types/_swe-rebench-v2-state.js +14 -0
- package/dist/solver-types/_swe-rebench-v2-state.js.map +1 -1
- package/dist/solver-types/_swe-rebench-v2-validated-pool.d.ts +30 -0
- package/dist/solver-types/_swe-rebench-v2-validated-pool.js +40 -0
- package/dist/solver-types/_swe-rebench-v2-validated-pool.js.map +1 -1
- package/dist/solver-types/index.js +2 -0
- package/dist/solver-types/index.js.map +1 -1
- package/dist/solver-types/jinn-repo-admit.d.ts +17 -0
- package/dist/solver-types/jinn-repo-admit.js +16 -0
- package/dist/solver-types/jinn-repo-admit.js.map +1 -0
- package/dist/solver-types/jinn-repo-auto.d.ts +60 -0
- package/dist/solver-types/jinn-repo-auto.js +163 -0
- package/dist/solver-types/jinn-repo-auto.js.map +1 -0
- package/dist/solver-types/jinn-repo-definition.d.ts +15 -0
- package/dist/solver-types/jinn-repo-definition.js +34 -0
- package/dist/solver-types/jinn-repo-definition.js.map +1 -0
- package/dist/solver-types/jinn-repo-extract.d.ts +16 -0
- package/dist/solver-types/jinn-repo-extract.js +32 -0
- package/dist/solver-types/jinn-repo-extract.js.map +1 -0
- package/dist/solver-types/jinn-repo.d.ts +21 -0
- package/dist/solver-types/jinn-repo.js +23 -0
- package/dist/solver-types/jinn-repo.js.map +1 -0
- package/dist/solver-types/learner-loop-test.js +1 -1
- package/dist/solver-types/learner-loop-test.js.map +1 -1
- package/dist/solver-types/slates/held-out-slate.swe-rebench-v2.v1.json +20 -0
- package/dist/solver-types/slates/held-out-slate.swe-rebench-v2.v2.json +19 -0
- package/dist/solver-types/slates/held-out-slate.swe-rebench-v2.v2.screening-report.json +628 -0
- package/dist/solver-types/solver-type.d.ts +8 -0
- package/dist/solver-types/swe-rebench-v2.d.ts +2 -0
- package/dist/solver-types/swe-rebench-v2.js +115 -10
- package/dist/solver-types/swe-rebench-v2.js.map +1 -1
- package/dist/solvernets/launched-record-dispatcher.d.ts +5 -0
- package/dist/solvernets/launched-record-dispatcher.js +8 -1
- package/dist/solvernets/launched-record-dispatcher.js.map +1 -1
- package/dist/solvernets/registry-client-erc8004.js +29 -37
- package/dist/solvernets/registry-client-erc8004.js.map +1 -1
- package/dist/solvernets/registry-client.d.ts +6 -0
- package/dist/solvernets/store.d.ts +1 -1
- package/dist/solvernets/store.js +8 -3
- package/dist/solvernets/store.js.map +1 -1
- package/dist/spend/ai-units-config.d.ts +10 -0
- package/dist/spend/ai-units-config.js +7 -1
- package/dist/spend/ai-units-config.js.map +1 -1
- package/dist/spend/ai-units.d.ts +51 -0
- package/dist/spend/ai-units.js +73 -0
- package/dist/spend/ai-units.js.map +1 -1
- package/dist/spend/record.js +12 -5
- package/dist/spend/record.js.map +1 -1
- package/dist/store/store.d.ts +91 -5
- package/dist/store/store.js +170 -7
- package/dist/store/store.js.map +1 -1
- package/dist/trajectory/harness-bundle-schema.d.ts +1 -1
- package/dist/trajectory/harness-bundle-schema.js +1 -1
- package/dist/trajectory/harness-bundle-schema.js.map +1 -1
- package/dist/trajectory/schema.d.ts +1 -1
- package/dist/trajectory/schema.js +1 -1
- package/dist/trajectory/schema.js.map +1 -1
- package/dist/trajectory/transcript-parsers/types.d.ts +1 -1
- package/dist/trajectory/transcript-parsers/types.js +1 -1
- package/dist/trajectory/transcript-parsers/types.js.map +1 -1
- package/dist/types/envelope.d.ts +1 -1
- package/dist/types/envelope.js +1 -1
- package/dist/types/envelope.js.map +1 -1
- package/dist/types/payloads/index.d.ts +1 -1
- package/dist/types/payloads/index.js +7 -1
- package/dist/types/payloads/index.js.map +1 -1
- package/dist/types/payloads/portfolio-v0.d.ts +1 -1
- package/dist/types/payloads/portfolio-v0.js +1 -1
- package/dist/types/payloads/portfolio-v0.js.map +1 -1
- package/dist/types/payloads/prediction-apy-v0.d.ts +1 -1
- package/dist/types/payloads/prediction-apy-v0.js +1 -1
- package/dist/types/payloads/prediction-apy-v0.js.map +1 -1
- package/dist/types/payloads/prediction-v0.d.ts +1 -1
- package/dist/types/payloads/prediction-v0.js +1 -1
- package/dist/types/payloads/prediction-v0.js.map +1 -1
- package/dist/types/portfolio.d.ts +1 -1
- package/dist/types/portfolio.js +1 -1
- package/dist/types/portfolio.js.map +1 -1
- package/dist/types/prediction-apy.d.ts +1 -1
- package/dist/types/prediction-apy.js +1 -1
- package/dist/types/prediction-apy.js.map +1 -1
- package/dist/types/prediction.d.ts +1 -1
- package/dist/types/prediction.js +1 -1
- package/dist/types/prediction.js.map +1 -1
- package/dist/types/session-provenance.d.ts +1 -1
- package/dist/types/session-provenance.js +1 -1
- package/dist/types/session-provenance.js.map +1 -1
- package/dist/types/task-document.d.ts +1 -1
- package/dist/types/task-document.js +1 -1
- package/dist/types/task-document.js.map +1 -1
- package/dist/types/task.d.ts +1 -1
- package/dist/types/task.js +1 -1
- package/dist/types/task.js.map +1 -1
- package/dist/types/window.d.ts +1 -1
- package/dist/types/window.js +1 -1
- package/dist/types/window.js.map +1 -1
- package/dist/vendor/@jinn-network/sdk/dist/checkpoint.d.ts +1 -1
- package/dist/vendor/@jinn-network/sdk/dist/checkpoint.js +1 -1
- package/dist/vendor/@jinn-network/sdk/dist/contracts.d.ts +3 -2
- package/dist/vendor/@jinn-network/sdk/dist/contracts.js +49 -0
- package/dist/vendor/@jinn-network/sdk/dist/jinn-repo.d.ts +44 -0
- package/dist/vendor/@jinn-network/sdk/dist/jinn-repo.js +25 -0
- package/dist/vendor/@jinn-network/sdk/dist/json-schema.d.ts +1 -1
- package/dist/vendor/@jinn-network/sdk/dist/json-schema.js +1 -1
- package/dist/vendor/@jinn-network/sdk/dist/payloads/jinn-repo.d.ts +38 -0
- package/dist/vendor/@jinn-network/sdk/dist/payloads/jinn-repo.js +22 -0
- package/dist/vendor/@jinn-network/sdk/dist/payloads/prediction-v1.d.ts +1 -1
- package/dist/vendor/@jinn-network/sdk/dist/payloads/prediction-v1.js +1 -1
- package/dist/vendor/@jinn-network/sdk/dist/payloads/session-derived.d.ts +1 -1
- package/dist/vendor/@jinn-network/sdk/dist/payloads/session-derived.js +1 -1
- package/dist/vendor/@jinn-network/sdk/dist/payloads/swe-rebench-v2.d.ts +109 -2
- package/dist/vendor/@jinn-network/sdk/dist/payloads/swe-rebench-v2.js +26 -2
- package/dist/vendor/@jinn-network/sdk/dist/prediction-v1.d.ts +1 -1
- package/dist/vendor/@jinn-network/sdk/dist/prediction-v1.js +1 -1
- package/dist/vendor/@jinn-network/sdk/dist/solvernets/jinn-repo.d.ts +4 -0
- package/dist/vendor/@jinn-network/sdk/dist/solvernets/jinn-repo.js +2 -0
- package/dist/vendor/@jinn-network/sdk/dist/solvernets/manifest-schema.d.ts +1 -1
- package/dist/vendor/@jinn-network/sdk/dist/solvernets/manifest-schema.js +1 -1
- package/dist/vendor/@jinn-network/sdk/dist/solvernets/swe-rebench-v2-held-out-slate.d.ts +65 -0
- package/dist/vendor/@jinn-network/sdk/dist/solvernets/swe-rebench-v2-held-out-slate.js +123 -0
- package/dist/vendor/@jinn-network/sdk/dist/solvernets/swe-rebench-v2.d.ts +2 -2
- package/dist/vendor/@jinn-network/sdk/dist/solvernets/swe-rebench-v2.js +1 -1
- package/dist/vendor/@jinn-network/sdk/dist/swe-rebench-v2.d.ts +1 -1
- package/dist/vendor/@jinn-network/sdk/dist/swe-rebench-v2.js +1 -1
- package/dist/vendor/@jinn-network/sdk/package.json +9 -1
- package/docker-compose.yml +3 -2
- package/package.json +23 -20
- package/plugins/jinn-repo-runtime/.claude-plugin/plugin.json +5 -0
- package/plugins/jinn-repo-runtime/.codex-plugin/plugin.json +39 -0
- package/plugins/jinn-repo-runtime/README.md +27 -0
- package/plugins/jinn-repo-runtime/hooks/hooks.json +16 -0
- package/plugins/jinn-repo-runtime/hooks/session-start +73 -0
- package/plugins/jinn-repo-runtime/jinn.plugin.json +11 -0
- package/plugins/jinn-repo-runtime/skills/task/SKILL.md +92 -0
- package/plugins/learner/.claude-plugin/plugin.json +1 -1
- package/plugins/learner/.codex-plugin/plugin.json +1 -1
- package/plugins/learner/hooks/session-start +30 -1
- package/plugins/learner/skills/learn/consolidator-prompt.md +4 -0
- package/plugins/swe-rebench-v2-runtime/hooks/hooks.json +16 -0
- package/plugins/swe-rebench-v2-runtime/hooks/session-start +74 -0
- package/dist/dashboard/assets/index-CzKxvMcU.css +0 -32
- package/dist/dashboard/assets/index-yVemxHot.js +0 -351
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
import { HELD_OUT_SLATE_SCHEMA_VERSION, hashHeldOutSlateArtifact, } from '../solver-types/_swe-rebench-v2-held-out-slate.js';
|
|
2
|
+
/** Stratification / diversity key: the org prefix of an instance_id
|
|
3
|
+
* (`tobymao__sqlglot-4661` → `tobymao`). Derivable without an HF fetch. */
|
|
4
|
+
export function repoOf(task) {
|
|
5
|
+
const idx = task.instance_id.indexOf('__');
|
|
6
|
+
return idx === -1 ? task.instance_id : task.instance_id.slice(0, idx);
|
|
7
|
+
}
|
|
8
|
+
/**
|
|
9
|
+
* Order candidates round-robin across repos so the first N base-fails span
|
|
10
|
+
* repos rather than clumping in alphabetically-early ones. Deterministic:
|
|
11
|
+
* instances sort by instance_id within each repo group; repo groups iterate in
|
|
12
|
+
* sorted repo order.
|
|
13
|
+
*/
|
|
14
|
+
export function stratifyByRepo(pool) {
|
|
15
|
+
const groups = new Map();
|
|
16
|
+
for (const task of pool) {
|
|
17
|
+
const repo = repoOf(task);
|
|
18
|
+
(groups.get(repo) ?? groups.set(repo, []).get(repo)).push(task);
|
|
19
|
+
}
|
|
20
|
+
const repos = [...groups.keys()].sort((a, b) => a.localeCompare(b));
|
|
21
|
+
for (const repo of repos) {
|
|
22
|
+
groups.get(repo).sort((a, b) => a.instance_id.localeCompare(b.instance_id));
|
|
23
|
+
}
|
|
24
|
+
const out = [];
|
|
25
|
+
let added = true;
|
|
26
|
+
for (let i = 0; added; i++) {
|
|
27
|
+
added = false;
|
|
28
|
+
for (const repo of repos) {
|
|
29
|
+
const g = groups.get(repo);
|
|
30
|
+
if (i < g.length) {
|
|
31
|
+
out.push(g[i]);
|
|
32
|
+
added = true;
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
return out;
|
|
37
|
+
}
|
|
38
|
+
/**
|
|
39
|
+
* The expensive part: gradeable → base R-runs (early-stop on first pass) →
|
|
40
|
+
* prover (only if base is 0/R). Pure measurement; no selection/caps. This is what
|
|
41
|
+
* gets cached for resumability.
|
|
42
|
+
*/
|
|
43
|
+
async function measureCandidate(task, deps, R) {
|
|
44
|
+
const none = { gradeable: false, basePasses: 0, baseRuns: 0, baseUnscorable: false, proverRan: false, proverPassed: null };
|
|
45
|
+
if (!(await deps.ensureGradeable(task)))
|
|
46
|
+
return none;
|
|
47
|
+
let basePasses = 0;
|
|
48
|
+
let baseUnscorable = false;
|
|
49
|
+
let baseUnscorableReason;
|
|
50
|
+
let r = 0;
|
|
51
|
+
for (; r < R; r++) {
|
|
52
|
+
const run = await deps.runBaseFrozen(task);
|
|
53
|
+
if (run.passed === null) {
|
|
54
|
+
baseUnscorable = true;
|
|
55
|
+
baseUnscorableReason = run.unscorableReason;
|
|
56
|
+
break;
|
|
57
|
+
}
|
|
58
|
+
if (run.passed) {
|
|
59
|
+
basePasses++;
|
|
60
|
+
break;
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
const baseRuns = r + (baseUnscorable || basePasses > 0 ? 1 : 0);
|
|
64
|
+
if (baseUnscorable) {
|
|
65
|
+
return { gradeable: true, basePasses: 0, baseRuns, baseUnscorable: true, ...(baseUnscorableReason ? { baseUnscorableReason } : {}), proverRan: false, proverPassed: null };
|
|
66
|
+
}
|
|
67
|
+
if (basePasses > 0) {
|
|
68
|
+
return { gradeable: true, basePasses, baseRuns, baseUnscorable: false, proverRan: false, proverPassed: null };
|
|
69
|
+
}
|
|
70
|
+
// Base reliably fails (0/R) → layer 3: prover (existence proof of headroom).
|
|
71
|
+
const prover = await deps.runProverFrozen(task);
|
|
72
|
+
return {
|
|
73
|
+
gradeable: true, basePasses: 0, baseRuns, baseUnscorable: false, proverRan: true, proverPassed: prover.passed,
|
|
74
|
+
...(prover.passed === null && prover.unscorableReason ? { proverUnscorableReason: prover.unscorableReason } : {}),
|
|
75
|
+
};
|
|
76
|
+
}
|
|
77
|
+
/**
|
|
78
|
+
* Partition a candidate stream into the held-out exam vs the rest, applying the
|
|
79
|
+
* three filter layers cheapest-first. `candidates` MUST already be ordered (use
|
|
80
|
+
* {@link stratifyByRepo}); selection order is the iteration order and is frozen.
|
|
81
|
+
*
|
|
82
|
+
* Resumable: if `deps.getCachedMeasurement` is provided, an already-measured
|
|
83
|
+
* candidate replays from cache (no inference, no budget cost), so re-running the
|
|
84
|
+
* same command resumes — the `maxCandidates` budget bounds only NEW measurements
|
|
85
|
+
* per invocation, letting a long screen proceed in budget-sized chunks. The
|
|
86
|
+
* selection decision (caps, held-out) is always recomputed fresh, so the cached
|
|
87
|
+
* measurements stay valid even if `heldOutCount`/`perRepoCap` change.
|
|
88
|
+
*/
|
|
89
|
+
export async function screenBaseFailures(candidates, deps, opts) {
|
|
90
|
+
const log = deps.log ?? (() => { });
|
|
91
|
+
const heldOut = [];
|
|
92
|
+
const screened = [];
|
|
93
|
+
const perRepo = new Map();
|
|
94
|
+
let liveMeasured = 0;
|
|
95
|
+
for (const task of candidates) {
|
|
96
|
+
if (heldOut.length >= opts.heldOutCount)
|
|
97
|
+
break;
|
|
98
|
+
const repo = repoOf(task);
|
|
99
|
+
const base = { instance_id: task.instance_id, repo, basePasses: 0, proverPassed: null };
|
|
100
|
+
// Measure (from cache, or live — bounded by the per-invocation budget).
|
|
101
|
+
let m = deps.getCachedMeasurement?.(task.instance_id);
|
|
102
|
+
if (!m) {
|
|
103
|
+
if (liveMeasured >= opts.maxCandidates)
|
|
104
|
+
break; // budget bounds NEW (inference-spending) measurements
|
|
105
|
+
m = await measureCandidate(task, deps, opts.R);
|
|
106
|
+
liveMeasured += 1;
|
|
107
|
+
deps.recordMeasurement?.(task.instance_id, m);
|
|
108
|
+
}
|
|
109
|
+
// Decide from the measurement (always fresh; cap/diversity not cached).
|
|
110
|
+
if (!m.gradeable) {
|
|
111
|
+
screened.push({ ...base, baseRuns: 0, gradeable: false, heldOut: false, reason: 'not-gradeable' });
|
|
112
|
+
continue;
|
|
113
|
+
}
|
|
114
|
+
if (m.baseUnscorable) {
|
|
115
|
+
if (m.baseUnscorableReason)
|
|
116
|
+
log(`[screen] ${task.instance_id} base-unscorable: ${m.baseUnscorableReason}`);
|
|
117
|
+
screened.push({ ...base, baseRuns: m.baseRuns, gradeable: true, heldOut: false, reason: 'base-unscorable', ...(m.baseUnscorableReason ? { unscorableReason: m.baseUnscorableReason } : {}) });
|
|
118
|
+
continue;
|
|
119
|
+
}
|
|
120
|
+
if (m.basePasses > 0) {
|
|
121
|
+
screened.push({ ...base, baseRuns: m.baseRuns, basePasses: m.basePasses, gradeable: true, heldOut: false, reason: 'base-passes' });
|
|
122
|
+
continue;
|
|
123
|
+
}
|
|
124
|
+
// Base 0/R → prover outcome (layer 3).
|
|
125
|
+
if (m.proverPassed !== true) {
|
|
126
|
+
if (m.proverPassed === null && m.proverUnscorableReason)
|
|
127
|
+
log(`[screen] ${task.instance_id} prover-unscorable: ${m.proverUnscorableReason}`);
|
|
128
|
+
screened.push({
|
|
129
|
+
...base, baseRuns: m.baseRuns, gradeable: true, proverPassed: m.proverPassed, heldOut: false, reason: 'no-headroom',
|
|
130
|
+
...(m.proverPassed === null && m.proverUnscorableReason ? { unscorableReason: m.proverUnscorableReason } : {}),
|
|
131
|
+
});
|
|
132
|
+
continue;
|
|
133
|
+
}
|
|
134
|
+
if ((perRepo.get(repo) ?? 0) >= opts.perRepoCap) {
|
|
135
|
+
screened.push({ ...base, baseRuns: m.baseRuns, gradeable: true, proverPassed: true, heldOut: false, reason: 'per-repo-cap' });
|
|
136
|
+
continue;
|
|
137
|
+
}
|
|
138
|
+
perRepo.set(repo, (perRepo.get(repo) ?? 0) + 1);
|
|
139
|
+
heldOut.push({ instance_id: task.instance_id, repo, baseRuns: m.baseRuns });
|
|
140
|
+
screened.push({ ...base, baseRuns: m.baseRuns, gradeable: true, proverPassed: true, heldOut: true, reason: 'held-out' });
|
|
141
|
+
log(`[screen] held out ${task.instance_id} (${heldOut.length}/${opts.heldOutCount})`);
|
|
142
|
+
}
|
|
143
|
+
return { heldOut, screened };
|
|
144
|
+
}
|
|
145
|
+
const V2_SLATE_COMMENT = 'BASELINE-FAILURE REGRESSION BENCHMARK (issue #986). Screened: gradeable at the current ' +
|
|
146
|
+
'evalSemanticsVersion AND base claude-code/Haiku frozen fails 0/R (R≥3) AND a stronger Codex/GPT-5.5 ' +
|
|
147
|
+
'prover passes ≥1 (proven headroom). Baseline 0% by construction. Held out from the generator train ' +
|
|
148
|
+
'stream via the active-slate-version union. Content-addressed; scores comparable WITHIN this version only.';
|
|
149
|
+
export function buildV2SlateFile(instanceIds, generatedAt) {
|
|
150
|
+
const artifact = {
|
|
151
|
+
schemaVersion: HELD_OUT_SLATE_SCHEMA_VERSION,
|
|
152
|
+
solverType: 'swe-rebench-v2.v1',
|
|
153
|
+
version: 'v2',
|
|
154
|
+
generatedAt,
|
|
155
|
+
instanceIds: [...instanceIds].sort((a, b) => a.localeCompare(b)),
|
|
156
|
+
};
|
|
157
|
+
return { comment: V2_SLATE_COMMENT, ...artifact, hash: hashHeldOutSlateArtifact(artifact) };
|
|
158
|
+
}
|
|
159
|
+
//# sourceMappingURL=screen.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"screen.js","sourceRoot":"","sources":["../../src/eval/screen.ts"],"names":[],"mappings":"AACA,OAAO,EACL,6BAA6B,EAC7B,wBAAwB,GAEzB,MAAM,mDAAmD,CAAC;AAE3D;4EAC4E;AAC5E,MAAM,UAAU,MAAM,CAAC,IAAc;IACnC,MAAM,GAAG,GAAG,IAAI,CAAC,WAAW,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;IAC3C,OAAO,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC,CAAC,IAAI,CAAC,WAAW,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC;AACxE,CAAC;AAED;;;;;GAKG;AACH,MAAM,UAAU,cAAc,CAAC,IAAgB;IAC7C,MAAM,MAAM,GAAG,IAAI,GAAG,EAAsB,CAAC;IAC7C,KAAK,MAAM,IAAI,IAAI,IAAI,EAAE,CAAC;QACxB,MAAM,IAAI,GAAG,MAAM,CAAC,IAAI,CAAC,CAAC;QAC1B,CAAC,MAAM,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,MAAM,CAAC,GAAG,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,IAAI,CAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IACnE,CAAC;IACD,MAAM,KAAK,GAAG,CAAC,GAAG,MAAM,CAAC,IAAI,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,aAAa,CAAC,CAAC,CAAC,CAAC,CAAC;IACpE,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,MAAM,CAAC,GAAG,CAAC,IAAI,CAAE,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,WAAW,CAAC,aAAa,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC;IAC/E,CAAC;IACD,MAAM,GAAG,GAAe,EAAE,CAAC;IAC3B,IAAI,KAAK,GAAG,IAAI,CAAC;IACjB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,KAAK,EAAE,CAAC,EAAE,EAAE,CAAC;QAC3B,KAAK,GAAG,KAAK,CAAC;QACd,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;YACzB,MAAM,CAAC,GAAG,MAAM,CAAC,GAAG,CAAC,IAAI,CAAE,CAAC;YAC5B,IAAI,CAAC,GAAG,CAAC,CAAC,MAAM,EAAE,CAAC;gBACjB,GAAG,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAE,CAAC,CAAC;gBAChB,KAAK,GAAG,IAAI,CAAC;YACf,CAAC;QACH,CAAC;IACH,CAAC;IACD,OAAO,GAAG,CAAC;AACb,CAAC;AAgFD;;;;GAIG;AACH,KAAK,UAAU,gBAAgB,CAAC,IAAc,EAAE,IAAgB,EAAE,CAAS;IACzE,MAAM,IAAI,GAAsB,EAAE,SAAS,EAAE,KAAK,EAAE,UAAU,EAAE,CAAC,EAAE,QAAQ,EAAE,CAAC,EAAE,cAAc,EAAE,KAAK,EAAE,SAAS,EAAE,KAAK,EAAE,YAAY,EAAE,IAAI,EAAE,CAAC;IAC9I,IAAI,CAAC,CAAC,MAAM,IAAI,CAAC,eAAe,CAAC,IAAI,CAAC,CAAC;QAAE,OAAO,IAAI,CAAC;IAErD,IAAI,UAAU,GAAG,CAAC,CAAC;IACnB,IAAI,cAAc,GAAG,KAAK,CAAC;IAC3B,IAAI,oBAAwC,CAAC;IAC7C,IAAI,CAAC,GAAG,CAAC,CAAC;IACV,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;QAClB,MAAM,GAAG,GAAG,MAAM,IAAI,CAAC,aAAa,CAAC,IAAI,CAAC,CAAC;QAC3C,IAAI,GAAG,CAAC,MAAM,KAAK,IAAI,EAAE,CAAC;YAAC,cAAc,GAAG,IAAI,CAAC;YAAC,oBAAoB,GAAG,GAAG,CAAC,gBAAgB,CAAC;YAAC,MAAM;QAAC,CAAC;QACvG,IAAI,GAAG,CAAC,MAAM,EAAE,CAAC;YAAC,UAAU,EAAE,CAAC;YAAC,MAAM;QAAC,CAAC;IAC1C,CAAC;IACD,MAAM,QAAQ,GAAG,CAAC,GAAG,CAAC,cAAc,IAAI,UAAU,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IAChE,IAAI,cAAc,EAAE,CAAC;QACnB,OAAO,EAAE,SAAS,EAAE,IAAI,EAAE,UAAU,EAAE,CAAC,EAAE,QAAQ,EAAE,cAAc,EAAE,IAAI,EAAE,GAAG,CAAC,oBAAoB,CAAC,CAAC,CAAC,EAAE,oBAAoB,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,SAAS,EAAE,KAAK,EAAE,YAAY,EAAE,IAAI,EAAE,CAAC;IAC7K,CAAC;IACD,IAAI,UAAU,GAAG,CAAC,EAAE,CAAC;QACnB,OAAO,EAAE,SAAS,EAAE,IAAI,EAAE,UAAU,EAAE,QAAQ,EAAE,cAAc,EAAE,KAAK,EAAE,SAAS,EAAE,KAAK,EAAE,YAAY,EAAE,IAAI,EAAE,CAAC;IAChH,CAAC;IACD,6EAA6E;IAC7E,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,eAAe,CAAC,IAAI,CAAC,CAAC;IAChD,OAAO;QACL,SAAS,EAAE,IAAI,EAAE,UAAU,EAAE,CAAC,EAAE,QAAQ,EAAE,cAAc,EAAE,KAAK,EAAE,SAAS,EAAE,IAAI,EAAE,YAAY,EAAE,MAAM,CAAC,MAAM;QAC7G,GAAG,CAAC,MAAM,CAAC,MAAM,KAAK,IAAI,IAAI,MAAM,CAAC,gBAAgB,CAAC,CAAC,CAAC,EAAE,sBAAsB,EAAE,MAAM,CAAC,gBAAgB,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;KAClH,CAAC;AACJ,CAAC;AAED;;;;;;;;;;;GAWG;AACH,MAAM,CAAC,KAAK,UAAU,kBAAkB,CACtC,UAAsB,EACtB,IAAgB,EAChB,IAAgB;IAEhB,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,IAAI,CAAC,GAAG,EAAE,GAAE,CAAC,CAAC,CAAC;IACnC,MAAM,OAAO,GAA4B,EAAE,CAAC;IAC5C,MAAM,QAAQ,GAAwB,EAAE,CAAC;IACzC,MAAM,OAAO,GAAG,IAAI,GAAG,EAAkB,CAAC;IAC1C,IAAI,YAAY,GAAG,CAAC,CAAC;IAErB,KAAK,MAAM,IAAI,IAAI,UAAU,EAAE,CAAC;QAC9B,IAAI,OAAO,CAAC,MAAM,IAAI,IAAI,CAAC,YAAY;YAAE,MAAM;QAC/C,MAAM,IAAI,GAAG,MAAM,CAAC,IAAI,CAAC,CAAC;QAC1B,MAAM,IAAI,GAAG,EAAE,WAAW,EAAE,IAAI,CAAC,WAAW,EAAE,IAAI,EAAE,UAAU,EAAE,CAAC,EAAE,YAAY,EAAE,IAAsB,EAAE,CAAC;QAE1G,wEAAwE;QACxE,IAAI,CAAC,GAAG,IAAI,CAAC,oBAAoB,EAAE,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;QACtD,IAAI,CAAC,CAAC,EAAE,CAAC;YACP,IAAI,YAAY,IAAI,IAAI,CAAC,aAAa;gBAAE,MAAM,CAAC,sDAAsD;YACrG,CAAC,GAAG,MAAM,gBAAgB,CAAC,IAAI,EAAE,IAAI,EAAE,IAAI,CAAC,CAAC,CAAC,CAAC;YAC/C,YAAY,IAAI,CAAC,CAAC;YAClB,IAAI,CAAC,iBAAiB,EAAE,CAAC,IAAI,CAAC,WAAW,EAAE,CAAC,CAAC,CAAC;QAChD,CAAC;QAED,wEAAwE;QACxE,IAAI,CAAC,CAAC,CAAC,SAAS,EAAE,CAAC;YACjB,QAAQ,CAAC,IAAI,CAAC,EAAE,GAAG,IAAI,EAAE,QAAQ,EAAE,CAAC,EAAE,SAAS,EAAE,KAAK,EAAE,OAAO,EAAE,KAAK,EAAE,MAAM,EAAE,eAAe,EAAE,CAAC,CAAC;YACnG,SAAS;QACX,CAAC;QACD,IAAI,CAAC,CAAC,cAAc,EAAE,CAAC;YACrB,IAAI,CAAC,CAAC,oBAAoB;gBAAE,GAAG,CAAC,YAAY,IAAI,CAAC,WAAW,qBAAqB,CAAC,CAAC,oBAAoB,EAAE,CAAC,CAAC;YAC3G,QAAQ,CAAC,IAAI,CAAC,EAAE,GAAG,IAAI,EAAE,QAAQ,EAAE,CAAC,CAAC,QAAQ,EAAE,SAAS,EAAE,IAAI,EAAE,OAAO,EAAE,KAAK,EAAE,MAAM,EAAE,iBAAiB,EAAE,GAAG,CAAC,CAAC,CAAC,oBAAoB,CAAC,CAAC,CAAC,EAAE,gBAAgB,EAAE,CAAC,CAAC,oBAAoB,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,CAAC,CAAC;YAC9L,SAAS;QACX,CAAC;QACD,IAAI,CAAC,CAAC,UAAU,GAAG,CAAC,EAAE,CAAC;YACrB,QAAQ,CAAC,IAAI,CAAC,EAAE,GAAG,IAAI,EAAE,QAAQ,EAAE,CAAC,CAAC,QAAQ,EAAE,UAAU,EAAE,CAAC,CAAC,UAAU,EAAE,SAAS,EAAE,IAAI,EAAE,OAAO,EAAE,KAAK,EAAE,MAAM,EAAE,aAAa,EAAE,CAAC,CAAC;YACnI,SAAS;QACX,CAAC;QACD,uCAAuC;QACvC,IAAI,CAAC,CAAC,YAAY,KAAK,IAAI,EAAE,CAAC;YAC5B,IAAI,CAAC,CAAC,YAAY,KAAK,IAAI,IAAI,CAAC,CAAC,sBAAsB;gBAAE,GAAG,CAAC,YAAY,IAAI,CAAC,WAAW,uBAAuB,CAAC,CAAC,sBAAsB,EAAE,CAAC,CAAC;YAC5I,QAAQ,CAAC,IAAI,CAAC;gBACZ,GAAG,IAAI,EAAE,QAAQ,EAAE,CAAC,CAAC,QAAQ,EAAE,SAAS,EAAE,IAAI,EAAE,YAAY,EAAE,CAAC,CAAC,YAAY,EAAE,OAAO,EAAE,KAAK,EAAE,MAAM,EAAE,aAAa;gBACnH,GAAG,CAAC,CAAC,CAAC,YAAY,KAAK,IAAI,IAAI,CAAC,CAAC,sBAAsB,CAAC,CAAC,CAAC,EAAE,gBAAgB,EAAE,CAAC,CAAC,sBAAsB,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;aAC/G,CAAC,CAAC;YACH,SAAS;QACX,CAAC;QACD,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,IAAI,IAAI,CAAC,UAAU,EAAE,CAAC;YAChD,QAAQ,CAAC,IAAI,CAAC,EAAE,GAAG,IAAI,EAAE,QAAQ,EAAE,CAAC,CAAC,QAAQ,EAAE,SAAS,EAAE,IAAI,EAAE,YAAY,EAAE,IAAI,EAAE,OAAO,EAAE,KAAK,EAAE,MAAM,EAAE,cAAc,EAAE,CAAC,CAAC;YAC9H,SAAS;QACX,CAAC;QAED,OAAO,CAAC,GAAG,CAAC,IAAI,EAAE,CAAC,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;QAChD,OAAO,CAAC,IAAI,CAAC,EAAE,WAAW,EAAE,IAAI,CAAC,WAAW,EAAE,IAAI,EAAE,QAAQ,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAC,CAAC;QAC5E,QAAQ,CAAC,IAAI,CAAC,EAAE,GAAG,IAAI,EAAE,QAAQ,EAAE,CAAC,CAAC,QAAQ,EAAE,SAAS,EAAE,IAAI,EAAE,YAAY,EAAE,IAAI,EAAE,OAAO,EAAE,IAAI,EAAE,MAAM,EAAE,UAAU,EAAE,CAAC,CAAC;QACzH,GAAG,CAAC,qBAAqB,IAAI,CAAC,WAAW,KAAK,OAAO,CAAC,MAAM,IAAI,IAAI,CAAC,YAAY,GAAG,CAAC,CAAC;IACxF,CAAC;IAED,OAAO,EAAE,OAAO,EAAE,QAAQ,EAAE,CAAC;AAC/B,CAAC;AAUD,MAAM,gBAAgB,GACpB,yFAAyF;IACzF,sGAAsG;IACtG,qGAAqG;IACrG,2GAA2G,CAAC;AAE9G,MAAM,UAAU,gBAAgB,CAAC,WAAqB,EAAE,WAAmB;IACzE,MAAM,QAAQ,GAAyB;QACrC,aAAa,EAAE,6BAA6B;QAC5C,UAAU,EAAE,mBAAmB;QAC/B,OAAO,EAAE,IAAI;QACb,WAAW;QACX,WAAW,EAAE,CAAC,GAAG,WAAW,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,aAAa,CAAC,CAAC,CAAC,CAAC;KACjE,CAAC;IACF,OAAO,EAAE,OAAO,EAAE,gBAAgB,EAAE,GAAG,QAAQ,EAAE,IAAI,EAAE,wBAAwB,CAAC,QAAQ,CAAC,EAAE,CAAC;AAC9F,CAAC"}
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Ordinary least-squares slope of resolved-rate vs cycle index for the
|
|
3
|
+
* train-arm slope measurement (issue #822, AC#1).
|
|
4
|
+
*
|
|
5
|
+
* The train-arm e2e evaluates a checkpoint against the held-out slate (#817)
|
|
6
|
+
* at intervals via the eval orchestrator (#818), collecting one
|
|
7
|
+
* `{ cycleIndex, rate }` point per interval (`rate` = passed / scorable, the
|
|
8
|
+
* Wilson point estimate). The slope of the least-squares fit is the headline
|
|
9
|
+
* "is the learner improving across the training sequence" number.
|
|
10
|
+
*
|
|
11
|
+
* It is deliberately a thin helper over the closed-form OLS slope
|
|
12
|
+
* (`cov(x,y) / var(x)`); the per-point confidence intervals come from
|
|
13
|
+
* `wilson.ts` — this module does NOT reimplement them. The slope sign alone is
|
|
14
|
+
* never a verdict at small N: a flat or slightly negative slope is "within
|
|
15
|
+
* noise", which the e2e surfaces via the §4.1 honesty caveat.
|
|
16
|
+
*/
|
|
17
|
+
export interface RatePoint {
|
|
18
|
+
/** Training cycle index the eval ran at (0 = baseline, before any training). */
|
|
19
|
+
cycleIndex: number;
|
|
20
|
+
/** Observed resolved rate at that interval (passed / scorable, in [0, 1]). */
|
|
21
|
+
rate: number;
|
|
22
|
+
}
|
|
23
|
+
/**
|
|
24
|
+
* Least-squares slope of `rate` regressed on `cycleIndex`. Returns 0 for fewer
|
|
25
|
+
* than two points (no line to fit) and for a degenerate fit where every x is
|
|
26
|
+
* identical (zero variance — division would be NaN). A flat sequence yields
|
|
27
|
+
* exactly 0.
|
|
28
|
+
*/
|
|
29
|
+
export declare function leastSquaresSlope(points: RatePoint[]): number;
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Ordinary least-squares slope of resolved-rate vs cycle index for the
|
|
3
|
+
* train-arm slope measurement (issue #822, AC#1).
|
|
4
|
+
*
|
|
5
|
+
* The train-arm e2e evaluates a checkpoint against the held-out slate (#817)
|
|
6
|
+
* at intervals via the eval orchestrator (#818), collecting one
|
|
7
|
+
* `{ cycleIndex, rate }` point per interval (`rate` = passed / scorable, the
|
|
8
|
+
* Wilson point estimate). The slope of the least-squares fit is the headline
|
|
9
|
+
* "is the learner improving across the training sequence" number.
|
|
10
|
+
*
|
|
11
|
+
* It is deliberately a thin helper over the closed-form OLS slope
|
|
12
|
+
* (`cov(x,y) / var(x)`); the per-point confidence intervals come from
|
|
13
|
+
* `wilson.ts` — this module does NOT reimplement them. The slope sign alone is
|
|
14
|
+
* never a verdict at small N: a flat or slightly negative slope is "within
|
|
15
|
+
* noise", which the e2e surfaces via the §4.1 honesty caveat.
|
|
16
|
+
*/
|
|
17
|
+
/**
|
|
18
|
+
* Least-squares slope of `rate` regressed on `cycleIndex`. Returns 0 for fewer
|
|
19
|
+
* than two points (no line to fit) and for a degenerate fit where every x is
|
|
20
|
+
* identical (zero variance — division would be NaN). A flat sequence yields
|
|
21
|
+
* exactly 0.
|
|
22
|
+
*/
|
|
23
|
+
export function leastSquaresSlope(points) {
|
|
24
|
+
const n = points.length;
|
|
25
|
+
if (n < 2)
|
|
26
|
+
return 0;
|
|
27
|
+
let sumX = 0;
|
|
28
|
+
let sumY = 0;
|
|
29
|
+
for (const { cycleIndex, rate } of points) {
|
|
30
|
+
sumX += cycleIndex;
|
|
31
|
+
sumY += rate;
|
|
32
|
+
}
|
|
33
|
+
const meanX = sumX / n;
|
|
34
|
+
const meanY = sumY / n;
|
|
35
|
+
let cov = 0;
|
|
36
|
+
let varX = 0;
|
|
37
|
+
for (const { cycleIndex, rate } of points) {
|
|
38
|
+
const dx = cycleIndex - meanX;
|
|
39
|
+
cov += dx * (rate - meanY);
|
|
40
|
+
varX += dx * dx;
|
|
41
|
+
}
|
|
42
|
+
if (varX === 0)
|
|
43
|
+
return 0;
|
|
44
|
+
return cov / varX;
|
|
45
|
+
}
|
|
46
|
+
//# sourceMappingURL=slope.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"slope.js","sourceRoot":"","sources":["../../src/eval/slope.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;GAeG;AASH;;;;;GAKG;AACH,MAAM,UAAU,iBAAiB,CAAC,MAAmB;IACnD,MAAM,CAAC,GAAG,MAAM,CAAC,MAAM,CAAC;IACxB,IAAI,CAAC,GAAG,CAAC;QAAE,OAAO,CAAC,CAAC;IACpB,IAAI,IAAI,GAAG,CAAC,CAAC;IACb,IAAI,IAAI,GAAG,CAAC,CAAC;IACb,KAAK,MAAM,EAAE,UAAU,EAAE,IAAI,EAAE,IAAI,MAAM,EAAE,CAAC;QAC1C,IAAI,IAAI,UAAU,CAAC;QACnB,IAAI,IAAI,IAAI,CAAC;IACf,CAAC;IACD,MAAM,KAAK,GAAG,IAAI,GAAG,CAAC,CAAC;IACvB,MAAM,KAAK,GAAG,IAAI,GAAG,CAAC,CAAC;IACvB,IAAI,GAAG,GAAG,CAAC,CAAC;IACZ,IAAI,IAAI,GAAG,CAAC,CAAC;IACb,KAAK,MAAM,EAAE,UAAU,EAAE,IAAI,EAAE,IAAI,MAAM,EAAE,CAAC;QAC1C,MAAM,EAAE,GAAG,UAAU,GAAG,KAAK,CAAC;QAC9B,GAAG,IAAI,EAAE,GAAG,CAAC,IAAI,GAAG,KAAK,CAAC,CAAC;QAC3B,IAAI,IAAI,EAAE,GAAG,EAAE,CAAC;IAClB,CAAC;IACD,IAAI,IAAI,KAAK,CAAC;QAAE,OAAO,CAAC,CAAC;IACzB,OAAO,GAAG,GAAG,IAAI,CAAC;AACpB,CAAC"}
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Train-sequence builder with the no-train/test-overlap guard for the
|
|
3
|
+
* train-arm slope e2e (issue #822, AC#2).
|
|
4
|
+
*
|
|
5
|
+
* The learner-full-cycle e2e drives `runCycle` DIRECTLY (it does not post tasks
|
|
6
|
+
* through the generator), so the generator's `excludeHeldOutSlate` train-stream
|
|
7
|
+
* chokepoint (#817) is BYPASSED. This builder is therefore the load-bearing
|
|
8
|
+
* AC#2 mechanism for the e2e: it selects the N distinct training instances from
|
|
9
|
+
* the pool with the held-out slate excluded, and asserts the resulting
|
|
10
|
+
* sequence is disjoint from the slate (fail-loud, never a silent drop).
|
|
11
|
+
*
|
|
12
|
+
* It reuses `excludeHeldOutSlate` from the #817 primitive rather than
|
|
13
|
+
* reimplementing the exclusion. Selection is deterministic (instance-id sorted)
|
|
14
|
+
* so a given pool yields a stable sequence across runs.
|
|
15
|
+
*/
|
|
16
|
+
import type { PoolTask } from '../solver-types/_swe-rebench-v2-pool.js';
|
|
17
|
+
/** Thrown when a chosen training sequence intersects the held-out slate (AC#2). */
|
|
18
|
+
export declare class TrainTestOverlapError extends Error {
|
|
19
|
+
readonly overlap: string[];
|
|
20
|
+
constructor(overlap: string[]);
|
|
21
|
+
}
|
|
22
|
+
/** Assert a set of training ids is disjoint from the slate, else throw loud. */
|
|
23
|
+
export declare function assertNoOverlap(trainIds: string[], slateIds: Set<string>): void;
|
|
24
|
+
export declare function buildTrainSequence(args: {
|
|
25
|
+
pool: PoolTask[];
|
|
26
|
+
slateIds: Set<string>;
|
|
27
|
+
/** Number of distinct training tasks (= N training cycles). */
|
|
28
|
+
count: number;
|
|
29
|
+
/**
|
|
30
|
+
* Optional explicit, hand-picked instance_ids (in order). When set, the
|
|
31
|
+
* builder still runs the no-overlap guard and resolves each id against the
|
|
32
|
+
* pool — used to fail-loud on a hand-edited sequence that overlaps the slate.
|
|
33
|
+
*/
|
|
34
|
+
explicitIds?: string[];
|
|
35
|
+
}): PoolTask[];
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Train-sequence builder with the no-train/test-overlap guard for the
|
|
3
|
+
* train-arm slope e2e (issue #822, AC#2).
|
|
4
|
+
*
|
|
5
|
+
* The learner-full-cycle e2e drives `runCycle` DIRECTLY (it does not post tasks
|
|
6
|
+
* through the generator), so the generator's `excludeHeldOutSlate` train-stream
|
|
7
|
+
* chokepoint (#817) is BYPASSED. This builder is therefore the load-bearing
|
|
8
|
+
* AC#2 mechanism for the e2e: it selects the N distinct training instances from
|
|
9
|
+
* the pool with the held-out slate excluded, and asserts the resulting
|
|
10
|
+
* sequence is disjoint from the slate (fail-loud, never a silent drop).
|
|
11
|
+
*
|
|
12
|
+
* It reuses `excludeHeldOutSlate` from the #817 primitive rather than
|
|
13
|
+
* reimplementing the exclusion. Selection is deterministic (instance-id sorted)
|
|
14
|
+
* so a given pool yields a stable sequence across runs.
|
|
15
|
+
*/
|
|
16
|
+
import { excludeHeldOutSlate } from '../solver-types/_swe-rebench-v2-held-out-slate.js';
|
|
17
|
+
/** Thrown when a chosen training sequence intersects the held-out slate (AC#2). */
|
|
18
|
+
export class TrainTestOverlapError extends Error {
|
|
19
|
+
overlap;
|
|
20
|
+
constructor(overlap) {
|
|
21
|
+
super(`train/test overlap: training sequence includes held-out slate instance(s) ` +
|
|
22
|
+
`${overlap.join(', ')} — refusing to train on the eval slate (AC#2). ` +
|
|
23
|
+
`The slate must stay out-of-sample for the slope to mean anything.`);
|
|
24
|
+
this.overlap = overlap;
|
|
25
|
+
this.name = 'TrainTestOverlapError';
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
/** Assert a set of training ids is disjoint from the slate, else throw loud. */
|
|
29
|
+
export function assertNoOverlap(trainIds, slateIds) {
|
|
30
|
+
const overlap = trainIds.filter((id) => slateIds.has(id));
|
|
31
|
+
if (overlap.length > 0)
|
|
32
|
+
throw new TrainTestOverlapError(overlap);
|
|
33
|
+
}
|
|
34
|
+
export function buildTrainSequence(args) {
|
|
35
|
+
const eligible = excludeHeldOutSlate(args.pool, args.slateIds);
|
|
36
|
+
const byId = new Map(eligible.map((t) => [t.instance_id, t]));
|
|
37
|
+
if (args.explicitIds) {
|
|
38
|
+
// Guard the hand-picked sequence against the slate BEFORE resolving, so a
|
|
39
|
+
// slate-overlapping id is a TrainTestOverlapError, not a "not eligible".
|
|
40
|
+
assertNoOverlap(args.explicitIds, args.slateIds);
|
|
41
|
+
return args.explicitIds.map((id) => {
|
|
42
|
+
const task = byId.get(id);
|
|
43
|
+
if (!task) {
|
|
44
|
+
throw new Error(`explicit training instance ${id} not in the eligible pool`);
|
|
45
|
+
}
|
|
46
|
+
return task;
|
|
47
|
+
});
|
|
48
|
+
}
|
|
49
|
+
// Deterministic selection: instance-id sorted, first `count`.
|
|
50
|
+
const sorted = [...eligible].sort((a, b) => a.instance_id.localeCompare(b.instance_id));
|
|
51
|
+
if (sorted.length < args.count) {
|
|
52
|
+
throw new Error(`train sequence needs ${args.count} distinct tasks but only ${sorted.length} eligible ` +
|
|
53
|
+
`(pool size ${args.pool.length} minus ${args.slateIds.size} held-out slate instance(s))`);
|
|
54
|
+
}
|
|
55
|
+
const picked = sorted.slice(0, args.count);
|
|
56
|
+
assertNoOverlap(picked.map((t) => t.instance_id), args.slateIds);
|
|
57
|
+
return picked;
|
|
58
|
+
}
|
|
59
|
+
//# sourceMappingURL=train-sequence.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"train-sequence.js","sourceRoot":"","sources":["../../src/eval/train-sequence.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;GAcG;AAGH,OAAO,EAAE,mBAAmB,EAAE,MAAM,mDAAmD,CAAC;AAExF,mFAAmF;AACnF,MAAM,OAAO,qBAAsB,SAAQ,KAAK;IAClB;IAA5B,YAA4B,OAAiB;QAC3C,KAAK,CACH,4EAA4E;YAC1E,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,iDAAiD;YACtE,mEAAmE,CACtE,CAAC;QALwB,YAAO,GAAP,OAAO,CAAU;QAM3C,IAAI,CAAC,IAAI,GAAG,uBAAuB,CAAC;IACtC,CAAC;CACF;AAED,gFAAgF;AAChF,MAAM,UAAU,eAAe,CAAC,QAAkB,EAAE,QAAqB;IACvE,MAAM,OAAO,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC;IAC1D,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC;QAAE,MAAM,IAAI,qBAAqB,CAAC,OAAO,CAAC,CAAC;AACnE,CAAC;AAED,MAAM,UAAU,kBAAkB,CAAC,IAWlC;IACC,MAAM,QAAQ,GAAG,mBAAmB,CAAC,IAAI,CAAC,IAAI,EAAE,IAAI,CAAC,QAAQ,CAAC,CAAC;IAC/D,MAAM,IAAI,GAAG,IAAI,GAAG,CAAC,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,WAAW,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC;IAE9D,IAAI,IAAI,CAAC,WAAW,EAAE,CAAC;QACrB,0EAA0E;QAC1E,yEAAyE;QACzE,eAAe,CAAC,IAAI,CAAC,WAAW,EAAE,IAAI,CAAC,QAAQ,CAAC,CAAC;QACjD,OAAO,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,CAAC,EAAE,EAAE,EAAE;YACjC,MAAM,IAAI,GAAG,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;YAC1B,IAAI,CAAC,IAAI,EAAE,CAAC;gBACV,MAAM,IAAI,KAAK,CAAC,8BAA8B,EAAE,2BAA2B,CAAC,CAAC;YAC/E,CAAC;YACD,OAAO,IAAI,CAAC;QACd,CAAC,CAAC,CAAC;IACL,CAAC;IAED,8DAA8D;IAC9D,MAAM,MAAM,GAAG,CAAC,GAAG,QAAQ,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,WAAW,CAAC,aAAa,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC;IACxF,IAAI,MAAM,CAAC,MAAM,GAAG,IAAI,CAAC,KAAK,EAAE,CAAC;QAC/B,MAAM,IAAI,KAAK,CACb,wBAAwB,IAAI,CAAC,KAAK,4BAA4B,MAAM,CAAC,MAAM,YAAY;YACrF,cAAc,IAAI,CAAC,IAAI,CAAC,MAAM,UAAU,IAAI,CAAC,QAAQ,CAAC,IAAI,8BAA8B,CAC3F,CAAC;IACJ,CAAC;IACD,MAAM,MAAM,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC,EAAE,IAAI,CAAC,KAAK,CAAC,CAAC;IAC3C,eAAe,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,WAAW,CAAC,EAAE,IAAI,CAAC,QAAQ,CAAC,CAAC;IACjE,OAAO,MAAM,CAAC;AAChB,CAAC"}
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Wilson score interval + resolved-rate comparison for the `jinn eval`
|
|
3
|
+
* held-out checkpoint orchestrator (issue #818, AC#2).
|
|
4
|
+
*
|
|
5
|
+
* The Wilson score interval is a binomial proportion confidence interval that
|
|
6
|
+
* behaves well at the extremes (p=0, p=1) and for small n — unlike the naive
|
|
7
|
+
* normal-approximation interval. We write it small (no stats dependency, per
|
|
8
|
+
* repo convention): the formula is ~10 lines.
|
|
9
|
+
*
|
|
10
|
+
* Per log/decisions/2026-05-28-rl-eval-measurement.md §4: this is v1-simple.
|
|
11
|
+
* Only *large* deltas are trustworthy — we encode that as "the child and
|
|
12
|
+
* parent intervals do not overlap." No seed control, no multi-run averaging.
|
|
13
|
+
*/
|
|
14
|
+
export interface Interval {
|
|
15
|
+
/** Observed point estimate, passed / scorable (0 when scorable=0). */
|
|
16
|
+
p: number;
|
|
17
|
+
/** Lower bound, clamped to [0, 1]. */
|
|
18
|
+
lo: number;
|
|
19
|
+
/** Upper bound, clamped to [0, 1]. */
|
|
20
|
+
hi: number;
|
|
21
|
+
}
|
|
22
|
+
/**
|
|
23
|
+
* Wilson score interval for `passed` successes out of `scorable` trials.
|
|
24
|
+
* `scorable === 0` returns a degenerate `{ p: 0, lo: 0, hi: 0 }` (no NaN).
|
|
25
|
+
*/
|
|
26
|
+
export declare function wilsonInterval(passed: number, scorable: number, z?: number): Interval;
|
|
27
|
+
export type RateVerdict = 'trustworthy' | 'within-noise';
|
|
28
|
+
export interface RateComparison {
|
|
29
|
+
child: Interval;
|
|
30
|
+
parent: Interval;
|
|
31
|
+
/** child.p − parent.p (point-estimate difference, can be negative). */
|
|
32
|
+
delta: number;
|
|
33
|
+
/**
|
|
34
|
+
* 'trustworthy' iff the two Wilson intervals do NOT overlap; otherwise
|
|
35
|
+
* 'within-noise'. v1-simple: only disjoint intervals justify a claim.
|
|
36
|
+
*/
|
|
37
|
+
verdict: RateVerdict;
|
|
38
|
+
}
|
|
39
|
+
export declare function compareRates(child: {
|
|
40
|
+
passed: number;
|
|
41
|
+
scorable: number;
|
|
42
|
+
}, parent: {
|
|
43
|
+
passed: number;
|
|
44
|
+
scorable: number;
|
|
45
|
+
}): RateComparison;
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Wilson score interval + resolved-rate comparison for the `jinn eval`
|
|
3
|
+
* held-out checkpoint orchestrator (issue #818, AC#2).
|
|
4
|
+
*
|
|
5
|
+
* The Wilson score interval is a binomial proportion confidence interval that
|
|
6
|
+
* behaves well at the extremes (p=0, p=1) and for small n — unlike the naive
|
|
7
|
+
* normal-approximation interval. We write it small (no stats dependency, per
|
|
8
|
+
* repo convention): the formula is ~10 lines.
|
|
9
|
+
*
|
|
10
|
+
* Per log/decisions/2026-05-28-rl-eval-measurement.md §4: this is v1-simple.
|
|
11
|
+
* Only *large* deltas are trustworthy — we encode that as "the child and
|
|
12
|
+
* parent intervals do not overlap." No seed control, no multi-run averaging.
|
|
13
|
+
*/
|
|
14
|
+
/** Two-sided z for a 95% interval (1.96 ≈ Φ⁻¹(0.975)). */
|
|
15
|
+
const DEFAULT_Z = 1.96;
|
|
16
|
+
/**
|
|
17
|
+
* Wilson score interval for `passed` successes out of `scorable` trials.
|
|
18
|
+
* `scorable === 0` returns a degenerate `{ p: 0, lo: 0, hi: 0 }` (no NaN).
|
|
19
|
+
*/
|
|
20
|
+
export function wilsonInterval(passed, scorable, z = DEFAULT_Z) {
|
|
21
|
+
if (scorable === 0)
|
|
22
|
+
return { p: 0, lo: 0, hi: 0 };
|
|
23
|
+
const n = scorable;
|
|
24
|
+
const p = passed / n;
|
|
25
|
+
const z2 = z * z;
|
|
26
|
+
const denom = 1 + z2 / n;
|
|
27
|
+
const centre = p + z2 / (2 * n);
|
|
28
|
+
const margin = z * Math.sqrt((p * (1 - p)) / n + z2 / (4 * n * n));
|
|
29
|
+
const lo = (centre - margin) / denom;
|
|
30
|
+
const hi = (centre + margin) / denom;
|
|
31
|
+
return {
|
|
32
|
+
p,
|
|
33
|
+
lo: Math.max(0, lo),
|
|
34
|
+
hi: Math.min(1, hi),
|
|
35
|
+
};
|
|
36
|
+
}
|
|
37
|
+
export function compareRates(child, parent) {
|
|
38
|
+
const c = wilsonInterval(child.passed, child.scorable);
|
|
39
|
+
const p = wilsonInterval(parent.passed, parent.scorable);
|
|
40
|
+
const disjoint = c.lo > p.hi || p.lo > c.hi;
|
|
41
|
+
return {
|
|
42
|
+
child: c,
|
|
43
|
+
parent: p,
|
|
44
|
+
delta: c.p - p.p,
|
|
45
|
+
verdict: disjoint ? 'trustworthy' : 'within-noise',
|
|
46
|
+
};
|
|
47
|
+
}
|
|
48
|
+
//# sourceMappingURL=wilson.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"wilson.js","sourceRoot":"","sources":["../../src/eval/wilson.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;GAYG;AAEH,0DAA0D;AAC1D,MAAM,SAAS,GAAG,IAAI,CAAC;AAWvB;;;GAGG;AACH,MAAM,UAAU,cAAc,CAAC,MAAc,EAAE,QAAgB,EAAE,IAAY,SAAS;IACpF,IAAI,QAAQ,KAAK,CAAC;QAAE,OAAO,EAAE,CAAC,EAAE,CAAC,EAAE,EAAE,EAAE,CAAC,EAAE,EAAE,EAAE,CAAC,EAAE,CAAC;IAClD,MAAM,CAAC,GAAG,QAAQ,CAAC;IACnB,MAAM,CAAC,GAAG,MAAM,GAAG,CAAC,CAAC;IACrB,MAAM,EAAE,GAAG,CAAC,GAAG,CAAC,CAAC;IACjB,MAAM,KAAK,GAAG,CAAC,GAAG,EAAE,GAAG,CAAC,CAAC;IACzB,MAAM,MAAM,GAAG,CAAC,GAAG,EAAE,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IAChC,MAAM,MAAM,GAAG,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,GAAG,EAAE,GAAG,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;IACnE,MAAM,EAAE,GAAG,CAAC,MAAM,GAAG,MAAM,CAAC,GAAG,KAAK,CAAC;IACrC,MAAM,EAAE,GAAG,CAAC,MAAM,GAAG,MAAM,CAAC,GAAG,KAAK,CAAC;IACrC,OAAO;QACL,CAAC;QACD,EAAE,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,EAAE,CAAC;QACnB,EAAE,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,EAAE,CAAC;KACpB,CAAC;AACJ,CAAC;AAgBD,MAAM,UAAU,YAAY,CAC1B,KAA2C,EAC3C,MAA4C;IAE5C,MAAM,CAAC,GAAG,cAAc,CAAC,KAAK,CAAC,MAAM,EAAE,KAAK,CAAC,QAAQ,CAAC,CAAC;IACvD,MAAM,CAAC,GAAG,cAAc,CAAC,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,QAAQ,CAAC,CAAC;IACzD,MAAM,QAAQ,GAAG,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC,EAAE,CAAC;IAC5C,OAAO;QACL,KAAK,EAAE,CAAC;QACR,MAAM,EAAE,CAAC;QACT,KAAK,EAAE,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;QAChB,OAAO,EAAE,QAAQ,CAAC,CAAC,CAAC,aAAa,CAAC,CAAC,CAAC,cAAc;KACnD,CAAC;AACJ,CAAC"}
|
package/dist/events/types.d.ts
CHANGED
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
* markers) and consumed by /v1/events SSE + /v1/events/recent JSON endpoints
|
|
6
6
|
* served to the operator SPA. See docs/superpowers/specs/2026-05-01-operator-local-app-design.md.
|
|
7
7
|
*/
|
|
8
|
-
import { z } from 'zod';
|
|
8
|
+
import { z } from 'zod/v3';
|
|
9
9
|
export declare const StructuredEventKindSchema: z.ZodEnum<["intent", "reward", "fleet", "system", "error", "log"]>;
|
|
10
10
|
export type StructuredEventKind = z.infer<typeof StructuredEventKindSchema>;
|
|
11
11
|
export declare const StructuredEventSchema: z.ZodObject<{
|
package/dist/events/types.js
CHANGED
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
* markers) and consumed by /v1/events SSE + /v1/events/recent JSON endpoints
|
|
6
6
|
* served to the operator SPA. See docs/superpowers/specs/2026-05-01-operator-local-app-design.md.
|
|
7
7
|
*/
|
|
8
|
-
import { z } from 'zod';
|
|
8
|
+
import { z } from 'zod/v3';
|
|
9
9
|
export const StructuredEventKindSchema = z.enum([
|
|
10
10
|
'intent',
|
|
11
11
|
'reward',
|
package/dist/events/types.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"types.js","sourceRoot":"","sources":["../../src/events/types.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AACH,OAAO,EAAE,CAAC,EAAE,MAAM,
|
|
1
|
+
{"version":3,"file":"types.js","sourceRoot":"","sources":["../../src/events/types.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AACH,OAAO,EAAE,CAAC,EAAE,MAAM,QAAQ,CAAC;AAE3B,MAAM,CAAC,MAAM,yBAAyB,GAAG,CAAC,CAAC,IAAI,CAAC;IAC9C,QAAQ;IACR,QAAQ;IACR,OAAO;IACP,QAAQ;IACR,OAAO;IACP,KAAK;CACN,CAAC,CAAC;AAGH,MAAM,CAAC,MAAM,qBAAqB,GAAG,CAAC,CAAC,MAAM,CAAC;IAC5C,aAAa,EAAE,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC;IAC3B,EAAE,EAAE,CAAC,CAAC,MAAM,EAAE;IACd,EAAE,EAAE,CAAC,CAAC,MAAM,EAAE;IACd,IAAI,EAAE,yBAAyB;IAC/B,OAAO,EAAE,CAAC,CAAC,MAAM,EAAE;IACnB,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IAChC,MAAM,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IAC7B,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IAChC,OAAO,EAAE,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,OAAO,EAAE,CAAC,CAAC,QAAQ,EAAE;CAC1C,CAAC,CAAC"}
|
|
@@ -22,9 +22,11 @@
|
|
|
22
22
|
* Used for manifest signing: produce a deterministic byte string that two
|
|
23
23
|
* independent parties can reproduce from the same object graph.
|
|
24
24
|
*/
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
25
|
+
// canonicalize v3 is ESM-only (no CJS entry); the client is `"type": "module"`
|
|
26
|
+
// so a native ESM default import is the correct interop (replaces the v2-era
|
|
27
|
+
// createRequire shim). v3 is packaging-only vs v2 — its RFC 8785 output is
|
|
28
|
+
// byte-identical, so existing content hashes / manifest signatures are stable.
|
|
29
|
+
import canonicalize from 'canonicalize';
|
|
28
30
|
/**
|
|
29
31
|
* Recursively replace NaN / ±Infinity with null so that canonicalize does not
|
|
30
32
|
* throw — matching the JSON.stringify behaviour that the rest of the codebase
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"canonical-json.js","sourceRoot":"","sources":["../../../src/harnesses/engine/canonical-json.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;GAuBG;AACH,
|
|
1
|
+
{"version":3,"file":"canonical-json.js","sourceRoot":"","sources":["../../../src/harnesses/engine/canonical-json.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;GAuBG;AACH,+EAA+E;AAC/E,6EAA6E;AAC7E,2EAA2E;AAC3E,+EAA+E;AAC/E,OAAO,YAAY,MAAM,cAAc,CAAC;AAExC;;;;;GAKG;AACH,SAAS,eAAe,CAAC,KAAc;IACrC,IAAI,OAAO,KAAK,KAAK,QAAQ,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC;QAAE,OAAO,IAAI,CAAC;IAC/D,IAAI,KAAK,CAAC,OAAO,CAAC,KAAK,CAAC;QAAE,OAAO,KAAK,CAAC,GAAG,CAAC,eAAe,CAAC,CAAC;IAC5D,IAAI,KAAK,KAAK,IAAI,IAAI,OAAO,KAAK,KAAK,QAAQ,EAAE,CAAC;QAChD,mEAAmE;QACnE,qEAAqE;QACrE,sEAAsE;QACtE,WAAW;QACX,MAAM,MAAM,GAAI,KAAoC,CAAC,MAAM,CAAC;QAC5D,IAAI,OAAO,MAAM,KAAK,UAAU,EAAE,CAAC;YACjC,OAAO,eAAe,CAAC,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC;QAC7C,CAAC;QACD,MAAM,GAAG,GAA4B,EAAE,CAAC;QACxC,KAAK,MAAM,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,KAAK,CAAC;YAAE,GAAG,CAAC,CAAC,CAAC,GAAG,eAAe,CAAC,CAAC,CAAC,CAAC;QACxE,OAAO,GAAG,CAAC;IACb,CAAC;IACD,OAAO,KAAK,CAAC;AACf,CAAC;AAED,MAAM,UAAU,aAAa,CAAC,KAAc;IAC1C,MAAM,OAAO,GAAG,eAAe,CAAC,KAAK,CAAC,CAAC;IAEvC,MAAM,MAAM,GAAG,YAAY,CAAC,OAAO,CAAC,CAAC;IACrC,IAAI,MAAM,KAAK,SAAS,EAAE,CAAC;QACzB,wEAAwE;QACxE,uEAAuE;QACvE,kCAAkC;QAClC,OAAO,MAAM,CAAC;IAChB,CAAC;IACD,OAAO,MAAM,CAAC;AAChB,CAAC"}
|
|
@@ -70,6 +70,10 @@ export interface JoinedSolverNetsView {
|
|
|
70
70
|
} | undefined;
|
|
71
71
|
/** Enumerate all joined manifest CIDs (used for digest-based filtering). */
|
|
72
72
|
manifestCids(): string[];
|
|
73
|
+
/** Add/replace one joined entry live (used by the hot-apply join applier, #1037). */
|
|
74
|
+
set(manifestCid: string, entry: {
|
|
75
|
+
roles: Array<'solver' | 'evaluator'>;
|
|
76
|
+
}): void;
|
|
73
77
|
}
|
|
74
78
|
/**
|
|
75
79
|
* Build a `JoinedSolverNetsView` from the raw operator-config block.
|
|
@@ -82,6 +86,17 @@ export declare function joinedSolverNetsViewFromConfig(joined: Record<string, {
|
|
|
82
86
|
manifestCid: string;
|
|
83
87
|
roles: Array<'solver' | 'evaluator'>;
|
|
84
88
|
}> | undefined): JoinedSolverNetsView | undefined;
|
|
89
|
+
/**
|
|
90
|
+
* Mutable `JoinedSolverNetsView` for the running daemon. Unlike
|
|
91
|
+
* `joinedSolverNetsViewFromConfig` (boot snapshot), the applier
|
|
92
|
+
* (`daemon/join-applier.ts`, #1037) keeps a handle and calls `set()` when a
|
|
93
|
+
* join is hot-applied, so the engine's per-task eligibility check sees the new
|
|
94
|
+
* cid on its next call without a restart.
|
|
95
|
+
*/
|
|
96
|
+
export declare function createMutableJoinedSolverNetsView(initial: Record<string, {
|
|
97
|
+
manifestCid: string;
|
|
98
|
+
roles: Array<'solver' | 'evaluator'>;
|
|
99
|
+
}> | undefined): JoinedSolverNetsView;
|
|
85
100
|
/**
|
|
86
101
|
* Resolves a launched SolverNet manifest by IPFS CID.
|
|
87
102
|
*
|
|
@@ -576,6 +591,15 @@ export declare class TaskEngine {
|
|
|
576
591
|
* TCAttemptAlreadyFinalized, …). We mark the row RACE_LOST and emit a
|
|
577
592
|
* `kind=race_lost` activity event so operators can audit prunes
|
|
578
593
|
* without inflating the FAILED counter (#896).
|
|
594
|
+
* - A transport-transient error (e.g. `AllRpcsFailedError` — every provider
|
|
595
|
+
* in the L2 fallback chain failed at once) on a task whose delivery window
|
|
596
|
+
* is still open: leave the row in its current in-flight state so the next
|
|
597
|
+
* tick re-drives it once the RPCs recover, and emit a `tick_error` (warn)
|
|
598
|
+
* event instead of inflating the FAILED counter. Without this the daemon
|
|
599
|
+
* stamped the row FAILED, dropping it from `getInFlight()` permanently, so
|
|
600
|
+
* L2 work went silent until a manual restart (#912). Past-window transient
|
|
601
|
+
* errors still terminalize to avoid churning on work that can no longer
|
|
602
|
+
* settle.
|
|
579
603
|
* - Everything else: existing markFailed behaviour. When invoked from
|
|
580
604
|
* recovery, `contextLabel === 'recovery'` so the failure_reason
|
|
581
605
|
* carries the `recovery:` prefix the original code path used.
|