@jinn-network/client 0.1.8 → 0.1.9-canary.144d87d2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (397) hide show
  1. package/README.md +6 -0
  2. package/dist/adapters/mech/adapter.d.ts +21 -1
  3. package/dist/adapters/mech/adapter.js +77 -10
  4. package/dist/adapters/mech/adapter.js.map +1 -1
  5. package/dist/adapters/mech/contracts.js +62 -28
  6. package/dist/adapters/mech/contracts.js.map +1 -1
  7. package/dist/adapters/mech/safe-revert.d.ts +4 -0
  8. package/dist/adapters/mech/safe-revert.js +5 -1
  9. package/dist/adapters/mech/safe-revert.js.map +1 -1
  10. package/dist/adapters/mech/safe.js +5 -1
  11. package/dist/adapters/mech/safe.js.map +1 -1
  12. package/dist/adapters/mech/verdict-code.js +1 -1
  13. package/dist/adapters/mech/verdict-code.js.map +1 -1
  14. package/dist/api/bootstrap-endpoint.d.ts +1 -0
  15. package/dist/api/bootstrap-endpoint.js +1 -0
  16. package/dist/api/bootstrap-endpoint.js.map +1 -1
  17. package/dist/api/discovery-endpoint.d.ts +1 -0
  18. package/dist/api/discovery-endpoint.js +24 -0
  19. package/dist/api/discovery-endpoint.js.map +1 -1
  20. package/dist/api/fleet-build.d.ts +1 -7
  21. package/dist/api/fleet-build.js +0 -7
  22. package/dist/api/fleet-build.js.map +1 -1
  23. package/dist/api/gather-status.d.ts +8 -2
  24. package/dist/api/gather-status.js +29 -117
  25. package/dist/api/gather-status.js.map +1 -1
  26. package/dist/api/loop-completion-build.d.ts +79 -0
  27. package/dist/api/loop-completion-build.js +155 -0
  28. package/dist/api/loop-completion-build.js.map +1 -0
  29. package/dist/api/operator-artifacts-endpoint.js +1 -1
  30. package/dist/api/operator-artifacts-endpoint.js.map +1 -1
  31. package/dist/api/peers.js +2 -0
  32. package/dist/api/peers.js.map +1 -1
  33. package/dist/api/setup-endpoints.d.ts +32 -0
  34. package/dist/api/setup-endpoints.js +94 -24
  35. package/dist/api/setup-endpoints.js.map +1 -1
  36. package/dist/api/solvernets-endpoints.js +4 -1
  37. package/dist/api/solvernets-endpoints.js.map +1 -1
  38. package/dist/api/status-build.d.ts +43 -33
  39. package/dist/api/status-build.js +3 -26
  40. package/dist/api/status-build.js.map +1 -1
  41. package/dist/api/status-rollup-build.d.ts +0 -4
  42. package/dist/api/status-rollup-build.js +0 -4
  43. package/dist/api/status-rollup-build.js.map +1 -1
  44. package/dist/api/stop-hook.d.ts +1 -1
  45. package/dist/api/stop-hook.js +1 -1
  46. package/dist/api/stop-hook.js.map +1 -1
  47. package/dist/build-info.json +4 -4
  48. package/dist/build-meta.json +1 -1
  49. package/dist/cli/commands/codedigest-revert-check.js +6 -2
  50. package/dist/cli/commands/codedigest-revert-check.js.map +1 -1
  51. package/dist/cli/commands/doctor.d.ts +3 -0
  52. package/dist/cli/commands/doctor.js +37 -2
  53. package/dist/cli/commands/doctor.js.map +1 -1
  54. package/dist/cli/commands/eval.d.ts +87 -0
  55. package/dist/cli/commands/eval.js +481 -0
  56. package/dist/cli/commands/eval.js.map +1 -0
  57. package/dist/cli/commands/rewards.d.ts +2 -0
  58. package/dist/cli/commands/rewards.js +30 -3
  59. package/dist/cli/commands/rewards.js.map +1 -1
  60. package/dist/cli/commands/solver-nets.js +68 -0
  61. package/dist/cli/commands/solver-nets.js.map +1 -1
  62. package/dist/cli/commands/status.js +0 -1
  63. package/dist/cli/commands/status.js.map +1 -1
  64. package/dist/cli/index.js +2 -0
  65. package/dist/cli/index.js.map +1 -1
  66. package/dist/config.d.ts +102 -15
  67. package/dist/config.js +166 -19
  68. package/dist/config.js.map +1 -1
  69. package/dist/daemon/ai-units-gate.d.ts +6 -6
  70. package/dist/daemon/ai-units-gate.js +11 -10
  71. package/dist/daemon/ai-units-gate.js.map +1 -1
  72. package/dist/daemon/balance-topup-loop.js +3 -0
  73. package/dist/daemon/balance-topup-loop.js.map +1 -1
  74. package/dist/daemon/checkpoint-loop.js +2 -2
  75. package/dist/daemon/creator.d.ts +1 -0
  76. package/dist/daemon/creator.js +26 -14
  77. package/dist/daemon/creator.js.map +1 -1
  78. package/dist/daemon/daemon.d.ts +15 -0
  79. package/dist/daemon/daemon.js +78 -22
  80. package/dist/daemon/daemon.js.map +1 -1
  81. package/dist/daemon/eviction-loop.d.ts +7 -0
  82. package/dist/daemon/eviction-loop.js +19 -3
  83. package/dist/daemon/eviction-loop.js.map +1 -1
  84. package/dist/daemon/jinn-claim-loop.js +3 -0
  85. package/dist/daemon/jinn-claim-loop.js.map +1 -1
  86. package/dist/daemon/join-applier.d.ts +35 -0
  87. package/dist/daemon/join-applier.js +49 -0
  88. package/dist/daemon/join-applier.js.map +1 -0
  89. package/dist/daemon/loop-heartbeat.d.ts +34 -0
  90. package/dist/daemon/loop-heartbeat.js +39 -0
  91. package/dist/daemon/loop-heartbeat.js.map +1 -0
  92. package/dist/daemon/reward-claim-loop.js +4 -1
  93. package/dist/daemon/reward-claim-loop.js.map +1 -1
  94. package/dist/daemon/watchdog-loop.d.ts +84 -0
  95. package/dist/daemon/watchdog-loop.js +91 -0
  96. package/dist/daemon/watchdog-loop.js.map +1 -0
  97. package/dist/dashboard/assets/index-8tAiMbUV.css +1 -0
  98. package/dist/dashboard/assets/index-D6a-DfaM.js +171 -0
  99. package/dist/dashboard/index.html +2 -2
  100. package/dist/discovery/http.d.ts +17 -0
  101. package/dist/discovery/http.js +295 -25
  102. package/dist/discovery/http.js.map +1 -1
  103. package/dist/discovery/onchain.js +155 -1
  104. package/dist/discovery/onchain.js.map +1 -1
  105. package/dist/discovery/types.d.ts +106 -0
  106. package/dist/discovery/types.js +40 -0
  107. package/dist/discovery/types.js.map +1 -1
  108. package/dist/discovery/with-fallback.js +14 -0
  109. package/dist/discovery/with-fallback.js.map +1 -1
  110. package/dist/earning/bootstrap.d.ts +25 -0
  111. package/dist/earning/bootstrap.js +79 -28
  112. package/dist/earning/bootstrap.js.map +1 -1
  113. package/dist/earning/faucet.d.ts +1 -1
  114. package/dist/earning/faucet.js +2 -2
  115. package/dist/earning/faucet.js.map +1 -1
  116. package/dist/earning/safe-adapter.js +11 -0
  117. package/dist/earning/safe-adapter.js.map +1 -1
  118. package/dist/earning/stolas-claim.js +5 -5
  119. package/dist/earning/types.d.ts +1 -1
  120. package/dist/earning/types.js +1 -1
  121. package/dist/earning/types.js.map +1 -1
  122. package/dist/eval/eval-harness-run.d.ts +63 -0
  123. package/dist/eval/eval-harness-run.js +123 -0
  124. package/dist/eval/eval-harness-run.js.map +1 -0
  125. package/dist/eval/orchestrator.d.ts +224 -0
  126. package/dist/eval/orchestrator.js +250 -0
  127. package/dist/eval/orchestrator.js.map +1 -0
  128. package/dist/eval/paired.d.ts +68 -0
  129. package/dist/eval/paired.js +93 -0
  130. package/dist/eval/paired.js.map +1 -0
  131. package/dist/eval/resolve-slate-tasks.d.ts +35 -0
  132. package/dist/eval/resolve-slate-tasks.js +56 -0
  133. package/dist/eval/resolve-slate-tasks.js.map +1 -0
  134. package/dist/eval/screen-discovery.d.ts +22 -0
  135. package/dist/eval/screen-discovery.js +71 -0
  136. package/dist/eval/screen-discovery.js.map +1 -0
  137. package/dist/eval/screen-progress.d.ts +41 -0
  138. package/dist/eval/screen-progress.js +60 -0
  139. package/dist/eval/screen-progress.js.map +1 -0
  140. package/dist/eval/screen-runner.d.ts +30 -0
  141. package/dist/eval/screen-runner.js +289 -0
  142. package/dist/eval/screen-runner.js.map +1 -0
  143. package/dist/eval/screen.d.ts +107 -0
  144. package/dist/eval/screen.js +159 -0
  145. package/dist/eval/screen.js.map +1 -0
  146. package/dist/eval/slope.d.ts +29 -0
  147. package/dist/eval/slope.js +46 -0
  148. package/dist/eval/slope.js.map +1 -0
  149. package/dist/eval/train-sequence.d.ts +35 -0
  150. package/dist/eval/train-sequence.js +59 -0
  151. package/dist/eval/train-sequence.js.map +1 -0
  152. package/dist/eval/wilson.d.ts +45 -0
  153. package/dist/eval/wilson.js +48 -0
  154. package/dist/eval/wilson.js.map +1 -0
  155. package/dist/events/types.d.ts +1 -1
  156. package/dist/events/types.js +1 -1
  157. package/dist/events/types.js.map +1 -1
  158. package/dist/harnesses/engine/canonical-json.js +5 -3
  159. package/dist/harnesses/engine/canonical-json.js.map +1 -1
  160. package/dist/harnesses/engine/engine.d.ts +24 -0
  161. package/dist/harnesses/engine/engine.js +72 -9
  162. package/dist/harnesses/engine/engine.js.map +1 -1
  163. package/dist/harnesses/engine/packaging.js +1 -1
  164. package/dist/harnesses/engine/packaging.js.map +1 -1
  165. package/dist/harnesses/engine/persistence.d.ts +17 -0
  166. package/dist/harnesses/engine/persistence.js +28 -0
  167. package/dist/harnesses/engine/persistence.js.map +1 -1
  168. package/dist/harnesses/impls/claude-mcp-hyperliquid/mcp-tools.d.ts +1 -1
  169. package/dist/harnesses/impls/claude-mcp-hyperliquid/mcp-tools.js +1 -1
  170. package/dist/harnesses/impls/claude-mcp-hyperliquid/mcp-tools.js.map +1 -1
  171. package/dist/harnesses/impls/hermes-agent/adapter.d.ts +2 -0
  172. package/dist/harnesses/impls/hermes-agent/adapter.js +8 -5
  173. package/dist/harnesses/impls/hermes-agent/adapter.js.map +1 -1
  174. package/dist/harnesses/impls/hermes-agent/bootstrap.d.ts +1 -0
  175. package/dist/harnesses/impls/hermes-agent/bootstrap.js +6 -1
  176. package/dist/harnesses/impls/hermes-agent/bootstrap.js.map +1 -1
  177. package/dist/harnesses/impls/hermes-agent/harness.d.ts +17 -3
  178. package/dist/harnesses/impls/hermes-agent/harness.js +68 -5
  179. package/dist/harnesses/impls/hermes-agent/harness.js.map +1 -1
  180. package/dist/harnesses/impls/index.d.ts +2 -0
  181. package/dist/harnesses/impls/index.js +9 -0
  182. package/dist/harnesses/impls/index.js.map +1 -1
  183. package/dist/harnesses/impls/jinn-repo-evaluator/eval-runner.d.ts +34 -0
  184. package/dist/harnesses/impls/jinn-repo-evaluator/eval-runner.js +111 -0
  185. package/dist/harnesses/impls/jinn-repo-evaluator/eval-runner.js.map +1 -0
  186. package/dist/harnesses/impls/jinn-repo-evaluator/evaluator.d.ts +24 -0
  187. package/dist/harnesses/impls/jinn-repo-evaluator/evaluator.js +19 -0
  188. package/dist/harnesses/impls/jinn-repo-evaluator/evaluator.js.map +1 -0
  189. package/dist/harnesses/impls/jinn-repo-evaluator/harness.d.ts +64 -0
  190. package/dist/harnesses/impls/jinn-repo-evaluator/harness.js +125 -0
  191. package/dist/harnesses/impls/jinn-repo-evaluator/harness.js.map +1 -0
  192. package/dist/harnesses/impls/jinn-repo-evaluator/repro.d.ts +32 -0
  193. package/dist/harnesses/impls/jinn-repo-evaluator/repro.js +73 -0
  194. package/dist/harnesses/impls/jinn-repo-evaluator/repro.js.map +1 -0
  195. package/dist/harnesses/impls/learner/adapters/claude-code.js +5 -0
  196. package/dist/harnesses/impls/learner/adapters/claude-code.js.map +1 -1
  197. package/dist/harnesses/impls/learner/harness.d.ts +17 -1
  198. package/dist/harnesses/impls/learner/harness.js +51 -1
  199. package/dist/harnesses/impls/learner/harness.js.map +1 -1
  200. package/dist/harnesses/impls/learner/harvest.d.ts +2 -0
  201. package/dist/harnesses/impls/learner/harvest.js +51 -1
  202. package/dist/harnesses/impls/learner/harvest.js.map +1 -1
  203. package/dist/harnesses/impls/learner/plugin-path.js +1 -0
  204. package/dist/harnesses/impls/learner/plugin-path.js.map +1 -1
  205. package/dist/harnesses/impls/swe-rebench-v2-evaluator/harness.js +3 -1
  206. package/dist/harnesses/impls/swe-rebench-v2-evaluator/harness.js.map +1 -1
  207. package/dist/harnesses/impls/swe-rebench-v2-evaluator/index.d.ts +2 -2
  208. package/dist/harnesses/impls/swe-rebench-v2-evaluator/index.js +3 -1
  209. package/dist/harnesses/impls/swe-rebench-v2-evaluator/index.js.map +1 -1
  210. package/dist/harnesses/readiness-registry.d.ts +10 -0
  211. package/dist/harnesses/readiness-registry.js +13 -0
  212. package/dist/harnesses/readiness-registry.js.map +1 -1
  213. package/dist/harnesses/types.d.ts +14 -0
  214. package/dist/learner/revert-decision.d.ts +16 -1
  215. package/dist/learner/revert-decision.js +38 -18
  216. package/dist/learner/revert-decision.js.map +1 -1
  217. package/dist/learner/revert-stats.d.ts +14 -0
  218. package/dist/learner/revert-stats.js +42 -0
  219. package/dist/learner/revert-stats.js.map +1 -1
  220. package/dist/local-provider-url.d.ts +3 -0
  221. package/dist/local-provider-url.js +28 -0
  222. package/dist/local-provider-url.js.map +1 -0
  223. package/dist/main.js +94 -25
  224. package/dist/main.js.map +1 -1
  225. package/dist/mcp/operator-server.js +1 -1
  226. package/dist/mcp/operator-server.js.map +1 -1
  227. package/dist/mcp/server.js +1 -1
  228. package/dist/mcp/server.js.map +1 -1
  229. package/dist/plugins/learner/.claude-plugin/plugin.json +1 -1
  230. package/dist/plugins/learner/.codex-plugin/plugin.json +1 -1
  231. package/dist/plugins/learner/hooks/session-start +30 -1
  232. package/dist/plugins/learner/skills/learn/consolidator-prompt.md +4 -0
  233. package/dist/preflight/deployment-readiness.d.ts +147 -0
  234. package/dist/preflight/deployment-readiness.js +366 -0
  235. package/dist/preflight/deployment-readiness.js.map +1 -0
  236. package/dist/preflight/pidfile-liveness.d.ts +7 -1
  237. package/dist/preflight/pidfile-liveness.js +14 -0
  238. package/dist/preflight/pidfile-liveness.js.map +1 -1
  239. package/dist/rpc/transport.d.ts +43 -5
  240. package/dist/rpc/transport.js +131 -30
  241. package/dist/rpc/transport.js.map +1 -1
  242. package/dist/scripts/swe-rebench-v2-seed-pool.json +2 -1
  243. package/dist/solver-nets/registry.d.ts +19 -0
  244. package/dist/solver-nets/registry.js +95 -66
  245. package/dist/solver-nets/registry.js.map +1 -1
  246. package/dist/solver-types/_jinn-repo-pool.d.ts +27 -0
  247. package/dist/solver-types/_jinn-repo-pool.js +27 -0
  248. package/dist/solver-types/_jinn-repo-pool.js.map +1 -0
  249. package/dist/solver-types/_swe-rebench-v2-held-out-slate.d.ts +76 -0
  250. package/dist/solver-types/_swe-rebench-v2-held-out-slate.js +156 -0
  251. package/dist/solver-types/_swe-rebench-v2-held-out-slate.js.map +1 -0
  252. package/dist/solver-types/_swe-rebench-v2-pool-recovery.d.ts +81 -0
  253. package/dist/solver-types/_swe-rebench-v2-pool-recovery.js +116 -0
  254. package/dist/solver-types/_swe-rebench-v2-pool-recovery.js.map +1 -0
  255. package/dist/solver-types/_swe-rebench-v2-state.d.ts +9 -0
  256. package/dist/solver-types/_swe-rebench-v2-state.js +14 -0
  257. package/dist/solver-types/_swe-rebench-v2-state.js.map +1 -1
  258. package/dist/solver-types/_swe-rebench-v2-validated-pool.d.ts +30 -0
  259. package/dist/solver-types/_swe-rebench-v2-validated-pool.js +40 -0
  260. package/dist/solver-types/_swe-rebench-v2-validated-pool.js.map +1 -1
  261. package/dist/solver-types/index.js +2 -0
  262. package/dist/solver-types/index.js.map +1 -1
  263. package/dist/solver-types/jinn-repo-admit.d.ts +17 -0
  264. package/dist/solver-types/jinn-repo-admit.js +16 -0
  265. package/dist/solver-types/jinn-repo-admit.js.map +1 -0
  266. package/dist/solver-types/jinn-repo-auto.d.ts +60 -0
  267. package/dist/solver-types/jinn-repo-auto.js +163 -0
  268. package/dist/solver-types/jinn-repo-auto.js.map +1 -0
  269. package/dist/solver-types/jinn-repo-definition.d.ts +15 -0
  270. package/dist/solver-types/jinn-repo-definition.js +34 -0
  271. package/dist/solver-types/jinn-repo-definition.js.map +1 -0
  272. package/dist/solver-types/jinn-repo-extract.d.ts +16 -0
  273. package/dist/solver-types/jinn-repo-extract.js +32 -0
  274. package/dist/solver-types/jinn-repo-extract.js.map +1 -0
  275. package/dist/solver-types/jinn-repo.d.ts +21 -0
  276. package/dist/solver-types/jinn-repo.js +23 -0
  277. package/dist/solver-types/jinn-repo.js.map +1 -0
  278. package/dist/solver-types/learner-loop-test.js +1 -1
  279. package/dist/solver-types/learner-loop-test.js.map +1 -1
  280. package/dist/solver-types/slates/held-out-slate.swe-rebench-v2.v1.json +20 -0
  281. package/dist/solver-types/slates/held-out-slate.swe-rebench-v2.v2.json +19 -0
  282. package/dist/solver-types/slates/held-out-slate.swe-rebench-v2.v2.screening-report.json +628 -0
  283. package/dist/solver-types/solver-type.d.ts +8 -0
  284. package/dist/solver-types/swe-rebench-v2.d.ts +2 -0
  285. package/dist/solver-types/swe-rebench-v2.js +115 -10
  286. package/dist/solver-types/swe-rebench-v2.js.map +1 -1
  287. package/dist/solvernets/launched-record-dispatcher.d.ts +5 -0
  288. package/dist/solvernets/launched-record-dispatcher.js +8 -1
  289. package/dist/solvernets/launched-record-dispatcher.js.map +1 -1
  290. package/dist/solvernets/registry-client-erc8004.js +29 -37
  291. package/dist/solvernets/registry-client-erc8004.js.map +1 -1
  292. package/dist/solvernets/registry-client.d.ts +6 -0
  293. package/dist/solvernets/store.d.ts +1 -1
  294. package/dist/solvernets/store.js +8 -3
  295. package/dist/solvernets/store.js.map +1 -1
  296. package/dist/spend/ai-units-config.d.ts +10 -0
  297. package/dist/spend/ai-units-config.js +7 -1
  298. package/dist/spend/ai-units-config.js.map +1 -1
  299. package/dist/spend/ai-units.d.ts +51 -0
  300. package/dist/spend/ai-units.js +73 -0
  301. package/dist/spend/ai-units.js.map +1 -1
  302. package/dist/spend/record.js +12 -5
  303. package/dist/spend/record.js.map +1 -1
  304. package/dist/store/store.d.ts +91 -5
  305. package/dist/store/store.js +170 -7
  306. package/dist/store/store.js.map +1 -1
  307. package/dist/trajectory/harness-bundle-schema.d.ts +1 -1
  308. package/dist/trajectory/harness-bundle-schema.js +1 -1
  309. package/dist/trajectory/harness-bundle-schema.js.map +1 -1
  310. package/dist/trajectory/schema.d.ts +1 -1
  311. package/dist/trajectory/schema.js +1 -1
  312. package/dist/trajectory/schema.js.map +1 -1
  313. package/dist/trajectory/transcript-parsers/types.d.ts +1 -1
  314. package/dist/trajectory/transcript-parsers/types.js +1 -1
  315. package/dist/trajectory/transcript-parsers/types.js.map +1 -1
  316. package/dist/types/envelope.d.ts +1 -1
  317. package/dist/types/envelope.js +1 -1
  318. package/dist/types/envelope.js.map +1 -1
  319. package/dist/types/payloads/index.d.ts +1 -1
  320. package/dist/types/payloads/index.js +7 -1
  321. package/dist/types/payloads/index.js.map +1 -1
  322. package/dist/types/payloads/portfolio-v0.d.ts +1 -1
  323. package/dist/types/payloads/portfolio-v0.js +1 -1
  324. package/dist/types/payloads/portfolio-v0.js.map +1 -1
  325. package/dist/types/payloads/prediction-apy-v0.d.ts +1 -1
  326. package/dist/types/payloads/prediction-apy-v0.js +1 -1
  327. package/dist/types/payloads/prediction-apy-v0.js.map +1 -1
  328. package/dist/types/payloads/prediction-v0.d.ts +1 -1
  329. package/dist/types/payloads/prediction-v0.js +1 -1
  330. package/dist/types/payloads/prediction-v0.js.map +1 -1
  331. package/dist/types/portfolio.d.ts +1 -1
  332. package/dist/types/portfolio.js +1 -1
  333. package/dist/types/portfolio.js.map +1 -1
  334. package/dist/types/prediction-apy.d.ts +1 -1
  335. package/dist/types/prediction-apy.js +1 -1
  336. package/dist/types/prediction-apy.js.map +1 -1
  337. package/dist/types/prediction.d.ts +1 -1
  338. package/dist/types/prediction.js +1 -1
  339. package/dist/types/prediction.js.map +1 -1
  340. package/dist/types/session-provenance.d.ts +1 -1
  341. package/dist/types/session-provenance.js +1 -1
  342. package/dist/types/session-provenance.js.map +1 -1
  343. package/dist/types/task-document.d.ts +1 -1
  344. package/dist/types/task-document.js +1 -1
  345. package/dist/types/task-document.js.map +1 -1
  346. package/dist/types/task.d.ts +1 -1
  347. package/dist/types/task.js +1 -1
  348. package/dist/types/task.js.map +1 -1
  349. package/dist/types/window.d.ts +1 -1
  350. package/dist/types/window.js +1 -1
  351. package/dist/types/window.js.map +1 -1
  352. package/dist/vendor/@jinn-network/sdk/dist/checkpoint.d.ts +1 -1
  353. package/dist/vendor/@jinn-network/sdk/dist/checkpoint.js +1 -1
  354. package/dist/vendor/@jinn-network/sdk/dist/contracts.d.ts +3 -2
  355. package/dist/vendor/@jinn-network/sdk/dist/contracts.js +49 -0
  356. package/dist/vendor/@jinn-network/sdk/dist/jinn-repo.d.ts +44 -0
  357. package/dist/vendor/@jinn-network/sdk/dist/jinn-repo.js +25 -0
  358. package/dist/vendor/@jinn-network/sdk/dist/json-schema.d.ts +1 -1
  359. package/dist/vendor/@jinn-network/sdk/dist/json-schema.js +1 -1
  360. package/dist/vendor/@jinn-network/sdk/dist/payloads/jinn-repo.d.ts +38 -0
  361. package/dist/vendor/@jinn-network/sdk/dist/payloads/jinn-repo.js +22 -0
  362. package/dist/vendor/@jinn-network/sdk/dist/payloads/prediction-v1.d.ts +1 -1
  363. package/dist/vendor/@jinn-network/sdk/dist/payloads/prediction-v1.js +1 -1
  364. package/dist/vendor/@jinn-network/sdk/dist/payloads/session-derived.d.ts +1 -1
  365. package/dist/vendor/@jinn-network/sdk/dist/payloads/session-derived.js +1 -1
  366. package/dist/vendor/@jinn-network/sdk/dist/payloads/swe-rebench-v2.d.ts +109 -2
  367. package/dist/vendor/@jinn-network/sdk/dist/payloads/swe-rebench-v2.js +26 -2
  368. package/dist/vendor/@jinn-network/sdk/dist/prediction-v1.d.ts +1 -1
  369. package/dist/vendor/@jinn-network/sdk/dist/prediction-v1.js +1 -1
  370. package/dist/vendor/@jinn-network/sdk/dist/solvernets/jinn-repo.d.ts +4 -0
  371. package/dist/vendor/@jinn-network/sdk/dist/solvernets/jinn-repo.js +2 -0
  372. package/dist/vendor/@jinn-network/sdk/dist/solvernets/manifest-schema.d.ts +1 -1
  373. package/dist/vendor/@jinn-network/sdk/dist/solvernets/manifest-schema.js +1 -1
  374. package/dist/vendor/@jinn-network/sdk/dist/solvernets/swe-rebench-v2-held-out-slate.d.ts +65 -0
  375. package/dist/vendor/@jinn-network/sdk/dist/solvernets/swe-rebench-v2-held-out-slate.js +123 -0
  376. package/dist/vendor/@jinn-network/sdk/dist/solvernets/swe-rebench-v2.d.ts +2 -2
  377. package/dist/vendor/@jinn-network/sdk/dist/solvernets/swe-rebench-v2.js +1 -1
  378. package/dist/vendor/@jinn-network/sdk/dist/swe-rebench-v2.d.ts +1 -1
  379. package/dist/vendor/@jinn-network/sdk/dist/swe-rebench-v2.js +1 -1
  380. package/dist/vendor/@jinn-network/sdk/package.json +9 -1
  381. package/docker-compose.yml +3 -2
  382. package/package.json +23 -20
  383. package/plugins/jinn-repo-runtime/.claude-plugin/plugin.json +5 -0
  384. package/plugins/jinn-repo-runtime/.codex-plugin/plugin.json +39 -0
  385. package/plugins/jinn-repo-runtime/README.md +27 -0
  386. package/plugins/jinn-repo-runtime/hooks/hooks.json +16 -0
  387. package/plugins/jinn-repo-runtime/hooks/session-start +73 -0
  388. package/plugins/jinn-repo-runtime/jinn.plugin.json +11 -0
  389. package/plugins/jinn-repo-runtime/skills/task/SKILL.md +92 -0
  390. package/plugins/learner/.claude-plugin/plugin.json +1 -1
  391. package/plugins/learner/.codex-plugin/plugin.json +1 -1
  392. package/plugins/learner/hooks/session-start +30 -1
  393. package/plugins/learner/skills/learn/consolidator-prompt.md +4 -0
  394. package/plugins/swe-rebench-v2-runtime/hooks/hooks.json +16 -0
  395. package/plugins/swe-rebench-v2-runtime/hooks/session-start +74 -0
  396. package/dist/dashboard/assets/index-CzKxvMcU.css +0 -32
  397. package/dist/dashboard/assets/index-yVemxHot.js +0 -351
@@ -0,0 +1,250 @@
1
+ /**
2
+ * `jinn eval` held-out checkpoint orchestrator (issue #818).
3
+ *
4
+ * Runs a held-out slate against a checkpoint in FROZEN mode, persists per-task
5
+ * pass/fail, and emits a Wilson-CI resolved-rate comparison vs the parent
6
+ * checkpoint. Every external boundary is constructor-injected — this is the
7
+ * thin slice #819 drives a deterministic slate against with NO Docker/IPFS.
8
+ *
9
+ * Acceptance criteria:
10
+ * AC#1 — run the slate frozen, write per-task pass/fail.
11
+ * AC#2 — emit a resolved-rate comparison vs the parent with a CI.
12
+ * AC#3 — the freeze-fence holds: a `runHarnessOnce` `{ violation }` throws
13
+ * LOUD and the instance is NOT recorded (no implStateDir mutation
14
+ * slips through; enforcement lives in `runHarnessWithFreezeFence`).
15
+ *
16
+ * Per log/decisions/2026-05-28-rl-eval-measurement.md §4: v1-simple. Only large
17
+ * deltas are trustworthy (disjoint Wilson intervals). No seed control, no
18
+ * multi-run averaging.
19
+ */
20
+ import { compareRates } from './wilson.js';
21
+ import { comparePaired } from './paired.js';
22
+ /** Thrown when `runHarnessOnce` reports a freeze-fence violation (AC#3). */
23
+ export class FreezeFenceViolationError extends Error {
24
+ instanceId;
25
+ constructor(instanceId) {
26
+ super(`freeze-fence violation on instance ${instanceId}: the harness mutated implStateDir ` +
27
+ `during a frozen-mode eval run — refusing to record a tainted result`);
28
+ this.instanceId = instanceId;
29
+ this.name = 'FreezeFenceViolationError';
30
+ }
31
+ }
32
+ /**
33
+ * Thrown when the locally-evaluated impl-state's codeDigest does not match the
34
+ * named checkpoint's manifest (C1). Guards against persisting + comparing
35
+ * results under a checkpoint name while the operator's local impl-state has
36
+ * drifted from that checkpoint.
37
+ */
38
+ export class CheckpointStateMismatchError extends Error {
39
+ checkpointCid;
40
+ manifestCodeDigest;
41
+ evaluatedCodeDigest;
42
+ constructor(checkpointCid, manifestCodeDigest, evaluatedCodeDigest) {
43
+ super(`local impl-state does not match checkpoint ${checkpointCid}: ` +
44
+ `manifest codeDigest=${manifestCodeDigest} but the evaluated impl-state hashes to ` +
45
+ `${evaluatedCodeDigest}. Check out the impl-state for this checkpoint or pass the ` +
46
+ `correct --impl-state-dir; refusing to record results under a codeDigest that is a lie`);
47
+ this.checkpointCid = checkpointCid;
48
+ this.manifestCodeDigest = manifestCodeDigest;
49
+ this.evaluatedCodeDigest = evaluatedCodeDigest;
50
+ this.name = 'CheckpointStateMismatchError';
51
+ }
52
+ }
53
+ /** Thrown when the parent checkpoint has no aggregate for this slate version (AC#2). */
54
+ export class ParentNotEvaluatedError extends Error {
55
+ parentCheckpointCid;
56
+ slateVersion;
57
+ constructor(parentCheckpointCid, slateVersion) {
58
+ super(`parent checkpoint ${parentCheckpointCid} has no eval results for slate ${slateVersion} — ` +
59
+ `eval the parent checkpoint first (scores are only comparable within a slate version)`);
60
+ this.parentCheckpointCid = parentCheckpointCid;
61
+ this.slateVersion = slateVersion;
62
+ this.name = 'ParentNotEvaluatedError';
63
+ }
64
+ }
65
+ /**
66
+ * Thrown when EITHER compared checkpoint was scored on a DIFFERENT slate
67
+ * *content* under the same version label (DR-2026-05-28 §2 — confounder control).
68
+ * A slate is content-addressed and scores are comparable ONLY within identical
69
+ * content; the store keys aggregates by version, so a content edit that skipped
70
+ * the version bump (or re-derived the declared hash — which the slate loader
71
+ * admits passes), or stale rows surviving `recordEvalResult`'s by-instance
72
+ * upsert, would let the comparison silently compare two checkpoints scored on
73
+ * different task sets. That is confounder #1 (task-selection) — exactly what the
74
+ * held-out exam exists to defeat — so we refuse the comparison loudly instead.
75
+ */
76
+ export class SlateHashMismatchError extends Error {
77
+ slateVersion;
78
+ currentSlateHash;
79
+ drifted;
80
+ constructor(slateVersion, currentSlateHash, drifted) {
81
+ const detail = drifted
82
+ .map((d) => `${d.checkpointCid} carries [${d.hashes.join(', ')}]`)
83
+ .join('; ');
84
+ super(`a checkpoint was evaluated against a different slate content under the same version label ` +
85
+ `${slateVersion}: current slate hash is ${currentSlateHash} but ${detail}. Same version must ` +
86
+ `mean the same exam — a slate content change is a measurement discontinuity that must bump the ` +
87
+ `version. Bump the slate version and re-evaluate; refusing to report a delta across two ` +
88
+ `different task sets (confounder #1, task-selection)`);
89
+ this.slateVersion = slateVersion;
90
+ this.currentSlateHash = currentSlateHash;
91
+ this.drifted = drifted;
92
+ this.name = 'SlateHashMismatchError';
93
+ }
94
+ }
95
+ /**
96
+ * Default swe-rebench-v2 instance mapping — preserves the exact harness `task`
97
+ * envelope and grade call the orchestrator used before the backend seam existed.
98
+ */
99
+ function sweRebenchSlateInstance(rt) {
100
+ return {
101
+ instance_id: rt.task.instance_id,
102
+ // Carry the full SweRebenchV2Task in `spec` + the `solverType` dispatch key
103
+ // so the harness can restore the repo (clone/checkout from
104
+ // spec.repo/base_commit). id/description/role alone are insufficient.
105
+ harnessTask: {
106
+ id: rt.task.instance_id,
107
+ description: rt.task.problem_statement,
108
+ role: 'restoration',
109
+ solverType: 'swe-rebench-v2.v1',
110
+ spec: rt.task,
111
+ window: { startTs: 0, endTs: Date.now() + 3_600_000 },
112
+ },
113
+ gradeTask: rt.task,
114
+ solutionSchemaVersion: 'swe-rebench-v2-solution.v1',
115
+ row: rt.row,
116
+ };
117
+ }
118
+ export async function runEval(args) {
119
+ const { deps, slate, checkpointCid, parentCheckpointCid } = args;
120
+ const slateInstances = args.slateInstances ?? (args.tasksWithRows ?? []).map(sweRebenchSlateInstance);
121
+ // Confounder guard (DR-2026-05-28 §2). The store keys aggregates by slate
122
+ // VERSION; if EITHER checkpoint has results recorded under a different slate
123
+ // CONTENT for that version — a content edit that skipped the version bump, or
124
+ // stale rows surviving recordEvalResult's by-instance upsert — comparing
125
+ // child-vs-parent reintroduces confounder #1 (task-selection: two scores over
126
+ // different task sets). Both arms' hashes are known up front, so detect drift
127
+ // in EITHER before the loop and refuse — never burn N× real spend producing a
128
+ // number we'd have to throw away.
129
+ const drifted = [parentCheckpointCid, checkpointCid]
130
+ .map((cid) => ({
131
+ checkpointCid: cid,
132
+ hashes: deps.store.getEvalSlateHashes(cid, slate.version).filter((hash) => hash !== slate.hash),
133
+ }))
134
+ .filter((arm) => arm.hashes.length > 0);
135
+ if (drifted.length > 0) {
136
+ throw new SlateHashMismatchError(slate.version, slate.hash, drifted);
137
+ }
138
+ // Hoist the impl-state-dir fetch outside the loop: every slate instance runs
139
+ // against the SAME checkpoint state.
140
+ const implStateDir = args.implStateDir ?? `/tmp/jinn-eval-${checkpointCid}`;
141
+ const localImplStateDir = await deps.fetchImplStateDirToLocal(args.checkpointManifest.implStateDirCid, implStateDir);
142
+ const perTask = [];
143
+ const runAtMs = Date.now();
144
+ let digestChecked = false;
145
+ // Real digest of the evaluated impl-state (from the freeze-fence), captured on
146
+ // the first run for provenance. Falls back to the manifest digest, which the
147
+ // C1 guard proves equal.
148
+ let evaluatedDigest = args.checkpointManifest.codeDigest;
149
+ for (const inst of slateInstances) {
150
+ const task = inst.gradeTask;
151
+ let passed;
152
+ let unscorable = false;
153
+ let logExcerpt = '';
154
+ try {
155
+ const run = await deps.runHarnessOnce({
156
+ harness: deps.harness,
157
+ implStateDir: localImplStateDir,
158
+ mode: 'frozen',
159
+ // The harness task is built by the BACKEND (eval.ts) and is already
160
+ // leak-controlled (jinn-repo hands the solver `solverView(item)` only —
161
+ // no gold tests / reference solution). swe-rebench-v2 carries the full
162
+ // task in `spec` so the harness can restore the repo.
163
+ task: inst.harnessTask,
164
+ });
165
+ // AC#3: a freeze violation is loud and terminal — do NOT record this
166
+ // instance (its implStateDir mutation taints the run). Thrown OUTSIDE the
167
+ // unscorable catch below so it propagates and aborts the whole eval.
168
+ if (run.violation) {
169
+ throw new FreezeFenceViolationError(inst.instance_id);
170
+ }
171
+ if (!run.solution) {
172
+ throw new Error(`harness produced no solution for instance ${inst.instance_id}`);
173
+ }
174
+ // C1: on the FIRST successful run, verify the real digest of the evaluated
175
+ // impl-state (computed by the freeze-fence) matches the named checkpoint's
176
+ // manifest. Fail fast — before grading any instance — to avoid Docker spend
177
+ // and to never persist a result under a codeDigest that is a lie.
178
+ if (!digestChecked) {
179
+ digestChecked = true;
180
+ const runDigest = run.envelope?.executor.codeDigest;
181
+ if (runDigest && runDigest !== args.checkpointManifest.codeDigest) {
182
+ throw new CheckpointStateMismatchError(checkpointCid, args.checkpointManifest.codeDigest, runDigest);
183
+ }
184
+ if (runDigest)
185
+ evaluatedDigest = runDigest;
186
+ }
187
+ const verdict = await deps.evaluator.grade({
188
+ task,
189
+ solutionPayload: { schemaVersion: inst.solutionSchemaVersion, patch: run.solution.patch },
190
+ ...(inst.row !== undefined ? { row: inst.row } : {}),
191
+ });
192
+ passed = verdict.passed_match;
193
+ logExcerpt = verdict.test_log.slice(0, 1000);
194
+ }
195
+ catch (err) {
196
+ // A freeze-fence violation taints the run and must stay a LOUD terminal
197
+ // abort (AC#3). Likewise the C1 digest mismatch — never record results
198
+ // under a codeDigest that is a lie. Both re-throw and abort the eval.
199
+ if (err instanceof FreezeFenceViolationError || err instanceof CheckpointStateMismatchError) {
200
+ throw err;
201
+ }
202
+ // Everything else is unscorable: excluded from the denominator, NEVER
203
+ // coerced to a fail (#476). This covers both grade-side failures
204
+ // (EvalCouldNotGradeError / InsufficientDiskError) and harness-run
205
+ // failures (Defect A — harvest missing-artifact, "produced no patch",
206
+ // clone/timeout, etc.); a harness/infra failure to produce a gradeable
207
+ // solution is environment-side, not an agent capability fail. The slate
208
+ // continues to the next instance.
209
+ passed = null;
210
+ unscorable = true;
211
+ logExcerpt = (err instanceof Error ? err.message : String(err)).slice(0, 1000);
212
+ }
213
+ deps.store.recordEvalResult({
214
+ checkpoint_cid: checkpointCid,
215
+ slate_hash: slate.hash,
216
+ slate_version: slate.version,
217
+ instance_id: inst.instance_id,
218
+ passed,
219
+ unscorable,
220
+ code_digest: args.checkpointManifest.codeDigest,
221
+ run_at_ms: runAtMs,
222
+ test_log_excerpt: logExcerpt,
223
+ });
224
+ perTask.push({ instance_id: inst.instance_id, passed, unscorable });
225
+ }
226
+ // AC#2: compare child vs parent at the SAME slate version. A parent with no
227
+ // rows is a hard error — no cross-version compare, eval the parent first.
228
+ const parentAgg = deps.store.getEvalAggregate(parentCheckpointCid, slate.version);
229
+ if (parentAgg.scorable === 0 && parentAgg.unscorable === 0) {
230
+ throw new ParentNotEvaluatedError(parentCheckpointCid, slate.version);
231
+ }
232
+ const childAgg = deps.store.getEvalAggregate(checkpointCid, slate.version);
233
+ const comparison = compareRates({ passed: childAgg.passed, scorable: childAgg.scorable }, { passed: parentAgg.passed, scorable: parentAgg.scorable });
234
+ // Paired (matched-design) verdict — same slate scored before & after, so the
235
+ // matched McNemar test is the correct, higher-power one. Additive: only when
236
+ // the store exposes per-instance results (DR-2026-06-02-b §2a).
237
+ let paired;
238
+ if (deps.store.getEvalResults) {
239
+ const parentResults = deps.store.getEvalResults(parentCheckpointCid, slate.version);
240
+ const childResults = deps.store.getEvalResults(checkpointCid, slate.version);
241
+ paired = comparePaired(parentResults, childResults);
242
+ }
243
+ return {
244
+ perTask,
245
+ comparison,
246
+ ...(paired ? { paired } : {}),
247
+ evaluated: { implStateDir: localImplStateDir, codeDigest: evaluatedDigest, matchedCheckpoint: true },
248
+ };
249
+ }
250
+ //# sourceMappingURL=orchestrator.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"orchestrator.js","sourceRoot":"","sources":["../../src/eval/orchestrator.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;GAkBG;AAOH,OAAO,EAAE,YAAY,EAAuB,MAAM,aAAa,CAAC;AAChE,OAAO,EAAE,aAAa,EAA2C,MAAM,aAAa,CAAC;AAErF,4EAA4E;AAC5E,MAAM,OAAO,yBAA0B,SAAQ,KAAK;IACtB;IAA5B,YAA4B,UAAkB;QAC5C,KAAK,CACH,sCAAsC,UAAU,qCAAqC;YACnF,qEAAqE,CACxE,CAAC;QAJwB,eAAU,GAAV,UAAU,CAAQ;QAK5C,IAAI,CAAC,IAAI,GAAG,2BAA2B,CAAC;IAC1C,CAAC;CACF;AAED;;;;;GAKG;AACH,MAAM,OAAO,4BAA6B,SAAQ,KAAK;IAEnC;IACA;IACA;IAHlB,YACkB,aAAqB,EACrB,kBAA0B,EAC1B,mBAA2B;QAE3C,KAAK,CACH,8CAA8C,aAAa,IAAI;YAC7D,uBAAuB,kBAAkB,0CAA0C;YACnF,GAAG,mBAAmB,6DAA6D;YACnF,uFAAuF,CAC1F,CAAC;QATc,kBAAa,GAAb,aAAa,CAAQ;QACrB,uBAAkB,GAAlB,kBAAkB,CAAQ;QAC1B,wBAAmB,GAAnB,mBAAmB,CAAQ;QAQ3C,IAAI,CAAC,IAAI,GAAG,8BAA8B,CAAC;IAC7C,CAAC;CACF;AAED,wFAAwF;AACxF,MAAM,OAAO,uBAAwB,SAAQ,KAAK;IACpB;IAA6C;IAAzE,YAA4B,mBAA2B,EAAkB,YAAoB;QAC3F,KAAK,CACH,qBAAqB,mBAAmB,kCAAkC,YAAY,KAAK;YACzF,sFAAsF,CACzF,CAAC;QAJwB,wBAAmB,GAAnB,mBAAmB,CAAQ;QAAkB,iBAAY,GAAZ,YAAY,CAAQ;QAK3F,IAAI,CAAC,IAAI,GAAG,yBAAyB,CAAC;IACxC,CAAC;CACF;AASD;;;;;;;;;;GAUG;AACH,MAAM,OAAO,sBAAuB,SAAQ,KAAK;IAE7B;IACA;IACA;IAHlB,YACkB,YAAoB,EACpB,gBAAwB,EACxB,OAAsC;QAEtD,MAAM,MAAM,GAAG,OAAO;aACnB,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,CAAC,aAAa,aAAa,CAAC,CAAC,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC;aACjE,IAAI,CAAC,IAAI,CAAC,CAAC;QACd,KAAK,CACH,4FAA4F;YAC1F,GAAG,YAAY,2BAA2B,gBAAgB,QAAQ,MAAM,sBAAsB;YAC9F,gGAAgG;YAChG,yFAAyF;YACzF,qDAAqD,CACxD,CAAC;QAbc,iBAAY,GAAZ,YAAY,CAAQ;QACpB,qBAAgB,GAAhB,gBAAgB,CAAQ;QACxB,YAAO,GAAP,OAAO,CAA+B;QAYtD,IAAI,CAAC,IAAI,GAAG,wBAAwB,CAAC;IACvC,CAAC;CACF;AAgID;;;GAGG;AACH,SAAS,uBAAuB,CAAC,EAAqB;IACpD,OAAO;QACL,WAAW,EAAE,EAAE,CAAC,IAAI,CAAC,WAAW;QAChC,4EAA4E;QAC5E,2DAA2D;QAC3D,sEAAsE;QACtE,WAAW,EAAE;YACX,EAAE,EAAE,EAAE,CAAC,IAAI,CAAC,WAAW;YACvB,WAAW,EAAE,EAAE,CAAC,IAAI,CAAC,iBAAiB;YACtC,IAAI,EAAE,aAAa;YACnB,UAAU,EAAE,mBAAmB;YAC/B,IAAI,EAAE,EAAE,CAAC,IAA0C;YACnD,MAAM,EAAE,EAAE,OAAO,EAAE,CAAC,EAAE,KAAK,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS,EAAE;SACtD;QACD,SAAS,EAAE,EAAE,CAAC,IAAI;QAClB,qBAAqB,EAAE,4BAA4B;QACnD,GAAG,EAAE,EAAE,CAAC,GAAG;KACZ,CAAC;AACJ,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,OAAO,CAAC,IAiB7B;IACC,MAAM,EAAE,IAAI,EAAE,KAAK,EAAE,aAAa,EAAE,mBAAmB,EAAE,GAAG,IAAI,CAAC;IACjE,MAAM,cAAc,GAClB,IAAI,CAAC,cAAc,IAAI,CAAC,IAAI,CAAC,aAAa,IAAI,EAAE,CAAC,CAAC,GAAG,CAAC,uBAAuB,CAAC,CAAC;IAEjF,0EAA0E;IAC1E,6EAA6E;IAC7E,8EAA8E;IAC9E,yEAAyE;IACzE,8EAA8E;IAC9E,8EAA8E;IAC9E,8EAA8E;IAC9E,kCAAkC;IAClC,MAAM,OAAO,GAAG,CAAC,mBAAmB,EAAE,aAAa,CAAC;SACjD,GAAG,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,CAAC;QACb,aAAa,EAAE,GAAG;QAClB,MAAM,EAAE,IAAI,CAAC,KAAK,CAAC,kBAAkB,CAAC,GAAG,EAAE,KAAK,CAAC,OAAO,CAAC,CAAC,MAAM,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,KAAK,KAAK,CAAC,IAAI,CAAC;KAChG,CAAC,CAAC;SACF,MAAM,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,GAAG,CAAC,MAAM,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;IAC1C,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACvB,MAAM,IAAI,sBAAsB,CAAC,KAAK,CAAC,OAAO,EAAE,KAAK,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC;IACvE,CAAC;IAED,6EAA6E;IAC7E,qCAAqC;IACrC,MAAM,YAAY,GAAG,IAAI,CAAC,YAAY,IAAI,kBAAkB,aAAa,EAAE,CAAC;IAC5E,MAAM,iBAAiB,GAAG,MAAM,IAAI,CAAC,wBAAwB,CAC3D,IAAI,CAAC,kBAAkB,CAAC,eAAe,EACvC,YAAY,CACb,CAAC;IAEF,MAAM,OAAO,GAAoB,EAAE,CAAC;IACpC,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;IAC3B,IAAI,aAAa,GAAG,KAAK,CAAC;IAC1B,+EAA+E;IAC/E,6EAA6E;IAC7E,yBAAyB;IACzB,IAAI,eAAe,GAAG,IAAI,CAAC,kBAAkB,CAAC,UAAU,CAAC;IAEzD,KAAK,MAAM,IAAI,IAAI,cAAc,EAAE,CAAC;QAClC,MAAM,IAAI,GAAG,IAAI,CAAC,SAAS,CAAC;QAC5B,IAAI,MAAsB,CAAC;QAC3B,IAAI,UAAU,GAAG,KAAK,CAAC;QACvB,IAAI,UAAU,GAAG,EAAE,CAAC;QACpB,IAAI,CAAC;YACH,MAAM,GAAG,GAAG,MAAM,IAAI,CAAC,cAAc,CAAC;gBACpC,OAAO,EAAE,IAAI,CAAC,OAAO;gBACrB,YAAY,EAAE,iBAAiB;gBAC/B,IAAI,EAAE,QAAQ;gBACd,oEAAoE;gBACpE,wEAAwE;gBACxE,uEAAuE;gBACvE,sDAAsD;gBACtD,IAAI,EAAE,IAAI,CAAC,WAAW;aACvB,CAAC,CAAC;YAEH,qEAAqE;YACrE,0EAA0E;YAC1E,qEAAqE;YACrE,IAAI,GAAG,CAAC,SAAS,EAAE,CAAC;gBAClB,MAAM,IAAI,yBAAyB,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;YACxD,CAAC;YACD,IAAI,CAAC,GAAG,CAAC,QAAQ,EAAE,CAAC;gBAClB,MAAM,IAAI,KAAK,CAAC,6CAA6C,IAAI,CAAC,WAAW,EAAE,CAAC,CAAC;YACnF,CAAC;YAED,2EAA2E;YAC3E,2EAA2E;YAC3E,4EAA4E;YAC5E,kEAAkE;YAClE,IAAI,CAAC,aAAa,EAAE,CAAC;gBACnB,aAAa,GAAG,IAAI,CAAC;gBACrB,MAAM,SAAS,GAAG,GAAG,CAAC,QAAQ,EAAE,QAAQ,CAAC,UAAU,CAAC;gBACpD,IAAI,SAAS,IAAI,SAAS,KAAK,IAAI,CAAC,kBAAkB,CAAC,UAAU,EAAE,CAAC;oBAClE,MAAM,IAAI,4BAA4B,CACpC,aAAa,EACb,IAAI,CAAC,kBAAkB,CAAC,UAAU,EAClC,SAAS,CACV,CAAC;gBACJ,CAAC;gBACD,IAAI,SAAS;oBAAE,eAAe,GAAG,SAAS,CAAC;YAC7C,CAAC;YAED,MAAM,OAAO,GAAG,MAAM,IAAI,CAAC,SAAS,CAAC,KAAK,CAAC;gBACzC,IAAI;gBACJ,eAAe,EAAE,EAAE,aAAa,EAAE,IAAI,CAAC,qBAAqB,EAAE,KAAK,EAAE,GAAG,CAAC,QAAQ,CAAC,KAAK,EAAE;gBACzF,GAAG,CAAC,IAAI,CAAC,GAAG,KAAK,SAAS,CAAC,CAAC,CAAC,EAAE,GAAG,EAAE,IAAI,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;aACrD,CAAC,CAAC;YACH,MAAM,GAAG,OAAO,CAAC,YAAY,CAAC;YAC9B,UAAU,GAAG,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,EAAE,IAAI,CAAC,CAAC;QAC/C,CAAC;QAAC,OAAO,GAAG,EAAE,CAAC;YACb,wEAAwE;YACxE,uEAAuE;YACvE,sEAAsE;YACtE,IAAI,GAAG,YAAY,yBAAyB,IAAI,GAAG,YAAY,4BAA4B,EAAE,CAAC;gBAC5F,MAAM,GAAG,CAAC;YACZ,CAAC;YACD,sEAAsE;YACtE,iEAAiE;YACjE,mEAAmE;YACnE,sEAAsE;YACtE,uEAAuE;YACvE,wEAAwE;YACxE,kCAAkC;YAClC,MAAM,GAAG,IAAI,CAAC;YACd,UAAU,GAAG,IAAI,CAAC;YAClB,UAAU,GAAG,CAAC,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,IAAI,CAAC,CAAC;QACjF,CAAC;QAED,IAAI,CAAC,KAAK,CAAC,gBAAgB,CAAC;YAC1B,cAAc,EAAE,aAAa;YAC7B,UAAU,EAAE,KAAK,CAAC,IAAI;YACtB,aAAa,EAAE,KAAK,CAAC,OAAO;YAC5B,WAAW,EAAE,IAAI,CAAC,WAAW;YAC7B,MAAM;YACN,UAAU;YACV,WAAW,EAAE,IAAI,CAAC,kBAAkB,CAAC,UAAU;YAC/C,SAAS,EAAE,OAAO;YAClB,gBAAgB,EAAE,UAAU;SAC7B,CAAC,CAAC;QAEH,OAAO,CAAC,IAAI,CAAC,EAAE,WAAW,EAAE,IAAI,CAAC,WAAW,EAAE,MAAM,EAAE,UAAU,EAAE,CAAC,CAAC;IACtE,CAAC;IAED,4EAA4E;IAC5E,0EAA0E;IAC1E,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,gBAAgB,CAAC,mBAAmB,EAAE,KAAK,CAAC,OAAO,CAAC,CAAC;IAClF,IAAI,SAAS,CAAC,QAAQ,KAAK,CAAC,IAAI,SAAS,CAAC,UAAU,KAAK,CAAC,EAAE,CAAC;QAC3D,MAAM,IAAI,uBAAuB,CAAC,mBAAmB,EAAE,KAAK,CAAC,OAAO,CAAC,CAAC;IACxE,CAAC;IACD,MAAM,QAAQ,GAAG,IAAI,CAAC,KAAK,CAAC,gBAAgB,CAAC,aAAa,EAAE,KAAK,CAAC,OAAO,CAAC,CAAC;IAE3E,MAAM,UAAU,GAAG,YAAY,CAC7B,EAAE,MAAM,EAAE,QAAQ,CAAC,MAAM,EAAE,QAAQ,EAAE,QAAQ,CAAC,QAAQ,EAAE,EACxD,EAAE,MAAM,EAAE,SAAS,CAAC,MAAM,EAAE,QAAQ,EAAE,SAAS,CAAC,QAAQ,EAAE,CAC3D,CAAC;IAEF,6EAA6E;IAC7E,6EAA6E;IAC7E,gEAAgE;IAChE,IAAI,MAAoC,CAAC;IACzC,IAAI,IAAI,CAAC,KAAK,CAAC,cAAc,EAAE,CAAC;QAC9B,MAAM,aAAa,GAAG,IAAI,CAAC,KAAK,CAAC,cAAc,CAAC,mBAAmB,EAAE,KAAK,CAAC,OAAO,CAAC,CAAC;QACpF,MAAM,YAAY,GAAG,IAAI,CAAC,KAAK,CAAC,cAAc,CAAC,aAAa,EAAE,KAAK,CAAC,OAAO,CAAC,CAAC;QAC7E,MAAM,GAAG,aAAa,CAAC,aAAa,EAAE,YAAY,CAAC,CAAC;IACtD,CAAC;IAED,OAAO;QACL,OAAO;QACP,UAAU;QACV,GAAG,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,MAAM,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;QAC7B,SAAS,EAAE,EAAE,YAAY,EAAE,iBAAiB,EAAE,UAAU,EAAE,eAAe,EAAE,iBAAiB,EAAE,IAAI,EAAE;KACrG,CAAC;AACJ,CAAC"}
@@ -0,0 +1,68 @@
1
+ /**
2
+ * Paired (matched-design) comparison for the held-out exam.
3
+ *
4
+ * The exam scores the SAME slate against the before- and after-checkpoints, so
5
+ * the two result sets are MATCHED, not independent. The marginal test in
6
+ * `wilson.ts` (`compareRates`) compares two INDEPENDENT Wilson intervals and
7
+ * calls a delta trustworthy only when they are disjoint. On a matched design
8
+ * that is statistically wasteful: it carries the full between-instance
9
+ * difficulty variance ("this bug is just hard") in BOTH intervals, which swamps
10
+ * a real, consistent within-instance improvement. Example: a learner that flips
11
+ * 3 hard instances fail→pass and regresses none moves 4/10→7/10 — two Wilson
12
+ * intervals ([17,69]% vs [40,89]%) overlap → "within-noise", even though every
13
+ * observed change was an improvement.
14
+ *
15
+ * McNemar's test is the textbook test for matched binary data: it looks ONLY at
16
+ * the discordant pairs (instances that flipped) and asks whether the flips are
17
+ * asymmetric beyond chance. This is NOT a weaker bar — it is the CORRECT test
18
+ * for the design, and it still gates the claim on significance (exact two-sided
19
+ * p < alpha) AND on the improvement direction (more fail→pass than pass→fail).
20
+ * It is reported ALONGSIDE the conservative marginal verdict, never replacing it
21
+ * — so the exam is strengthened, never weakened (DR-2026-06-02-b §2a).
22
+ *
23
+ * R=1 (one run per instance) ships here: McNemar exact on per-instance flips.
24
+ * R>1 (per-instance pass RATES → Wilcoxon signed-rank / paired bootstrap) layers
25
+ * on once `eval_results` supports appended runs (DR-2026-06-02-b §2b).
26
+ */
27
+ /** Minimal per-instance result (subset of PerTaskResult / EvalResultRow). */
28
+ export interface PairedInput {
29
+ instance_id: string;
30
+ /** null when unscorable. */
31
+ passed: boolean | null;
32
+ unscorable: boolean;
33
+ }
34
+ export type PairedVerdict = 'trustworthy' | 'within-noise';
35
+ export interface PairedComparison {
36
+ /** Instances scorable (passed !== null) in BOTH arms — the matched pairs. */
37
+ pairs: number;
38
+ /** b: before fail → after pass (improvements). */
39
+ improved: number;
40
+ /** c: before pass → after fail (regressions). */
41
+ regressed: number;
42
+ concordantPass: number;
43
+ concordantFail: number;
44
+ /** Instances unscorable/absent in either arm — excluded from the paired test. */
45
+ excluded: number;
46
+ /** Exact two-sided McNemar p-value over the discordant pairs. */
47
+ pValue: number;
48
+ /** 'trustworthy' iff pValue < alpha AND improved > regressed; else 'within-noise'. */
49
+ verdict: PairedVerdict;
50
+ }
51
+ /**
52
+ * Exact two-sided McNemar p-value for discordant counts `b` (improvements) and
53
+ * `c` (regressions). Under H0 each discordant pair is an independent fair coin,
54
+ * so the count of one direction is Binomial(n=b+c, 0.5); the two-sided p is
55
+ * `2 * P(X <= min(b,c))`, capped at 1. No `b==c==0` division (returns 1: no
56
+ * evidence). Computed iteratively (term ratio) so it is exact and overflow-free
57
+ * for any slate size.
58
+ */
59
+ export declare function mcnemarExact(b: number, c: number): number;
60
+ /**
61
+ * Paired McNemar comparison of two matched per-instance result sets. An instance
62
+ * contributes a pair ONLY when it is scorable (passed !== null, not unscorable)
63
+ * in BOTH arms — an unscorable/missing side is excluded (honest: a disk-skip or
64
+ * harness failure is never coerced into a flip). `alpha` defaults to 0.05.
65
+ */
66
+ export declare function comparePaired(before: readonly PairedInput[], after: readonly PairedInput[], opts?: {
67
+ alpha?: number;
68
+ }): PairedComparison;
@@ -0,0 +1,93 @@
1
+ /**
2
+ * Paired (matched-design) comparison for the held-out exam.
3
+ *
4
+ * The exam scores the SAME slate against the before- and after-checkpoints, so
5
+ * the two result sets are MATCHED, not independent. The marginal test in
6
+ * `wilson.ts` (`compareRates`) compares two INDEPENDENT Wilson intervals and
7
+ * calls a delta trustworthy only when they are disjoint. On a matched design
8
+ * that is statistically wasteful: it carries the full between-instance
9
+ * difficulty variance ("this bug is just hard") in BOTH intervals, which swamps
10
+ * a real, consistent within-instance improvement. Example: a learner that flips
11
+ * 3 hard instances fail→pass and regresses none moves 4/10→7/10 — two Wilson
12
+ * intervals ([17,69]% vs [40,89]%) overlap → "within-noise", even though every
13
+ * observed change was an improvement.
14
+ *
15
+ * McNemar's test is the textbook test for matched binary data: it looks ONLY at
16
+ * the discordant pairs (instances that flipped) and asks whether the flips are
17
+ * asymmetric beyond chance. This is NOT a weaker bar — it is the CORRECT test
18
+ * for the design, and it still gates the claim on significance (exact two-sided
19
+ * p < alpha) AND on the improvement direction (more fail→pass than pass→fail).
20
+ * It is reported ALONGSIDE the conservative marginal verdict, never replacing it
21
+ * — so the exam is strengthened, never weakened (DR-2026-06-02-b §2a).
22
+ *
23
+ * R=1 (one run per instance) ships here: McNemar exact on per-instance flips.
24
+ * R>1 (per-instance pass RATES → Wilcoxon signed-rank / paired bootstrap) layers
25
+ * on once `eval_results` supports appended runs (DR-2026-06-02-b §2b).
26
+ */
27
+ /**
28
+ * Exact two-sided McNemar p-value for discordant counts `b` (improvements) and
29
+ * `c` (regressions). Under H0 each discordant pair is an independent fair coin,
30
+ * so the count of one direction is Binomial(n=b+c, 0.5); the two-sided p is
31
+ * `2 * P(X <= min(b,c))`, capped at 1. No `b==c==0` division (returns 1: no
32
+ * evidence). Computed iteratively (term ratio) so it is exact and overflow-free
33
+ * for any slate size.
34
+ */
35
+ export function mcnemarExact(b, c) {
36
+ const n = b + c;
37
+ if (n === 0)
38
+ return 1;
39
+ const k = Math.min(b, c);
40
+ // sum_{i=0}^{k} C(n,i) * 0.5^n, via prob_0 = 0.5^n, prob_i = prob_{i-1}*(n-i+1)/i.
41
+ let term = Math.pow(0.5, n); // i = 0
42
+ let cdf = term;
43
+ for (let i = 1; i <= k; i++) {
44
+ term = (term * (n - i + 1)) / i;
45
+ cdf += term;
46
+ }
47
+ return Math.min(1, 2 * cdf);
48
+ }
49
+ /**
50
+ * Paired McNemar comparison of two matched per-instance result sets. An instance
51
+ * contributes a pair ONLY when it is scorable (passed !== null, not unscorable)
52
+ * in BOTH arms — an unscorable/missing side is excluded (honest: a disk-skip or
53
+ * harness failure is never coerced into a flip). `alpha` defaults to 0.05.
54
+ */
55
+ export function comparePaired(before, after, opts = {}) {
56
+ const alpha = opts.alpha ?? 0.05;
57
+ const scorable = (r) => !r.unscorable && r.passed !== null;
58
+ const beforeById = new Map(before.map((r) => [r.instance_id, r]));
59
+ let improved = 0;
60
+ let regressed = 0;
61
+ let concordantPass = 0;
62
+ let concordantFail = 0;
63
+ let pairs = 0;
64
+ let excluded = 0;
65
+ const seen = new Set();
66
+ for (const a of after) {
67
+ seen.add(a.instance_id);
68
+ const b = beforeById.get(a.instance_id);
69
+ if (!b || !scorable(b) || !scorable(a)) {
70
+ excluded++;
71
+ continue;
72
+ }
73
+ pairs++;
74
+ const wasPass = b.passed === true;
75
+ const nowPass = a.passed === true;
76
+ if (!wasPass && nowPass)
77
+ improved++;
78
+ else if (wasPass && !nowPass)
79
+ regressed++;
80
+ else if (wasPass && nowPass)
81
+ concordantPass++;
82
+ else
83
+ concordantFail++;
84
+ }
85
+ // before-only instances (absent from after) are excluded too.
86
+ for (const b of before)
87
+ if (!seen.has(b.instance_id))
88
+ excluded++;
89
+ const pValue = mcnemarExact(improved, regressed);
90
+ const verdict = pValue < alpha && improved > regressed ? 'trustworthy' : 'within-noise';
91
+ return { pairs, improved, regressed, concordantPass, concordantFail, excluded, pValue, verdict };
92
+ }
93
+ //# sourceMappingURL=paired.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"paired.js","sourceRoot":"","sources":["../../src/eval/paired.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;GAyBG;AA6BH;;;;;;;GAOG;AACH,MAAM,UAAU,YAAY,CAAC,CAAS,EAAE,CAAS;IAC/C,MAAM,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;IAChB,IAAI,CAAC,KAAK,CAAC;QAAE,OAAO,CAAC,CAAC;IACtB,MAAM,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;IACzB,mFAAmF;IACnF,IAAI,IAAI,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC,CAAC,QAAQ;IACrC,IAAI,GAAG,GAAG,IAAI,CAAC;IACf,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;QAC5B,IAAI,GAAG,CAAC,IAAI,GAAG,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC;QAChC,GAAG,IAAI,IAAI,CAAC;IACd,CAAC;IACD,OAAO,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC;AAC9B,CAAC;AAED;;;;;GAKG;AACH,MAAM,UAAU,aAAa,CAC3B,MAA8B,EAC9B,KAA6B,EAC7B,OAA2B,EAAE;IAE7B,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,IAAI,IAAI,CAAC;IACjC,MAAM,QAAQ,GAAG,CAAC,CAAc,EAAW,EAAE,CAAC,CAAC,CAAC,CAAC,UAAU,IAAI,CAAC,CAAC,MAAM,KAAK,IAAI,CAAC;IACjF,MAAM,UAAU,GAAG,IAAI,GAAG,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,WAAW,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC;IAElE,IAAI,QAAQ,GAAG,CAAC,CAAC;IACjB,IAAI,SAAS,GAAG,CAAC,CAAC;IAClB,IAAI,cAAc,GAAG,CAAC,CAAC;IACvB,IAAI,cAAc,GAAG,CAAC,CAAC;IACvB,IAAI,KAAK,GAAG,CAAC,CAAC;IACd,IAAI,QAAQ,GAAG,CAAC,CAAC;IAEjB,MAAM,IAAI,GAAG,IAAI,GAAG,EAAU,CAAC;IAC/B,KAAK,MAAM,CAAC,IAAI,KAAK,EAAE,CAAC;QACtB,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC;QACxB,MAAM,CAAC,GAAG,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC;QACxC,IAAI,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC,CAAC,EAAE,CAAC;YACvC,QAAQ,EAAE,CAAC;YACX,SAAS;QACX,CAAC;QACD,KAAK,EAAE,CAAC;QACR,MAAM,OAAO,GAAG,CAAC,CAAC,MAAM,KAAK,IAAI,CAAC;QAClC,MAAM,OAAO,GAAG,CAAC,CAAC,MAAM,KAAK,IAAI,CAAC;QAClC,IAAI,CAAC,OAAO,IAAI,OAAO;YAAE,QAAQ,EAAE,CAAC;aAC/B,IAAI,OAAO,IAAI,CAAC,OAAO;YAAE,SAAS,EAAE,CAAC;aACrC,IAAI,OAAO,IAAI,OAAO;YAAE,cAAc,EAAE,CAAC;;YACzC,cAAc,EAAE,CAAC;IACxB,CAAC;IACD,8DAA8D;IAC9D,KAAK,MAAM,CAAC,IAAI,MAAM;QAAE,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,WAAW,CAAC;YAAE,QAAQ,EAAE,CAAC;IAEjE,MAAM,MAAM,GAAG,YAAY,CAAC,QAAQ,EAAE,SAAS,CAAC,CAAC;IACjD,MAAM,OAAO,GACX,MAAM,GAAG,KAAK,IAAI,QAAQ,GAAG,SAAS,CAAC,CAAC,CAAC,aAAa,CAAC,CAAC,CAAC,cAAc,CAAC;IAE1E,OAAO,EAAE,KAAK,EAAE,QAAQ,EAAE,SAAS,EAAE,cAAc,EAAE,cAAc,EAAE,QAAQ,EAAE,MAAM,EAAE,OAAO,EAAE,CAAC;AACnG,CAAC"}
@@ -0,0 +1,35 @@
1
+ /**
2
+ * Resolve a held-out slate's `instance_id`s into `SweRebenchV2Task` objects
3
+ * for the `jinn eval` orchestrator (issue #818, AC#1).
4
+ *
5
+ * A slate stores only `instance_id`s (`_swe-rebench-v2-held-out-slate.ts`). The
6
+ * evaluator grades against the full HuggingFace row, fetched by
7
+ * `(hf_dataset, hf_split, instance_id)`. The `HfRow` does NOT echo
8
+ * `hf_dataset`/`hf_split` (see `swe-rebench-v2-evaluator/index.ts`), so the
9
+ * caller supplies the slate-level dataset+split as args; the resolver verifies
10
+ * each id exists by fetching its row and returns the `{ task, row }` pair so the
11
+ * orchestrator reuses the row at grade time (avoids a second fetch).
12
+ *
13
+ * No retry logic here — `HttpHfFetcher` owns retries. A fetcher throw for any id
14
+ * propagates loudly (a missing slate instance is a hard error, never a silent
15
+ * drop).
16
+ */
17
+ import { type SweRebenchV2Task } from '../vendor/@jinn-network/sdk/dist/solvernets/swe-rebench-v2.js';
18
+ import type { HfFetcher, HfRow } from '../harnesses/impls/swe-rebench-v2-evaluator/index.js';
19
+ import type { PoolTask } from '../solver-types/_swe-rebench-v2-pool.js';
20
+ export interface ResolvedSlateTask {
21
+ task: SweRebenchV2Task;
22
+ row: HfRow;
23
+ }
24
+ export declare function resolveSlateTasks(args: {
25
+ /**
26
+ * The pool tasks for this slate group, each carrying the real
27
+ * `problem_statement` / `base_commit` / `language` the agent run needs to
28
+ * solve the instance (the slate stores only ids; the pool is the source of
29
+ * these fields).
30
+ */
31
+ poolTasks: PoolTask[];
32
+ hf_dataset: string;
33
+ hf_split: string;
34
+ fetcher: HfFetcher;
35
+ }): Promise<ResolvedSlateTask[]>;
@@ -0,0 +1,56 @@
1
+ /**
2
+ * Resolve a held-out slate's `instance_id`s into `SweRebenchV2Task` objects
3
+ * for the `jinn eval` orchestrator (issue #818, AC#1).
4
+ *
5
+ * A slate stores only `instance_id`s (`_swe-rebench-v2-held-out-slate.ts`). The
6
+ * evaluator grades against the full HuggingFace row, fetched by
7
+ * `(hf_dataset, hf_split, instance_id)`. The `HfRow` does NOT echo
8
+ * `hf_dataset`/`hf_split` (see `swe-rebench-v2-evaluator/index.ts`), so the
9
+ * caller supplies the slate-level dataset+split as args; the resolver verifies
10
+ * each id exists by fetching its row and returns the `{ task, row }` pair so the
11
+ * orchestrator reuses the row at grade time (avoids a second fetch).
12
+ *
13
+ * No retry logic here — `HttpHfFetcher` owns retries. A fetcher throw for any id
14
+ * propagates loudly (a missing slate instance is a hard error, never a silent
15
+ * drop).
16
+ */
17
+ import { SweRebenchV2TaskSchema } from '../vendor/@jinn-network/sdk/dist/solvernets/swe-rebench-v2.js';
18
+ export async function resolveSlateTasks(args) {
19
+ const tasks = [...args.poolTasks].sort((a, b) => a.instance_id.localeCompare(b.instance_id));
20
+ const out = [];
21
+ for (const poolTask of tasks) {
22
+ const row = await args.fetcher.fetchTaskRow({
23
+ hf_dataset: args.hf_dataset,
24
+ hf_split: args.hf_split,
25
+ instance_id: poolTask.instance_id,
26
+ });
27
+ out.push({ task: buildTask(poolTask, args.hf_dataset, args.hf_split, row), row });
28
+ }
29
+ return out;
30
+ }
31
+ /**
32
+ * Construct the `SweRebenchV2Task` the orchestrator hands to the harness +
33
+ * evaluator. The agent run needs the real `problem_statement` (what to solve)
34
+ * and `base_commit` (the repo state to check out) — these are threaded through
35
+ * from the pool task (the generator's mapping in `swe-rebench-v2.ts`).
36
+ * `hf_dataset`/`hf_split` are the evaluator's row-fetch key; `repo` comes from
37
+ * the fetched row. A pool task missing an optional field falls back the same
38
+ * way the generator does, so the object stays schema-valid.
39
+ */
40
+ function buildTask(poolTask, hf_dataset, hf_split, row) {
41
+ const language = SweRebenchV2TaskSchema.shape.language.safeParse(poolTask.language);
42
+ return {
43
+ schemaVersion: 'swe-rebench-v2.v1',
44
+ instance_id: poolTask.instance_id,
45
+ repo: row.repo,
46
+ base_commit: poolTask.base_commit ?? '0'.repeat(40),
47
+ language: language.success ? language.data : 'python',
48
+ problem_statement: poolTask.problem_statement ?? '',
49
+ interface: poolTask.interface ?? '',
50
+ hf_dataset,
51
+ hf_split,
52
+ deadline_unix: 1,
53
+ round_month: hf_split.replace('_', '-'),
54
+ };
55
+ }
56
+ //# sourceMappingURL=resolve-slate-tasks.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"resolve-slate-tasks.js","sourceRoot":"","sources":["../../src/eval/resolve-slate-tasks.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;GAeG;AAEH,OAAO,EAAE,sBAAsB,EAAyB,MAAM,6CAA6C,CAAC;AAS5G,MAAM,CAAC,KAAK,UAAU,iBAAiB,CAAC,IAWvC;IACC,MAAM,KAAK,GAAG,CAAC,GAAG,IAAI,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,WAAW,CAAC,aAAa,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC;IAC7F,MAAM,GAAG,GAAwB,EAAE,CAAC;IACpC,KAAK,MAAM,QAAQ,IAAI,KAAK,EAAE,CAAC;QAC7B,MAAM,GAAG,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,YAAY,CAAC;YAC1C,UAAU,EAAE,IAAI,CAAC,UAAU;YAC3B,QAAQ,EAAE,IAAI,CAAC,QAAQ;YACvB,WAAW,EAAE,QAAQ,CAAC,WAAW;SAClC,CAAC,CAAC;QACH,GAAG,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,SAAS,CAAC,QAAQ,EAAE,IAAI,CAAC,UAAU,EAAE,IAAI,CAAC,QAAQ,EAAE,GAAG,CAAC,EAAE,GAAG,EAAE,CAAC,CAAC;IACpF,CAAC;IACD,OAAO,GAAG,CAAC;AACb,CAAC;AAED;;;;;;;;GAQG;AACH,SAAS,SAAS,CAChB,QAAkB,EAClB,UAAkB,EAClB,QAAgB,EAChB,GAAU;IAEV,MAAM,QAAQ,GAAG,sBAAsB,CAAC,KAAK,CAAC,QAAQ,CAAC,SAAS,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC;IACpF,OAAO;QACL,aAAa,EAAE,mBAAmB;QAClC,WAAW,EAAE,QAAQ,CAAC,WAAW;QACjC,IAAI,EAAE,GAAG,CAAC,IAAI;QACd,WAAW,EAAE,QAAQ,CAAC,WAAW,IAAI,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC;QACnD,QAAQ,EAAE,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,QAAQ;QACrD,iBAAiB,EAAE,QAAQ,CAAC,iBAAiB,IAAI,EAAE;QACnD,SAAS,EAAE,QAAQ,CAAC,SAAS,IAAI,EAAE;QACnC,UAAU;QACV,QAAQ;QACR,aAAa,EAAE,CAAC;QAChB,WAAW,EAAE,QAAQ,CAAC,OAAO,CAAC,GAAG,EAAE,GAAG,CAAC;KACxC,CAAC;AACJ,CAAC"}
@@ -0,0 +1,22 @@
1
+ /**
2
+ * Held-out screening's authoritative "already trained-on" exclusion source (#986).
3
+ *
4
+ * Returns the set of swe-rebench-v2 `instance_id`s that have been ATTEMPTED on
5
+ * the network for a SolverNet — any verdict envelope, passed OR failed,
6
+ * cross-operator. An attempted instance was executed by an operator, so the
7
+ * learner trained on it; holding it out later would make a trained-checkpoint
8
+ * pass count as memorization, not generalization.
9
+ *
10
+ * Why the indexer and not the local generator-state: the local ledger only
11
+ * reflects THIS box's posting and can be stale (a different active generator —
12
+ * e.g. a hosted operator — posts to its own ledger). The indexer's
13
+ * `verdictEnvelopeMeta` is the cross-operator, current record. (The on-chain
14
+ * task/attempt tables carry no instance_id; only the indexer's IPFS enrichment
15
+ * resolves it — same backing as `DiscoveryAPI.getInstanceSuccessCounts`, minus
16
+ * the `actualPassed: true` filter so failed attempts count too.)
17
+ *
18
+ * Throws on indexer failure — callers MUST abort rather than screen against an
19
+ * unknown attempted set, because a missing exclusion can silently contaminate
20
+ * the exam (the whole point of held-out discipline).
21
+ */
22
+ export declare function fetchAttemptedInstanceIds(discoveryUrl: string, manifestCid: string, fetchImpl?: typeof fetch): Promise<Set<string>>;
@@ -0,0 +1,71 @@
1
+ /**
2
+ * Held-out screening's authoritative "already trained-on" exclusion source (#986).
3
+ *
4
+ * Returns the set of swe-rebench-v2 `instance_id`s that have been ATTEMPTED on
5
+ * the network for a SolverNet — any verdict envelope, passed OR failed,
6
+ * cross-operator. An attempted instance was executed by an operator, so the
7
+ * learner trained on it; holding it out later would make a trained-checkpoint
8
+ * pass count as memorization, not generalization.
9
+ *
10
+ * Why the indexer and not the local generator-state: the local ledger only
11
+ * reflects THIS box's posting and can be stale (a different active generator —
12
+ * e.g. a hosted operator — posts to its own ledger). The indexer's
13
+ * `verdictEnvelopeMeta` is the cross-operator, current record. (The on-chain
14
+ * task/attempt tables carry no instance_id; only the indexer's IPFS enrichment
15
+ * resolves it — same backing as `DiscoveryAPI.getInstanceSuccessCounts`, minus
16
+ * the `actualPassed: true` filter so failed attempts count too.)
17
+ *
18
+ * Throws on indexer failure — callers MUST abort rather than screen against an
19
+ * unknown attempted set, because a missing exclusion can silently contaminate
20
+ * the exam (the whole point of held-out discipline).
21
+ */
22
+ const ATTEMPTED_QUERY = `
23
+ query InstanceAttempted($cid: String!, $limit: Int!, $after: String) {
24
+ verdictEnvelopeMetas(
25
+ where: {
26
+ solverNetManifestCid: $cid,
27
+ solverType_starts_with: "swe-rebench-v2",
28
+ enrichmentStatus: "ok",
29
+ instanceId_not: ""
30
+ },
31
+ limit: $limit,
32
+ after: $after,
33
+ orderBy: "enrichedAtBlock",
34
+ orderDirection: "asc"
35
+ ) {
36
+ items { instanceId }
37
+ pageInfo { hasNextPage endCursor }
38
+ }
39
+ }`;
40
+ export async function fetchAttemptedInstanceIds(discoveryUrl, manifestCid, fetchImpl = fetch) {
41
+ const gqlUrl = discoveryUrl.endsWith('/graphql') ? discoveryUrl : `${discoveryUrl.replace(/\/$/, '')}/graphql`;
42
+ const ids = new Set();
43
+ let cursor = null;
44
+ const MAX_PAGES = 20;
45
+ const PAGE_LIMIT = 1000;
46
+ for (let page = 0; page < MAX_PAGES; page++) {
47
+ const res = await fetchImpl(gqlUrl, {
48
+ method: 'POST',
49
+ headers: { 'content-type': 'application/json' },
50
+ body: JSON.stringify({ query: ATTEMPTED_QUERY, variables: { cid: manifestCid, limit: PAGE_LIMIT, after: cursor } }),
51
+ });
52
+ if (!res.ok) {
53
+ throw new Error(`held-out screening: indexer attempted-ids query failed (HTTP ${res.status}) at ${gqlUrl}`);
54
+ }
55
+ const json = (await res.json());
56
+ if (json.errors) {
57
+ throw new Error(`held-out screening: indexer attempted-ids query errors: ${JSON.stringify(json.errors).slice(0, 300)}`);
58
+ }
59
+ const conn = json.data?.verdictEnvelopeMetas;
60
+ for (const item of conn?.items ?? []) {
61
+ if (item.instanceId)
62
+ ids.add(item.instanceId);
63
+ }
64
+ const pageInfo = conn?.pageInfo;
65
+ if (!pageInfo?.hasNextPage || !pageInfo.endCursor)
66
+ break;
67
+ cursor = pageInfo.endCursor;
68
+ }
69
+ return ids;
70
+ }
71
+ //# sourceMappingURL=screen-discovery.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"screen-discovery.js","sourceRoot":"","sources":["../../src/eval/screen-discovery.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;GAoBG;AAEH,MAAM,eAAe,GAAG;;;;;;;;;;;;;;;;;EAiBtB,CAAC;AAYH,MAAM,CAAC,KAAK,UAAU,yBAAyB,CAC7C,YAAoB,EACpB,WAAmB,EACnB,YAA0B,KAAK;IAE/B,MAAM,MAAM,GAAG,YAAY,CAAC,QAAQ,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,GAAG,YAAY,CAAC,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC,UAAU,CAAC;IAC/G,MAAM,GAAG,GAAG,IAAI,GAAG,EAAU,CAAC;IAC9B,IAAI,MAAM,GAAkB,IAAI,CAAC;IACjC,MAAM,SAAS,GAAG,EAAE,CAAC;IACrB,MAAM,UAAU,GAAG,IAAI,CAAC;IAExB,KAAK,IAAI,IAAI,GAAG,CAAC,EAAE,IAAI,GAAG,SAAS,EAAE,IAAI,EAAE,EAAE,CAAC;QAC5C,MAAM,GAAG,GAAG,MAAM,SAAS,CAAC,MAAM,EAAE;YAClC,MAAM,EAAE,MAAM;YACd,OAAO,EAAE,EAAE,cAAc,EAAE,kBAAkB,EAAE;YAC/C,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC,EAAE,KAAK,EAAE,eAAe,EAAE,SAAS,EAAE,EAAE,GAAG,EAAE,WAAW,EAAE,KAAK,EAAE,UAAU,EAAE,KAAK,EAAE,MAAM,EAAE,EAAE,CAAC;SACpH,CAAC,CAAC;QACH,IAAI,CAAC,GAAG,CAAC,EAAE,EAAE,CAAC;YACZ,MAAM,IAAI,KAAK,CAAC,gEAAgE,GAAG,CAAC,MAAM,QAAQ,MAAM,EAAE,CAAC,CAAC;QAC9G,CAAC;QACD,MAAM,IAAI,GAAG,CAAC,MAAM,GAAG,CAAC,IAAI,EAAE,CAAkB,CAAC;QACjD,IAAI,IAAI,CAAC,MAAM,EAAE,CAAC;YAChB,MAAM,IAAI,KAAK,CAAC,2DAA2D,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE,CAAC,CAAC;QAC1H,CAAC;QACD,MAAM,IAAI,GAAG,IAAI,CAAC,IAAI,EAAE,oBAAoB,CAAC;QAC7C,KAAK,MAAM,IAAI,IAAI,IAAI,EAAE,KAAK,IAAI,EAAE,EAAE,CAAC;YACrC,IAAI,IAAI,CAAC,UAAU;gBAAE,GAAG,CAAC,GAAG,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;QAChD,CAAC;QACD,MAAM,QAAQ,GAAG,IAAI,EAAE,QAAQ,CAAC;QAChC,IAAI,CAAC,QAAQ,EAAE,WAAW,IAAI,CAAC,QAAQ,CAAC,SAAS;YAAE,MAAM;QACzD,MAAM,GAAG,QAAQ,CAAC,SAAS,CAAC;IAC9B,CAAC;IAED,OAAO,GAAG,CAAC;AACb,CAAC"}