@jinn-network/client 0.1.7 → 0.1.8-canary.09a3b2f6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (438) hide show
  1. package/README.md +67 -1
  2. package/dist/adapters/mech/adapter.d.ts +39 -2
  3. package/dist/adapters/mech/adapter.js +178 -20
  4. package/dist/adapters/mech/adapter.js.map +1 -1
  5. package/dist/adapters/mech/contracts.d.ts +22 -1
  6. package/dist/adapters/mech/contracts.js +96 -52
  7. package/dist/adapters/mech/contracts.js.map +1 -1
  8. package/dist/adapters/mech/safe-revert.d.ts +4 -0
  9. package/dist/adapters/mech/safe-revert.js +5 -1
  10. package/dist/adapters/mech/safe-revert.js.map +1 -1
  11. package/dist/adapters/mech/safe.d.ts +1 -1
  12. package/dist/adapters/mech/safe.js +10 -4
  13. package/dist/adapters/mech/safe.js.map +1 -1
  14. package/dist/adapters/mech/types.d.ts +6 -1
  15. package/dist/adapters/mech/types.js.map +1 -1
  16. package/dist/agent/operator-claude.js +8 -0
  17. package/dist/agent/operator-claude.js.map +1 -1
  18. package/dist/api/activity-events-endpoint.d.ts +14 -0
  19. package/dist/api/activity-events-endpoint.js +59 -0
  20. package/dist/api/activity-events-endpoint.js.map +1 -0
  21. package/dist/api/bootstrap-endpoint.d.ts +1 -2
  22. package/dist/api/bootstrap-endpoint.js +42 -24
  23. package/dist/api/bootstrap-endpoint.js.map +1 -1
  24. package/dist/api/codex-doctor-endpoint.d.ts +22 -5
  25. package/dist/api/codex-doctor-endpoint.js +136 -17
  26. package/dist/api/codex-doctor-endpoint.js.map +1 -1
  27. package/dist/api/debug-report-endpoint.d.ts +27 -0
  28. package/dist/api/debug-report-endpoint.js +157 -0
  29. package/dist/api/debug-report-endpoint.js.map +1 -0
  30. package/dist/api/discovery-endpoint.d.ts +1 -0
  31. package/dist/api/discovery-endpoint.js +24 -0
  32. package/dist/api/discovery-endpoint.js.map +1 -1
  33. package/dist/api/fleet-build.d.ts +1 -7
  34. package/dist/api/fleet-build.js +0 -7
  35. package/dist/api/fleet-build.js.map +1 -1
  36. package/dist/api/gather-status.d.ts +39 -0
  37. package/dist/api/gather-status.js +181 -84
  38. package/dist/api/gather-status.js.map +1 -1
  39. package/dist/api/hermes-doctor-endpoint.d.ts +15 -7
  40. package/dist/api/hermes-doctor-endpoint.js +56 -19
  41. package/dist/api/hermes-doctor-endpoint.js.map +1 -1
  42. package/dist/api/launcher-status.d.ts +4 -2
  43. package/dist/api/launcher-status.js +11 -10
  44. package/dist/api/launcher-status.js.map +1 -1
  45. package/dist/api/launcher-tasks.d.ts +1 -1
  46. package/dist/api/launcher-tasks.js +12 -8
  47. package/dist/api/launcher-tasks.js.map +1 -1
  48. package/dist/api/loop-completion-build.d.ts +79 -0
  49. package/dist/api/loop-completion-build.js +155 -0
  50. package/dist/api/loop-completion-build.js.map +1 -0
  51. package/dist/api/operator-artifacts-endpoint.js +73 -6
  52. package/dist/api/operator-artifacts-endpoint.js.map +1 -1
  53. package/dist/api/portfolio-v0-build.d.ts +7 -1
  54. package/dist/api/portfolio-v0-build.js +6 -2
  55. package/dist/api/portfolio-v0-build.js.map +1 -1
  56. package/dist/api/prediction-v1-build.d.ts +6 -0
  57. package/dist/api/prediction-v1-build.js +3 -1
  58. package/dist/api/prediction-v1-build.js.map +1 -1
  59. package/dist/api/server.d.ts +17 -0
  60. package/dist/api/server.js +40 -1
  61. package/dist/api/server.js.map +1 -1
  62. package/dist/api/setup-endpoints.d.ts +13 -9
  63. package/dist/api/setup-endpoints.js +50 -173
  64. package/dist/api/setup-endpoints.js.map +1 -1
  65. package/dist/api/solvernets-endpoints.js +33 -63
  66. package/dist/api/solvernets-endpoints.js.map +1 -1
  67. package/dist/api/status-build.d.ts +140 -17
  68. package/dist/api/status-build.js +47 -34
  69. package/dist/api/status-build.js.map +1 -1
  70. package/dist/api/status-harness-rollup.d.ts +35 -0
  71. package/dist/api/status-harness-rollup.js +45 -0
  72. package/dist/api/status-harness-rollup.js.map +1 -0
  73. package/dist/api/status-rollup-build.d.ts +0 -4
  74. package/dist/api/status-rollup-build.js +0 -4
  75. package/dist/api/status-rollup-build.js.map +1 -1
  76. package/dist/api/task-runs-build.d.ts +8 -0
  77. package/dist/api/task-runs-build.js +5 -1
  78. package/dist/api/task-runs-build.js.map +1 -1
  79. package/dist/build-info.json +4 -4
  80. package/dist/build-meta.json +1 -1
  81. package/dist/captures/live-publisher.js +24 -4
  82. package/dist/captures/live-publisher.js.map +1 -1
  83. package/dist/captures/publish.d.ts +1 -1
  84. package/dist/chain-read-errors.d.ts +12 -0
  85. package/dist/chain-read-errors.js +26 -1
  86. package/dist/chain-read-errors.js.map +1 -1
  87. package/dist/cli/commands/codedigest-revert-check.d.ts +33 -0
  88. package/dist/cli/commands/codedigest-revert-check.js +253 -0
  89. package/dist/cli/commands/codedigest-revert-check.js.map +1 -0
  90. package/dist/cli/commands/doctor.d.ts +3 -0
  91. package/dist/cli/commands/doctor.js +35 -0
  92. package/dist/cli/commands/doctor.js.map +1 -1
  93. package/dist/cli/commands/eval.d.ts +76 -0
  94. package/dist/cli/commands/eval.js +401 -0
  95. package/dist/cli/commands/eval.js.map +1 -0
  96. package/dist/cli/commands/rewards.d.ts +2 -0
  97. package/dist/cli/commands/rewards.js +27 -0
  98. package/dist/cli/commands/rewards.js.map +1 -1
  99. package/dist/cli/commands/solver-nets.d.ts +1 -0
  100. package/dist/cli/commands/solver-nets.js +245 -22
  101. package/dist/cli/commands/solver-nets.js.map +1 -1
  102. package/dist/cli/commands/solver-plugins-block.d.ts +33 -0
  103. package/dist/cli/commands/solver-plugins-block.js +118 -0
  104. package/dist/cli/commands/solver-plugins-block.js.map +1 -0
  105. package/dist/cli/commands/solver-plugins-feedback.d.ts +72 -0
  106. package/dist/cli/commands/solver-plugins-feedback.js +262 -0
  107. package/dist/cli/commands/solver-plugins-feedback.js.map +1 -0
  108. package/dist/cli/commands/solver-plugins-read.d.ts +54 -0
  109. package/dist/cli/commands/solver-plugins-read.js +259 -0
  110. package/dist/cli/commands/solver-plugins-read.js.map +1 -0
  111. package/dist/cli/commands/solver-plugins.d.ts +35 -0
  112. package/dist/cli/commands/solver-plugins.js +399 -2
  113. package/dist/cli/commands/solver-plugins.js.map +1 -1
  114. package/dist/cli/commands/status.js +0 -1
  115. package/dist/cli/commands/status.js.map +1 -1
  116. package/dist/cli/commands/tasks.js +15 -2
  117. package/dist/cli/commands/tasks.js.map +1 -1
  118. package/dist/cli/index.js +4 -0
  119. package/dist/cli/index.js.map +1 -1
  120. package/dist/cli/task-native-readiness.d.ts +7 -0
  121. package/dist/cli/task-native-readiness.js +7 -5
  122. package/dist/cli/task-native-readiness.js.map +1 -1
  123. package/dist/config.d.ts +206 -232
  124. package/dist/config.js +289 -107
  125. package/dist/config.js.map +1 -1
  126. package/dist/daemon/ai-units-gate.d.ts +54 -0
  127. package/dist/daemon/ai-units-gate.js +83 -0
  128. package/dist/daemon/ai-units-gate.js.map +1 -0
  129. package/dist/daemon/creator.js +13 -0
  130. package/dist/daemon/creator.js.map +1 -1
  131. package/dist/daemon/daemon.d.ts +10 -0
  132. package/dist/daemon/daemon.js +205 -30
  133. package/dist/daemon/daemon.js.map +1 -1
  134. package/dist/daemon/eviction-loop.d.ts +7 -0
  135. package/dist/daemon/eviction-loop.js +16 -0
  136. package/dist/daemon/eviction-loop.js.map +1 -1
  137. package/dist/daemon/gate-logger.d.ts +9 -0
  138. package/dist/daemon/gate-logger.js +2 -0
  139. package/dist/daemon/gate-logger.js.map +1 -0
  140. package/dist/daemon/jinn-claim-loop.js +22 -4
  141. package/dist/daemon/jinn-claim-loop.js.map +1 -1
  142. package/dist/daemon/readiness-gate.d.ts +1 -4
  143. package/dist/daemon/readiness-gate.js.map +1 -1
  144. package/dist/daemon/spend-cap-gate.d.ts +40 -0
  145. package/dist/daemon/spend-cap-gate.js +46 -0
  146. package/dist/daemon/spend-cap-gate.js.map +1 -0
  147. package/dist/dashboard/assets/index-3quVQqik.js +167 -0
  148. package/dist/dashboard/assets/index-BVAWkLwY.css +1 -0
  149. package/dist/dashboard/index.html +2 -2
  150. package/dist/discovery/http.d.ts +7 -0
  151. package/dist/discovery/http.js +567 -24
  152. package/dist/discovery/http.js.map +1 -1
  153. package/dist/discovery/onchain.js +197 -5
  154. package/dist/discovery/onchain.js.map +1 -1
  155. package/dist/discovery/types.d.ts +235 -0
  156. package/dist/discovery/types.js +40 -0
  157. package/dist/discovery/types.js.map +1 -1
  158. package/dist/discovery/with-fallback.js +41 -0
  159. package/dist/discovery/with-fallback.js.map +1 -1
  160. package/dist/earning/bootstrap.d.ts +31 -3
  161. package/dist/earning/bootstrap.js +94 -22
  162. package/dist/earning/bootstrap.js.map +1 -1
  163. package/dist/earning/faucet.d.ts +1 -1
  164. package/dist/earning/faucet.js +2 -2
  165. package/dist/earning/faucet.js.map +1 -1
  166. package/dist/earning/safe-adapter.js +34 -11
  167. package/dist/earning/safe-adapter.js.map +1 -1
  168. package/dist/earning/types.d.ts +6 -6
  169. package/dist/earning/viem-clients.d.ts +11 -4
  170. package/dist/earning/viem-clients.js +14 -5
  171. package/dist/earning/viem-clients.js.map +1 -1
  172. package/dist/erc8004/identity.d.ts +19 -3
  173. package/dist/erc8004/identity.js +38 -11
  174. package/dist/erc8004/identity.js.map +1 -1
  175. package/dist/erc8004/index.d.ts +1 -1
  176. package/dist/erc8004/index.js.map +1 -1
  177. package/dist/eval/eval-harness-run.d.ts +63 -0
  178. package/dist/eval/eval-harness-run.js +123 -0
  179. package/dist/eval/eval-harness-run.js.map +1 -0
  180. package/dist/eval/orchestrator.d.ts +163 -0
  181. package/dist/eval/orchestrator.js +232 -0
  182. package/dist/eval/orchestrator.js.map +1 -0
  183. package/dist/eval/paired.d.ts +68 -0
  184. package/dist/eval/paired.js +93 -0
  185. package/dist/eval/paired.js.map +1 -0
  186. package/dist/eval/resolve-slate-tasks.d.ts +35 -0
  187. package/dist/eval/resolve-slate-tasks.js +56 -0
  188. package/dist/eval/resolve-slate-tasks.js.map +1 -0
  189. package/dist/eval/screen-discovery.d.ts +22 -0
  190. package/dist/eval/screen-discovery.js +71 -0
  191. package/dist/eval/screen-discovery.js.map +1 -0
  192. package/dist/eval/screen-progress.d.ts +41 -0
  193. package/dist/eval/screen-progress.js +60 -0
  194. package/dist/eval/screen-progress.js.map +1 -0
  195. package/dist/eval/screen-runner.d.ts +30 -0
  196. package/dist/eval/screen-runner.js +289 -0
  197. package/dist/eval/screen-runner.js.map +1 -0
  198. package/dist/eval/screen.d.ts +107 -0
  199. package/dist/eval/screen.js +159 -0
  200. package/dist/eval/screen.js.map +1 -0
  201. package/dist/eval/slope.d.ts +29 -0
  202. package/dist/eval/slope.js +46 -0
  203. package/dist/eval/slope.js.map +1 -0
  204. package/dist/eval/train-sequence.d.ts +35 -0
  205. package/dist/eval/train-sequence.js +59 -0
  206. package/dist/eval/train-sequence.js.map +1 -0
  207. package/dist/eval/wilson.d.ts +45 -0
  208. package/dist/eval/wilson.js +48 -0
  209. package/dist/eval/wilson.js.map +1 -0
  210. package/dist/events/types.d.ts +2 -2
  211. package/dist/harnesses/cost-estimates.d.ts +10 -31
  212. package/dist/harnesses/cost-estimates.js +11 -43
  213. package/dist/harnesses/cost-estimates.js.map +1 -1
  214. package/dist/harnesses/engine/canonical-json.js +5 -3
  215. package/dist/harnesses/engine/canonical-json.js.map +1 -1
  216. package/dist/harnesses/engine/engine.d.ts +37 -4
  217. package/dist/harnesses/engine/engine.js +151 -20
  218. package/dist/harnesses/engine/engine.js.map +1 -1
  219. package/dist/harnesses/engine/persistence.d.ts +38 -4
  220. package/dist/harnesses/engine/persistence.js +71 -6
  221. package/dist/harnesses/engine/persistence.js.map +1 -1
  222. package/dist/harnesses/engine/state.d.ts +9 -0
  223. package/dist/harnesses/engine/state.js +23 -10
  224. package/dist/harnesses/engine/state.js.map +1 -1
  225. package/dist/harnesses/impls/hermes-agent/adapter.d.ts +2 -0
  226. package/dist/harnesses/impls/hermes-agent/adapter.js +8 -5
  227. package/dist/harnesses/impls/hermes-agent/adapter.js.map +1 -1
  228. package/dist/harnesses/impls/hermes-agent/bootstrap.d.ts +1 -0
  229. package/dist/harnesses/impls/hermes-agent/bootstrap.js +10 -3
  230. package/dist/harnesses/impls/hermes-agent/bootstrap.js.map +1 -1
  231. package/dist/harnesses/impls/hermes-agent/config-builder.d.ts +1 -1
  232. package/dist/harnesses/impls/hermes-agent/config-builder.js +4 -2
  233. package/dist/harnesses/impls/hermes-agent/config-builder.js.map +1 -1
  234. package/dist/harnesses/impls/hermes-agent/harness.d.ts +31 -3
  235. package/dist/harnesses/impls/hermes-agent/harness.js +84 -7
  236. package/dist/harnesses/impls/hermes-agent/harness.js.map +1 -1
  237. package/dist/harnesses/impls/hermes-agent/prompt.d.ts +6 -6
  238. package/dist/harnesses/impls/hermes-agent/prompt.js +6 -6
  239. package/dist/harnesses/impls/index.d.ts +2 -0
  240. package/dist/harnesses/impls/index.js +2 -0
  241. package/dist/harnesses/impls/index.js.map +1 -1
  242. package/dist/harnesses/impls/learner/adapters/claude-code.d.ts +17 -0
  243. package/dist/harnesses/impls/learner/adapters/claude-code.js +118 -14
  244. package/dist/harnesses/impls/learner/adapters/claude-code.js.map +1 -1
  245. package/dist/harnesses/impls/learner/adapters/codex-code.d.ts +9 -0
  246. package/dist/harnesses/impls/learner/adapters/codex-code.js +30 -8
  247. package/dist/harnesses/impls/learner/adapters/codex-code.js.map +1 -1
  248. package/dist/harnesses/impls/learner/harness.d.ts +41 -1
  249. package/dist/harnesses/impls/learner/harness.js +78 -4
  250. package/dist/harnesses/impls/learner/harness.js.map +1 -1
  251. package/dist/harnesses/impls/learner/harvest.d.ts +3 -1
  252. package/dist/harnesses/impls/learner/harvest.js +30 -6
  253. package/dist/harnesses/impls/learner/harvest.js.map +1 -1
  254. package/dist/harnesses/impls/learner/plugin-path.js +1 -0
  255. package/dist/harnesses/impls/learner/plugin-path.js.map +1 -1
  256. package/dist/harnesses/impls/learner/restoration-patch.d.ts +2 -2
  257. package/dist/harnesses/impls/learner/restoration-patch.js +25 -6
  258. package/dist/harnesses/impls/learner/restoration-patch.js.map +1 -1
  259. package/dist/harnesses/impls/swe-rebench-v2-evaluator/eval-runner.js +21 -1
  260. package/dist/harnesses/impls/swe-rebench-v2-evaluator/eval-runner.js.map +1 -1
  261. package/dist/harnesses/impls/swe-rebench-v2-evaluator/harness.js +3 -1
  262. package/dist/harnesses/impls/swe-rebench-v2-evaluator/harness.js.map +1 -1
  263. package/dist/harnesses/impls/swe-rebench-v2-evaluator/hf-fetcher.d.ts +74 -5
  264. package/dist/harnesses/impls/swe-rebench-v2-evaluator/hf-fetcher.js +103 -32
  265. package/dist/harnesses/impls/swe-rebench-v2-evaluator/hf-fetcher.js.map +1 -1
  266. package/dist/harnesses/impls/swe-rebench-v2-evaluator/index.d.ts +2 -2
  267. package/dist/harnesses/impls/swe-rebench-v2-evaluator/index.js +3 -1
  268. package/dist/harnesses/impls/swe-rebench-v2-evaluator/index.js.map +1 -1
  269. package/dist/harnesses/readiness-registry.d.ts +7 -0
  270. package/dist/harnesses/readiness-registry.js +9 -0
  271. package/dist/harnesses/readiness-registry.js.map +1 -1
  272. package/dist/harnesses/types.d.ts +14 -0
  273. package/dist/learner/revert-decision.d.ts +74 -0
  274. package/dist/learner/revert-decision.js +73 -0
  275. package/dist/learner/revert-decision.js.map +1 -0
  276. package/dist/learner/revert-stats.d.ts +38 -0
  277. package/dist/learner/revert-stats.js +86 -0
  278. package/dist/learner/revert-stats.js.map +1 -0
  279. package/dist/local-provider-url.d.ts +3 -0
  280. package/dist/local-provider-url.js +28 -0
  281. package/dist/local-provider-url.js.map +1 -0
  282. package/dist/main.js +199 -104
  283. package/dist/main.js.map +1 -1
  284. package/dist/mcp/get-codedigest-reward.d.ts +13 -0
  285. package/dist/mcp/get-codedigest-reward.js +23 -0
  286. package/dist/mcp/get-codedigest-reward.js.map +1 -0
  287. package/dist/mcp/server.js +23 -0
  288. package/dist/mcp/server.js.map +1 -1
  289. package/dist/observability/debug-report-assemble.d.ts +43 -0
  290. package/dist/observability/debug-report-assemble.js +80 -0
  291. package/dist/observability/debug-report-assemble.js.map +1 -0
  292. package/dist/observability/emit-event.d.ts +9 -2
  293. package/dist/observability/emit-event.js +36 -2
  294. package/dist/observability/emit-event.js.map +1 -1
  295. package/dist/observability/file-logger.d.ts +69 -0
  296. package/dist/observability/file-logger.js +177 -0
  297. package/dist/observability/file-logger.js.map +1 -0
  298. package/dist/observability/redact-secrets.d.ts +65 -0
  299. package/dist/observability/redact-secrets.js +300 -0
  300. package/dist/observability/redact-secrets.js.map +1 -0
  301. package/dist/observability/tar.d.ts +30 -0
  302. package/dist/observability/tar.js +102 -0
  303. package/dist/observability/tar.js.map +1 -0
  304. package/dist/plugins/learner/.claude-plugin/plugin.json +1 -1
  305. package/dist/plugins/learner/.codex-plugin/plugin.json +1 -1
  306. package/dist/plugins/learner/hooks/session-start +30 -1
  307. package/dist/plugins/learner/skills/learn/consolidator-prompt.md +22 -1
  308. package/dist/plugins/learner/skills/learn/promoter-prompt.md +72 -1
  309. package/dist/preflight/deployment-readiness.d.ts +147 -0
  310. package/dist/preflight/deployment-readiness.js +366 -0
  311. package/dist/preflight/deployment-readiness.js.map +1 -0
  312. package/dist/preflight/pidfile-liveness.d.ts +50 -0
  313. package/dist/preflight/pidfile-liveness.js +117 -0
  314. package/dist/preflight/pidfile-liveness.js.map +1 -0
  315. package/dist/preflight/rpc-network.d.ts +40 -0
  316. package/dist/preflight/rpc-network.js +67 -1
  317. package/dist/preflight/rpc-network.js.map +1 -1
  318. package/dist/rpc/transport.d.ts +145 -0
  319. package/dist/rpc/transport.js +319 -0
  320. package/dist/rpc/transport.js.map +1 -0
  321. package/dist/scripts/donation-consumption-acceptance.js +7 -28
  322. package/dist/scripts/donation-consumption-acceptance.js.map +1 -1
  323. package/dist/scripts/swe-rebench-v2-pytest-missing.json +16 -0
  324. package/dist/solver-nets/prediction-operator-ux.d.ts +1 -2
  325. package/dist/solver-nets/prediction-operator-ux.js +56 -53
  326. package/dist/solver-nets/prediction-operator-ux.js.map +1 -1
  327. package/dist/solver-nets/registry.d.ts +19 -1
  328. package/dist/solver-nets/registry.js +37 -24
  329. package/dist/solver-nets/registry.js.map +1 -1
  330. package/dist/solver-types/_swe-rebench-v2-held-out-slate.d.ts +76 -0
  331. package/dist/solver-types/_swe-rebench-v2-held-out-slate.js +156 -0
  332. package/dist/solver-types/_swe-rebench-v2-held-out-slate.js.map +1 -0
  333. package/dist/solver-types/_swe-rebench-v2-pool-recovery.d.ts +81 -0
  334. package/dist/solver-types/_swe-rebench-v2-pool-recovery.js +116 -0
  335. package/dist/solver-types/_swe-rebench-v2-pool-recovery.js.map +1 -0
  336. package/dist/solver-types/_swe-rebench-v2-pool.d.ts +9 -2
  337. package/dist/solver-types/_swe-rebench-v2-pool.js +15 -20
  338. package/dist/solver-types/_swe-rebench-v2-pool.js.map +1 -1
  339. package/dist/solver-types/_swe-rebench-v2-state.d.ts +24 -0
  340. package/dist/solver-types/_swe-rebench-v2-state.js +33 -0
  341. package/dist/solver-types/_swe-rebench-v2-state.js.map +1 -1
  342. package/dist/solver-types/_swe-rebench-v2-validated-pool.d.ts +116 -2
  343. package/dist/solver-types/_swe-rebench-v2-validated-pool.js +296 -21
  344. package/dist/solver-types/_swe-rebench-v2-validated-pool.js.map +1 -1
  345. package/dist/solver-types/slates/held-out-slate.swe-rebench-v2.v1.json +20 -0
  346. package/dist/solver-types/slates/held-out-slate.swe-rebench-v2.v2.json +19 -0
  347. package/dist/solver-types/slates/held-out-slate.swe-rebench-v2.v2.screening-report.json +628 -0
  348. package/dist/solver-types/solver-type.d.ts +8 -0
  349. package/dist/solver-types/swe-rebench-v2-auto.d.ts +20 -11
  350. package/dist/solver-types/swe-rebench-v2-auto.js +64 -19
  351. package/dist/solver-types/swe-rebench-v2-auto.js.map +1 -1
  352. package/dist/solver-types/swe-rebench-v2.d.ts +10 -2
  353. package/dist/solver-types/swe-rebench-v2.js +233 -13
  354. package/dist/solver-types/swe-rebench-v2.js.map +1 -1
  355. package/dist/solvernets/daemon-init.d.ts +1 -1
  356. package/dist/solvernets/daemon-init.js +19 -4
  357. package/dist/solvernets/daemon-init.js.map +1 -1
  358. package/dist/solvernets/launched-record-dispatcher.d.ts +7 -0
  359. package/dist/solvernets/launched-record-dispatcher.js +10 -4
  360. package/dist/solvernets/launched-record-dispatcher.js.map +1 -1
  361. package/dist/solvernets/registry-client-erc8004.js +40 -37
  362. package/dist/solvernets/registry-client-erc8004.js.map +1 -1
  363. package/dist/solvernets/registry-client.d.ts +6 -0
  364. package/dist/solvernets/store.d.ts +2 -2
  365. package/dist/solvernets/store.js +7 -2
  366. package/dist/solvernets/store.js.map +1 -1
  367. package/dist/spend/ai-units-config.d.ts +49 -0
  368. package/dist/spend/ai-units-config.js +34 -0
  369. package/dist/spend/ai-units-config.js.map +1 -0
  370. package/dist/spend/ai-units.d.ts +140 -0
  371. package/dist/spend/ai-units.js +229 -0
  372. package/dist/spend/ai-units.js.map +1 -0
  373. package/dist/spend/cost-surface-status.d.ts +12 -0
  374. package/dist/spend/cost-surface-status.js +24 -0
  375. package/dist/spend/cost-surface-status.js.map +1 -0
  376. package/dist/spend/credential.d.ts +39 -0
  377. package/dist/spend/credential.js +71 -0
  378. package/dist/spend/credential.js.map +1 -0
  379. package/dist/spend/daemon-config.d.ts +13 -0
  380. package/dist/spend/daemon-config.js +24 -0
  381. package/dist/spend/daemon-config.js.map +1 -0
  382. package/dist/spend/pricing.d.ts +16 -0
  383. package/dist/spend/pricing.js +26 -0
  384. package/dist/spend/pricing.js.map +1 -0
  385. package/dist/spend/record.d.ts +13 -0
  386. package/dist/spend/record.js +43 -0
  387. package/dist/spend/record.js.map +1 -0
  388. package/dist/spend/usage.d.ts +27 -0
  389. package/dist/spend/usage.js +113 -0
  390. package/dist/spend/usage.js.map +1 -0
  391. package/dist/store/store.d.ts +187 -0
  392. package/dist/store/store.js +467 -4
  393. package/dist/store/store.js.map +1 -1
  394. package/dist/trajectory/transcript-parsers/codex-session.d.ts +12 -6
  395. package/dist/trajectory/transcript-parsers/codex-session.js +114 -13
  396. package/dist/trajectory/transcript-parsers/codex-session.js.map +1 -1
  397. package/dist/trajectory/transcript-parsers/types.d.ts +8 -8
  398. package/dist/trajectory/transcript-session-dirs.d.ts +18 -0
  399. package/dist/trajectory/transcript-session-dirs.js +85 -0
  400. package/dist/trajectory/transcript-session-dirs.js.map +1 -0
  401. package/dist/trajectory/transcript-watcher.d.ts +20 -1
  402. package/dist/trajectory/transcript-watcher.js +108 -32
  403. package/dist/trajectory/transcript-watcher.js.map +1 -1
  404. package/dist/tx-retry.d.ts +25 -0
  405. package/dist/tx-retry.js +95 -7
  406. package/dist/tx-retry.js.map +1 -1
  407. package/dist/types/payloads/portfolio-v0.d.ts +3 -3
  408. package/dist/types/payloads/prediction-apy-v0.d.ts +3 -3
  409. package/dist/types/payloads/prediction-v0.d.ts +12 -12
  410. package/dist/vendor/@jinn-network/sdk/dist/payloads/swe-rebench-v2.d.ts +108 -1
  411. package/dist/vendor/@jinn-network/sdk/dist/payloads/swe-rebench-v2.js +25 -1
  412. package/dist/vendor/@jinn-network/sdk/dist/solvernets/swe-rebench-v2-held-out-slate.d.ts +65 -0
  413. package/dist/vendor/@jinn-network/sdk/dist/solvernets/swe-rebench-v2-held-out-slate.js +123 -0
  414. package/dist/vendor/@jinn-network/sdk/dist/solvernets/swe-rebench-v2.d.ts +2 -2
  415. package/dist/vendor/@jinn-network/sdk/dist/solvernets/swe-rebench-v2.js +1 -1
  416. package/dist/vendor/@jinn-network/sdk/package.json +4 -0
  417. package/docker-compose.yml +3 -2
  418. package/package.json +30 -18
  419. package/plugins/learner/.claude-plugin/plugin.json +1 -1
  420. package/plugins/learner/.codex-plugin/plugin.json +1 -1
  421. package/plugins/learner/hooks/session-start +30 -1
  422. package/plugins/learner/skills/learn/consolidator-prompt.md +22 -1
  423. package/plugins/learner/skills/learn/promoter-prompt.md +72 -1
  424. package/plugins/swe-rebench-v2-diffmin/README.md +10 -9
  425. package/plugins/swe-rebench-v2-diffmin/jinn.plugin.json +1 -1
  426. package/plugins/swe-rebench-v2-diffmin/skills/diffmin/SKILL.md +15 -10
  427. package/plugins/swe-rebench-v2-diffmin/skills/test-map/SKILL.md +10 -12
  428. package/plugins/swe-rebench-v2-runtime/.claude-plugin/plugin.json +1 -1
  429. package/plugins/swe-rebench-v2-runtime/.codex-plugin/plugin.json +3 -3
  430. package/plugins/swe-rebench-v2-runtime/README.md +6 -6
  431. package/plugins/swe-rebench-v2-runtime/hooks/hooks.json +16 -0
  432. package/plugins/swe-rebench-v2-runtime/hooks/session-start +74 -0
  433. package/plugins/swe-rebench-v2-runtime/jinn.plugin.json +2 -3
  434. package/plugins/swe-rebench-v2-runtime/skills/task/SKILL.md +81 -0
  435. package/dist/dashboard/assets/index-BUlE8F3Y.js +0 -330
  436. package/dist/dashboard/assets/index-blqc7eqq.css +0 -32
  437. package/plugins/swe-rebench-v2-runtime/skills/orient/SKILL.md +0 -29
  438. package/plugins/swe-rebench-v2-runtime/skills/plan/SKILL.md +0 -53
@@ -0,0 +1,123 @@
1
+ /**
2
+ * Daemon-faithful harness run for `jinn eval` (issue #818) and the #822 e2e.
3
+ *
4
+ * Live testing proved the eval's earlier hand-rolled harness invocation did NOT
5
+ * reproduce the daemon's agent runtime: it ran the agent with NO SolverNet
6
+ * runtime plugins, so the claude-code adapter passed no `--plugin-dir`, the
7
+ * bundled MCP server never loaded, and the agent had no `submit_typed_payload`
8
+ * tool — producing no gradeable patch and rendering every task unscorable. The
9
+ * daemon scores the same tasks because it builds the full HarnessContext with
10
+ * `solverPluginRoots` from the SolverNet's runtime plugins (engine.ts §runImpl).
11
+ *
12
+ * This module is the single source of truth for that daemon-equivalent context.
13
+ * It rebuilds the engine's HarnessContext MINUS persistence/on-chain wiring and
14
+ * runs the harness through the SAME freeze-fence the daemon uses. The CLI uses
15
+ * it now; the #822 e2e delegates to it next.
16
+ */
17
+ import { mkdtempSync, rmSync } from 'node:fs';
18
+ import { tmpdir } from 'node:os';
19
+ import { join } from 'node:path';
20
+ import { provisionWorkingDir } from '../harnesses/engine/packaging.js';
21
+ import { runHarnessWithFreezeFence } from '../daemon/freeze-fence.js';
22
+ import { TrajectoryCollector } from '../trajectory/index.js';
23
+ import { loadSolverNets, } from '../solver-nets/registry.js';
24
+ /** Wall-clock budget for a single eval harness run (1h) when the task carries no window. */
25
+ const EVAL_RUN_BUDGET_MS = 3_600_000;
26
+ /**
27
+ * Resolve the SolverNet runtime plugins for `solverType`, exactly as the daemon
28
+ * does (`loadSolverNets(...).forSolverType(...)`). These plugins carry the
29
+ * bundled MCP server that provides `submit_typed_payload`; without them the
30
+ * agent cannot emit a gradeable patch.
31
+ *
32
+ * Fails LOUD with an operator-actionable message when there is no joined
33
+ * SolverNet for the solverType, or the matched net has no runtime plugins —
34
+ * mirroring the evaluator's `jinn harnesses enable` precondition.
35
+ */
36
+ export async function resolveRuntimePluginsForSolverType(solverType, joinedSolverNets, role = 'restoration') {
37
+ const registry = await loadSolverNets({ joinedSolverNets });
38
+ const net = registry.forSolverType(solverType, role);
39
+ if (!net) {
40
+ throw new Error(`no SolverNet for solverType ${solverType} (role ${role}): the eval runs the agent with the ` +
41
+ `SAME runtime plugins the daemon would, but this operator has not joined a SolverNet for it. ` +
42
+ `Join the SolverNet and install its plugins (e.g. set joinedSolverNets[<manifestCid>] in your ` +
43
+ `config and run \`jinn harnesses enable\`), then re-run eval.`);
44
+ }
45
+ if (net.runtimePlugins.length === 0) {
46
+ throw new Error(`SolverNet ${net.name} for solverType ${solverType} has no runtime plugins: the agent would run ` +
47
+ `without the bundled MCP server (no submit_typed_payload) and produce no gradeable patch. ` +
48
+ `Install the SolverNet's plugins (\`jinn harnesses enable\`) before running eval.`);
49
+ }
50
+ return net.runtimePlugins;
51
+ }
52
+ /**
53
+ * Run a harness once for eval, building the FULL daemon-equivalent
54
+ * HarnessContext (plugins + solverPluginRoots) but with no persistence /
55
+ * on-chain wiring. Mirrors `HarnessEngine`'s `runImpl` context assembly
56
+ * (engine.ts §runImpl) and routes through the same freeze-fence.
57
+ *
58
+ * Owns the ephemeral working-dir lifecycle: the returned result references only
59
+ * the patch string + codeDigest (nothing under workingDir), so the dir is safe
60
+ * to reclaim in `finally` — preventing leaked repo clones across a full slate.
61
+ */
62
+ export async function runHarnessForEval(args) {
63
+ const { harness, task, solverType, runtimePlugins, implStateDir, mode } = args;
64
+ const defaultEndTs = Date.now() + EVAL_RUN_BUDGET_MS;
65
+ const windowEndTs = task.window?.endTs ?? defaultEndTs;
66
+ // Fresh ephemeral scratch dir — SEPARATE from implStateDir so the harness's
67
+ // repo clone + diff harvest never trips the freeze-fence (which hashes
68
+ // implStateDir only). Provisioned the same way the engine does.
69
+ const workingDir = mkdtempSync(join(tmpdir(), 'jinn-eval-work-'));
70
+ const abort = new AbortController();
71
+ const msUntilEndTs = () => Math.max(0, windowEndTs - Date.now());
72
+ const endTimer = setTimeout(() => abort.abort(), msUntilEndTs());
73
+ // Don't keep the event loop alive on this timer (it's cleared in finally).
74
+ endTimer.unref?.();
75
+ try {
76
+ provisionWorkingDir(workingDir, task);
77
+ // Full daemon-equivalent HarnessContext — mirrors engine.ts §runImpl MINUS
78
+ // persistence/on-chain. The load-bearing fields are runtimePlugins +
79
+ // solverPluginRoots: they carry the bundled MCP server (submit_typed_payload).
80
+ const ctx = {
81
+ task,
82
+ requestId: task.id ?? `eval-${solverType}`,
83
+ taskCid: '',
84
+ solverNet: {
85
+ name: args.solverNetName ?? solverType,
86
+ solverType,
87
+ ...(args.model ? { model: args.model } : {}),
88
+ },
89
+ runtimePlugins,
90
+ solverPluginRoots: runtimePlugins.map((plugin) => plugin.root),
91
+ implStateDir,
92
+ workingDir,
93
+ log: () => {
94
+ /* eval run: quiet harness logs */
95
+ },
96
+ abort: abort.signal,
97
+ msUntilEndTs,
98
+ trajectory: new TrajectoryCollector({ taskCid: '', runId: `eval-${task.id ?? solverType}` }),
99
+ mode,
100
+ };
101
+ const fence = await runHarnessWithFreezeFence(harness, ctx);
102
+ if (!fence.ok) {
103
+ return { violation: { taskId: task.id ?? `eval-${solverType}` } };
104
+ }
105
+ const patch = fence.output.solutionPayload?.patch;
106
+ if (typeof patch !== 'string') {
107
+ throw new Error(`eval harness produced no swe-rebench-v2 patch for task ${task.id ?? solverType} ` +
108
+ `(solutionPayload.patch missing) — the agent ran but emitted no gradeable diff`);
109
+ }
110
+ return {
111
+ envelope: { executor: { mode, codeDigest: `sha256:${fence.codeDigest}` } },
112
+ solution: { patch },
113
+ };
114
+ }
115
+ finally {
116
+ clearTimeout(endTimer);
117
+ // Always reclaim the per-task scratch dir — swe-rebench-v2 clones a real
118
+ // upstream repo here; leaking it across a full slate piles onto the
119
+ // disk-pressure this stack already guards (JINN_EVAL_DISK_FLOOR_GB).
120
+ rmSync(workingDir, { recursive: true, force: true });
121
+ }
122
+ }
123
+ //# sourceMappingURL=eval-harness-run.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"eval-harness-run.js","sourceRoot":"","sources":["../../src/eval/eval-harness-run.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;GAeG;AAEH,OAAO,EAAE,WAAW,EAAE,MAAM,EAAE,MAAM,SAAS,CAAC;AAC9C,OAAO,EAAE,MAAM,EAAE,MAAM,SAAS,CAAC;AACjC,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AAGjC,OAAO,EAAE,mBAAmB,EAAE,MAAM,kCAAkC,CAAC;AACvE,OAAO,EAAE,yBAAyB,EAAE,MAAM,2BAA2B,CAAC;AACtE,OAAO,EAAE,mBAAmB,EAAE,MAAM,wBAAwB,CAAC;AAC7D,OAAO,EACL,cAAc,GAGf,MAAM,4BAA4B,CAAC;AAEpC,4FAA4F;AAC5F,MAAM,kBAAkB,GAAG,SAAS,CAAC;AAErC;;;;;;;;;GASG;AACH,MAAM,CAAC,KAAK,UAAU,kCAAkC,CACtD,UAAkB,EAClB,gBAAmE,EACnE,OAA0B,aAAa;IAEvC,MAAM,QAAQ,GAAG,MAAM,cAAc,CAAC,EAAE,gBAAgB,EAAE,CAAC,CAAC;IAC5D,MAAM,GAAG,GAAG,QAAQ,CAAC,aAAa,CAAC,UAAU,EAAE,IAAI,CAAC,CAAC;IACrD,IAAI,CAAC,GAAG,EAAE,CAAC;QACT,MAAM,IAAI,KAAK,CACb,+BAA+B,UAAU,UAAU,IAAI,sCAAsC;YAC3F,8FAA8F;YAC9F,+FAA+F;YAC/F,8DAA8D,CACjE,CAAC;IACJ,CAAC;IACD,IAAI,GAAG,CAAC,cAAc,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACpC,MAAM,IAAI,KAAK,CACb,aAAa,GAAG,CAAC,IAAI,mBAAmB,UAAU,+CAA+C;YAC/F,2FAA2F;YAC3F,kFAAkF,CACrF,CAAC;IACJ,CAAC;IACD,OAAO,GAAG,CAAC,cAAc,CAAC;AAC5B,CAAC;AAED;;;;;;;;;GASG;AACH,MAAM,CAAC,KAAK,UAAU,iBAAiB,CAAC,IASvC;IAKC,MAAM,EAAE,OAAO,EAAE,IAAI,EAAE,UAAU,EAAE,cAAc,EAAE,YAAY,EAAE,IAAI,EAAE,GAAG,IAAI,CAAC;IAC/E,MAAM,YAAY,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,kBAAkB,CAAC;IACrD,MAAM,WAAW,GAAG,IAAI,CAAC,MAAM,EAAE,KAAK,IAAI,YAAY,CAAC;IAEvD,4EAA4E;IAC5E,uEAAuE;IACvE,gEAAgE;IAChE,MAAM,UAAU,GAAG,WAAW,CAAC,IAAI,CAAC,MAAM,EAAE,EAAE,iBAAiB,CAAC,CAAC,CAAC;IAElE,MAAM,KAAK,GAAG,IAAI,eAAe,EAAE,CAAC;IACpC,MAAM,YAAY,GAAG,GAAG,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,WAAW,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC,CAAC;IACjE,MAAM,QAAQ,GAAG,UAAU,CAAC,GAAG,EAAE,CAAC,KAAK,CAAC,KAAK,EAAE,EAAE,YAAY,EAAE,CAAC,CAAC;IACjE,2EAA2E;IAC3E,QAAQ,CAAC,KAAK,EAAE,EAAE,CAAC;IAEnB,IAAI,CAAC;QACH,mBAAmB,CAAC,UAAU,EAAE,IAAI,CAAC,CAAC;QAEtC,2EAA2E;QAC3E,qEAAqE;QACrE,+EAA+E;QAC/E,MAAM,GAAG,GAAmB;YAC1B,IAAI;YACJ,SAAS,EAAE,IAAI,CAAC,EAAE,IAAI,QAAQ,UAAU,EAAE;YAC1C,OAAO,EAAE,EAAE;YACX,SAAS,EAAE;gBACT,IAAI,EAAE,IAAI,CAAC,aAAa,IAAI,UAAU;gBACtC,UAAU;gBACV,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,KAAK,EAAE,IAAI,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;aAC7C;YACD,cAAc;YACd,iBAAiB,EAAE,cAAc,CAAC,GAAG,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,MAAM,CAAC,IAAI,CAAC;YAC9D,YAAY;YACZ,UAAU;YACV,GAAG,EAAE,GAAG,EAAE;gBACR,kCAAkC;YACpC,CAAC;YACD,KAAK,EAAE,KAAK,CAAC,MAAM;YACnB,YAAY;YACZ,UAAU,EAAE,IAAI,mBAAmB,CAAC,EAAE,OAAO,EAAE,EAAE,EAAE,KAAK,EAAE,QAAQ,IAAI,CAAC,EAAE,IAAI,UAAU,EAAE,EAAE,CAAC;YAC5F,IAAI;SACL,CAAC;QAEF,MAAM,KAAK,GAAG,MAAM,yBAAyB,CAAC,OAAO,EAAE,GAAG,CAAC,CAAC;QAC5D,IAAI,CAAC,KAAK,CAAC,EAAE,EAAE,CAAC;YACd,OAAO,EAAE,SAAS,EAAE,EAAE,MAAM,EAAE,IAAI,CAAC,EAAE,IAAI,QAAQ,UAAU,EAAE,EAAE,EAAE,CAAC;QACpE,CAAC;QAED,MAAM,KAAK,GAAI,KAAK,CAAC,MAAM,CAAC,eAAmD,EAAE,KAAK,CAAC;QACvF,IAAI,OAAO,KAAK,KAAK,QAAQ,EAAE,CAAC;YAC9B,MAAM,IAAI,KAAK,CACb,0DAA0D,IAAI,CAAC,EAAE,IAAI,UAAU,GAAG;gBAChF,+EAA+E,CAClF,CAAC;QACJ,CAAC;QACD,OAAO;YACL,QAAQ,EAAE,EAAE,QAAQ,EAAE,EAAE,IAAI,EAAE,UAAU,EAAE,UAAU,KAAK,CAAC,UAAU,EAAE,EAAE,EAAE;YAC1E,QAAQ,EAAE,EAAE,KAAK,EAAE;SACpB,CAAC;IACJ,CAAC;YAAS,CAAC;QACT,YAAY,CAAC,QAAQ,CAAC,CAAC;QACvB,yEAAyE;QACzE,oEAAoE;QACpE,qEAAqE;QACrE,MAAM,CAAC,UAAU,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,CAAC,CAAC;IACvD,CAAC;AACH,CAAC"}
@@ -0,0 +1,163 @@
1
+ /**
2
+ * `jinn eval` held-out checkpoint orchestrator (issue #818).
3
+ *
4
+ * Runs a held-out slate against a checkpoint in FROZEN mode, persists per-task
5
+ * pass/fail, and emits a Wilson-CI resolved-rate comparison vs the parent
6
+ * checkpoint. Every external boundary is constructor-injected — this is the
7
+ * thin slice #819 drives a deterministic slate against with NO Docker/IPFS.
8
+ *
9
+ * Acceptance criteria:
10
+ * AC#1 — run the slate frozen, write per-task pass/fail.
11
+ * AC#2 — emit a resolved-rate comparison vs the parent with a CI.
12
+ * AC#3 — the freeze-fence holds: a `runHarnessOnce` `{ violation }` throws
13
+ * LOUD and the instance is NOT recorded (no implStateDir mutation
14
+ * slips through; enforcement lives in `runHarnessWithFreezeFence`).
15
+ *
16
+ * Per log/decisions/2026-05-28-rl-eval-measurement.md §4: v1-simple. Only large
17
+ * deltas are trustworthy (disjoint Wilson intervals). No seed control, no
18
+ * multi-run averaging.
19
+ */
20
+ import type { Harness, HarnessContext } from '../harnesses/types.js';
21
+ import type { SweRebenchV2Evaluator } from '../harnesses/impls/swe-rebench-v2-evaluator/index.js';
22
+ import type { HarnessCheckpointManifest } from '../vendor/@jinn-network/sdk/dist/checkpoint.js';
23
+ import type { LoadedHeldOutSlate } from '../solver-types/_swe-rebench-v2-held-out-slate.js';
24
+ import type { ResolvedSlateTask } from './resolve-slate-tasks.js';
25
+ import type { EvalAggregate, EvalResultRecord } from '../store/store.js';
26
+ import { type RateComparison } from './wilson.js';
27
+ import { type PairedComparison, type PairedInput } from './paired.js';
28
+ /** Thrown when `runHarnessOnce` reports a freeze-fence violation (AC#3). */
29
+ export declare class FreezeFenceViolationError extends Error {
30
+ readonly instanceId: string;
31
+ constructor(instanceId: string);
32
+ }
33
+ /**
34
+ * Thrown when the locally-evaluated impl-state's codeDigest does not match the
35
+ * named checkpoint's manifest (C1). Guards against persisting + comparing
36
+ * results under a checkpoint name while the operator's local impl-state has
37
+ * drifted from that checkpoint.
38
+ */
39
+ export declare class CheckpointStateMismatchError extends Error {
40
+ readonly checkpointCid: string;
41
+ readonly manifestCodeDigest: string;
42
+ readonly evaluatedCodeDigest: string;
43
+ constructor(checkpointCid: string, manifestCodeDigest: string, evaluatedCodeDigest: string);
44
+ }
45
+ /** Thrown when the parent checkpoint has no aggregate for this slate version (AC#2). */
46
+ export declare class ParentNotEvaluatedError extends Error {
47
+ readonly parentCheckpointCid: string;
48
+ readonly slateVersion: string;
49
+ constructor(parentCheckpointCid: string, slateVersion: string);
50
+ }
51
+ /** A checkpoint whose recorded results carry slate hashes other than the current one. */
52
+ export interface SlateHashDrift {
53
+ checkpointCid: string;
54
+ /** The drifted hashes (each != the current slate hash). */
55
+ hashes: string[];
56
+ }
57
+ /**
58
+ * Thrown when EITHER compared checkpoint was scored on a DIFFERENT slate
59
+ * *content* under the same version label (DR-2026-05-28 §2 — confounder control).
60
+ * A slate is content-addressed and scores are comparable ONLY within identical
61
+ * content; the store keys aggregates by version, so a content edit that skipped
62
+ * the version bump (or re-derived the declared hash — which the slate loader
63
+ * admits passes), or stale rows surviving `recordEvalResult`'s by-instance
64
+ * upsert, would let the comparison silently compare two checkpoints scored on
65
+ * different task sets. That is confounder #1 (task-selection) — exactly what the
66
+ * held-out exam exists to defeat — so we refuse the comparison loudly instead.
67
+ */
68
+ export declare class SlateHashMismatchError extends Error {
69
+ readonly slateVersion: string;
70
+ readonly currentSlateHash: string;
71
+ readonly drifted: ReadonlyArray<SlateHashDrift>;
72
+ constructor(slateVersion: string, currentSlateHash: string, drifted: ReadonlyArray<SlateHashDrift>);
73
+ }
74
+ /**
75
+ * Injected `runHarnessOnce`-shaped boundary. Mirrors the production
76
+ * `runHarnessOnce` (freeze-fence + mode propagation) but also surfaces the
77
+ * harness `solution` so the orchestrator can extract the patch to grade. On a
78
+ * freeze violation it returns `{ violation }` and no solution.
79
+ */
80
+ export type RunHarnessOnceForEval = (params: {
81
+ harness: Harness;
82
+ implStateDir: string;
83
+ mode: 'train' | 'frozen';
84
+ task?: HarnessContext['task'];
85
+ }) => Promise<{
86
+ envelope?: {
87
+ executor: {
88
+ mode: 'train' | 'frozen';
89
+ codeDigest: string;
90
+ };
91
+ };
92
+ violation?: {
93
+ taskId: string;
94
+ };
95
+ solution?: {
96
+ patch: string;
97
+ };
98
+ }>;
99
+ export interface EvalOrchestratorDeps {
100
+ harness: Harness;
101
+ fetchImplStateDirToLocal(implStateDirCid: string, targetDir: string): Promise<string>;
102
+ evaluator: SweRebenchV2Evaluator;
103
+ runHarnessOnce: RunHarnessOnceForEval;
104
+ store: {
105
+ recordEvalResult(args: EvalResultRecord): void;
106
+ getEvalAggregate(checkpointCid: string, slateVersion: string): EvalAggregate;
107
+ /** Distinct slate_hash values the parent's results were recorded under (drift guard). */
108
+ getEvalSlateHashes(checkpointCid: string, slateVersion: string): string[];
109
+ /**
110
+ * Optional: per-instance results for a (checkpoint, slate version). When
111
+ * present, the orchestrator also emits the paired (matched-design) McNemar
112
+ * verdict (DR-2026-06-02-b §2a) — the same slate is scored before & after, so
113
+ * the matched test is the statistically correct one and is far more powerful
114
+ * than the marginal disjoint-interval test. Reported ALONGSIDE `comparison`,
115
+ * never replacing it (the exam is strengthened, not weakened).
116
+ */
117
+ getEvalResults?(checkpointCid: string, slateVersion: string): PairedInput[];
118
+ };
119
+ }
120
+ export interface PerTaskResult {
121
+ instance_id: string;
122
+ /** null when unscorable. */
123
+ passed: boolean | null;
124
+ unscorable: boolean;
125
+ }
126
+ /**
127
+ * Provenance of the graded artifact (Legibility). The CLI grades the operator's
128
+ * LOCAL frozen impl-state — verified == the named checkpoint via the C1
129
+ * codeDigest guard (CheckpointStateMismatchError) — NOT a state re-fetched from
130
+ * the checkpoint CID. Surfacing this stops a reader from assuming the tool
131
+ * fetched and graded the published checkpoint's state.
132
+ */
133
+ export interface EvalProvenance {
134
+ /** Local impl-state directory that was actually run. */
135
+ implStateDir: string;
136
+ /** Real codeDigest of the evaluated impl-state (verified == manifest.codeDigest). */
137
+ codeDigest: string;
138
+ /** Always true on a returned result — a mismatch throws CheckpointStateMismatchError. */
139
+ matchedCheckpoint: true;
140
+ }
141
+ export interface EvalRunResult {
142
+ perTask: PerTaskResult[];
143
+ comparison: RateComparison;
144
+ /**
145
+ * Paired (matched-design) McNemar verdict vs the parent — present only when
146
+ * the store exposes `getEvalResults`. The marginal `comparison` above is
147
+ * conservative; this is the statistically correct test for the same-slate
148
+ * before/after design and is reported alongside it (DR-2026-06-02-b §2a).
149
+ */
150
+ paired?: PairedComparison;
151
+ /** What was actually graded (local impl-state), not the checkpoint identity. */
152
+ evaluated: EvalProvenance;
153
+ }
154
+ export declare function runEval(args: {
155
+ checkpointManifest: HarnessCheckpointManifest;
156
+ checkpointCid: string;
157
+ slate: LoadedHeldOutSlate;
158
+ tasksWithRows: ResolvedSlateTask[];
159
+ parentCheckpointCid: string;
160
+ deps: EvalOrchestratorDeps;
161
+ /** Working dir for the fetched impl-state-dir (defaults to a checkpoint-scoped tmp path). */
162
+ implStateDir?: string;
163
+ }): Promise<EvalRunResult>;
@@ -0,0 +1,232 @@
1
+ /**
2
+ * `jinn eval` held-out checkpoint orchestrator (issue #818).
3
+ *
4
+ * Runs a held-out slate against a checkpoint in FROZEN mode, persists per-task
5
+ * pass/fail, and emits a Wilson-CI resolved-rate comparison vs the parent
6
+ * checkpoint. Every external boundary is constructor-injected — this is the
7
+ * thin slice #819 drives a deterministic slate against with NO Docker/IPFS.
8
+ *
9
+ * Acceptance criteria:
10
+ * AC#1 — run the slate frozen, write per-task pass/fail.
11
+ * AC#2 — emit a resolved-rate comparison vs the parent with a CI.
12
+ * AC#3 — the freeze-fence holds: a `runHarnessOnce` `{ violation }` throws
13
+ * LOUD and the instance is NOT recorded (no implStateDir mutation
14
+ * slips through; enforcement lives in `runHarnessWithFreezeFence`).
15
+ *
16
+ * Per log/decisions/2026-05-28-rl-eval-measurement.md §4: v1-simple. Only large
17
+ * deltas are trustworthy (disjoint Wilson intervals). No seed control, no
18
+ * multi-run averaging.
19
+ */
20
+ import { compareRates } from './wilson.js';
21
+ import { comparePaired } from './paired.js';
22
+ /** Thrown when `runHarnessOnce` reports a freeze-fence violation (AC#3). */
23
+ export class FreezeFenceViolationError extends Error {
24
+ instanceId;
25
+ constructor(instanceId) {
26
+ super(`freeze-fence violation on instance ${instanceId}: the harness mutated implStateDir ` +
27
+ `during a frozen-mode eval run — refusing to record a tainted result`);
28
+ this.instanceId = instanceId;
29
+ this.name = 'FreezeFenceViolationError';
30
+ }
31
+ }
32
+ /**
33
+ * Thrown when the locally-evaluated impl-state's codeDigest does not match the
34
+ * named checkpoint's manifest (C1). Guards against persisting + comparing
35
+ * results under a checkpoint name while the operator's local impl-state has
36
+ * drifted from that checkpoint.
37
+ */
38
+ export class CheckpointStateMismatchError extends Error {
39
+ checkpointCid;
40
+ manifestCodeDigest;
41
+ evaluatedCodeDigest;
42
+ constructor(checkpointCid, manifestCodeDigest, evaluatedCodeDigest) {
43
+ super(`local impl-state does not match checkpoint ${checkpointCid}: ` +
44
+ `manifest codeDigest=${manifestCodeDigest} but the evaluated impl-state hashes to ` +
45
+ `${evaluatedCodeDigest}. Check out the impl-state for this checkpoint or pass the ` +
46
+ `correct --impl-state-dir; refusing to record results under a codeDigest that is a lie`);
47
+ this.checkpointCid = checkpointCid;
48
+ this.manifestCodeDigest = manifestCodeDigest;
49
+ this.evaluatedCodeDigest = evaluatedCodeDigest;
50
+ this.name = 'CheckpointStateMismatchError';
51
+ }
52
+ }
53
+ /** Thrown when the parent checkpoint has no aggregate for this slate version (AC#2). */
54
+ export class ParentNotEvaluatedError extends Error {
55
+ parentCheckpointCid;
56
+ slateVersion;
57
+ constructor(parentCheckpointCid, slateVersion) {
58
+ super(`parent checkpoint ${parentCheckpointCid} has no eval results for slate ${slateVersion} — ` +
59
+ `eval the parent checkpoint first (scores are only comparable within a slate version)`);
60
+ this.parentCheckpointCid = parentCheckpointCid;
61
+ this.slateVersion = slateVersion;
62
+ this.name = 'ParentNotEvaluatedError';
63
+ }
64
+ }
65
+ /**
66
+ * Thrown when EITHER compared checkpoint was scored on a DIFFERENT slate
67
+ * *content* under the same version label (DR-2026-05-28 §2 — confounder control).
68
+ * A slate is content-addressed and scores are comparable ONLY within identical
69
+ * content; the store keys aggregates by version, so a content edit that skipped
70
+ * the version bump (or re-derived the declared hash — which the slate loader
71
+ * admits passes), or stale rows surviving `recordEvalResult`'s by-instance
72
+ * upsert, would let the comparison silently compare two checkpoints scored on
73
+ * different task sets. That is confounder #1 (task-selection) — exactly what the
74
+ * held-out exam exists to defeat — so we refuse the comparison loudly instead.
75
+ */
76
+ export class SlateHashMismatchError extends Error {
77
+ slateVersion;
78
+ currentSlateHash;
79
+ drifted;
80
+ constructor(slateVersion, currentSlateHash, drifted) {
81
+ const detail = drifted
82
+ .map((d) => `${d.checkpointCid} carries [${d.hashes.join(', ')}]`)
83
+ .join('; ');
84
+ super(`a checkpoint was evaluated against a different slate content under the same version label ` +
85
+ `${slateVersion}: current slate hash is ${currentSlateHash} but ${detail}. Same version must ` +
86
+ `mean the same exam — a slate content change is a measurement discontinuity that must bump the ` +
87
+ `version. Bump the slate version and re-evaluate; refusing to report a delta across two ` +
88
+ `different task sets (confounder #1, task-selection)`);
89
+ this.slateVersion = slateVersion;
90
+ this.currentSlateHash = currentSlateHash;
91
+ this.drifted = drifted;
92
+ this.name = 'SlateHashMismatchError';
93
+ }
94
+ }
95
+ export async function runEval(args) {
96
+ const { deps, slate, checkpointCid, parentCheckpointCid } = args;
97
+ // Confounder guard (DR-2026-05-28 §2). The store keys aggregates by slate
98
+ // VERSION; if EITHER checkpoint has results recorded under a different slate
99
+ // CONTENT for that version — a content edit that skipped the version bump, or
100
+ // stale rows surviving recordEvalResult's by-instance upsert — comparing
101
+ // child-vs-parent reintroduces confounder #1 (task-selection: two scores over
102
+ // different task sets). Both arms' hashes are known up front, so detect drift
103
+ // in EITHER before the loop and refuse — never burn N× real spend producing a
104
+ // number we'd have to throw away.
105
+ const drifted = [parentCheckpointCid, checkpointCid]
106
+ .map((cid) => ({
107
+ checkpointCid: cid,
108
+ hashes: deps.store.getEvalSlateHashes(cid, slate.version).filter((hash) => hash !== slate.hash),
109
+ }))
110
+ .filter((arm) => arm.hashes.length > 0);
111
+ if (drifted.length > 0) {
112
+ throw new SlateHashMismatchError(slate.version, slate.hash, drifted);
113
+ }
114
+ // Hoist the impl-state-dir fetch outside the loop: every slate instance runs
115
+ // against the SAME checkpoint state.
116
+ const implStateDir = args.implStateDir ?? `/tmp/jinn-eval-${checkpointCid}`;
117
+ const localImplStateDir = await deps.fetchImplStateDirToLocal(args.checkpointManifest.implStateDirCid, implStateDir);
118
+ const perTask = [];
119
+ const runAtMs = Date.now();
120
+ let digestChecked = false;
121
+ // Real digest of the evaluated impl-state (from the freeze-fence), captured on
122
+ // the first run for provenance. Falls back to the manifest digest, which the
123
+ // C1 guard proves equal.
124
+ let evaluatedDigest = args.checkpointManifest.codeDigest;
125
+ for (const { task, row } of args.tasksWithRows) {
126
+ let passed;
127
+ let unscorable = false;
128
+ let logExcerpt = '';
129
+ try {
130
+ const run = await deps.runHarnessOnce({
131
+ harness: deps.harness,
132
+ implStateDir: localImplStateDir,
133
+ mode: 'frozen',
134
+ // Carry the full SweRebenchV2Task in `spec` + the `solverType` dispatch
135
+ // key so the harness can restore the repo (clone/checkout from
136
+ // spec.repo/base_commit). The harness emits a swe-rebench-v2 patch from
137
+ // spec; id/description/role alone are insufficient for a real run.
138
+ task: {
139
+ id: task.instance_id,
140
+ description: task.problem_statement,
141
+ role: 'restoration',
142
+ solverType: 'swe-rebench-v2.v1',
143
+ spec: task,
144
+ window: { startTs: 0, endTs: Date.now() + 3_600_000 },
145
+ },
146
+ });
147
+ // AC#3: a freeze violation is loud and terminal — do NOT record this
148
+ // instance (its implStateDir mutation taints the run). Thrown OUTSIDE the
149
+ // unscorable catch below so it propagates and aborts the whole eval.
150
+ if (run.violation) {
151
+ throw new FreezeFenceViolationError(task.instance_id);
152
+ }
153
+ if (!run.solution) {
154
+ throw new Error(`harness produced no solution for instance ${task.instance_id}`);
155
+ }
156
+ // C1: on the FIRST successful run, verify the real digest of the evaluated
157
+ // impl-state (computed by the freeze-fence) matches the named checkpoint's
158
+ // manifest. Fail fast — before grading any instance — to avoid Docker spend
159
+ // and to never persist a result under a codeDigest that is a lie.
160
+ if (!digestChecked) {
161
+ digestChecked = true;
162
+ const runDigest = run.envelope?.executor.codeDigest;
163
+ if (runDigest && runDigest !== args.checkpointManifest.codeDigest) {
164
+ throw new CheckpointStateMismatchError(checkpointCid, args.checkpointManifest.codeDigest, runDigest);
165
+ }
166
+ if (runDigest)
167
+ evaluatedDigest = runDigest;
168
+ }
169
+ const verdict = await deps.evaluator.grade({
170
+ task,
171
+ solutionPayload: { schemaVersion: 'swe-rebench-v2-solution.v1', patch: run.solution.patch },
172
+ row,
173
+ });
174
+ passed = verdict.passed_match;
175
+ logExcerpt = verdict.test_log.slice(0, 1000);
176
+ }
177
+ catch (err) {
178
+ // A freeze-fence violation taints the run and must stay a LOUD terminal
179
+ // abort (AC#3). Likewise the C1 digest mismatch — never record results
180
+ // under a codeDigest that is a lie. Both re-throw and abort the eval.
181
+ if (err instanceof FreezeFenceViolationError || err instanceof CheckpointStateMismatchError) {
182
+ throw err;
183
+ }
184
+ // Everything else is unscorable: excluded from the denominator, NEVER
185
+ // coerced to a fail (#476). This covers both grade-side failures
186
+ // (EvalCouldNotGradeError / InsufficientDiskError) and harness-run
187
+ // failures (Defect A — harvest missing-artifact, "produced no patch",
188
+ // clone/timeout, etc.); a harness/infra failure to produce a gradeable
189
+ // solution is environment-side, not an agent capability fail. The slate
190
+ // continues to the next instance.
191
+ passed = null;
192
+ unscorable = true;
193
+ logExcerpt = (err instanceof Error ? err.message : String(err)).slice(0, 1000);
194
+ }
195
+ deps.store.recordEvalResult({
196
+ checkpoint_cid: checkpointCid,
197
+ slate_hash: slate.hash,
198
+ slate_version: slate.version,
199
+ instance_id: task.instance_id,
200
+ passed,
201
+ unscorable,
202
+ code_digest: args.checkpointManifest.codeDigest,
203
+ run_at_ms: runAtMs,
204
+ test_log_excerpt: logExcerpt,
205
+ });
206
+ perTask.push({ instance_id: task.instance_id, passed, unscorable });
207
+ }
208
+ // AC#2: compare child vs parent at the SAME slate version. A parent with no
209
+ // rows is a hard error — no cross-version compare, eval the parent first.
210
+ const parentAgg = deps.store.getEvalAggregate(parentCheckpointCid, slate.version);
211
+ if (parentAgg.scorable === 0 && parentAgg.unscorable === 0) {
212
+ throw new ParentNotEvaluatedError(parentCheckpointCid, slate.version);
213
+ }
214
+ const childAgg = deps.store.getEvalAggregate(checkpointCid, slate.version);
215
+ const comparison = compareRates({ passed: childAgg.passed, scorable: childAgg.scorable }, { passed: parentAgg.passed, scorable: parentAgg.scorable });
216
+ // Paired (matched-design) verdict — same slate scored before & after, so the
217
+ // matched McNemar test is the correct, higher-power one. Additive: only when
218
+ // the store exposes per-instance results (DR-2026-06-02-b §2a).
219
+ let paired;
220
+ if (deps.store.getEvalResults) {
221
+ const parentResults = deps.store.getEvalResults(parentCheckpointCid, slate.version);
222
+ const childResults = deps.store.getEvalResults(checkpointCid, slate.version);
223
+ paired = comparePaired(parentResults, childResults);
224
+ }
225
+ return {
226
+ perTask,
227
+ comparison,
228
+ ...(paired ? { paired } : {}),
229
+ evaluated: { implStateDir: localImplStateDir, codeDigest: evaluatedDigest, matchedCheckpoint: true },
230
+ };
231
+ }
232
+ //# sourceMappingURL=orchestrator.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"orchestrator.js","sourceRoot":"","sources":["../../src/eval/orchestrator.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;GAkBG;AAQH,OAAO,EAAE,YAAY,EAAuB,MAAM,aAAa,CAAC;AAChE,OAAO,EAAE,aAAa,EAA2C,MAAM,aAAa,CAAC;AAErF,4EAA4E;AAC5E,MAAM,OAAO,yBAA0B,SAAQ,KAAK;IACtB;IAA5B,YAA4B,UAAkB;QAC5C,KAAK,CACH,sCAAsC,UAAU,qCAAqC;YACnF,qEAAqE,CACxE,CAAC;QAJwB,eAAU,GAAV,UAAU,CAAQ;QAK5C,IAAI,CAAC,IAAI,GAAG,2BAA2B,CAAC;IAC1C,CAAC;CACF;AAED;;;;;GAKG;AACH,MAAM,OAAO,4BAA6B,SAAQ,KAAK;IAEnC;IACA;IACA;IAHlB,YACkB,aAAqB,EACrB,kBAA0B,EAC1B,mBAA2B;QAE3C,KAAK,CACH,8CAA8C,aAAa,IAAI;YAC7D,uBAAuB,kBAAkB,0CAA0C;YACnF,GAAG,mBAAmB,6DAA6D;YACnF,uFAAuF,CAC1F,CAAC;QATc,kBAAa,GAAb,aAAa,CAAQ;QACrB,uBAAkB,GAAlB,kBAAkB,CAAQ;QAC1B,wBAAmB,GAAnB,mBAAmB,CAAQ;QAQ3C,IAAI,CAAC,IAAI,GAAG,8BAA8B,CAAC;IAC7C,CAAC;CACF;AAED,wFAAwF;AACxF,MAAM,OAAO,uBAAwB,SAAQ,KAAK;IACpB;IAA6C;IAAzE,YAA4B,mBAA2B,EAAkB,YAAoB;QAC3F,KAAK,CACH,qBAAqB,mBAAmB,kCAAkC,YAAY,KAAK;YACzF,sFAAsF,CACzF,CAAC;QAJwB,wBAAmB,GAAnB,mBAAmB,CAAQ;QAAkB,iBAAY,GAAZ,YAAY,CAAQ;QAK3F,IAAI,CAAC,IAAI,GAAG,yBAAyB,CAAC;IACxC,CAAC;CACF;AASD;;;;;;;;;;GAUG;AACH,MAAM,OAAO,sBAAuB,SAAQ,KAAK;IAE7B;IACA;IACA;IAHlB,YACkB,YAAoB,EACpB,gBAAwB,EACxB,OAAsC;QAEtD,MAAM,MAAM,GAAG,OAAO;aACnB,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,CAAC,aAAa,aAAa,CAAC,CAAC,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC;aACjE,IAAI,CAAC,IAAI,CAAC,CAAC;QACd,KAAK,CACH,4FAA4F;YAC1F,GAAG,YAAY,2BAA2B,gBAAgB,QAAQ,MAAM,sBAAsB;YAC9F,gGAAgG;YAChG,yFAAyF;YACzF,qDAAqD,CACxD,CAAC;QAbc,iBAAY,GAAZ,YAAY,CAAQ;QACpB,qBAAgB,GAAhB,gBAAgB,CAAQ;QACxB,YAAO,GAAP,OAAO,CAA+B;QAYtD,IAAI,CAAC,IAAI,GAAG,wBAAwB,CAAC;IACvC,CAAC;CACF;AA8ED,MAAM,CAAC,KAAK,UAAU,OAAO,CAAC,IAS7B;IACC,MAAM,EAAE,IAAI,EAAE,KAAK,EAAE,aAAa,EAAE,mBAAmB,EAAE,GAAG,IAAI,CAAC;IAEjE,0EAA0E;IAC1E,6EAA6E;IAC7E,8EAA8E;IAC9E,yEAAyE;IACzE,8EAA8E;IAC9E,8EAA8E;IAC9E,8EAA8E;IAC9E,kCAAkC;IAClC,MAAM,OAAO,GAAG,CAAC,mBAAmB,EAAE,aAAa,CAAC;SACjD,GAAG,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,CAAC;QACb,aAAa,EAAE,GAAG;QAClB,MAAM,EAAE,IAAI,CAAC,KAAK,CAAC,kBAAkB,CAAC,GAAG,EAAE,KAAK,CAAC,OAAO,CAAC,CAAC,MAAM,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,KAAK,KAAK,CAAC,IAAI,CAAC;KAChG,CAAC,CAAC;SACF,MAAM,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,GAAG,CAAC,MAAM,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;IAC1C,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACvB,MAAM,IAAI,sBAAsB,CAAC,KAAK,CAAC,OAAO,EAAE,KAAK,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC;IACvE,CAAC;IAED,6EAA6E;IAC7E,qCAAqC;IACrC,MAAM,YAAY,GAAG,IAAI,CAAC,YAAY,IAAI,kBAAkB,aAAa,EAAE,CAAC;IAC5E,MAAM,iBAAiB,GAAG,MAAM,IAAI,CAAC,wBAAwB,CAC3D,IAAI,CAAC,kBAAkB,CAAC,eAAe,EACvC,YAAY,CACb,CAAC;IAEF,MAAM,OAAO,GAAoB,EAAE,CAAC;IACpC,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;IAC3B,IAAI,aAAa,GAAG,KAAK,CAAC;IAC1B,+EAA+E;IAC/E,6EAA6E;IAC7E,yBAAyB;IACzB,IAAI,eAAe,GAAG,IAAI,CAAC,kBAAkB,CAAC,UAAU,CAAC;IAEzD,KAAK,MAAM,EAAE,IAAI,EAAE,GAAG,EAAE,IAAI,IAAI,CAAC,aAAa,EAAE,CAAC;QAC/C,IAAI,MAAsB,CAAC;QAC3B,IAAI,UAAU,GAAG,KAAK,CAAC;QACvB,IAAI,UAAU,GAAG,EAAE,CAAC;QACpB,IAAI,CAAC;YACH,MAAM,GAAG,GAAG,MAAM,IAAI,CAAC,cAAc,CAAC;gBACpC,OAAO,EAAE,IAAI,CAAC,OAAO;gBACrB,YAAY,EAAE,iBAAiB;gBAC/B,IAAI,EAAE,QAAQ;gBACd,wEAAwE;gBACxE,+DAA+D;gBAC/D,wEAAwE;gBACxE,mEAAmE;gBACnE,IAAI,EAAE;oBACJ,EAAE,EAAE,IAAI,CAAC,WAAW;oBACpB,WAAW,EAAE,IAAI,CAAC,iBAAiB;oBACnC,IAAI,EAAE,aAAa;oBACnB,UAAU,EAAE,mBAAmB;oBAC/B,IAAI,EAAE,IAA0C;oBAChD,MAAM,EAAE,EAAE,OAAO,EAAE,CAAC,EAAE,KAAK,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS,EAAE;iBACtD;aACF,CAAC,CAAC;YAEH,qEAAqE;YACrE,0EAA0E;YAC1E,qEAAqE;YACrE,IAAI,GAAG,CAAC,SAAS,EAAE,CAAC;gBAClB,MAAM,IAAI,yBAAyB,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;YACxD,CAAC;YACD,IAAI,CAAC,GAAG,CAAC,QAAQ,EAAE,CAAC;gBAClB,MAAM,IAAI,KAAK,CAAC,6CAA6C,IAAI,CAAC,WAAW,EAAE,CAAC,CAAC;YACnF,CAAC;YAED,2EAA2E;YAC3E,2EAA2E;YAC3E,4EAA4E;YAC5E,kEAAkE;YAClE,IAAI,CAAC,aAAa,EAAE,CAAC;gBACnB,aAAa,GAAG,IAAI,CAAC;gBACrB,MAAM,SAAS,GAAG,GAAG,CAAC,QAAQ,EAAE,QAAQ,CAAC,UAAU,CAAC;gBACpD,IAAI,SAAS,IAAI,SAAS,KAAK,IAAI,CAAC,kBAAkB,CAAC,UAAU,EAAE,CAAC;oBAClE,MAAM,IAAI,4BAA4B,CACpC,aAAa,EACb,IAAI,CAAC,kBAAkB,CAAC,UAAU,EAClC,SAAS,CACV,CAAC;gBACJ,CAAC;gBACD,IAAI,SAAS;oBAAE,eAAe,GAAG,SAAS,CAAC;YAC7C,CAAC;YAED,MAAM,OAAO,GAAG,MAAM,IAAI,CAAC,SAAS,CAAC,KAAK,CAAC;gBACzC,IAAI;gBACJ,eAAe,EAAE,EAAE,aAAa,EAAE,4BAA4B,EAAE,KAAK,EAAE,GAAG,CAAC,QAAQ,CAAC,KAAK,EAAE;gBAC3F,GAAG;aACJ,CAAC,CAAC;YACH,MAAM,GAAG,OAAO,CAAC,YAAY,CAAC;YAC9B,UAAU,GAAG,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,EAAE,IAAI,CAAC,CAAC;QAC/C,CAAC;QAAC,OAAO,GAAG,EAAE,CAAC;YACb,wEAAwE;YACxE,uEAAuE;YACvE,sEAAsE;YACtE,IAAI,GAAG,YAAY,yBAAyB,IAAI,GAAG,YAAY,4BAA4B,EAAE,CAAC;gBAC5F,MAAM,GAAG,CAAC;YACZ,CAAC;YACD,sEAAsE;YACtE,iEAAiE;YACjE,mEAAmE;YACnE,sEAAsE;YACtE,uEAAuE;YACvE,wEAAwE;YACxE,kCAAkC;YAClC,MAAM,GAAG,IAAI,CAAC;YACd,UAAU,GAAG,IAAI,CAAC;YAClB,UAAU,GAAG,CAAC,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,IAAI,CAAC,CAAC;QACjF,CAAC;QAED,IAAI,CAAC,KAAK,CAAC,gBAAgB,CAAC;YAC1B,cAAc,EAAE,aAAa;YAC7B,UAAU,EAAE,KAAK,CAAC,IAAI;YACtB,aAAa,EAAE,KAAK,CAAC,OAAO;YAC5B,WAAW,EAAE,IAAI,CAAC,WAAW;YAC7B,MAAM;YACN,UAAU;YACV,WAAW,EAAE,IAAI,CAAC,kBAAkB,CAAC,UAAU;YAC/C,SAAS,EAAE,OAAO;YAClB,gBAAgB,EAAE,UAAU;SAC7B,CAAC,CAAC;QAEH,OAAO,CAAC,IAAI,CAAC,EAAE,WAAW,EAAE,IAAI,CAAC,WAAW,EAAE,MAAM,EAAE,UAAU,EAAE,CAAC,CAAC;IACtE,CAAC;IAED,4EAA4E;IAC5E,0EAA0E;IAC1E,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,gBAAgB,CAAC,mBAAmB,EAAE,KAAK,CAAC,OAAO,CAAC,CAAC;IAClF,IAAI,SAAS,CAAC,QAAQ,KAAK,CAAC,IAAI,SAAS,CAAC,UAAU,KAAK,CAAC,EAAE,CAAC;QAC3D,MAAM,IAAI,uBAAuB,CAAC,mBAAmB,EAAE,KAAK,CAAC,OAAO,CAAC,CAAC;IACxE,CAAC;IACD,MAAM,QAAQ,GAAG,IAAI,CAAC,KAAK,CAAC,gBAAgB,CAAC,aAAa,EAAE,KAAK,CAAC,OAAO,CAAC,CAAC;IAE3E,MAAM,UAAU,GAAG,YAAY,CAC7B,EAAE,MAAM,EAAE,QAAQ,CAAC,MAAM,EAAE,QAAQ,EAAE,QAAQ,CAAC,QAAQ,EAAE,EACxD,EAAE,MAAM,EAAE,SAAS,CAAC,MAAM,EAAE,QAAQ,EAAE,SAAS,CAAC,QAAQ,EAAE,CAC3D,CAAC;IAEF,6EAA6E;IAC7E,6EAA6E;IAC7E,gEAAgE;IAChE,IAAI,MAAoC,CAAC;IACzC,IAAI,IAAI,CAAC,KAAK,CAAC,cAAc,EAAE,CAAC;QAC9B,MAAM,aAAa,GAAG,IAAI,CAAC,KAAK,CAAC,cAAc,CAAC,mBAAmB,EAAE,KAAK,CAAC,OAAO,CAAC,CAAC;QACpF,MAAM,YAAY,GAAG,IAAI,CAAC,KAAK,CAAC,cAAc,CAAC,aAAa,EAAE,KAAK,CAAC,OAAO,CAAC,CAAC;QAC7E,MAAM,GAAG,aAAa,CAAC,aAAa,EAAE,YAAY,CAAC,CAAC;IACtD,CAAC;IAED,OAAO;QACL,OAAO;QACP,UAAU;QACV,GAAG,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,MAAM,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;QAC7B,SAAS,EAAE,EAAE,YAAY,EAAE,iBAAiB,EAAE,UAAU,EAAE,eAAe,EAAE,iBAAiB,EAAE,IAAI,EAAE;KACrG,CAAC;AACJ,CAAC"}
@@ -0,0 +1,68 @@
1
+ /**
2
+ * Paired (matched-design) comparison for the held-out exam.
3
+ *
4
+ * The exam scores the SAME slate against the before- and after-checkpoints, so
5
+ * the two result sets are MATCHED, not independent. The marginal test in
6
+ * `wilson.ts` (`compareRates`) compares two INDEPENDENT Wilson intervals and
7
+ * calls a delta trustworthy only when they are disjoint. On a matched design
8
+ * that is statistically wasteful: it carries the full between-instance
9
+ * difficulty variance ("this bug is just hard") in BOTH intervals, which swamps
10
+ * a real, consistent within-instance improvement. Example: a learner that flips
11
+ * 3 hard instances fail→pass and regresses none moves 4/10→7/10 — two Wilson
12
+ * intervals ([17,69]% vs [40,89]%) overlap → "within-noise", even though every
13
+ * observed change was an improvement.
14
+ *
15
+ * McNemar's test is the textbook test for matched binary data: it looks ONLY at
16
+ * the discordant pairs (instances that flipped) and asks whether the flips are
17
+ * asymmetric beyond chance. This is NOT a weaker bar — it is the CORRECT test
18
+ * for the design, and it still gates the claim on significance (exact two-sided
19
+ * p < alpha) AND on the improvement direction (more fail→pass than pass→fail).
20
+ * It is reported ALONGSIDE the conservative marginal verdict, never replacing it
21
+ * — so the exam is strengthened, never weakened (DR-2026-06-02-b §2a).
22
+ *
23
+ * R=1 (one run per instance) ships here: McNemar exact on per-instance flips.
24
+ * R>1 (per-instance pass RATES → Wilcoxon signed-rank / paired bootstrap) layers
25
+ * on once `eval_results` supports appended runs (DR-2026-06-02-b §2b).
26
+ */
27
+ /** Minimal per-instance result (subset of PerTaskResult / EvalResultRow). */
28
+ export interface PairedInput {
29
+ instance_id: string;
30
+ /** null when unscorable. */
31
+ passed: boolean | null;
32
+ unscorable: boolean;
33
+ }
34
+ export type PairedVerdict = 'trustworthy' | 'within-noise';
35
+ export interface PairedComparison {
36
+ /** Instances scorable (passed !== null) in BOTH arms — the matched pairs. */
37
+ pairs: number;
38
+ /** b: before fail → after pass (improvements). */
39
+ improved: number;
40
+ /** c: before pass → after fail (regressions). */
41
+ regressed: number;
42
+ concordantPass: number;
43
+ concordantFail: number;
44
+ /** Instances unscorable/absent in either arm — excluded from the paired test. */
45
+ excluded: number;
46
+ /** Exact two-sided McNemar p-value over the discordant pairs. */
47
+ pValue: number;
48
+ /** 'trustworthy' iff pValue < alpha AND improved > regressed; else 'within-noise'. */
49
+ verdict: PairedVerdict;
50
+ }
51
+ /**
52
+ * Exact two-sided McNemar p-value for discordant counts `b` (improvements) and
53
+ * `c` (regressions). Under H0 each discordant pair is an independent fair coin,
54
+ * so the count of one direction is Binomial(n=b+c, 0.5); the two-sided p is
55
+ * `2 * P(X <= min(b,c))`, capped at 1. No `b==c==0` division (returns 1: no
56
+ * evidence). Computed iteratively (term ratio) so it is exact and overflow-free
57
+ * for any slate size.
58
+ */
59
+ export declare function mcnemarExact(b: number, c: number): number;
60
+ /**
61
+ * Paired McNemar comparison of two matched per-instance result sets. An instance
62
+ * contributes a pair ONLY when it is scorable (passed !== null, not unscorable)
63
+ * in BOTH arms — an unscorable/missing side is excluded (honest: a disk-skip or
64
+ * harness failure is never coerced into a flip). `alpha` defaults to 0.05.
65
+ */
66
+ export declare function comparePaired(before: readonly PairedInput[], after: readonly PairedInput[], opts?: {
67
+ alpha?: number;
68
+ }): PairedComparison;