@jinn-network/client 0.1.6 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (288) hide show
  1. package/CHANGELOG.md +33 -0
  2. package/deployments/deployment-jinn-mvi-l1-sepolia-fast.json +23 -4
  3. package/deployments/deployment-jinn-mvi-l1-sepolia.json +23 -4
  4. package/deployments/deployment-jinn-mvi-l2-baseSepolia.json +5 -4
  5. package/dist/adapters/mech/adapter.d.ts +38 -1
  6. package/dist/adapters/mech/adapter.js +241 -54
  7. package/dist/adapters/mech/adapter.js.map +1 -1
  8. package/dist/adapters/mech/contracts.d.ts +17 -4
  9. package/dist/adapters/mech/contracts.js +8 -2
  10. package/dist/adapters/mech/contracts.js.map +1 -1
  11. package/dist/adapters/mech/safe-revert.d.ts +20 -0
  12. package/dist/adapters/mech/safe-revert.js +12 -4
  13. package/dist/adapters/mech/safe-revert.js.map +1 -1
  14. package/dist/adapters/mech/safe.d.ts +5 -1
  15. package/dist/adapters/mech/safe.js +27 -8
  16. package/dist/adapters/mech/safe.js.map +1 -1
  17. package/dist/adapters/mech/verdict-code.d.ts +1 -0
  18. package/dist/adapters/mech/verdict-code.js +18 -0
  19. package/dist/adapters/mech/verdict-code.js.map +1 -1
  20. package/dist/api/admin-endpoint.d.ts +15 -3
  21. package/dist/api/admin-endpoint.js +24 -2
  22. package/dist/api/admin-endpoint.js.map +1 -1
  23. package/dist/api/bootstrap-endpoint.js +49 -0
  24. package/dist/api/bootstrap-endpoint.js.map +1 -1
  25. package/dist/api/codex-doctor-endpoint.d.ts +73 -0
  26. package/dist/api/codex-doctor-endpoint.js +177 -0
  27. package/dist/api/codex-doctor-endpoint.js.map +1 -0
  28. package/dist/api/discovery-endpoint.d.ts +1 -0
  29. package/dist/api/discovery-endpoint.js +26 -0
  30. package/dist/api/discovery-endpoint.js.map +1 -1
  31. package/dist/api/fleet-build.d.ts +1 -0
  32. package/dist/api/fleet-build.js +2 -1
  33. package/dist/api/fleet-build.js.map +1 -1
  34. package/dist/api/gather-status.d.ts +11 -0
  35. package/dist/api/gather-status.js +400 -4
  36. package/dist/api/gather-status.js.map +1 -1
  37. package/dist/api/hermes-doctor-endpoint.d.ts +117 -0
  38. package/dist/api/hermes-doctor-endpoint.js +229 -23
  39. package/dist/api/hermes-doctor-endpoint.js.map +1 -1
  40. package/dist/api/launcher-status.d.ts +21 -16
  41. package/dist/api/launcher-status.js +2 -1
  42. package/dist/api/launcher-status.js.map +1 -1
  43. package/dist/api/portfolio-v0-build.d.ts +10 -0
  44. package/dist/api/portfolio-v0-build.js +24 -5
  45. package/dist/api/portfolio-v0-build.js.map +1 -1
  46. package/dist/api/prediction-v1-build.d.ts +10 -0
  47. package/dist/api/prediction-v1-build.js +7 -1
  48. package/dist/api/prediction-v1-build.js.map +1 -1
  49. package/dist/api/server.d.ts +31 -1
  50. package/dist/api/server.js +68 -1
  51. package/dist/api/server.js.map +1 -1
  52. package/dist/api/setup-endpoints.d.ts +16 -0
  53. package/dist/api/setup-endpoints.js +78 -4
  54. package/dist/api/setup-endpoints.js.map +1 -1
  55. package/dist/api/setup-retry-endpoint.d.ts +19 -0
  56. package/dist/api/setup-retry-endpoint.js +32 -0
  57. package/dist/api/setup-retry-endpoint.js.map +1 -0
  58. package/dist/api/solvernets-endpoints.d.ts +8 -0
  59. package/dist/api/solvernets-endpoints.js +71 -43
  60. package/dist/api/solvernets-endpoints.js.map +1 -1
  61. package/dist/api/status-build.d.ts +72 -0
  62. package/dist/api/status-build.js +73 -18
  63. package/dist/api/status-build.js.map +1 -1
  64. package/dist/api/task-run-routing.d.ts +7 -0
  65. package/dist/api/task-run-routing.js +12 -0
  66. package/dist/api/task-run-routing.js.map +1 -0
  67. package/dist/api/task-runs-build.d.ts +21 -0
  68. package/dist/api/task-runs-build.js +14 -1
  69. package/dist/api/task-runs-build.js.map +1 -1
  70. package/dist/build-info.json +4 -4
  71. package/dist/build-meta.json +1 -1
  72. package/dist/chain-read-errors.d.ts +10 -0
  73. package/dist/chain-read-errors.js +15 -0
  74. package/dist/chain-read-errors.js.map +1 -1
  75. package/dist/cli/commands/auth.js +1 -1
  76. package/dist/cli/commands/auth.js.map +1 -1
  77. package/dist/cli/commands/create.js +3 -2
  78. package/dist/cli/commands/create.js.map +1 -1
  79. package/dist/cli/commands/doctor.d.ts +2 -0
  80. package/dist/cli/commands/doctor.js +2 -0
  81. package/dist/cli/commands/doctor.js.map +1 -1
  82. package/dist/cli/commands/rewards.js +11 -7
  83. package/dist/cli/commands/rewards.js.map +1 -1
  84. package/dist/cli/commands/solver-nets.js +24 -9
  85. package/dist/cli/commands/solver-nets.js.map +1 -1
  86. package/dist/cli/commands/status.js +1 -1
  87. package/dist/cli/commands/status.js.map +1 -1
  88. package/dist/cli/commands/tasks.js +86 -9
  89. package/dist/cli/commands/tasks.js.map +1 -1
  90. package/dist/cli/commands/update.d.ts +10 -0
  91. package/dist/cli/commands/update.js +36 -0
  92. package/dist/cli/commands/update.js.map +1 -1
  93. package/dist/cli/introspection-context.js +5 -0
  94. package/dist/cli/introspection-context.js.map +1 -1
  95. package/dist/cli/task-native-readiness.d.ts +3 -1
  96. package/dist/cli/task-native-readiness.js +28 -6
  97. package/dist/cli/task-native-readiness.js.map +1 -1
  98. package/dist/config.d.ts +106 -5
  99. package/dist/config.js +97 -18
  100. package/dist/config.js.map +1 -1
  101. package/dist/daemon/checkpoint-loop.d.ts +48 -0
  102. package/dist/daemon/checkpoint-loop.js +76 -0
  103. package/dist/daemon/checkpoint-loop.js.map +1 -0
  104. package/dist/daemon/creator.d.ts +1 -1
  105. package/dist/daemon/creator.js +7 -3
  106. package/dist/daemon/creator.js.map +1 -1
  107. package/dist/daemon/daemon.d.ts +19 -0
  108. package/dist/daemon/daemon.js +68 -1
  109. package/dist/daemon/daemon.js.map +1 -1
  110. package/dist/daemon/eviction-loop.d.ts +40 -0
  111. package/dist/daemon/eviction-loop.js +67 -0
  112. package/dist/daemon/eviction-loop.js.map +1 -0
  113. package/dist/daemon/jinn-claim-loop-wiring.d.ts +33 -0
  114. package/dist/daemon/jinn-claim-loop-wiring.js +40 -0
  115. package/dist/daemon/jinn-claim-loop-wiring.js.map +1 -0
  116. package/dist/daemon/jinn-claim-loop.d.ts +24 -17
  117. package/dist/daemon/jinn-claim-loop.js +77 -23
  118. package/dist/daemon/jinn-claim-loop.js.map +1 -1
  119. package/dist/daemon/skip-log-dedup.d.ts +69 -0
  120. package/dist/daemon/skip-log-dedup.js +106 -0
  121. package/dist/daemon/skip-log-dedup.js.map +1 -0
  122. package/dist/dashboard/assets/index-BUlE8F3Y.js +330 -0
  123. package/dist/dashboard/assets/index-blqc7eqq.css +32 -0
  124. package/dist/dashboard/index.html +2 -2
  125. package/dist/discovery/factory.d.ts +17 -5
  126. package/dist/discovery/factory.js +46 -18
  127. package/dist/discovery/factory.js.map +1 -1
  128. package/dist/discovery/http.js +142 -3
  129. package/dist/discovery/http.js.map +1 -1
  130. package/dist/discovery/onchain.d.ts +5 -0
  131. package/dist/discovery/onchain.js +407 -15
  132. package/dist/discovery/onchain.js.map +1 -1
  133. package/dist/discovery/types.d.ts +45 -1
  134. package/dist/discovery/types.js +8 -10
  135. package/dist/discovery/types.js.map +1 -1
  136. package/dist/discovery/with-fallback.d.ts +7 -0
  137. package/dist/discovery/with-fallback.js +10 -0
  138. package/dist/discovery/with-fallback.js.map +1 -1
  139. package/dist/earning/bootstrap.d.ts +92 -1
  140. package/dist/earning/bootstrap.js +203 -63
  141. package/dist/earning/bootstrap.js.map +1 -1
  142. package/dist/earning/contracts.d.ts +14 -0
  143. package/dist/earning/contracts.js +17 -5
  144. package/dist/earning/contracts.js.map +1 -1
  145. package/dist/earning/funding-plan.js +27 -18
  146. package/dist/earning/funding-plan.js.map +1 -1
  147. package/dist/earning/jinn-rewards.d.ts +46 -0
  148. package/dist/earning/jinn-rewards.js +32 -0
  149. package/dist/earning/jinn-rewards.js.map +1 -1
  150. package/dist/earning/safe-adapter.d.ts +2 -0
  151. package/dist/earning/safe-adapter.js +26 -12
  152. package/dist/earning/safe-adapter.js.map +1 -1
  153. package/dist/earning/store.d.ts +8 -0
  154. package/dist/earning/store.js.map +1 -1
  155. package/dist/earning/testnet-setup-migration.d.ts +12 -0
  156. package/dist/earning/testnet-setup-migration.js +27 -1
  157. package/dist/earning/testnet-setup-migration.js.map +1 -1
  158. package/dist/earning/types.d.ts +15 -0
  159. package/dist/erc8004/reputation.d.ts +8 -0
  160. package/dist/erc8004/reputation.js +22 -3
  161. package/dist/erc8004/reputation.js.map +1 -1
  162. package/dist/harnesses/cost-estimates.d.ts +145 -0
  163. package/dist/harnesses/cost-estimates.js +297 -0
  164. package/dist/harnesses/cost-estimates.js.map +1 -0
  165. package/dist/harnesses/engine/engine.d.ts +72 -0
  166. package/dist/harnesses/engine/engine.js +105 -8
  167. package/dist/harnesses/engine/engine.js.map +1 -1
  168. package/dist/harnesses/engine/persistence.d.ts +51 -1
  169. package/dist/harnesses/engine/persistence.js +118 -5
  170. package/dist/harnesses/engine/persistence.js.map +1 -1
  171. package/dist/harnesses/engine/work-dir-reaper.d.ts +65 -0
  172. package/dist/harnesses/engine/work-dir-reaper.js +100 -0
  173. package/dist/harnesses/engine/work-dir-reaper.js.map +1 -0
  174. package/dist/harnesses/impls/hermes-agent/adapter.js +40 -0
  175. package/dist/harnesses/impls/hermes-agent/adapter.js.map +1 -1
  176. package/dist/harnesses/impls/hermes-agent/bootstrap.d.ts +20 -0
  177. package/dist/harnesses/impls/hermes-agent/bootstrap.js +40 -6
  178. package/dist/harnesses/impls/hermes-agent/bootstrap.js.map +1 -1
  179. package/dist/harnesses/impls/hermes-agent/harness.d.ts +59 -1
  180. package/dist/harnesses/impls/hermes-agent/harness.js +104 -0
  181. package/dist/harnesses/impls/hermes-agent/harness.js.map +1 -1
  182. package/dist/harnesses/impls/index.d.ts +7 -0
  183. package/dist/harnesses/impls/index.js +16 -1
  184. package/dist/harnesses/impls/index.js.map +1 -1
  185. package/dist/harnesses/impls/learner/harness.d.ts +38 -4
  186. package/dist/harnesses/impls/learner/harness.js +96 -2
  187. package/dist/harnesses/impls/learner/harness.js.map +1 -1
  188. package/dist/harnesses/impls/learner/plugin-path.d.ts +0 -13
  189. package/dist/harnesses/impls/learner/plugin-path.js +35 -15
  190. package/dist/harnesses/impls/learner/plugin-path.js.map +1 -1
  191. package/dist/harnesses/impls/learner/types.d.ts +11 -0
  192. package/dist/harnesses/impls/stub.d.ts +58 -0
  193. package/dist/harnesses/impls/stub.js +89 -0
  194. package/dist/harnesses/impls/stub.js.map +1 -0
  195. package/dist/harnesses/impls/swe-rebench-v2-evaluator/eval-runner.d.ts +69 -50
  196. package/dist/harnesses/impls/swe-rebench-v2-evaluator/eval-runner.js +178 -93
  197. package/dist/harnesses/impls/swe-rebench-v2-evaluator/eval-runner.js.map +1 -1
  198. package/dist/harnesses/impls/swe-rebench-v2-evaluator/harness.d.ts +12 -1
  199. package/dist/harnesses/impls/swe-rebench-v2-evaluator/harness.js +121 -7
  200. package/dist/harnesses/impls/swe-rebench-v2-evaluator/harness.js.map +1 -1
  201. package/dist/harnesses/impls/swe-rebench-v2-evaluator/hf-fetcher.d.ts +15 -0
  202. package/dist/harnesses/impls/swe-rebench-v2-evaluator/hf-fetcher.js +54 -4
  203. package/dist/harnesses/impls/swe-rebench-v2-evaluator/hf-fetcher.js.map +1 -1
  204. package/dist/harnesses/impls/swe-rebench-v2-evaluator/index.d.ts +6 -0
  205. package/dist/harnesses/impls/swe-rebench-v2-evaluator/index.js +1 -1
  206. package/dist/harnesses/impls/swe-rebench-v2-evaluator/index.js.map +1 -1
  207. package/dist/harnesses/readiness-registry.js +9 -1
  208. package/dist/harnesses/readiness-registry.js.map +1 -1
  209. package/dist/main.js +371 -82
  210. package/dist/main.js.map +1 -1
  211. package/dist/observability/emit-event.d.ts +1 -1
  212. package/dist/observability/emit-event.js.map +1 -1
  213. package/dist/operator-errors.d.ts +7 -0
  214. package/dist/operator-errors.js +13 -1
  215. package/dist/operator-errors.js.map +1 -1
  216. package/dist/plugins/learner/.claude-plugin/plugin.json +9 -0
  217. package/dist/plugins/learner/.codex-plugin/plugin.json +39 -0
  218. package/dist/plugins/learner/AGENTS.md +40 -0
  219. package/dist/plugins/learner/CLAUDE.md +33 -0
  220. package/dist/plugins/learner/README.md +59 -0
  221. package/dist/plugins/learner/hooks/hooks.json +16 -0
  222. package/dist/plugins/learner/hooks/session-start +38 -0
  223. package/dist/plugins/learner/skills/learn/SKILL.md +412 -0
  224. package/dist/plugins/learner/skills/learn/analyst-prompt.md +68 -0
  225. package/dist/plugins/learner/skills/learn/consolidator-prompt.md +94 -0
  226. package/dist/plugins/learner/skills/learn/explorer-prompt.md +53 -0
  227. package/dist/plugins/learner/skills/learn/planner-prompt.md +87 -0
  228. package/dist/plugins/learner/skills/learn/promoter-prompt.md +113 -0
  229. package/dist/plugins/learner/skills/learn/step-worker-prompt.md +47 -0
  230. package/dist/plugins/learner/skills/learn/strategist-prompt.md +85 -0
  231. package/dist/restart-daemon.d.ts +90 -0
  232. package/dist/restart-daemon.js +95 -0
  233. package/dist/restart-daemon.js.map +1 -0
  234. package/dist/setup/halt-mode.d.ts +14 -0
  235. package/dist/setup/halt-mode.js +17 -0
  236. package/dist/setup/halt-mode.js.map +1 -0
  237. package/dist/solver-nets/prediction-operator-ux.js +43 -3
  238. package/dist/solver-nets/prediction-operator-ux.js.map +1 -1
  239. package/dist/solver-nets/registry.d.ts +1 -0
  240. package/dist/solver-nets/registry.js +1 -1
  241. package/dist/solver-nets/registry.js.map +1 -1
  242. package/dist/solver-types/_swe-rebench-v2-pool-cache.d.ts +58 -0
  243. package/dist/solver-types/_swe-rebench-v2-pool-cache.js +87 -0
  244. package/dist/solver-types/_swe-rebench-v2-pool-cache.js.map +1 -0
  245. package/dist/solver-types/_swe-rebench-v2-substrate.d.ts +1 -0
  246. package/dist/solver-types/_swe-rebench-v2-substrate.js +10 -0
  247. package/dist/solver-types/_swe-rebench-v2-substrate.js.map +1 -1
  248. package/dist/solver-types/_swe-rebench-v2-validated-pool.d.ts +65 -0
  249. package/dist/solver-types/_swe-rebench-v2-validated-pool.js +243 -26
  250. package/dist/solver-types/_swe-rebench-v2-validated-pool.js.map +1 -1
  251. package/dist/solver-types/swe-rebench-v2-auto.d.ts +22 -7
  252. package/dist/solver-types/swe-rebench-v2-auto.js +45 -20
  253. package/dist/solver-types/swe-rebench-v2-auto.js.map +1 -1
  254. package/dist/solver-types/swe-rebench-v2.d.ts +13 -2
  255. package/dist/solver-types/swe-rebench-v2.js +233 -94
  256. package/dist/solver-types/swe-rebench-v2.js.map +1 -1
  257. package/dist/solvernets/daemon-init.d.ts +10 -2
  258. package/dist/solvernets/daemon-init.js +22 -2
  259. package/dist/solvernets/daemon-init.js.map +1 -1
  260. package/dist/solvernets/launched-record-dispatcher.js +35 -7
  261. package/dist/solvernets/launched-record-dispatcher.js.map +1 -1
  262. package/dist/solvernets/store.d.ts +5 -0
  263. package/dist/solvernets/store.js +1 -0
  264. package/dist/solvernets/store.js.map +1 -1
  265. package/dist/store/store.d.ts +15 -0
  266. package/dist/store/store.js +118 -3
  267. package/dist/store/store.js.map +1 -1
  268. package/dist/tasks/sources.d.ts +18 -1
  269. package/dist/tasks/sources.js +33 -5
  270. package/dist/tasks/sources.js.map +1 -1
  271. package/dist/tx-retry.d.ts +151 -19
  272. package/dist/tx-retry.js +286 -32
  273. package/dist/tx-retry.js.map +1 -1
  274. package/dist/types/payloads/prediction-apy-v0.d.ts +5 -5
  275. package/dist/types/payloads/prediction-v0.d.ts +5 -5
  276. package/dist/types/task-document.d.ts +392 -0
  277. package/dist/types/task-document.js +10 -0
  278. package/dist/types/task-document.js.map +1 -1
  279. package/dist/types/task.d.ts +28 -0
  280. package/dist/util/extract-tx-hash.d.ts +14 -0
  281. package/dist/util/extract-tx-hash.js +19 -0
  282. package/dist/util/extract-tx-hash.js.map +1 -0
  283. package/dist/vendor/@jinn-network/sdk/dist/contracts.js +1 -1
  284. package/dist/vendor/@jinn-network/sdk/dist/solvernets/manifest-schema.d.ts +3 -0
  285. package/dist/vendor/@jinn-network/sdk/dist/solvernets/manifest-schema.js +1 -0
  286. package/package.json +29 -12
  287. package/dist/dashboard/assets/index-DOlzFN8a.css +0 -32
  288. package/dist/dashboard/assets/index-NkZ7CTAT.js +0 -140
@@ -121,4 +121,15 @@ export interface LearnerHarnessConfig {
121
121
  * Defaults to 'bare'.
122
122
  */
123
123
  runtimeMode?: 'bare' | 'container' | 'docker-compose';
124
+ /**
125
+ * Path to the `codex` executable. Used by `isReady()` when this
126
+ * `LearnerHarness` is the Codex variant (`name === CODEX_HARNESS`) — it is
127
+ * passed to `probeCodexDoctor()`. Defaults to 'codex' (from PATH).
128
+ */
129
+ codexPath?: string;
130
+ /**
131
+ * Timeout (ms) for the `codex --version` probe in the Codex variant's
132
+ * `isReady()`. Defaults to 30s.
133
+ */
134
+ codexDoctorTimeoutMs?: number;
124
135
  }
@@ -0,0 +1,58 @@
1
+ /**
2
+ * Env-gated stub harness for the T2.2 producer/evaluator gate.
3
+ *
4
+ * When JINN_HARNESS_STUB_INSTANCE is set, the canned patch at
5
+ * <fixturesDir>/<instanceMatcher>.patch is returned as a SWE-rebench v2
6
+ * restoration solution. Never calls an LLM; never accepts tasks whose
7
+ * spec.instance_id differs from the configured matcher.
8
+ *
9
+ * PRODUCTION SAFETY — two-env-var requirement.
10
+ * This is a *fake* harness: it produces canned, non-genuine work. If it ever
11
+ * entered a real operator run it would generate fraudulent on-chain activity.
12
+ * To make accidental activation impossible, the factory requires BOTH:
13
+ * JINN_HARNESS_STUB_INSTANCE — instance ID this stub responds to
14
+ * JINN_TEST_MODE === '1' — explicit test-mode sentinel
15
+ * If JINN_HARNESS_STUB_INSTANCE is set but JINN_TEST_MODE is not '1', the
16
+ * factory THROWS rather than silently registering the stub. A single stray
17
+ * exported env var in an operator's shell can no longer activate it.
18
+ *
19
+ * Activated by environment variables:
20
+ * JINN_HARNESS_STUB_INSTANCE — instance ID this stub responds to (required to activate)
21
+ * JINN_TEST_MODE — must equal '1' (defense-in-depth; required to activate)
22
+ * JINN_HARNESS_STUB_FIXTURES_DIR — dir containing <instanceMatcher>.patch files
23
+ * (default: client/test/release/tier-2/fixtures)
24
+ */
25
+ import type { Harness, HarnessContext, ReadyStatus, Solution } from '../types.js';
26
+ export interface StubHarnessConfig {
27
+ /** Directory containing <instanceMatcher>.patch files. */
28
+ fixturesDir: string;
29
+ /** The instance ID this stub will accept. Tasks with other instance IDs are rejected. */
30
+ instanceMatcher: string;
31
+ }
32
+ /**
33
+ * A zero-LLM Harness that returns a canned patch for a specific SWE-rebench v2
34
+ * instance. Intended exclusively for T2.2 release-gate automation.
35
+ */
36
+ export declare class StubHarness implements Harness {
37
+ readonly name = "harness:stub";
38
+ readonly version = "0.1.0-stub";
39
+ private readonly fixturesDir;
40
+ private readonly instanceMatcher;
41
+ constructor(config: StubHarnessConfig);
42
+ supports(ctx: {
43
+ solverType: string;
44
+ role?: 'restoration' | 'evaluation';
45
+ }): boolean;
46
+ isReady(): Promise<ReadyStatus>;
47
+ run(ctx: HarnessContext): Promise<Solution>;
48
+ }
49
+ /**
50
+ * Factory that reads JINN_HARNESS_STUB_INSTANCE and JINN_HARNESS_STUB_FIXTURES_DIR
51
+ * from the environment and returns a configured StubHarness, or null if the env
52
+ * var is absent (allowing the registry to skip registration silently).
53
+ *
54
+ * Defense-in-depth: if JINN_HARNESS_STUB_INSTANCE is set but JINN_TEST_MODE is
55
+ * not exactly '1', this THROWS rather than returning a harness — a real
56
+ * operator run must never silently pick up the fake stub harness.
57
+ */
58
+ export declare function maybeCreateStubHarnessFromEnv(): StubHarness | null;
@@ -0,0 +1,89 @@
1
+ /**
2
+ * Env-gated stub harness for the T2.2 producer/evaluator gate.
3
+ *
4
+ * When JINN_HARNESS_STUB_INSTANCE is set, the canned patch at
5
+ * <fixturesDir>/<instanceMatcher>.patch is returned as a SWE-rebench v2
6
+ * restoration solution. Never calls an LLM; never accepts tasks whose
7
+ * spec.instance_id differs from the configured matcher.
8
+ *
9
+ * PRODUCTION SAFETY — two-env-var requirement.
10
+ * This is a *fake* harness: it produces canned, non-genuine work. If it ever
11
+ * entered a real operator run it would generate fraudulent on-chain activity.
12
+ * To make accidental activation impossible, the factory requires BOTH:
13
+ * JINN_HARNESS_STUB_INSTANCE — instance ID this stub responds to
14
+ * JINN_TEST_MODE === '1' — explicit test-mode sentinel
15
+ * If JINN_HARNESS_STUB_INSTANCE is set but JINN_TEST_MODE is not '1', the
16
+ * factory THROWS rather than silently registering the stub. A single stray
17
+ * exported env var in an operator's shell can no longer activate it.
18
+ *
19
+ * Activated by environment variables:
20
+ * JINN_HARNESS_STUB_INSTANCE — instance ID this stub responds to (required to activate)
21
+ * JINN_TEST_MODE — must equal '1' (defense-in-depth; required to activate)
22
+ * JINN_HARNESS_STUB_FIXTURES_DIR — dir containing <instanceMatcher>.patch files
23
+ * (default: client/test/release/tier-2/fixtures)
24
+ */
25
+ import * as fs from 'node:fs/promises';
26
+ import * as path from 'node:path';
27
+ /**
28
+ * A zero-LLM Harness that returns a canned patch for a specific SWE-rebench v2
29
+ * instance. Intended exclusively for T2.2 release-gate automation.
30
+ */
31
+ export class StubHarness {
32
+ name = 'harness:stub';
33
+ version = '0.1.0-stub';
34
+ fixturesDir;
35
+ instanceMatcher;
36
+ constructor(config) {
37
+ this.fixturesDir = config.fixturesDir;
38
+ this.instanceMatcher = config.instanceMatcher;
39
+ }
40
+ supports(ctx) {
41
+ if (ctx.role === 'evaluation')
42
+ return false;
43
+ return ctx.solverType === 'swe-rebench-v2.v1';
44
+ }
45
+ async isReady() {
46
+ return { ready: true };
47
+ }
48
+ async run(ctx) {
49
+ const taskInstanceId = ctx.task.spec?.['instance_id'];
50
+ if (taskInstanceId !== this.instanceMatcher) {
51
+ throw new Error(`stub harness: task.spec.instance_id=${String(taskInstanceId)} does not match configured instanceMatcher=${this.instanceMatcher}`);
52
+ }
53
+ const patchPath = path.join(this.fixturesDir, `${this.instanceMatcher}.patch`);
54
+ const patch = await fs.readFile(patchPath, 'utf-8');
55
+ return {
56
+ venueRef: { name: this.name },
57
+ gating: {},
58
+ solutionPayload: {
59
+ schemaVersion: 'swe-rebench-v2-solution.v1',
60
+ patch,
61
+ },
62
+ };
63
+ }
64
+ }
65
+ /**
66
+ * Factory that reads JINN_HARNESS_STUB_INSTANCE and JINN_HARNESS_STUB_FIXTURES_DIR
67
+ * from the environment and returns a configured StubHarness, or null if the env
68
+ * var is absent (allowing the registry to skip registration silently).
69
+ *
70
+ * Defense-in-depth: if JINN_HARNESS_STUB_INSTANCE is set but JINN_TEST_MODE is
71
+ * not exactly '1', this THROWS rather than returning a harness — a real
72
+ * operator run must never silently pick up the fake stub harness.
73
+ */
74
+ export function maybeCreateStubHarnessFromEnv() {
75
+ const instanceMatcher = process.env['JINN_HARNESS_STUB_INSTANCE'];
76
+ if (!instanceMatcher)
77
+ return null;
78
+ if (process.env['JINN_TEST_MODE'] !== '1') {
79
+ throw new Error('stub harness must never activate in a real operator run: ' +
80
+ 'JINN_HARNESS_STUB_INSTANCE is set but JINN_TEST_MODE is not "1". ' +
81
+ 'The stub harness produces canned, non-genuine work and would generate ' +
82
+ 'fraudulent on-chain activity. Set JINN_TEST_MODE=1 if this is a Tier 2 ' +
83
+ 'test; otherwise unset JINN_HARNESS_STUB_INSTANCE.');
84
+ }
85
+ const fixturesDir = process.env['JINN_HARNESS_STUB_FIXTURES_DIR'] ??
86
+ path.resolve(process.cwd(), 'test', 'release', 'tier-2', 'fixtures');
87
+ return new StubHarness({ instanceMatcher, fixturesDir });
88
+ }
89
+ //# sourceMappingURL=stub.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"stub.js","sourceRoot":"","sources":["../../../src/harnesses/impls/stub.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;GAuBG;AAEH,OAAO,KAAK,EAAE,MAAM,kBAAkB,CAAC;AACvC,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AAUlC;;;GAGG;AACH,MAAM,OAAO,WAAW;IACb,IAAI,GAAG,cAAc,CAAC;IACtB,OAAO,GAAG,YAAY,CAAC;IAEf,WAAW,CAAS;IACpB,eAAe,CAAS;IAEzC,YAAY,MAAyB;QACnC,IAAI,CAAC,WAAW,GAAG,MAAM,CAAC,WAAW,CAAC;QACtC,IAAI,CAAC,eAAe,GAAG,MAAM,CAAC,eAAe,CAAC;IAChD,CAAC;IAED,QAAQ,CAAC,GAAgE;QACvE,IAAI,GAAG,CAAC,IAAI,KAAK,YAAY;YAAE,OAAO,KAAK,CAAC;QAC5C,OAAO,GAAG,CAAC,UAAU,KAAK,mBAAmB,CAAC;IAChD,CAAC;IAED,KAAK,CAAC,OAAO;QACX,OAAO,EAAE,KAAK,EAAE,IAAI,EAAE,CAAC;IACzB,CAAC;IAED,KAAK,CAAC,GAAG,CAAC,GAAmB;QAC3B,MAAM,cAAc,GAAI,GAAG,CAAC,IAAI,CAAC,IAA4C,EAAE,CAAC,aAAa,CAAC,CAAC;QAC/F,IAAI,cAAc,KAAK,IAAI,CAAC,eAAe,EAAE,CAAC;YAC5C,MAAM,IAAI,KAAK,CACb,uCAAuC,MAAM,CAAC,cAAc,CAAC,8CAA8C,IAAI,CAAC,eAAe,EAAE,CAClI,CAAC;QACJ,CAAC;QACD,MAAM,SAAS,GAAG,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,WAAW,EAAE,GAAG,IAAI,CAAC,eAAe,QAAQ,CAAC,CAAC;QAC/E,MAAM,KAAK,GAAG,MAAM,EAAE,CAAC,QAAQ,CAAC,SAAS,EAAE,OAAO,CAAC,CAAC;QACpD,OAAO;YACL,QAAQ,EAAE,EAAE,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE;YAC7B,MAAM,EAAE,EAAE;YACV,eAAe,EAAE;gBACf,aAAa,EAAE,4BAA4B;gBAC3C,KAAK;aACN;SACF,CAAC;IACJ,CAAC;CACF;AAED;;;;;;;;GAQG;AACH,MAAM,UAAU,6BAA6B;IAC3C,MAAM,eAAe,GAAG,OAAO,CAAC,GAAG,CAAC,4BAA4B,CAAC,CAAC;IAClE,IAAI,CAAC,eAAe;QAAE,OAAO,IAAI,CAAC;IAClC,IAAI,OAAO,CAAC,GAAG,CAAC,gBAAgB,CAAC,KAAK,GAAG,EAAE,CAAC;QAC1C,MAAM,IAAI,KAAK,CACb,2DAA2D;YACzD,mEAAmE;YACnE,wEAAwE;YACxE,yEAAyE;YACzE,mDAAmD,CACtD,CAAC;IACJ,CAAC;IACD,MAAM,WAAW,GACf,OAAO,CAAC,GAAG,CAAC,gCAAgC,CAAC;QAC7C,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,GAAG,EAAE,EAAE,MAAM,EAAE,SAAS,EAAE,QAAQ,EAAE,UAAU,CAAC,CAAC;IACvE,OAAO,IAAI,WAAW,CAAC,EAAE,eAAe,EAAE,WAAW,EAAE,CAAC,CAAC;AAC3D,CAAC"}
@@ -37,6 +37,36 @@ export declare class EvalCouldNotGradeError extends Error {
37
37
  readonly logExcerpt: string;
38
38
  constructor(reason: string, logExcerpt?: string);
39
39
  }
40
+ /**
41
+ * Thrown by `runEval` when the disk cannot be brought above the eval
42
+ * disk-floor even after a broad prune. A clean abort — the caller stops
43
+ * gracefully; no instance is graded, nothing is marked. Distinct from
44
+ * `EvalCouldNotGradeError`: this is operator-environment, retryable, and must
45
+ * never be turned into a `scorable: false` admission (#476).
46
+ */
47
+ export declare class InsufficientDiskError extends Error {
48
+ readonly freeBytes: number;
49
+ readonly floorBytes: number;
50
+ constructor(freeBytes: number, floorBytes: number);
51
+ }
52
+ /**
53
+ * Default free-disk floor required before an eval round: 20 GB. A single
54
+ * SWE-rebench eval image was observed to peak transiently at ~12.6 GB, so the
55
+ * floor clears the worst observed instance with real margin. Override with
56
+ * `JINN_EVAL_DISK_FLOOR_GB` on constrained hosts.
57
+ */
58
+ export declare const DEFAULT_EVAL_DISK_FLOOR_BYTES = 20000000000;
59
+ /** Resolve the disk floor: explicit option > `JINN_EVAL_DISK_FLOOR_GB` env > default. */
60
+ export declare function resolveDiskFloorBytes(opt: number | undefined): number;
61
+ /**
62
+ * Default wall-clock limit for one upstream eval.py invocation: 2 hours. Some
63
+ * linux/amd64 SWE-rebench images can wedge indefinitely under Apple Silicon
64
+ * emulation after a native crash, so the subprocess gets a hard guardrail.
65
+ * Override with `JINN_SWE_REBENCH_EVAL_TIMEOUT_MS`; set `0` to disable.
66
+ */
67
+ export declare const DEFAULT_EVAL_TIMEOUT_MS: number;
68
+ /** Resolve the eval timeout: explicit option > env > default. */
69
+ export declare function resolveEvalTimeoutMs(opt: number | undefined): number;
40
70
  export interface PythonEvalRunnerOptions {
41
71
  /** Path to the cloned SWE-rebench-V2 repo (cached locally). */
42
72
  upstreamRepoDir: string;
@@ -45,66 +75,55 @@ export interface PythonEvalRunnerOptions {
45
75
  /** Workers for parallel eval (defaults to 1; we run one task at a time). */
46
76
  maxWorkers?: number;
47
77
  /**
48
- * Max number of distinct eval images to keep in the local Docker cache.
49
- * The runner tracks an in-process LRU keyed by image tag; once usage exceeds
50
- * this cap, the least-recently-used images are removed via
51
- * {@link PythonEvalRunnerOptions.cleanupImage}.
52
- *
53
- * The leaderboard pool has hundreds of unique instances at ~3 GB/image, so
54
- * an unbounded cache fills operator disks in days (jinn-mono-uy6v.11).
78
+ * Removes a completed round's entire Docker footprint the round's image,
79
+ * stopped containers, and build cache so eval disk usage never
80
+ * accumulates across instances (#476). Called once per `runEval`, in a
81
+ * `finally`, even when the eval threw.
55
82
  *
56
- * Default: `process.env.JINN_EVAL_IMAGE_CACHE_MAX` parsed as an integer, or
57
- * `DEFAULT_EVAL_IMAGE_CACHE_MAX` (20) if unset/invalid.
83
+ * Defaults to {@link defaultPruneRound}. Implementations MUST NOT throw
84
+ * `runEval` guards defensively, but cleanup failures should be swallowed
85
+ * (logged elsewhere if desired) so a flaky `docker` never escapes `runEval`.
58
86
  */
59
- imageCacheMax?: number;
87
+ pruneRound?: (image: string) => Promise<void>;
60
88
  /**
61
- * Removes an image from the local Docker cache (or no-ops if the operator
62
- * has chosen not to GC). Called for each eviction from the LRU.
63
- *
64
- * Defaults to `docker rmi <image>` via the system `docker` binary. Test
65
- * suites inject a stub to capture the eviction order without shelling out.
66
- *
67
- * Implementations MUST NOT throw — failures should be swallowed (logged
68
- * elsewhere if desired) so a missing/failed `docker rmi` never escapes
69
- * `runEval`. The runner enforces this defensively too.
89
+ * Resolves the eval image digest while the image is still local, before
90
+ * per-round pruning removes it. Defaults to `docker image inspect`.
91
+ */
92
+ resolveImageDigest?: (image: string) => Promise<string | null>;
93
+ /**
94
+ * Required free disk (bytes) before an eval round starts. Explicit value >
95
+ * `JINN_EVAL_DISK_FLOOR_GB` env > {@link DEFAULT_EVAL_DISK_FLOOR_BYTES}.
70
96
  */
71
- cleanupImage?: (image: string) => Promise<void>;
97
+ diskFloorBytes?: number;
98
+ /** Probe of free disk (bytes). Defaults to a `statfs` on the temp dir. */
99
+ freeDiskBytes?: () => Promise<number>;
100
+ /**
101
+ * Broad reclaim invoked when free disk is below the floor. Defaults to
102
+ * `docker system prune -f`. MUST NOT throw.
103
+ */
104
+ systemPrune?: () => Promise<void>;
105
+ /**
106
+ * Wall-clock timeout (ms) for one upstream eval.py invocation. Explicit value
107
+ * > `JINN_SWE_REBENCH_EVAL_TIMEOUT_MS` env > {@link DEFAULT_EVAL_TIMEOUT_MS}.
108
+ * Set to 0 to disable.
109
+ */
110
+ evalTimeoutMs?: number;
72
111
  }
73
- /**
74
- * Default cap on the per-instance Docker image cache when no explicit
75
- * `imageCacheMax` and no `JINN_EVAL_IMAGE_CACHE_MAX` env var are configured.
76
- *
77
- * 20 images × ~3 GB/image ≈ 60 GB working set — small enough that even a
78
- * 256 GB disk has headroom, large enough that the steady-state loop on a
79
- * frequently-repeating subset of the pool rarely re-pulls.
80
- */
81
- export declare const DEFAULT_EVAL_IMAGE_CACHE_MAX = 20;
82
- export declare function resolveImageCacheMax(opt: number | undefined): number;
83
112
  export declare function matchInfraSignature(log: string): string | null;
84
113
  export declare class PythonEvalRunner implements EvalRunner {
85
114
  private readonly opts;
86
- /**
87
- * LRU of image tags whose Docker layers may be cached locally. Stored as a
88
- * `Set<string>` because `Set` preserves insertion order; we delete-then-add
89
- * to refresh recency and `next()` on the keys iterator to find the
90
- * least-recently-used entry.
91
- */
92
- private readonly imageLru;
93
- private readonly imageCacheMax;
94
- private readonly cleanupImage;
115
+ private readonly pruneRound;
116
+ private readonly diskFloorBytes;
117
+ private readonly freeDiskBytes;
118
+ private readonly systemPrune;
119
+ private readonly resolveImageDigest;
120
+ private readonly evalTimeoutMs;
95
121
  constructor(opts: PythonEvalRunnerOptions);
96
- runEval(args: Parameters<EvalRunner['runEval']>[0]): ReturnType<EvalRunner['runEval']>;
97
122
  /**
98
- * Move `image` to the most-recently-used slot of the in-process LRU; if the
99
- * set now exceeds {@link imageCacheMax}, evict the oldest entries via
100
- * {@link cleanupImage}. Eviction failures are swallowed so a flaky
101
- * `docker rmi` cannot escape `runEval`.
102
- *
103
- * The cap is enforced after the just-used image is inserted: the
104
- * just-evaluated image is the *most* recent, so repeat-evals of recently
105
- * used instances never re-pull. Only when more than N distinct images have
106
- * been used does the oldest get rmi'd.
123
+ * Ensure enough free disk for an eval round. Below the floor broad prune →
124
+ * re-probe; still below `InsufficientDiskError` (clean abort). (#476)
107
125
  */
108
- private recordImageUsage;
126
+ private ensureDiskHeadroom;
127
+ runEval(args: Parameters<EvalRunner['runEval']>[0]): ReturnType<EvalRunner['runEval']>;
109
128
  private runEvalImpl;
110
129
  }
@@ -26,9 +26,10 @@
26
26
  * re-raises as `SkippableError` (no signed verdict).
27
27
  */
28
28
  import { spawn } from 'node:child_process';
29
- import { mkdtemp, writeFile, readFile, rm } from 'node:fs/promises';
29
+ import { mkdtemp, writeFile, readFile, rm, statfs } from 'node:fs/promises';
30
30
  import { tmpdir } from 'node:os';
31
31
  import { isAbsolute, join } from 'node:path';
32
+ import { defaultCommandRunner, resolveImageDigest as resolveSubstrateImageDigest, } from '../../../solver-types/_swe-rebench-v2-substrate.js';
32
33
  /**
33
34
  * Thrown when the eval could not actually grade the solution. There is no
34
35
  * signal about the solver here, only about the operator's environment — the
@@ -46,62 +47,102 @@ export class EvalCouldNotGradeError extends Error {
46
47
  }
47
48
  }
48
49
  /**
49
- * Default cap on the per-instance Docker image cache when no explicit
50
- * `imageCacheMax` and no `JINN_EVAL_IMAGE_CACHE_MAX` env var are configured.
51
- *
52
- * 20 images × ~3 GB/image 60 GB working set — small enough that even a
53
- * 256 GB disk has headroom, large enough that the steady-state loop on a
54
- * frequently-repeating subset of the pool rarely re-pulls.
50
+ * Thrown by `runEval` when the disk cannot be brought above the eval
51
+ * disk-floor even after a broad prune. A clean abort — the caller stops
52
+ * gracefully; no instance is graded, nothing is marked. Distinct from
53
+ * `EvalCouldNotGradeError`: this is operator-environment, retryable, and must
54
+ * never be turned into a `scorable: false` admission (#476).
55
+ */
56
+ export class InsufficientDiskError extends Error {
57
+ freeBytes;
58
+ floorBytes;
59
+ constructor(freeBytes, floorBytes) {
60
+ const gb = (n) => (n / 1_000_000_000).toFixed(1);
61
+ super(`insufficient disk for swe-rebench eval: ${gb(freeBytes)} GB free, ` +
62
+ `need ≥ ${gb(floorBytes)} GB`);
63
+ this.name = 'InsufficientDiskError';
64
+ this.freeBytes = freeBytes;
65
+ this.floorBytes = floorBytes;
66
+ }
67
+ }
68
+ /**
69
+ * Default free-disk floor required before an eval round: 20 GB. A single
70
+ * SWE-rebench eval image was observed to peak transiently at ~12.6 GB, so the
71
+ * floor clears the worst observed instance with real margin. Override with
72
+ * `JINN_EVAL_DISK_FLOOR_GB` on constrained hosts.
55
73
  */
56
- export const DEFAULT_EVAL_IMAGE_CACHE_MAX = 20;
57
- export function resolveImageCacheMax(opt) {
74
+ export const DEFAULT_EVAL_DISK_FLOOR_BYTES = 20_000_000_000;
75
+ /** Resolve the disk floor: explicit option > `JINN_EVAL_DISK_FLOOR_GB` env > default. */
76
+ export function resolveDiskFloorBytes(opt) {
58
77
  if (typeof opt === 'number' && Number.isFinite(opt) && opt > 0)
59
78
  return Math.floor(opt);
60
- const envRaw = process.env['JINN_EVAL_IMAGE_CACHE_MAX'];
79
+ const envRaw = process.env['JINN_EVAL_DISK_FLOOR_GB'];
61
80
  if (envRaw !== undefined) {
62
- // `Number()` returns 0 for `""` / whitespace and NaN for strings with
63
- // non-numeric content (e.g. `"garbage"`, `"1e3oops"`) — unlike `parseInt`,
64
- // which would silently accept `parseInt("1e3oops") === 1`. Either way we
65
- // reject anything that isn't a positive integer.
66
81
  const parsed = Number(envRaw);
67
- if (Number.isFinite(parsed) && Number.isInteger(parsed) && parsed > 0)
68
- return parsed;
69
- // Surface the typo so operators discover it before the disk fills,
70
- // rather than silently running on the default.
71
- console.warn(`[swe-rebench-v2] JINN_EVAL_IMAGE_CACHE_MAX=${JSON.stringify(envRaw)} is not a positive integer — using default ${DEFAULT_EVAL_IMAGE_CACHE_MAX}`);
82
+ if (Number.isFinite(parsed) && parsed > 0)
83
+ return Math.floor(parsed * 1_000_000_000);
84
+ console.warn(`[swe-rebench-v2] JINN_EVAL_DISK_FLOOR_GB=${JSON.stringify(envRaw)} is not a positive ` +
85
+ `number using default ${DEFAULT_EVAL_DISK_FLOOR_BYTES / 1_000_000_000} GB`);
72
86
  }
73
- return DEFAULT_EVAL_IMAGE_CACHE_MAX;
87
+ return DEFAULT_EVAL_DISK_FLOOR_BYTES;
74
88
  }
75
89
  /**
76
- * Production `cleanupImage`: spawn `docker rmi <image>`. Errors are tolerated
77
- * a missing/failed `docker rmi` is operationally survivable (the image
78
- * stays on disk; cache stays bloated for a while; not a correctness failure)
79
- * but we warn on non-zero exit and on failed-to-spawn so a persistently-flaky
80
- * daemon (or a permission slip) becomes visible before disks fill. Silent
81
- * leaks were the original failure mode `jinn-mono-uy6v.11` exists to fix.
82
- *
83
- * We listen on `'exit'` rather than `'close'` and route stdio to `'ignore'`
84
- * so the resolve path doesn't depend on parent-side stream draining (which
85
- * can fail to fire `'close'` cleanly when piped without backpressure on the
86
- * right tick). The image tag + exit code is sufficient signal; operators can
87
- * grep the docker daemon log for the underlying reason.
90
+ * Default wall-clock limit for one upstream eval.py invocation: 2 hours. Some
91
+ * linux/amd64 SWE-rebench images can wedge indefinitely under Apple Silicon
92
+ * emulation after a native crash, so the subprocess gets a hard guardrail.
93
+ * Override with `JINN_SWE_REBENCH_EVAL_TIMEOUT_MS`; set `0` to disable.
88
94
  */
89
- function defaultCleanupImage(image) {
90
- return new Promise((resolve) => {
91
- const child = spawn('docker', ['rmi', image], { stdio: ['ignore', 'ignore', 'ignore'] });
92
- child.on('exit', (code, signal) => {
93
- if (code !== 0) {
94
- const status = code !== null ? `exited ${code}` : `terminated by signal ${signal ?? 'unknown'}`;
95
- console.warn(`[swe-rebench-v2] docker rmi ${image} ${status}`);
96
- }
97
- resolve();
98
- });
99
- child.on('error', (err) => {
100
- console.warn(`[swe-rebench-v2] docker rmi ${image} failed to spawn: ${err.message}`);
101
- resolve();
102
- });
95
+ export const DEFAULT_EVAL_TIMEOUT_MS = 2 * 60 * 60 * 1000;
96
+ /** Resolve the eval timeout: explicit option > env > default. */
97
+ export function resolveEvalTimeoutMs(opt) {
98
+ if (typeof opt === 'number' && Number.isFinite(opt) && opt >= 0)
99
+ return Math.floor(opt);
100
+ const envRaw = process.env['JINN_SWE_REBENCH_EVAL_TIMEOUT_MS'];
101
+ if (envRaw !== undefined) {
102
+ const parsed = Number(envRaw);
103
+ if (Number.isFinite(parsed) && parsed >= 0)
104
+ return Math.floor(parsed);
105
+ console.warn(`[swe-rebench-v2] JINN_SWE_REBENCH_EVAL_TIMEOUT_MS=${JSON.stringify(envRaw)} is not a ` +
106
+ `non-negative number using default ${DEFAULT_EVAL_TIMEOUT_MS} ms`);
107
+ }
108
+ return DEFAULT_EVAL_TIMEOUT_MS;
109
+ }
110
+ /** Production disk probe: free bytes on the filesystem backing the temp dir. */
111
+ async function defaultFreeDiskBytes() {
112
+ const s = await statfs(tmpdir());
113
+ return s.bavail * s.bsize;
114
+ }
115
+ /**
116
+ * Spawn `docker <args>`, resolving regardless of outcome — a failed cleanup
117
+ * command is logged, never thrown (#476: cleanup must not break the eval loop).
118
+ */
119
+ function runDocker(args) {
120
+ return defaultCommandRunner('docker', args)
121
+ .then((res) => {
122
+ if (res.exitCode !== 0) {
123
+ const detail = (res.stderr || res.stdout).trim();
124
+ console.warn(`[swe-rebench-v2] docker ${args.join(' ')} exited ${res.exitCode}` +
125
+ `${detail ? `: ${detail.slice(-500)}` : ''}`);
126
+ }
127
+ })
128
+ .catch((err) => {
129
+ const reason = err instanceof Error ? err.message : String(err);
130
+ console.warn(`[swe-rebench-v2] docker ${args.join(' ')} failed to spawn: ${reason}`);
103
131
  });
104
132
  }
133
+ /**
134
+ * Production `pruneRound`: remove the round's image, then prune stopped
135
+ * containers and build cache. Each step is best-effort.
136
+ */
137
+ async function defaultPruneRound(image) {
138
+ if (image)
139
+ await runDocker(['rmi', '-f', image]);
140
+ await runDocker(['container', 'prune', '-f']);
141
+ await runDocker(['builder', 'prune', '-f']);
142
+ }
143
+ async function defaultResolveImageDigest(imageName) {
144
+ return resolveSubstrateImageDigest(imageName, defaultCommandRunner);
145
+ }
105
146
  /**
106
147
  * Container-output signatures that mean the eval aborted before producing a
107
148
  * usable result — i.e. the operator's environment is the problem, not the
@@ -120,6 +161,7 @@ const INFRA_SIGNATURES = [
120
161
  { rx: /Failed building editable|Failed to build installable wheels/i, reason: 'install_build_failed' },
121
162
  { rx: /No virtual environment found/i, reason: 'venv_missing' },
122
163
  { rx: /exec format error|the requested image's platform .* does not match/i, reason: 'image_arch_mismatch' },
164
+ { rx: /Fatal Python error:\s*Illegal instruction|Illegal instruction(?:\s+\(core dumped\))?/i, reason: 'image_arch_mismatch' },
123
165
  // 2026-05-14 triage (jinn-mono-fufn) — failure fingerprints from real verdicts:
124
166
  { rx: /A virtual environment already exists at \S+\.venv\b/i, reason: 'venv_collision' },
125
167
  { rx: /No module named pytest\b/i, reason: 'pytest_missing' },
@@ -178,64 +220,62 @@ function buildTestCommands(args) {
178
220
  }
179
221
  export class PythonEvalRunner {
180
222
  opts;
181
- /**
182
- * LRU of image tags whose Docker layers may be cached locally. Stored as a
183
- * `Set<string>` because `Set` preserves insertion order; we delete-then-add
184
- * to refresh recency and `next()` on the keys iterator to find the
185
- * least-recently-used entry.
186
- */
187
- imageLru = new Set();
188
- imageCacheMax;
189
- cleanupImage;
223
+ pruneRound;
224
+ diskFloorBytes;
225
+ freeDiskBytes;
226
+ systemPrune;
227
+ resolveImageDigest;
228
+ evalTimeoutMs;
190
229
  constructor(opts) {
191
230
  this.opts = opts;
192
- this.imageCacheMax = resolveImageCacheMax(opts.imageCacheMax);
193
- this.cleanupImage = opts.cleanupImage ?? defaultCleanupImage;
231
+ this.pruneRound = opts.pruneRound ?? defaultPruneRound;
232
+ this.diskFloorBytes = resolveDiskFloorBytes(opts.diskFloorBytes);
233
+ this.freeDiskBytes = opts.freeDiskBytes ?? defaultFreeDiskBytes;
234
+ this.systemPrune = opts.systemPrune ?? (() => runDocker(['system', 'prune', '-f']));
235
+ this.resolveImageDigest = opts.resolveImageDigest ?? defaultResolveImageDigest;
236
+ this.evalTimeoutMs = resolveEvalTimeoutMs(opts.evalTimeoutMs);
237
+ }
238
+ /**
239
+ * Ensure enough free disk for an eval round. Below the floor → broad prune →
240
+ * re-probe; still below → `InsufficientDiskError` (clean abort). (#476)
241
+ */
242
+ async ensureDiskHeadroom() {
243
+ const free = await this.freeDiskBytes();
244
+ if (free >= this.diskFloorBytes)
245
+ return;
246
+ console.warn(`[swe-rebench-v2] low disk (${(free / 1e9).toFixed(1)} GB) — running docker system prune`);
247
+ await this.systemPrune();
248
+ const afterPrune = await this.freeDiskBytes();
249
+ if (afterPrune < this.diskFloorBytes) {
250
+ throw new InsufficientDiskError(afterPrune, this.diskFloorBytes);
251
+ }
194
252
  }
195
253
  async runEval(args) {
254
+ await this.ensureDiskHeadroom();
196
255
  try {
197
- return await this.runEvalImpl(args);
256
+ const result = await this.runEvalImpl(args);
257
+ let imageDigest = null;
258
+ try {
259
+ imageDigest = await this.resolveImageDigest(args.image);
260
+ }
261
+ catch (err) {
262
+ const reason = err instanceof Error ? err.message : String(err);
263
+ console.warn(`[swe-rebench-v2] resolveImageDigest failed for ${args.image}: ${reason}`);
264
+ }
265
+ return {
266
+ ...result,
267
+ ...(imageDigest ? { imageDigest } : {}),
268
+ };
198
269
  }
199
270
  finally {
200
- // Always record the image and run GC — even when the eval threw. A
201
- // pull-and-crash failure (Docker storage IO error, image_arch_mismatch,
202
- // patch_corrupt, eval_no_report) still left an image on disk; we must
203
- // count it toward the cache cap so the failure path can't leak the LRU.
204
- await this.recordImageUsage(args.image);
205
- }
206
- }
207
- /**
208
- * Move `image` to the most-recently-used slot of the in-process LRU; if the
209
- * set now exceeds {@link imageCacheMax}, evict the oldest entries via
210
- * {@link cleanupImage}. Eviction failures are swallowed so a flaky
211
- * `docker rmi` cannot escape `runEval`.
212
- *
213
- * The cap is enforced after the just-used image is inserted: the
214
- * just-evaluated image is the *most* recent, so repeat-evals of recently
215
- * used instances never re-pull. Only when more than N distinct images have
216
- * been used does the oldest get rmi'd.
217
- */
218
- async recordImageUsage(image) {
219
- if (!image)
220
- return;
221
- // Refresh recency: delete-then-add reinserts at the tail of the set.
222
- this.imageLru.delete(image);
223
- this.imageLru.add(image);
224
- while (this.imageLru.size > this.imageCacheMax) {
225
- const oldest = this.imageLru.values().next().value;
226
- if (!oldest)
227
- break;
228
- this.imageLru.delete(oldest);
271
+ // Prune this round's full Docker footprint — even when the eval threw,
272
+ // a pull-and-crash still left an image on disk (#476).
229
273
  try {
230
- await this.cleanupImage(oldest);
274
+ await this.pruneRound(args.image);
231
275
  }
232
276
  catch (err) {
233
- // Best-effort GC: a failed rmi leaves the image on disk but mustn't
234
- // break the loop. Warn so a flaky `docker` (or a permission slip)
235
- // becomes visible before disks fill — silent leaks were the whole
236
- // problem this bead exists to fix.
237
277
  const reason = err instanceof Error ? err.message : String(err);
238
- console.warn(`[swe-rebench-v2] eval-image cleanup failed for ${oldest}: ${reason}`);
278
+ console.warn(`[swe-rebench-v2] pruneRound failed for ${args.image}: ${reason}`);
239
279
  }
240
280
  }
241
281
  }
@@ -280,6 +320,7 @@ export class PythonEvalRunner {
280
320
  const child = spawn(this.opts.pythonBin ?? 'python3', pyArgs, {
281
321
  cwd: this.opts.upstreamRepoDir,
282
322
  stdio: ['ignore', 'pipe', 'pipe'],
323
+ detached: process.platform !== 'win32',
283
324
  // SWE-rebench eval images are published for linux/amd64. Pin the platform
284
325
  // so the upstream `docker run` is consistent on amd64 hosts and does not
285
326
  // silently crash under arm64 emulation on dev machines.
@@ -289,10 +330,54 @@ export class PythonEvalRunner {
289
330
  child.stderr.on('data', (d) => { stderr += d.toString(); });
290
331
  let stdout = '';
291
332
  child.stdout.on('data', (d) => { stdout += d.toString(); });
333
+ let timedOut = false;
334
+ let closed = false;
335
+ let killTimer;
336
+ const killChild = (signal) => {
337
+ const pid = child.pid;
338
+ if (!pid)
339
+ return;
340
+ try {
341
+ if (process.platform === 'win32') {
342
+ child.kill(signal);
343
+ }
344
+ else {
345
+ process.kill(-pid, signal);
346
+ }
347
+ }
348
+ catch {
349
+ try {
350
+ child.kill(signal);
351
+ }
352
+ catch { }
353
+ }
354
+ };
355
+ const timeoutTimer = this.evalTimeoutMs > 0
356
+ ? setTimeout(() => {
357
+ timedOut = true;
358
+ killChild('SIGTERM');
359
+ killTimer = setTimeout(() => {
360
+ if (!closed)
361
+ killChild('SIGKILL');
362
+ }, 10_000);
363
+ killTimer.unref?.();
364
+ }, this.evalTimeoutMs)
365
+ : undefined;
366
+ timeoutTimer?.unref?.();
292
367
  const exitCode = await new Promise((resolve, reject) => {
293
368
  child.on('close', (code) => resolve(code ?? 1));
294
369
  child.on('error', reject);
370
+ }).finally(() => {
371
+ closed = true;
372
+ if (timeoutTimer)
373
+ clearTimeout(timeoutTimer);
374
+ if (killTimer)
375
+ clearTimeout(killTimer);
295
376
  });
377
+ if (timedOut) {
378
+ await rm(tmp, { recursive: true, force: true });
379
+ throw new EvalCouldNotGradeError('eval_timeout', `python eval timed out after ${this.evalTimeoutMs}ms; ${(stderr || stdout).slice(-800)}`);
380
+ }
296
381
  let report;
297
382
  try {
298
383
  report = JSON.parse(await readFile(reportPath, 'utf8'));