create-walle 0.9.21 → 0.9.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (500) hide show
  1. package/README.md +27 -5
  2. package/package.json +2 -2
  3. package/template/CLAUDE.md +2 -2
  4. package/template/LICENSE +1 -1
  5. package/template/bin/ctm-dev-cleanup.js +24 -3
  6. package/template/bin/ctm-launch.sh +13 -0
  7. package/template/bin/dev.sh +156 -18
  8. package/template/bin/node-bin.sh +84 -0
  9. package/template/bin/pin-node.sh +51 -0
  10. package/template/claude-task-manager/api-prompts.js +1203 -182
  11. package/template/claude-task-manager/api-reviews.js +109 -15
  12. package/template/claude-task-manager/approval-agent.js +1360 -280
  13. package/template/claude-task-manager/bin/restart-ctm.sh +64 -23
  14. package/template/claude-task-manager/bin/storage-migration-supervisor.js +338 -0
  15. package/template/claude-task-manager/db.js +4417 -295
  16. package/template/claude-task-manager/docs/app-update-refresh-protocol.md +69 -0
  17. package/template/claude-task-manager/docs/approval-ai-refinement.md +138 -0
  18. package/template/claude-task-manager/docs/approval-rescue-loop.md +74 -0
  19. package/template/claude-task-manager/docs/codex-operational-warning-health.md +107 -0
  20. package/template/claude-task-manager/docs/codex-resume-state-guard-design.md +17 -12
  21. package/template/claude-task-manager/docs/codex-terminal-render-controller-handoff.md +311 -0
  22. package/template/claude-task-manager/docs/coding-agent-hooks-architecture.md +418 -0
  23. package/template/claude-task-manager/docs/conversation-import-freshness.md +20 -0
  24. package/template/claude-task-manager/docs/google-workspace-auth-health.md +77 -0
  25. package/template/claude-task-manager/docs/image-paste-ux.md +13 -0
  26. package/template/claude-task-manager/docs/ipad-web-preview.md +88 -0
  27. package/template/claude-task-manager/docs/main-loop-offload-architecture.md +66 -0
  28. package/template/claude-task-manager/docs/microsoft-dev-tunnel-phone-access-design.md +274 -519
  29. package/template/claude-task-manager/docs/mobile-live-streaming.md +27 -5
  30. package/template/claude-task-manager/docs/mobile-remote-submission-lifecycle.md +69 -0
  31. package/template/claude-task-manager/docs/phone-access-design.md +53 -15
  32. package/template/claude-task-manager/docs/phone-passkey-identity.md +122 -0
  33. package/template/claude-task-manager/docs/phone-setup.md +3 -0
  34. package/template/claude-task-manager/docs/prompt-editing-tree-design.md +25 -1
  35. package/template/claude-task-manager/docs/remote-desktop-access-design.md +268 -0
  36. package/template/claude-task-manager/docs/restart-lifecycle-architecture.md +95 -0
  37. package/template/claude-task-manager/docs/runtime-work-control-plane.md +53 -0
  38. package/template/claude-task-manager/docs/session-interactive-wait-surfaces.md +38 -0
  39. package/template/claude-task-manager/docs/session-needs-you-dismissal.md +84 -0
  40. package/template/claude-task-manager/docs/session-render-state-management-design.md +91 -3
  41. package/template/claude-task-manager/docs/session-standup-command-center-design.md +25 -1
  42. package/template/claude-task-manager/docs/session-title-authority.md +32 -0
  43. package/template/claude-task-manager/docs/session-workspace-binding.md +33 -0
  44. package/template/claude-task-manager/docs/skill-intent-resolution-design.md +72 -0
  45. package/template/claude-task-manager/docs/walle-mcp-supervisor-health.md +86 -0
  46. package/template/claude-task-manager/docs/walle-relay-phone-access-design.md +24 -15
  47. package/template/claude-task-manager/docs/walle-session-history-hydration.md +114 -0
  48. package/template/claude-task-manager/docs/walle-session-input-queue.md +104 -0
  49. package/template/claude-task-manager/docs/walle-session-model-catalog.md +90 -0
  50. package/template/claude-task-manager/docs/walle-session-model-preferences.md +15 -6
  51. package/template/claude-task-manager/git-utils.js +897 -27
  52. package/template/claude-task-manager/lib/agent-capabilities.js +33 -0
  53. package/template/claude-task-manager/lib/agent-cli-cache.js +37 -7
  54. package/template/claude-task-manager/lib/agent-hooks-installer.js +26 -2
  55. package/template/claude-task-manager/lib/agent-presets.js +17 -1
  56. package/template/claude-task-manager/lib/all-sessions-query.js +108 -0
  57. package/template/claude-task-manager/lib/approval-ai-refinement.js +488 -0
  58. package/template/claude-task-manager/lib/approval-self-adapt.js +168 -0
  59. package/template/claude-task-manager/lib/async-semaphore.js +44 -0
  60. package/template/claude-task-manager/lib/auth-context.js +5 -0
  61. package/template/claude-task-manager/lib/auth-rate-limit.js +47 -4
  62. package/template/claude-task-manager/lib/auth-rules.js +29 -2
  63. package/template/claude-task-manager/lib/auto-approval-verifier.js +129 -16
  64. package/template/claude-task-manager/lib/background-llm.js +144 -17
  65. package/template/claude-task-manager/lib/branch-inventory.js +212 -0
  66. package/template/claude-task-manager/lib/claude-desktop-sessions.js +15 -3
  67. package/template/claude-task-manager/lib/coalesce-sync-frames.js +151 -0
  68. package/template/claude-task-manager/lib/codex-launch-health.js +762 -0
  69. package/template/claude-task-manager/lib/codex-transcript-pager.js +51 -0
  70. package/template/claude-task-manager/lib/codex-zst.js +124 -0
  71. package/template/claude-task-manager/lib/coding-agent-models.js +233 -30
  72. package/template/claude-task-manager/lib/connection-health.js +232 -0
  73. package/template/claude-task-manager/lib/conversation-blob-parser.js +42 -0
  74. package/template/claude-task-manager/lib/conversation-tail-merge.js +89 -26
  75. package/template/claude-task-manager/lib/ctm-session-context-api.js +39 -10
  76. package/template/claude-task-manager/lib/cursor-conversation-store.js +354 -0
  77. package/template/claude-task-manager/lib/db-owner-worker-client.js +315 -0
  78. package/template/claude-task-manager/lib/document-review.js +141 -6
  79. package/template/claude-task-manager/lib/escalation-review.js +152 -0
  80. package/template/claude-task-manager/lib/graceful-shutdown.js +159 -0
  81. package/template/claude-task-manager/lib/headless-term-service.js +678 -0
  82. package/template/claude-task-manager/lib/heavy-worker-fallback.js +38 -0
  83. package/template/claude-task-manager/lib/jsonl-conversation-parser.js +542 -0
  84. package/template/claude-task-manager/lib/jsonl-range-reader.js +112 -0
  85. package/template/claude-task-manager/lib/main-db-census.js +216 -0
  86. package/template/claude-task-manager/lib/message-pagination.js +106 -4
  87. package/template/claude-task-manager/lib/microsoft-dev-tunnel-setup.js +750 -26
  88. package/template/claude-task-manager/lib/mobile-auth-api.js +274 -7
  89. package/template/claude-task-manager/lib/mobile-auth-store.js +592 -10
  90. package/template/claude-task-manager/lib/mobile-notification-dispatcher.js +15 -0
  91. package/template/claude-task-manager/lib/model-overview-brain-fallback.js +311 -0
  92. package/template/claude-task-manager/lib/model-overview-cache.js +141 -0
  93. package/template/claude-task-manager/lib/models-health-routing-notice.js +126 -0
  94. package/template/claude-task-manager/lib/node-pin-guard.js +93 -0
  95. package/template/claude-task-manager/lib/perf-tracker.js +242 -6
  96. package/template/claude-task-manager/lib/permission-match.js +76 -0
  97. package/template/claude-task-manager/lib/permission-sync.js +133 -20
  98. package/template/claude-task-manager/lib/process-title.js +35 -0
  99. package/template/claude-task-manager/lib/prompt-executions-query.js +25 -0
  100. package/template/claude-task-manager/lib/prompt-index-disk-cache.js +44 -0
  101. package/template/claude-task-manager/lib/prompt-intent.js +132 -0
  102. package/template/claude-task-manager/lib/provider-user-context.js +34 -0
  103. package/template/claude-task-manager/lib/read-pool-client.js +313 -0
  104. package/template/claude-task-manager/lib/readpool-breaker.js +31 -0
  105. package/template/claude-task-manager/lib/recent-sessions-breaker.js +12 -0
  106. package/template/claude-task-manager/lib/remote-feedback-client.js +72 -0
  107. package/template/claude-task-manager/lib/remote-relay-protocol.js +37 -4
  108. package/template/claude-task-manager/lib/remote-relay-store.js +159 -0
  109. package/template/claude-task-manager/lib/remote-submission-observer.js +278 -0
  110. package/template/claude-task-manager/lib/restart-guard.js +109 -0
  111. package/template/claude-task-manager/lib/restore-interruption-detector.js +439 -0
  112. package/template/claude-task-manager/lib/restore-policy.js +13 -0
  113. package/template/claude-task-manager/lib/restore-resume-batch.js +74 -0
  114. package/template/claude-task-manager/lib/restore-runtime.js +68 -0
  115. package/template/claude-task-manager/lib/restore-storm.js +34 -0
  116. package/template/claude-task-manager/lib/resume-cwd.js +36 -0
  117. package/template/claude-task-manager/lib/resume-preflight.js +313 -0
  118. package/template/claude-task-manager/lib/runtime-work-registry.js +444 -0
  119. package/template/claude-task-manager/lib/sanitize-openai-auth.js +31 -0
  120. package/template/claude-task-manager/lib/scheduler.js +21 -1
  121. package/template/claude-task-manager/lib/scrollback-snapshot-store.js +159 -0
  122. package/template/claude-task-manager/lib/serial-task-queue.js +64 -0
  123. package/template/claude-task-manager/lib/server-listeners.js +239 -0
  124. package/template/claude-task-manager/lib/session-capture.js +42 -7
  125. package/template/claude-task-manager/lib/session-content-backfill.js +131 -0
  126. package/template/claude-task-manager/lib/session-history.js +388 -43
  127. package/template/claude-task-manager/lib/session-host-manager.js +287 -0
  128. package/template/claude-task-manager/lib/session-image-refs.js +209 -0
  129. package/template/claude-task-manager/lib/session-jobs.js +399 -59
  130. package/template/claude-task-manager/lib/session-prompt-index.js +137 -0
  131. package/template/claude-task-manager/lib/session-restore.js +53 -0
  132. package/template/claude-task-manager/lib/session-standup.js +123 -23
  133. package/template/claude-task-manager/lib/session-state-bus.js +14 -0
  134. package/template/claude-task-manager/lib/session-stream.js +64 -16
  135. package/template/claude-task-manager/lib/session-timeline-summary.js +260 -0
  136. package/template/claude-task-manager/lib/session-token-usage.js +494 -0
  137. package/template/claude-task-manager/lib/session-workspace-binding.js +356 -0
  138. package/template/claude-task-manager/lib/setup-network-config.js +9 -0
  139. package/template/claude-task-manager/lib/size-cap.js +45 -0
  140. package/template/claude-task-manager/lib/size-cap.test.js +62 -0
  141. package/template/claude-task-manager/lib/skill-autocomplete.js +180 -1
  142. package/template/claude-task-manager/lib/skill-intent-resolver.js +304 -0
  143. package/template/claude-task-manager/lib/sqlite-driver.js +19 -3
  144. package/template/claude-task-manager/lib/standup-attention.js +7 -3
  145. package/template/claude-task-manager/lib/status-authority.js +39 -0
  146. package/template/claude-task-manager/lib/status-hooks.js +4 -0
  147. package/template/claude-task-manager/lib/storage-migration.js +235 -0
  148. package/template/claude-task-manager/lib/structured-capture.js +298 -0
  149. package/template/claude-task-manager/lib/sync-io-census.js +163 -0
  150. package/template/claude-task-manager/lib/tailscale-setup.js +6 -0
  151. package/template/claude-task-manager/lib/terminal-activity-evidence.js +33 -0
  152. package/template/claude-task-manager/lib/terminal-choice.js +364 -0
  153. package/template/claude-task-manager/lib/terminal-control-sanitize.js +17 -0
  154. package/template/claude-task-manager/lib/terminal-fingerprint.js +48 -0
  155. package/template/claude-task-manager/lib/terminal-output-flush.js +84 -0
  156. package/template/claude-task-manager/lib/timeline-order.js +122 -0
  157. package/template/claude-task-manager/lib/transcript-store.js +348 -43
  158. package/template/claude-task-manager/lib/transport-security.js +84 -1
  159. package/template/claude-task-manager/lib/wait-state.js +184 -0
  160. package/template/claude-task-manager/lib/walle-client.js +47 -5
  161. package/template/claude-task-manager/lib/walle-ctm-history.js +564 -4
  162. package/template/claude-task-manager/lib/walle-external-actions.js +135 -16
  163. package/template/claude-task-manager/lib/walle-history-hydration.js +46 -0
  164. package/template/claude-task-manager/lib/walle-native-health.js +403 -0
  165. package/template/claude-task-manager/lib/walle-repair.js +701 -0
  166. package/template/claude-task-manager/lib/walle-session-cache.js +109 -0
  167. package/template/claude-task-manager/lib/walle-session-context.js +57 -21
  168. package/template/claude-task-manager/lib/walle-session-model-catalog.js +34 -0
  169. package/template/claude-task-manager/lib/walle-supervisor.js +539 -63
  170. package/template/claude-task-manager/lib/walle-transcript.js +52 -0
  171. package/template/claude-task-manager/lib/worktree-active-sync.js +11 -7
  172. package/template/claude-task-manager/lib/worktree-cwd.js +32 -1
  173. package/template/claude-task-manager/package.json +1 -1
  174. package/template/claude-task-manager/prompt-harvest.js +89 -66
  175. package/template/claude-task-manager/providers/claude-code.js +51 -3
  176. package/template/claude-task-manager/providers/cursor.js +140 -45
  177. package/template/claude-task-manager/public/css/reviews.css +551 -61
  178. package/template/claude-task-manager/public/css/setup.css +191 -0
  179. package/template/claude-task-manager/public/css/walle-session.css +865 -10
  180. package/template/claude-task-manager/public/css/walle.css +154 -0
  181. package/template/claude-task-manager/public/designs/ai-providers-consolidation-v2.html +830 -0
  182. package/template/claude-task-manager/public/index.html +18516 -2058
  183. package/template/claude-task-manager/public/ipad.html +363 -0
  184. package/template/claude-task-manager/public/js/document-review-links.js +301 -0
  185. package/template/claude-task-manager/public/js/image-normalize.js +69 -36
  186. package/template/claude-task-manager/public/js/message-renderer.js +1265 -77
  187. package/template/claude-task-manager/public/js/prompts.js +66 -29
  188. package/template/claude-task-manager/public/js/reviews.js +901 -133
  189. package/template/claude-task-manager/public/js/session-activity-utils.js +11 -1
  190. package/template/claude-task-manager/public/js/session-search-utils.js +94 -10
  191. package/template/claude-task-manager/public/js/session-status-precedence.js +23 -5
  192. package/template/claude-task-manager/public/js/setup.js +1273 -176
  193. package/template/claude-task-manager/public/js/stream-view.js +691 -73
  194. package/template/claude-task-manager/public/js/terminal-reconciler.js +210 -0
  195. package/template/claude-task-manager/public/js/walle-session.js +2455 -158
  196. package/template/claude-task-manager/public/js/walle.js +455 -28
  197. package/template/claude-task-manager/public/m/app.css +2909 -262
  198. package/template/claude-task-manager/public/m/app.js +6601 -398
  199. package/template/claude-task-manager/public/m/claim.html +224 -17
  200. package/template/claude-task-manager/public/m/index.html +117 -21
  201. package/template/claude-task-manager/public/m/sw.js +3 -1
  202. package/template/claude-task-manager/public/manifest.json +2 -2
  203. package/template/claude-task-manager/public/prompts.html +30 -14
  204. package/template/claude-task-manager/queue-engine.js +507 -28
  205. package/template/claude-task-manager/scripts/repair-claude-session-images.js +27 -8
  206. package/template/claude-task-manager/server.js +14341 -2197
  207. package/template/claude-task-manager/session-integrity.js +160 -18
  208. package/template/claude-task-manager/session-search-ranking.js +1 -0
  209. package/template/claude-task-manager/session-utils.js +25 -5
  210. package/template/claude-task-manager/workers/approval-blocklist.js +96 -6
  211. package/template/claude-task-manager/workers/approval-widget-validator.js +14 -8
  212. package/template/claude-task-manager/workers/conversation-import-worker.js +11 -50
  213. package/template/claude-task-manager/workers/db-owner-worker.js +386 -0
  214. package/template/claude-task-manager/workers/harvest-worker.js +9 -55
  215. package/template/claude-task-manager/workers/headless-term-worker.js +9 -530
  216. package/template/claude-task-manager/workers/read-pool-worker.js +387 -0
  217. package/template/claude-task-manager/workers/scrollback-worker.js +11 -72
  218. package/template/claude-task-manager/workers/session-host-process.js +146 -0
  219. package/template/claude-task-manager/workers/session-integrity-worker.js +10 -54
  220. package/template/claude-task-manager/workers/state-detectors/base.js +18 -1
  221. package/template/claude-task-manager/workers/state-detectors/claude-code.js +182 -9
  222. package/template/claude-task-manager/workers/state-detectors/codex.js +150 -2
  223. package/template/claude-task-manager/workers/state-detectors/cursor.js +127 -0
  224. package/template/claude-task-manager/workers/state-detectors/gemini.js +21 -0
  225. package/template/claude-task-manager/workers/state-detectors/index.js +29 -0
  226. package/template/claude-task-manager/workers/state-detectors/opencode.js +103 -0
  227. package/template/docs/design/markdown-review-pane.md +206 -0
  228. package/template/docs/designs/2026-05-17-portkey-gateway-provider-ux.md +129 -38
  229. package/template/docs/designs/2026-05-20-mobile-worktree-finish-command.md +27 -0
  230. package/template/docs/designs/2026-05-22-ai-configuration-consolidation.md +248 -0
  231. package/template/docs/designs/ai-configuration-consolidation-mock.html +812 -0
  232. package/template/docs/private-memory-and-pii-policy.md +69 -0
  233. package/template/package.json +2 -1
  234. package/template/scripts/check-private-data.js +201 -0
  235. package/template/shared/sqlite-owner-guard.js +30 -0
  236. package/template/shared/sqlite-owner-write-queue.js +225 -0
  237. package/template/shared/sqlite-storage-policy.js +111 -0
  238. package/template/shared/sqlite-write-lock.js +428 -0
  239. package/template/wall-e/agent-runners/claude-code.js +5 -0
  240. package/template/wall-e/agent.js +166 -22
  241. package/template/wall-e/api-walle.js +524 -70
  242. package/template/wall-e/auth/provider-flows.js +11 -1
  243. package/template/wall-e/bin/walle-mcp-stdio.js +341 -17
  244. package/template/wall-e/brain.js +1614 -141
  245. package/template/wall-e/chat/attachment-blocks.js +96 -0
  246. package/template/wall-e/chat/attachments.js +2 -1
  247. package/template/wall-e/chat/capability-resolver.js +7 -7
  248. package/template/wall-e/chat/context-messages.js +28 -0
  249. package/template/wall-e/chat/conversation-frame.js +630 -0
  250. package/template/wall-e/chat/provider-messages.js +125 -0
  251. package/template/wall-e/chat.js +1002 -233
  252. package/template/wall-e/coding/acceptance-contract.js +170 -0
  253. package/template/wall-e/coding/acp-adapter.js +1 -1
  254. package/template/wall-e/coding/agent-catalog.js +3 -0
  255. package/template/wall-e/coding/artifact-store.js +93 -0
  256. package/template/wall-e/coding/capability-router.js +120 -0
  257. package/template/wall-e/coding/coding-run-controller.js +423 -0
  258. package/template/wall-e/coding/compaction-service.js +157 -12
  259. package/template/wall-e/coding/frontend-verification.js +258 -0
  260. package/template/wall-e/coding/lifecycle-hooks.js +75 -0
  261. package/template/wall-e/coding/local-preview-contract.js +157 -0
  262. package/template/wall-e/coding/permission-service.js +57 -13
  263. package/template/wall-e/coding/prompt-bundle.js +19 -1
  264. package/template/wall-e/coding/prompt-section-registry.js +227 -0
  265. package/template/wall-e/coding/provider-compat.js +15 -0
  266. package/template/wall-e/coding/runtime-events.js +224 -0
  267. package/template/wall-e/coding/runtime-mode.js +3 -0
  268. package/template/wall-e/coding/side-git-snapshot.js +160 -4
  269. package/template/wall-e/coding/snapshot-service.js +143 -1
  270. package/template/wall-e/coding/stream-processor.js +388 -34
  271. package/template/wall-e/coding/task-tool.js +141 -4
  272. package/template/wall-e/coding/tool-execution-controller.js +365 -0
  273. package/template/wall-e/coding/tool-registry.js +43 -5
  274. package/template/wall-e/coding/user-hooks.js +217 -0
  275. package/template/wall-e/coding-orchestrator.js +1330 -221
  276. package/template/wall-e/coding-prompts.js +20 -4
  277. package/template/wall-e/context/context-builder.js +15 -2
  278. package/template/wall-e/decision/confidence.js +1 -1
  279. package/template/wall-e/docs/coding-acceptance-contract.md +41 -0
  280. package/template/wall-e/docs/external-action-controller.md +26 -6
  281. package/template/wall-e/docs/telemetry-lifecycle.md +8 -2
  282. package/template/wall-e/embeddings.js +591 -53
  283. package/template/wall-e/external-action-controller.js +12 -0
  284. package/template/wall-e/http/auth.js +1 -0
  285. package/template/wall-e/http/chat-api.js +46 -11
  286. package/template/wall-e/http/model-admin.js +836 -34
  287. package/template/wall-e/lib/boot-profile.js +88 -0
  288. package/template/wall-e/lib/event-loop-monitor.js +93 -0
  289. package/template/wall-e/lib/service-health.js +194 -0
  290. package/template/wall-e/llm/anthropic.js +130 -5
  291. package/template/wall-e/llm/client.js +266 -63
  292. package/template/wall-e/llm/default-fallback.js +382 -0
  293. package/template/wall-e/llm/health.js +19 -0
  294. package/template/wall-e/llm/message-guard.js +78 -0
  295. package/template/wall-e/llm/model-catalog.js +252 -1
  296. package/template/wall-e/llm/openai.js +26 -4
  297. package/template/wall-e/llm/portkey-sync.js +654 -0
  298. package/template/wall-e/llm/provider-error.js +30 -2
  299. package/template/wall-e/llm/registry.js +5 -1
  300. package/template/wall-e/llm/request-compat.js +67 -0
  301. package/template/wall-e/loops/backfill.js +79 -23
  302. package/template/wall-e/loops/brain-optimize.js +67 -0
  303. package/template/wall-e/loops/ingest.js +25 -10
  304. package/template/wall-e/loops/question-digest.js +160 -0
  305. package/template/wall-e/loops/reflect.js +6 -4
  306. package/template/wall-e/loops/think.js +39 -12
  307. package/template/wall-e/mcp-server.js +318 -36
  308. package/template/wall-e/memory/ctm-context-client.js +52 -14
  309. package/template/wall-e/memory/ctm-operational-context.js +237 -0
  310. package/template/wall-e/memory/ctm-prompt-executions-client.js +128 -0
  311. package/template/wall-e/memory/ctm-session-context.js +111 -63
  312. package/template/wall-e/prompts/coding/deepseek.txt +3 -0
  313. package/template/wall-e/prompts/coding/gemini.txt +6 -0
  314. package/template/wall-e/prompts/coding/gpt.txt +6 -0
  315. package/template/wall-e/prompts/coding/local.txt +7 -0
  316. package/template/wall-e/runtime/decision-hooks.js +115 -0
  317. package/template/wall-e/runtime/devbox-gateway.js +82 -8
  318. package/template/wall-e/runtime/prompt-manifest.js +86 -0
  319. package/template/wall-e/runtime/tool-executor.js +269 -0
  320. package/template/wall-e/runtime/tool-result-envelope.js +138 -0
  321. package/template/wall-e/runtime/transcript-projection.js +60 -0
  322. package/template/wall-e/runtime/walle-runtime.js +224 -0
  323. package/template/wall-e/scripts/db-optimize/migrate.js +162 -0
  324. package/template/wall-e/scripts/db-optimize/recall-eval.js +117 -0
  325. package/template/wall-e/server.js +15 -0
  326. package/template/wall-e/session-files.js +9 -0
  327. package/template/wall-e/skills/_bundled/google-calendar/run.js +1 -1
  328. package/template/wall-e/skills/_bundled/gws-workspace/run.js +1 -1
  329. package/template/wall-e/skills/_bundled/slack-mentions/run.js +76 -6
  330. package/template/wall-e/skills/claude-code-reader.js +7 -3
  331. package/template/wall-e/skills/script-skill-runner.js +10 -0
  332. package/template/wall-e/skills/skill-planner.js +38 -0
  333. package/template/wall-e/tools/builtin-middleware.js +19 -9
  334. package/template/wall-e/tools/local-tools.js +1428 -16
  335. package/template/wall-e/tools/permission-checker.js +73 -5
  336. package/template/wall-e/tools/question-manager.js +117 -7
  337. package/template/wall-e/training/harvester.js +12 -28
  338. package/template/wall-e/training/replay.js +25 -80
  339. package/template/website/index.html +10 -10
  340. package/template/wall-e/eval/ab-test.js +0 -203
  341. package/template/wall-e/eval/agent-runner.js +0 -772
  342. package/template/wall-e/eval/agent-scorer.js +0 -461
  343. package/template/wall-e/eval/aggregator.js +0 -414
  344. package/template/wall-e/eval/allowed-test-commands.js +0 -34
  345. package/template/wall-e/eval/benchmark-generator.js +0 -113
  346. package/template/wall-e/eval/benchmarks/chat-eval.json +0 -1662
  347. package/template/wall-e/eval/benchmarks/chat.json +0 -82
  348. package/template/wall-e/eval/benchmarks/coding-agent-real.json +0 -1
  349. package/template/wall-e/eval/benchmarks/coding-agent.json +0 -1581
  350. package/template/wall-e/eval/benchmarks/coding.json +0 -122
  351. package/template/wall-e/eval/benchmarks/memory-retrieval.json +0 -234
  352. package/template/wall-e/eval/benchmarks/reasoning.json +0 -82
  353. package/template/wall-e/eval/benchmarks/swebench-lite-30.json +0 -212
  354. package/template/wall-e/eval/benchmarks.js +0 -669
  355. package/template/wall-e/eval/cc-replay.js +0 -719
  356. package/template/wall-e/eval/chat-eval.js +0 -525
  357. package/template/wall-e/eval/check-keys.js +0 -15
  358. package/template/wall-e/eval/check-providers.js +0 -42
  359. package/template/wall-e/eval/codex-cli-baseline.js +0 -669
  360. package/template/wall-e/eval/coding-agent-real.js +0 -570
  361. package/template/wall-e/eval/context-compactor.js +0 -251
  362. package/template/wall-e/eval/debug-agent003.js +0 -68
  363. package/template/wall-e/eval/diagnostics.js +0 -216
  364. package/template/wall-e/eval/eval-orchestrator.js +0 -642
  365. package/template/wall-e/eval/evaluate.js +0 -202
  366. package/template/wall-e/eval/evaluator.js +0 -373
  367. package/template/wall-e/eval/exporter.js +0 -212
  368. package/template/wall-e/eval/fixtures/express-basic/package.json +0 -9
  369. package/template/wall-e/eval/fixtures/express-basic/server.js +0 -115
  370. package/template/wall-e/eval/fixtures/express-basic/test.js +0 -83
  371. package/template/wall-e/eval/fixtures/express-buggy/package.json +0 -9
  372. package/template/wall-e/eval/fixtures/express-buggy/server.js +0 -113
  373. package/template/wall-e/eval/fixtures/express-buggy/test.js +0 -83
  374. package/template/wall-e/eval/fixtures/express-buggy-items/package.json +0 -9
  375. package/template/wall-e/eval/fixtures/express-buggy-items/server.js +0 -112
  376. package/template/wall-e/eval/fixtures/express-buggy-items/test.js +0 -83
  377. package/template/wall-e/eval/fixtures/express-buggy-search/package.json +0 -9
  378. package/template/wall-e/eval/fixtures/express-buggy-search/server.js +0 -121
  379. package/template/wall-e/eval/fixtures/express-buggy-search/test.js +0 -83
  380. package/template/wall-e/eval/fixtures/express-rename-data/data.js +0 -34
  381. package/template/wall-e/eval/fixtures/express-rename-data/package.json +0 -9
  382. package/template/wall-e/eval/fixtures/express-rename-data/server.js +0 -97
  383. package/template/wall-e/eval/fixtures/express-rename-data/test.js +0 -88
  384. package/template/wall-e/eval/fixtures/express-xss/package.json +0 -12
  385. package/template/wall-e/eval/fixtures/express-xss/server.js +0 -90
  386. package/template/wall-e/eval/fixtures/express-xss/test.js +0 -67
  387. package/template/wall-e/eval/fixtures/express-xss/views/profile.ejs +0 -9
  388. package/template/wall-e/eval/fixtures/fullstack-app/config/default.js +0 -9
  389. package/template/wall-e/eval/fixtures/fullstack-app/config/test.js +0 -13
  390. package/template/wall-e/eval/fixtures/fullstack-app/package.json +0 -11
  391. package/template/wall-e/eval/fixtures/fullstack-app/public/css/style.css +0 -137
  392. package/template/wall-e/eval/fixtures/fullstack-app/public/index.html +0 -46
  393. package/template/wall-e/eval/fixtures/fullstack-app/public/js/app.js +0 -121
  394. package/template/wall-e/eval/fixtures/fullstack-app/public/js/auth.js +0 -71
  395. package/template/wall-e/eval/fixtures/fullstack-app/public/js/items.js +0 -80
  396. package/template/wall-e/eval/fixtures/fullstack-app/public/js/users.js +0 -46
  397. package/template/wall-e/eval/fixtures/fullstack-app/public/login.html +0 -45
  398. package/template/wall-e/eval/fixtures/fullstack-app/public/register.html +0 -38
  399. package/template/wall-e/eval/fixtures/fullstack-app/scripts/migrate.js +0 -23
  400. package/template/wall-e/eval/fixtures/fullstack-app/scripts/seed.js +0 -46
  401. package/template/wall-e/eval/fixtures/fullstack-app/server/db.js +0 -99
  402. package/template/wall-e/eval/fixtures/fullstack-app/server/index.js +0 -94
  403. package/template/wall-e/eval/fixtures/fullstack-app/server/middleware/auth.js +0 -19
  404. package/template/wall-e/eval/fixtures/fullstack-app/server/middleware/logger.js +0 -19
  405. package/template/wall-e/eval/fixtures/fullstack-app/server/router.js +0 -50
  406. package/template/wall-e/eval/fixtures/fullstack-app/server/routes/auth.js +0 -69
  407. package/template/wall-e/eval/fixtures/fullstack-app/server/routes/health.js +0 -23
  408. package/template/wall-e/eval/fixtures/fullstack-app/server/routes/items.js +0 -88
  409. package/template/wall-e/eval/fixtures/fullstack-app/server/routes/users.js +0 -75
  410. package/template/wall-e/eval/fixtures/fullstack-app/server/test.js +0 -198
  411. package/template/wall-e/eval/fixtures/fullstack-app/server/utils/response.js +0 -34
  412. package/template/wall-e/eval/fixtures/fullstack-app/server/utils/validate.js +0 -26
  413. package/template/wall-e/eval/fixtures/fullstack-app/server.js +0 -8
  414. package/template/wall-e/eval/fixtures/fullstack-app/test.js +0 -12
  415. package/template/wall-e/eval/fixtures/monorepo-basic/package.json +0 -8
  416. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/data.js +0 -58
  417. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/middleware.js +0 -46
  418. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/package.json +0 -8
  419. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/routes.js +0 -64
  420. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/server.js +0 -56
  421. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/test.js +0 -116
  422. package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/commands.js +0 -61
  423. package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/index.js +0 -62
  424. package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/output.js +0 -43
  425. package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/package.json +0 -11
  426. package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/test.js +0 -44
  427. package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/formatters.js +0 -43
  428. package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/index.js +0 -12
  429. package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/package.json +0 -5
  430. package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/test.js +0 -55
  431. package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/validators.js +0 -29
  432. package/template/wall-e/eval/fixtures/monorepo-basic/test.js +0 -46
  433. package/template/wall-e/eval/fixtures/node-cli/index.js +0 -78
  434. package/template/wall-e/eval/fixtures/node-cli/package.json +0 -10
  435. package/template/wall-e/eval/fixtures/node-cli/test.js +0 -57
  436. package/template/wall-e/eval/fixtures/node-typed/package.json +0 -8
  437. package/template/wall-e/eval/fixtures/node-typed/src/handlers.js +0 -31
  438. package/template/wall-e/eval/fixtures/node-typed/src/utils.js +0 -33
  439. package/template/wall-e/eval/fixtures/node-typed/test.js +0 -36
  440. package/template/wall-e/eval/fixtures/python-flask/app.py +0 -14
  441. package/template/wall-e/eval/fixtures/python-flask/requirements.txt +0 -2
  442. package/template/wall-e/eval/fixtures/python-flask/test_app.py +0 -25
  443. package/template/wall-e/eval/fixtures/wall-e-subset/brain.js +0 -105
  444. package/template/wall-e/eval/fixtures/wall-e-subset/eval/aggregator.js +0 -101
  445. package/template/wall-e/eval/fixtures/wall-e-subset/eval/benchmarks/chat.json +0 -20
  446. package/template/wall-e/eval/fixtures/wall-e-subset/eval/benchmarks/coding.json +0 -32
  447. package/template/wall-e/eval/fixtures/wall-e-subset/eval/benchmarks.js +0 -64
  448. package/template/wall-e/eval/fixtures/wall-e-subset/eval/fixtures/simple-project/package.json +0 -6
  449. package/template/wall-e/eval/fixtures/wall-e-subset/eval/fixtures/simple-project/server.js +0 -31
  450. package/template/wall-e/eval/fixtures/wall-e-subset/eval/fixtures/simple-project/test.js +0 -18
  451. package/template/wall-e/eval/fixtures/wall-e-subset/eval/fixtures/simple-project/utils.js +0 -34
  452. package/template/wall-e/eval/fixtures/wall-e-subset/eval/runner.js +0 -104
  453. package/template/wall-e/eval/fixtures/wall-e-subset/eval/scorer.js +0 -73
  454. package/template/wall-e/eval/fixtures/wall-e-subset/eval/test.js +0 -134
  455. package/template/wall-e/eval/fixtures/wall-e-subset/llm/client.js +0 -99
  456. package/template/wall-e/eval/fixtures/wall-e-subset/llm/providers.js +0 -63
  457. package/template/wall-e/eval/fixtures/wall-e-subset/llm/test.js +0 -70
  458. package/template/wall-e/eval/fixtures/wall-e-subset/package.json +0 -10
  459. package/template/wall-e/eval/fixtures/wall-e-subset/test.js +0 -86
  460. package/template/wall-e/eval/harvester.js +0 -685
  461. package/template/wall-e/eval/head-to-head.js +0 -388
  462. package/template/wall-e/eval/humaneval-adapter.js +0 -321
  463. package/template/wall-e/eval/list-models.js +0 -31
  464. package/template/wall-e/eval/livecodebench-adapter.js +0 -291
  465. package/template/wall-e/eval/mail-integration.js +0 -443
  466. package/template/wall-e/eval/manifest.js +0 -186
  467. package/template/wall-e/eval/meta-harness/adapters/coding-agent.js +0 -57
  468. package/template/wall-e/eval/meta-harness/bootstrap-snapshot.js +0 -149
  469. package/template/wall-e/eval/meta-harness/candidate-store.js +0 -117
  470. package/template/wall-e/eval/meta-harness/cli.js +0 -86
  471. package/template/wall-e/eval/meta-harness/domain-spec.js +0 -154
  472. package/template/wall-e/eval/meta-harness/domains/coding-agent.domain.json +0 -84
  473. package/template/wall-e/eval/meta-harness/examples/env-bootstrap-candidate.js +0 -29
  474. package/template/wall-e/eval/meta-harness/experience-store.js +0 -174
  475. package/template/wall-e/eval/meta-harness/frontier.js +0 -96
  476. package/template/wall-e/eval/meta-harness/harness-interface.js +0 -90
  477. package/template/wall-e/eval/meta-harness/leakage-guard.js +0 -80
  478. package/template/wall-e/eval/meta-harness/optimizer.js +0 -207
  479. package/template/wall-e/eval/meta-harness/proposer-runner.js +0 -110
  480. package/template/wall-e/eval/meta-harness/reporting.js +0 -58
  481. package/template/wall-e/eval/meta-harness/telemetry.js +0 -27
  482. package/template/wall-e/eval/meta-harness/validation.js +0 -81
  483. package/template/wall-e/eval/promoter.js +0 -228
  484. package/template/wall-e/eval/provider-normalizer.js +0 -33
  485. package/template/wall-e/eval/replay.js +0 -395
  486. package/template/wall-e/eval/run-agent-benchmarks.js +0 -386
  487. package/template/wall-e/eval/run-codex-cli-baseline.js +0 -177
  488. package/template/wall-e/eval/run-coding-agent-real.js +0 -187
  489. package/template/wall-e/eval/run-eval.js +0 -435
  490. package/template/wall-e/eval/run-model-comparison.js +0 -142
  491. package/template/wall-e/eval/session-evaluator.js +0 -187
  492. package/template/wall-e/eval/session-miner.js +0 -207
  493. package/template/wall-e/eval/session-retrieval-benchmark.js +0 -150
  494. package/template/wall-e/eval/session-transcripts.js +0 -509
  495. package/template/wall-e/eval/shadow.js +0 -161
  496. package/template/wall-e/eval/swebench-adapter.js +0 -345
  497. package/template/wall-e/eval/swebench-docker.js +0 -192
  498. package/template/wall-e/eval/train.py +0 -320
  499. package/template/wall-e/eval/trainer.js +0 -232
  500. package/template/wall-e/eval/weekly-eval-loop.js +0 -241
@@ -1,345 +0,0 @@
1
- 'use strict';
2
-
3
- /**
4
- * SWE-bench Lite adapter for Wall-E evaluation pipeline.
5
- *
6
- * Downloads the SWE-bench Lite dataset from HuggingFace (JSON),
7
- * maps tasks to Wall-E prompt format, and delegates execution
8
- * to Docker containers (or returns placeholder results when Docker
9
- * is unavailable).
10
- *
11
- * Results are stored with suite = 'swebench-lite'.
12
- */
13
-
14
- const fs = require('fs');
15
- const path = require('path');
16
- const os = require('os');
17
- const https = require('https');
18
- const crypto = require('crypto');
19
- const { resolveModelName } = require('./agent-runner');
20
- const { decorateBenchmarkResult, DEFAULT_SCORER_VERSION } = require('./manifest');
21
-
22
- const CACHE_DIR = path.join(os.homedir(), '.walle', 'swebench-cache');
23
- const DATASET_URL =
24
- 'https://raw.githubusercontent.com/princeton-nlp/SWE-bench/main/swebench/collect/tasks/swe-bench-lite.json';
25
- const CACHE_FILE = path.join(CACHE_DIR, 'swe-bench-lite.json');
26
-
27
- /**
28
- * Download and cache the SWE-bench Lite dataset.
29
- * Returns the parsed JSON array of tasks.
30
- */
31
- async function downloadDataset() {
32
- fs.mkdirSync(CACHE_DIR, { recursive: true });
33
-
34
- // Return cached version if it exists and is less than 7 days old
35
- if (fs.existsSync(CACHE_FILE)) {
36
- const stat = fs.statSync(CACHE_FILE);
37
- const ageMs = Date.now() - stat.mtimeMs;
38
- if (ageMs < 7 * 24 * 60 * 60 * 1000) {
39
- return JSON.parse(fs.readFileSync(CACHE_FILE, 'utf8'));
40
- }
41
- }
42
-
43
- // Download from HuggingFace / GitHub mirror
44
- const data = await new Promise((resolve, reject) => {
45
- const request = (url, redirectCount = 0) => {
46
- if (redirectCount > 5) return reject(new Error('Too many redirects'));
47
-
48
- https.get(url, { headers: { 'User-Agent': 'wall-e-eval/1.0' } }, (res) => {
49
- // Follow redirects
50
- if (res.statusCode >= 300 && res.statusCode < 400 && res.headers.location) {
51
- return request(res.headers.location, redirectCount + 1);
52
- }
53
- if (res.statusCode !== 200) {
54
- return reject(new Error(`HTTP ${res.statusCode} fetching SWE-bench dataset`));
55
- }
56
-
57
- const chunks = [];
58
- res.on('data', (chunk) => chunks.push(chunk));
59
- res.on('end', () => resolve(Buffer.concat(chunks).toString('utf8')));
60
- res.on('error', reject);
61
- }).on('error', reject);
62
- };
63
- request(DATASET_URL);
64
- });
65
-
66
- fs.writeFileSync(CACHE_FILE, data, 'utf8');
67
- return JSON.parse(data);
68
- }
69
-
70
- /**
71
- * Convert a SWE-bench task to Wall-E prompt format.
72
- *
73
- * @param {Object} task - SWE-bench task with instance_id, repo,
74
- * base_commit, problem_statement, hints_text, test_patch, patch
75
- * @returns {Object} Wall-E-compatible benchmark entry
76
- */
77
- function mapTaskToPrompt(task) {
78
- let prompt = `Fix the following issue in ${task.repo}:\n\n${task.problem_statement}`;
79
- if (task.hints_text) {
80
- prompt += `\n\nHints:\n${task.hints_text}`;
81
- }
82
-
83
- return {
84
- id: task.instance_id,
85
- prompt,
86
- suite: 'swebench-lite',
87
- repo: task.repo,
88
- baseCommit: task.base_commit,
89
- goldenPatch: task.patch,
90
- testPatch: task.test_patch,
91
- };
92
- }
93
-
94
- /**
95
- * Run a single SWE-bench task.
96
- *
97
- * This is the adapter layer: actual execution depends on Docker
98
- * availability. When Docker is present, delegates to swebench-docker.js
99
- * to spin up a container, apply the agent's patch, and run the test suite.
100
- *
101
- * @param {Object} task - Mapped task from mapTaskToPrompt()
102
- * @param {Object} options
103
- * @param {Function} options.runAgentLoop - The agent orchestrator function
104
- * @param {Object} options.brain - Wall-E brain instance (for DB / keys)
105
- * @param {Object} options.provider - LLM provider client
106
- * @param {string} options.model - Model ID to use
107
- * @param {number} options.timeoutMs - Timeout in ms (default 900000 = 15min)
108
- * @returns {Object} Result with { taskId, success, score, error, ... }
109
- */
110
- async function runSWEBenchTask(task, options = {}) {
111
- const { runAgentLoop, brain, provider, model, timeoutMs = 900_000 } = options;
112
-
113
- const mapped = typeof task.prompt === 'string' ? task : mapTaskToPrompt(task);
114
- const startTime = Date.now();
115
-
116
- // Check Docker availability
117
- let dockerAvailable = false;
118
- try {
119
- const docker = require('./swebench-docker');
120
- dockerAvailable = await docker.isDockerAvailable();
121
- } catch {
122
- dockerAvailable = false;
123
- }
124
-
125
- if (!dockerAvailable) {
126
- return {
127
- taskId: mapped.id,
128
- suite: 'swebench-lite',
129
- success: false,
130
- error: 'Docker not available — SWE-bench tasks require Docker for sandboxed execution',
131
- score: { composite: 0, dimensions: {} },
132
- elapsedMs: Date.now() - startTime,
133
- };
134
- }
135
-
136
- // Docker-based execution
137
- const docker = require('./swebench-docker');
138
- let containerId = null;
139
-
140
- try {
141
- // 1. Build container with the repo at the base commit
142
- containerId = await docker.buildContainer(mapped.repo, mapped.baseCommit);
143
-
144
- // 2. Apply test patch so the new tests exist
145
- if (mapped.testPatch) {
146
- await docker.applyPatch(containerId, mapped.testPatch);
147
- }
148
-
149
- // 3. Run the agent to produce a fix
150
- // The agent receives the issue description and must produce a patch
151
- let agentResult = null;
152
- if (runAgentLoop) {
153
- try {
154
- let timeoutHandle;
155
- agentResult = await Promise.race([
156
- runAgentLoop(mapped.prompt, {
157
- brain,
158
- provider,
159
- model,
160
- maxTurns: 30,
161
- persistTranscript: false,
162
- }),
163
- new Promise((_, reject) => {
164
- timeoutHandle = setTimeout(() => reject(new Error('Agent timeout')), timeoutMs);
165
- if (typeof timeoutHandle.unref === 'function') timeoutHandle.unref();
166
- }),
167
- ]);
168
- if (timeoutHandle) clearTimeout(timeoutHandle);
169
- } catch (err) {
170
- return {
171
- taskId: mapped.id,
172
- suite: 'swebench-lite',
173
- success: false,
174
- error: `Agent execution failed: ${err.message}`,
175
- score: { composite: 0, dimensions: {} },
176
- elapsedMs: Date.now() - startTime,
177
- };
178
- }
179
- }
180
-
181
- // 4. Apply agent's produced patch
182
- if (agentResult?.patch) {
183
- await docker.applyPatch(containerId, agentResult.patch);
184
- }
185
-
186
- // 5. Run tests to determine pass/fail
187
- const testResult = await docker.runTests(containerId, 'python -m pytest --tb=short -q');
188
- const passed = testResult.exitCode === 0;
189
-
190
- // 6. Score with Wall-E's 11-dimension scorer if available
191
- let score = { composite: passed ? 1.0 : 0.0, dimensions: {} };
192
- try {
193
- const { scoreAgentResult } = require('./agent-scorer');
194
- if (scoreAgentResult && agentResult) {
195
- score = scoreAgentResult({
196
- success: passed,
197
- actualToolCalls: agentResult.toolCalls || [],
198
- turnCount: agentResult.turns || 0,
199
- fileChanges: agentResult.fileChanges || [],
200
- }, {
201
- expectedToolCalls: ['read_file', 'edit_file', 'glob'],
202
- maxTurns: 30,
203
- });
204
- // Override correctness with binary pass/fail from tests
205
- if (score.dimensions) {
206
- score.dimensions.correctness = passed ? 1.0 : 0.0;
207
- }
208
- }
209
- } catch {
210
- // Scorer not available; binary score is fine
211
- }
212
-
213
- return {
214
- taskId: mapped.id,
215
- suite: 'swebench-lite',
216
- success: passed,
217
- score,
218
- testOutput: testResult.stdout?.slice(0, 2000),
219
- agentTurns: agentResult?.turns || 0,
220
- elapsedMs: Date.now() - startTime,
221
- };
222
- } finally {
223
- // Cleanup container
224
- if (containerId) {
225
- try { await docker.cleanup(containerId); } catch {}
226
- }
227
- }
228
- }
229
-
230
- /**
231
- * Load the curated 30-task subset (swebench-lite-30.json).
232
- * These are selected for diversity across repos, difficulty, and category.
233
- */
234
- async function loadCuratedSubset() {
235
- const filePath = path.join(__dirname, 'benchmarks', 'swebench-lite-30.json');
236
- return JSON.parse(fs.readFileSync(filePath, 'utf8'));
237
- }
238
-
239
- async function runSWEBenchSuite(options = {}) {
240
- const {
241
- brain,
242
- runAgentLoop,
243
- provider,
244
- providerType,
245
- model,
246
- maxTasks,
247
- signal,
248
- timeoutMs,
249
- runId: providedRunId,
250
- } = options;
251
-
252
- const allTasks = await loadCuratedSubset();
253
- const tasks = maxTasks ? allTasks.slice(0, maxTasks) : allTasks;
254
- const runId = providedRunId || crypto.randomUUID();
255
- const results = [];
256
- let totalPassed = 0;
257
-
258
- for (const rawTask of tasks) {
259
- if (signal?.aborted) break;
260
-
261
- const mapped = mapTaskToPrompt(rawTask);
262
- const result = await runSWEBenchTask(mapped, {
263
- brain,
264
- runAgentLoop,
265
- provider,
266
- model,
267
- timeoutMs,
268
- });
269
- results.push(result);
270
- if (result.success) totalPassed++;
271
-
272
- if (brain && typeof brain.insertBenchmarkResult === 'function') {
273
- try {
274
- const scoringMethod = 'swebench-docker-tests';
275
- brain.insertBenchmarkResult(decorateBenchmarkResult({
276
- runId,
277
- suite: 'swebench-lite',
278
- promptId: mapped.id,
279
- taskType: 'coding-agent',
280
- difficulty: rawTask.difficulty || 'medium',
281
- provider: providerType || 'unknown',
282
- model: resolveModelName(model),
283
- prompt: mapped.prompt,
284
- response: result.testOutput || '',
285
- traitScore: null,
286
- compositeScore: result.score?.composite || 0,
287
- latencyMs: result.elapsedMs || null,
288
- error: result.error || null,
289
- testsBefore: null,
290
- testsAfter: result.success ? 1 : 0,
291
- totalTests: result.error ? 0 : 1,
292
- dimensionsJson: JSON.stringify(result.score?.dimensions || {}),
293
- modelMetadataJson: JSON.stringify({
294
- repo: mapped.repo,
295
- baseCommit: mapped.baseCommit,
296
- agentTurns: result.agentTurns || 0,
297
- testOutput: result.testOutput || null,
298
- }),
299
- datasetVersion: 'swebench-lite:curated-30',
300
- scorerVersion: DEFAULT_SCORER_VERSION,
301
- scoringMethod,
302
- trusted: !result.error,
303
- runConfig: { maxTasks, timeoutMs, scoringMethod },
304
- }, {
305
- suite: 'swebench-lite',
306
- benchmark: {
307
- id: mapped.id,
308
- prompt: mapped.prompt,
309
- taskType: 'coding-agent',
310
- difficulty: rawTask.difficulty || 'medium',
311
- datasetVersion: 'swebench-lite:curated-30',
312
- },
313
- runId,
314
- provider: providerType || 'unknown',
315
- model: resolveModelName(model),
316
- scorerVersion: DEFAULT_SCORER_VERSION,
317
- scoringMethod,
318
- trusted: !result.error,
319
- runConfig: { maxTasks, timeoutMs, scoringMethod },
320
- }));
321
- } catch {}
322
- }
323
- }
324
-
325
- return {
326
- runId,
327
- suite: 'swebench-lite',
328
- model: resolveModelName(model),
329
- totalTasks: tasks.length,
330
- passed: totalPassed,
331
- passAt1: tasks.length > 0 ? totalPassed / tasks.length : 0,
332
- avgScore: results.reduce((s, r) => s + (r.score?.composite || 0), 0) / Math.max(results.length, 1),
333
- totalCost: results.reduce((s, r) => s + (r.costDollars || 0), 0),
334
- results,
335
- };
336
- }
337
-
338
- module.exports = {
339
- downloadDataset,
340
- mapTaskToPrompt,
341
- runSWEBenchTask,
342
- runSWEBenchSuite,
343
- loadCuratedSubset,
344
- CACHE_DIR,
345
- };
@@ -1,192 +0,0 @@
1
- 'use strict';
2
-
3
- /**
4
- * Docker container lifecycle management for SWE-bench tasks.
5
- *
6
- * Each task gets an isolated container with the target repo checked
7
- * out at the specified base commit. The agent's patch is applied
8
- * inside the container, then tests are run to determine pass/fail.
9
- */
10
-
11
- const { execFile } = require('child_process');
12
- const { promisify } = require('util');
13
- const crypto = require('crypto');
14
- const fs = require('fs');
15
- const os = require('os');
16
- const path = require('path');
17
- const execFileAsync = promisify(execFile);
18
-
19
- // Strict validation patterns for inputs interpolated into shell commands
20
- const REPO_PATTERN = /^[\w.-]+\/[\w.-]+$/;
21
- const COMMIT_PATTERN = /^[0-9a-f]{7,40}$/i;
22
-
23
- const DOCKER_TIMEOUT = 5000;
24
- const DEFAULT_RUN_TIMEOUT = 900_000; // 15 minutes
25
-
26
- /**
27
- * Check whether Docker is installed and the daemon is running.
28
- */
29
- async function isDockerAvailable() {
30
- try {
31
- await execFileAsync('docker', ['info'], { timeout: DOCKER_TIMEOUT });
32
- return true;
33
- } catch {
34
- return false;
35
- }
36
- }
37
-
38
- /**
39
- * Build a Docker container for the given repo at the given commit.
40
- *
41
- * Strategy: pull a Python base image, clone the repo, checkout the
42
- * commit, install dependencies, and return the container ID.
43
- *
44
- * @param {string} repo - GitHub repo in "owner/name" format (e.g. "django/django")
45
- * @param {string} baseCommit - Git commit SHA to check out
46
- * @returns {string} Container ID
47
- */
48
- async function buildContainer(repo, baseCommit) {
49
- // Validate inputs to prevent shell injection
50
- if (!REPO_PATTERN.test(repo)) {
51
- throw new Error(`Invalid repo format: ${repo} (expected owner/name)`);
52
- }
53
- if (!COMMIT_PATTERN.test(baseCommit)) {
54
- throw new Error(`Invalid commit SHA: ${baseCommit}`);
55
- }
56
-
57
- const containerName = `swebench-${crypto.randomUUID().slice(0, 8)}`;
58
-
59
- // Create a container from python:3.11-slim, clone repo at commit
60
- const setupScript = [
61
- 'apt-get update -qq && apt-get install -y -qq git > /dev/null 2>&1',
62
- `git clone --quiet https://github.com/${repo}.git /workspace`,
63
- `cd /workspace && git checkout ${baseCommit}`,
64
- 'cd /workspace && if [ -f requirements.txt ]; then pip install -q -r requirements.txt; fi',
65
- 'cd /workspace && if [ -f setup.py ]; then pip install -q -e .; fi',
66
- 'cd /workspace && if [ -f pyproject.toml ]; then pip install -q -e . 2>/dev/null || true; fi',
67
- ].join(' && ');
68
-
69
- // Start a long-running container
70
- const { stdout: containerId } = await execFileAsync('docker', [
71
- 'run', '-d',
72
- '--name', containerName,
73
- '--workdir', '/workspace',
74
- 'python:3.11-slim',
75
- 'bash', '-c', `${setupScript} && tail -f /dev/null`,
76
- ], { timeout: 300_000 }); // 5 min for clone + install
77
-
78
- const id = containerId.trim();
79
-
80
- // Wait for setup to complete by polling for the workspace dir
81
- let ready = false;
82
- for (let i = 0; i < 60; i++) {
83
- try {
84
- await execFileAsync('docker', [
85
- 'exec', id, 'test', '-d', '/workspace/.git',
86
- ], { timeout: 5000 });
87
- ready = true;
88
- break;
89
- } catch {
90
- await new Promise(r => setTimeout(r, 5000));
91
- }
92
- }
93
-
94
- if (!ready) {
95
- await cleanup(id).catch(() => {});
96
- throw new Error(`Container setup timed out for ${repo}@${baseCommit}`);
97
- }
98
-
99
- return id;
100
- }
101
-
102
- /**
103
- * Execute a command inside the container and return stdout/stderr.
104
- *
105
- * @param {string} containerId
106
- * @param {string} command - Shell command to run
107
- * @param {Object} opts
108
- * @param {number} opts.timeoutMs - Timeout in ms (default 15min)
109
- * @returns {{ stdout: string, stderr: string, exitCode: number }}
110
- */
111
- async function runInContainer(containerId, command, { timeoutMs = DEFAULT_RUN_TIMEOUT } = {}) {
112
- try {
113
- const { stdout, stderr } = await execFileAsync('docker', [
114
- 'exec', containerId, 'bash', '-c', command,
115
- ], { timeout: timeoutMs, maxBuffer: 10 * 1024 * 1024 });
116
- return { stdout, stderr, exitCode: 0 };
117
- } catch (err) {
118
- return {
119
- stdout: err.stdout || '',
120
- stderr: err.stderr || err.message,
121
- exitCode: err.code || 1,
122
- };
123
- }
124
- }
125
-
126
- /**
127
- * Apply a git patch inside the container.
128
- *
129
- * @param {string} containerId
130
- * @param {string} patch - Unified diff / git patch content
131
- */
132
- async function applyPatch(containerId, patch) {
133
- // Write patch to a local temp file, then docker cp it in (avoids shell injection)
134
- const tmpFile = path.join(os.tmpdir(), `swebench-patch-${crypto.randomUUID().slice(0, 8)}.patch`);
135
- try {
136
- fs.writeFileSync(tmpFile, patch);
137
- await execFileAsync('docker', ['cp', tmpFile, `${containerId}:/tmp/agent.patch`], { timeout: 15_000 });
138
- } finally {
139
- try { fs.unlinkSync(tmpFile); } catch {}
140
- }
141
-
142
- const result = await runInContainer(containerId,
143
- 'cd /workspace && git apply /tmp/agent.patch',
144
- { timeoutMs: 30_000 },
145
- );
146
-
147
- if (result.exitCode !== 0) {
148
- // Try with --3way as fallback
149
- const fallback = await runInContainer(containerId,
150
- 'cd /workspace && git apply --3way /tmp/agent.patch',
151
- { timeoutMs: 30_000 },
152
- );
153
- if (fallback.exitCode !== 0) {
154
- throw new Error(`Failed to apply patch: ${fallback.stderr.slice(0, 500)}`);
155
- }
156
- }
157
- }
158
-
159
- /**
160
- * Run the test suite inside the container.
161
- *
162
- * @param {string} containerId
163
- * @param {string} testCommand - e.g. "python -m pytest --tb=short -q"
164
- * @returns {{ stdout: string, stderr: string, exitCode: number }}
165
- */
166
- async function runTests(containerId, testCommand) {
167
- return runInContainer(containerId, `cd /workspace && ${testCommand}`, {
168
- timeoutMs: DEFAULT_RUN_TIMEOUT,
169
- });
170
- }
171
-
172
- /**
173
- * Stop and remove the container.
174
- *
175
- * @param {string} containerId
176
- */
177
- async function cleanup(containerId) {
178
- try {
179
- await execFileAsync('docker', ['rm', '-f', containerId], { timeout: 15_000 });
180
- } catch {
181
- // Best-effort cleanup
182
- }
183
- }
184
-
185
- module.exports = {
186
- isDockerAvailable,
187
- buildContainer,
188
- runInContainer,
189
- applyPatch,
190
- runTests,
191
- cleanup,
192
- };