create-walle 0.9.21 → 0.9.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (500) hide show
  1. package/README.md +27 -5
  2. package/package.json +2 -2
  3. package/template/CLAUDE.md +2 -2
  4. package/template/LICENSE +1 -1
  5. package/template/bin/ctm-dev-cleanup.js +24 -3
  6. package/template/bin/ctm-launch.sh +13 -0
  7. package/template/bin/dev.sh +156 -18
  8. package/template/bin/node-bin.sh +84 -0
  9. package/template/bin/pin-node.sh +51 -0
  10. package/template/claude-task-manager/api-prompts.js +1203 -182
  11. package/template/claude-task-manager/api-reviews.js +109 -15
  12. package/template/claude-task-manager/approval-agent.js +1360 -280
  13. package/template/claude-task-manager/bin/restart-ctm.sh +64 -23
  14. package/template/claude-task-manager/bin/storage-migration-supervisor.js +338 -0
  15. package/template/claude-task-manager/db.js +4417 -295
  16. package/template/claude-task-manager/docs/app-update-refresh-protocol.md +69 -0
  17. package/template/claude-task-manager/docs/approval-ai-refinement.md +138 -0
  18. package/template/claude-task-manager/docs/approval-rescue-loop.md +74 -0
  19. package/template/claude-task-manager/docs/codex-operational-warning-health.md +107 -0
  20. package/template/claude-task-manager/docs/codex-resume-state-guard-design.md +17 -12
  21. package/template/claude-task-manager/docs/codex-terminal-render-controller-handoff.md +311 -0
  22. package/template/claude-task-manager/docs/coding-agent-hooks-architecture.md +418 -0
  23. package/template/claude-task-manager/docs/conversation-import-freshness.md +20 -0
  24. package/template/claude-task-manager/docs/google-workspace-auth-health.md +77 -0
  25. package/template/claude-task-manager/docs/image-paste-ux.md +13 -0
  26. package/template/claude-task-manager/docs/ipad-web-preview.md +88 -0
  27. package/template/claude-task-manager/docs/main-loop-offload-architecture.md +66 -0
  28. package/template/claude-task-manager/docs/microsoft-dev-tunnel-phone-access-design.md +274 -519
  29. package/template/claude-task-manager/docs/mobile-live-streaming.md +27 -5
  30. package/template/claude-task-manager/docs/mobile-remote-submission-lifecycle.md +69 -0
  31. package/template/claude-task-manager/docs/phone-access-design.md +53 -15
  32. package/template/claude-task-manager/docs/phone-passkey-identity.md +122 -0
  33. package/template/claude-task-manager/docs/phone-setup.md +3 -0
  34. package/template/claude-task-manager/docs/prompt-editing-tree-design.md +25 -1
  35. package/template/claude-task-manager/docs/remote-desktop-access-design.md +268 -0
  36. package/template/claude-task-manager/docs/restart-lifecycle-architecture.md +95 -0
  37. package/template/claude-task-manager/docs/runtime-work-control-plane.md +53 -0
  38. package/template/claude-task-manager/docs/session-interactive-wait-surfaces.md +38 -0
  39. package/template/claude-task-manager/docs/session-needs-you-dismissal.md +84 -0
  40. package/template/claude-task-manager/docs/session-render-state-management-design.md +91 -3
  41. package/template/claude-task-manager/docs/session-standup-command-center-design.md +25 -1
  42. package/template/claude-task-manager/docs/session-title-authority.md +32 -0
  43. package/template/claude-task-manager/docs/session-workspace-binding.md +33 -0
  44. package/template/claude-task-manager/docs/skill-intent-resolution-design.md +72 -0
  45. package/template/claude-task-manager/docs/walle-mcp-supervisor-health.md +86 -0
  46. package/template/claude-task-manager/docs/walle-relay-phone-access-design.md +24 -15
  47. package/template/claude-task-manager/docs/walle-session-history-hydration.md +114 -0
  48. package/template/claude-task-manager/docs/walle-session-input-queue.md +104 -0
  49. package/template/claude-task-manager/docs/walle-session-model-catalog.md +90 -0
  50. package/template/claude-task-manager/docs/walle-session-model-preferences.md +15 -6
  51. package/template/claude-task-manager/git-utils.js +897 -27
  52. package/template/claude-task-manager/lib/agent-capabilities.js +33 -0
  53. package/template/claude-task-manager/lib/agent-cli-cache.js +37 -7
  54. package/template/claude-task-manager/lib/agent-hooks-installer.js +26 -2
  55. package/template/claude-task-manager/lib/agent-presets.js +17 -1
  56. package/template/claude-task-manager/lib/all-sessions-query.js +108 -0
  57. package/template/claude-task-manager/lib/approval-ai-refinement.js +488 -0
  58. package/template/claude-task-manager/lib/approval-self-adapt.js +168 -0
  59. package/template/claude-task-manager/lib/async-semaphore.js +44 -0
  60. package/template/claude-task-manager/lib/auth-context.js +5 -0
  61. package/template/claude-task-manager/lib/auth-rate-limit.js +47 -4
  62. package/template/claude-task-manager/lib/auth-rules.js +29 -2
  63. package/template/claude-task-manager/lib/auto-approval-verifier.js +129 -16
  64. package/template/claude-task-manager/lib/background-llm.js +144 -17
  65. package/template/claude-task-manager/lib/branch-inventory.js +212 -0
  66. package/template/claude-task-manager/lib/claude-desktop-sessions.js +15 -3
  67. package/template/claude-task-manager/lib/coalesce-sync-frames.js +151 -0
  68. package/template/claude-task-manager/lib/codex-launch-health.js +762 -0
  69. package/template/claude-task-manager/lib/codex-transcript-pager.js +51 -0
  70. package/template/claude-task-manager/lib/codex-zst.js +124 -0
  71. package/template/claude-task-manager/lib/coding-agent-models.js +233 -30
  72. package/template/claude-task-manager/lib/connection-health.js +232 -0
  73. package/template/claude-task-manager/lib/conversation-blob-parser.js +42 -0
  74. package/template/claude-task-manager/lib/conversation-tail-merge.js +89 -26
  75. package/template/claude-task-manager/lib/ctm-session-context-api.js +39 -10
  76. package/template/claude-task-manager/lib/cursor-conversation-store.js +354 -0
  77. package/template/claude-task-manager/lib/db-owner-worker-client.js +315 -0
  78. package/template/claude-task-manager/lib/document-review.js +141 -6
  79. package/template/claude-task-manager/lib/escalation-review.js +152 -0
  80. package/template/claude-task-manager/lib/graceful-shutdown.js +159 -0
  81. package/template/claude-task-manager/lib/headless-term-service.js +678 -0
  82. package/template/claude-task-manager/lib/heavy-worker-fallback.js +38 -0
  83. package/template/claude-task-manager/lib/jsonl-conversation-parser.js +542 -0
  84. package/template/claude-task-manager/lib/jsonl-range-reader.js +112 -0
  85. package/template/claude-task-manager/lib/main-db-census.js +216 -0
  86. package/template/claude-task-manager/lib/message-pagination.js +106 -4
  87. package/template/claude-task-manager/lib/microsoft-dev-tunnel-setup.js +750 -26
  88. package/template/claude-task-manager/lib/mobile-auth-api.js +274 -7
  89. package/template/claude-task-manager/lib/mobile-auth-store.js +592 -10
  90. package/template/claude-task-manager/lib/mobile-notification-dispatcher.js +15 -0
  91. package/template/claude-task-manager/lib/model-overview-brain-fallback.js +311 -0
  92. package/template/claude-task-manager/lib/model-overview-cache.js +141 -0
  93. package/template/claude-task-manager/lib/models-health-routing-notice.js +126 -0
  94. package/template/claude-task-manager/lib/node-pin-guard.js +93 -0
  95. package/template/claude-task-manager/lib/perf-tracker.js +242 -6
  96. package/template/claude-task-manager/lib/permission-match.js +76 -0
  97. package/template/claude-task-manager/lib/permission-sync.js +133 -20
  98. package/template/claude-task-manager/lib/process-title.js +35 -0
  99. package/template/claude-task-manager/lib/prompt-executions-query.js +25 -0
  100. package/template/claude-task-manager/lib/prompt-index-disk-cache.js +44 -0
  101. package/template/claude-task-manager/lib/prompt-intent.js +132 -0
  102. package/template/claude-task-manager/lib/provider-user-context.js +34 -0
  103. package/template/claude-task-manager/lib/read-pool-client.js +313 -0
  104. package/template/claude-task-manager/lib/readpool-breaker.js +31 -0
  105. package/template/claude-task-manager/lib/recent-sessions-breaker.js +12 -0
  106. package/template/claude-task-manager/lib/remote-feedback-client.js +72 -0
  107. package/template/claude-task-manager/lib/remote-relay-protocol.js +37 -4
  108. package/template/claude-task-manager/lib/remote-relay-store.js +159 -0
  109. package/template/claude-task-manager/lib/remote-submission-observer.js +278 -0
  110. package/template/claude-task-manager/lib/restart-guard.js +109 -0
  111. package/template/claude-task-manager/lib/restore-interruption-detector.js +439 -0
  112. package/template/claude-task-manager/lib/restore-policy.js +13 -0
  113. package/template/claude-task-manager/lib/restore-resume-batch.js +74 -0
  114. package/template/claude-task-manager/lib/restore-runtime.js +68 -0
  115. package/template/claude-task-manager/lib/restore-storm.js +34 -0
  116. package/template/claude-task-manager/lib/resume-cwd.js +36 -0
  117. package/template/claude-task-manager/lib/resume-preflight.js +313 -0
  118. package/template/claude-task-manager/lib/runtime-work-registry.js +444 -0
  119. package/template/claude-task-manager/lib/sanitize-openai-auth.js +31 -0
  120. package/template/claude-task-manager/lib/scheduler.js +21 -1
  121. package/template/claude-task-manager/lib/scrollback-snapshot-store.js +159 -0
  122. package/template/claude-task-manager/lib/serial-task-queue.js +64 -0
  123. package/template/claude-task-manager/lib/server-listeners.js +239 -0
  124. package/template/claude-task-manager/lib/session-capture.js +42 -7
  125. package/template/claude-task-manager/lib/session-content-backfill.js +131 -0
  126. package/template/claude-task-manager/lib/session-history.js +388 -43
  127. package/template/claude-task-manager/lib/session-host-manager.js +287 -0
  128. package/template/claude-task-manager/lib/session-image-refs.js +209 -0
  129. package/template/claude-task-manager/lib/session-jobs.js +399 -59
  130. package/template/claude-task-manager/lib/session-prompt-index.js +137 -0
  131. package/template/claude-task-manager/lib/session-restore.js +53 -0
  132. package/template/claude-task-manager/lib/session-standup.js +123 -23
  133. package/template/claude-task-manager/lib/session-state-bus.js +14 -0
  134. package/template/claude-task-manager/lib/session-stream.js +64 -16
  135. package/template/claude-task-manager/lib/session-timeline-summary.js +260 -0
  136. package/template/claude-task-manager/lib/session-token-usage.js +494 -0
  137. package/template/claude-task-manager/lib/session-workspace-binding.js +356 -0
  138. package/template/claude-task-manager/lib/setup-network-config.js +9 -0
  139. package/template/claude-task-manager/lib/size-cap.js +45 -0
  140. package/template/claude-task-manager/lib/size-cap.test.js +62 -0
  141. package/template/claude-task-manager/lib/skill-autocomplete.js +180 -1
  142. package/template/claude-task-manager/lib/skill-intent-resolver.js +304 -0
  143. package/template/claude-task-manager/lib/sqlite-driver.js +19 -3
  144. package/template/claude-task-manager/lib/standup-attention.js +7 -3
  145. package/template/claude-task-manager/lib/status-authority.js +39 -0
  146. package/template/claude-task-manager/lib/status-hooks.js +4 -0
  147. package/template/claude-task-manager/lib/storage-migration.js +235 -0
  148. package/template/claude-task-manager/lib/structured-capture.js +298 -0
  149. package/template/claude-task-manager/lib/sync-io-census.js +163 -0
  150. package/template/claude-task-manager/lib/tailscale-setup.js +6 -0
  151. package/template/claude-task-manager/lib/terminal-activity-evidence.js +33 -0
  152. package/template/claude-task-manager/lib/terminal-choice.js +364 -0
  153. package/template/claude-task-manager/lib/terminal-control-sanitize.js +17 -0
  154. package/template/claude-task-manager/lib/terminal-fingerprint.js +48 -0
  155. package/template/claude-task-manager/lib/terminal-output-flush.js +84 -0
  156. package/template/claude-task-manager/lib/timeline-order.js +122 -0
  157. package/template/claude-task-manager/lib/transcript-store.js +348 -43
  158. package/template/claude-task-manager/lib/transport-security.js +84 -1
  159. package/template/claude-task-manager/lib/wait-state.js +184 -0
  160. package/template/claude-task-manager/lib/walle-client.js +47 -5
  161. package/template/claude-task-manager/lib/walle-ctm-history.js +564 -4
  162. package/template/claude-task-manager/lib/walle-external-actions.js +135 -16
  163. package/template/claude-task-manager/lib/walle-history-hydration.js +46 -0
  164. package/template/claude-task-manager/lib/walle-native-health.js +403 -0
  165. package/template/claude-task-manager/lib/walle-repair.js +701 -0
  166. package/template/claude-task-manager/lib/walle-session-cache.js +109 -0
  167. package/template/claude-task-manager/lib/walle-session-context.js +57 -21
  168. package/template/claude-task-manager/lib/walle-session-model-catalog.js +34 -0
  169. package/template/claude-task-manager/lib/walle-supervisor.js +539 -63
  170. package/template/claude-task-manager/lib/walle-transcript.js +52 -0
  171. package/template/claude-task-manager/lib/worktree-active-sync.js +11 -7
  172. package/template/claude-task-manager/lib/worktree-cwd.js +32 -1
  173. package/template/claude-task-manager/package.json +1 -1
  174. package/template/claude-task-manager/prompt-harvest.js +89 -66
  175. package/template/claude-task-manager/providers/claude-code.js +51 -3
  176. package/template/claude-task-manager/providers/cursor.js +140 -45
  177. package/template/claude-task-manager/public/css/reviews.css +551 -61
  178. package/template/claude-task-manager/public/css/setup.css +191 -0
  179. package/template/claude-task-manager/public/css/walle-session.css +865 -10
  180. package/template/claude-task-manager/public/css/walle.css +154 -0
  181. package/template/claude-task-manager/public/designs/ai-providers-consolidation-v2.html +830 -0
  182. package/template/claude-task-manager/public/index.html +18516 -2058
  183. package/template/claude-task-manager/public/ipad.html +363 -0
  184. package/template/claude-task-manager/public/js/document-review-links.js +301 -0
  185. package/template/claude-task-manager/public/js/image-normalize.js +69 -36
  186. package/template/claude-task-manager/public/js/message-renderer.js +1265 -77
  187. package/template/claude-task-manager/public/js/prompts.js +66 -29
  188. package/template/claude-task-manager/public/js/reviews.js +901 -133
  189. package/template/claude-task-manager/public/js/session-activity-utils.js +11 -1
  190. package/template/claude-task-manager/public/js/session-search-utils.js +94 -10
  191. package/template/claude-task-manager/public/js/session-status-precedence.js +23 -5
  192. package/template/claude-task-manager/public/js/setup.js +1273 -176
  193. package/template/claude-task-manager/public/js/stream-view.js +691 -73
  194. package/template/claude-task-manager/public/js/terminal-reconciler.js +210 -0
  195. package/template/claude-task-manager/public/js/walle-session.js +2455 -158
  196. package/template/claude-task-manager/public/js/walle.js +455 -28
  197. package/template/claude-task-manager/public/m/app.css +2909 -262
  198. package/template/claude-task-manager/public/m/app.js +6601 -398
  199. package/template/claude-task-manager/public/m/claim.html +224 -17
  200. package/template/claude-task-manager/public/m/index.html +117 -21
  201. package/template/claude-task-manager/public/m/sw.js +3 -1
  202. package/template/claude-task-manager/public/manifest.json +2 -2
  203. package/template/claude-task-manager/public/prompts.html +30 -14
  204. package/template/claude-task-manager/queue-engine.js +507 -28
  205. package/template/claude-task-manager/scripts/repair-claude-session-images.js +27 -8
  206. package/template/claude-task-manager/server.js +14341 -2197
  207. package/template/claude-task-manager/session-integrity.js +160 -18
  208. package/template/claude-task-manager/session-search-ranking.js +1 -0
  209. package/template/claude-task-manager/session-utils.js +25 -5
  210. package/template/claude-task-manager/workers/approval-blocklist.js +96 -6
  211. package/template/claude-task-manager/workers/approval-widget-validator.js +14 -8
  212. package/template/claude-task-manager/workers/conversation-import-worker.js +11 -50
  213. package/template/claude-task-manager/workers/db-owner-worker.js +386 -0
  214. package/template/claude-task-manager/workers/harvest-worker.js +9 -55
  215. package/template/claude-task-manager/workers/headless-term-worker.js +9 -530
  216. package/template/claude-task-manager/workers/read-pool-worker.js +387 -0
  217. package/template/claude-task-manager/workers/scrollback-worker.js +11 -72
  218. package/template/claude-task-manager/workers/session-host-process.js +146 -0
  219. package/template/claude-task-manager/workers/session-integrity-worker.js +10 -54
  220. package/template/claude-task-manager/workers/state-detectors/base.js +18 -1
  221. package/template/claude-task-manager/workers/state-detectors/claude-code.js +182 -9
  222. package/template/claude-task-manager/workers/state-detectors/codex.js +150 -2
  223. package/template/claude-task-manager/workers/state-detectors/cursor.js +127 -0
  224. package/template/claude-task-manager/workers/state-detectors/gemini.js +21 -0
  225. package/template/claude-task-manager/workers/state-detectors/index.js +29 -0
  226. package/template/claude-task-manager/workers/state-detectors/opencode.js +103 -0
  227. package/template/docs/design/markdown-review-pane.md +206 -0
  228. package/template/docs/designs/2026-05-17-portkey-gateway-provider-ux.md +129 -38
  229. package/template/docs/designs/2026-05-20-mobile-worktree-finish-command.md +27 -0
  230. package/template/docs/designs/2026-05-22-ai-configuration-consolidation.md +248 -0
  231. package/template/docs/designs/ai-configuration-consolidation-mock.html +812 -0
  232. package/template/docs/private-memory-and-pii-policy.md +69 -0
  233. package/template/package.json +2 -1
  234. package/template/scripts/check-private-data.js +201 -0
  235. package/template/shared/sqlite-owner-guard.js +30 -0
  236. package/template/shared/sqlite-owner-write-queue.js +225 -0
  237. package/template/shared/sqlite-storage-policy.js +111 -0
  238. package/template/shared/sqlite-write-lock.js +428 -0
  239. package/template/wall-e/agent-runners/claude-code.js +5 -0
  240. package/template/wall-e/agent.js +166 -22
  241. package/template/wall-e/api-walle.js +524 -70
  242. package/template/wall-e/auth/provider-flows.js +11 -1
  243. package/template/wall-e/bin/walle-mcp-stdio.js +341 -17
  244. package/template/wall-e/brain.js +1614 -141
  245. package/template/wall-e/chat/attachment-blocks.js +96 -0
  246. package/template/wall-e/chat/attachments.js +2 -1
  247. package/template/wall-e/chat/capability-resolver.js +7 -7
  248. package/template/wall-e/chat/context-messages.js +28 -0
  249. package/template/wall-e/chat/conversation-frame.js +630 -0
  250. package/template/wall-e/chat/provider-messages.js +125 -0
  251. package/template/wall-e/chat.js +1002 -233
  252. package/template/wall-e/coding/acceptance-contract.js +170 -0
  253. package/template/wall-e/coding/acp-adapter.js +1 -1
  254. package/template/wall-e/coding/agent-catalog.js +3 -0
  255. package/template/wall-e/coding/artifact-store.js +93 -0
  256. package/template/wall-e/coding/capability-router.js +120 -0
  257. package/template/wall-e/coding/coding-run-controller.js +423 -0
  258. package/template/wall-e/coding/compaction-service.js +157 -12
  259. package/template/wall-e/coding/frontend-verification.js +258 -0
  260. package/template/wall-e/coding/lifecycle-hooks.js +75 -0
  261. package/template/wall-e/coding/local-preview-contract.js +157 -0
  262. package/template/wall-e/coding/permission-service.js +57 -13
  263. package/template/wall-e/coding/prompt-bundle.js +19 -1
  264. package/template/wall-e/coding/prompt-section-registry.js +227 -0
  265. package/template/wall-e/coding/provider-compat.js +15 -0
  266. package/template/wall-e/coding/runtime-events.js +224 -0
  267. package/template/wall-e/coding/runtime-mode.js +3 -0
  268. package/template/wall-e/coding/side-git-snapshot.js +160 -4
  269. package/template/wall-e/coding/snapshot-service.js +143 -1
  270. package/template/wall-e/coding/stream-processor.js +388 -34
  271. package/template/wall-e/coding/task-tool.js +141 -4
  272. package/template/wall-e/coding/tool-execution-controller.js +365 -0
  273. package/template/wall-e/coding/tool-registry.js +43 -5
  274. package/template/wall-e/coding/user-hooks.js +217 -0
  275. package/template/wall-e/coding-orchestrator.js +1330 -221
  276. package/template/wall-e/coding-prompts.js +20 -4
  277. package/template/wall-e/context/context-builder.js +15 -2
  278. package/template/wall-e/decision/confidence.js +1 -1
  279. package/template/wall-e/docs/coding-acceptance-contract.md +41 -0
  280. package/template/wall-e/docs/external-action-controller.md +26 -6
  281. package/template/wall-e/docs/telemetry-lifecycle.md +8 -2
  282. package/template/wall-e/embeddings.js +591 -53
  283. package/template/wall-e/external-action-controller.js +12 -0
  284. package/template/wall-e/http/auth.js +1 -0
  285. package/template/wall-e/http/chat-api.js +46 -11
  286. package/template/wall-e/http/model-admin.js +836 -34
  287. package/template/wall-e/lib/boot-profile.js +88 -0
  288. package/template/wall-e/lib/event-loop-monitor.js +93 -0
  289. package/template/wall-e/lib/service-health.js +194 -0
  290. package/template/wall-e/llm/anthropic.js +130 -5
  291. package/template/wall-e/llm/client.js +266 -63
  292. package/template/wall-e/llm/default-fallback.js +382 -0
  293. package/template/wall-e/llm/health.js +19 -0
  294. package/template/wall-e/llm/message-guard.js +78 -0
  295. package/template/wall-e/llm/model-catalog.js +252 -1
  296. package/template/wall-e/llm/openai.js +26 -4
  297. package/template/wall-e/llm/portkey-sync.js +654 -0
  298. package/template/wall-e/llm/provider-error.js +30 -2
  299. package/template/wall-e/llm/registry.js +5 -1
  300. package/template/wall-e/llm/request-compat.js +67 -0
  301. package/template/wall-e/loops/backfill.js +79 -23
  302. package/template/wall-e/loops/brain-optimize.js +67 -0
  303. package/template/wall-e/loops/ingest.js +25 -10
  304. package/template/wall-e/loops/question-digest.js +160 -0
  305. package/template/wall-e/loops/reflect.js +6 -4
  306. package/template/wall-e/loops/think.js +39 -12
  307. package/template/wall-e/mcp-server.js +318 -36
  308. package/template/wall-e/memory/ctm-context-client.js +52 -14
  309. package/template/wall-e/memory/ctm-operational-context.js +237 -0
  310. package/template/wall-e/memory/ctm-prompt-executions-client.js +128 -0
  311. package/template/wall-e/memory/ctm-session-context.js +111 -63
  312. package/template/wall-e/prompts/coding/deepseek.txt +3 -0
  313. package/template/wall-e/prompts/coding/gemini.txt +6 -0
  314. package/template/wall-e/prompts/coding/gpt.txt +6 -0
  315. package/template/wall-e/prompts/coding/local.txt +7 -0
  316. package/template/wall-e/runtime/decision-hooks.js +115 -0
  317. package/template/wall-e/runtime/devbox-gateway.js +82 -8
  318. package/template/wall-e/runtime/prompt-manifest.js +86 -0
  319. package/template/wall-e/runtime/tool-executor.js +269 -0
  320. package/template/wall-e/runtime/tool-result-envelope.js +138 -0
  321. package/template/wall-e/runtime/transcript-projection.js +60 -0
  322. package/template/wall-e/runtime/walle-runtime.js +224 -0
  323. package/template/wall-e/scripts/db-optimize/migrate.js +162 -0
  324. package/template/wall-e/scripts/db-optimize/recall-eval.js +117 -0
  325. package/template/wall-e/server.js +15 -0
  326. package/template/wall-e/session-files.js +9 -0
  327. package/template/wall-e/skills/_bundled/google-calendar/run.js +1 -1
  328. package/template/wall-e/skills/_bundled/gws-workspace/run.js +1 -1
  329. package/template/wall-e/skills/_bundled/slack-mentions/run.js +76 -6
  330. package/template/wall-e/skills/claude-code-reader.js +7 -3
  331. package/template/wall-e/skills/script-skill-runner.js +10 -0
  332. package/template/wall-e/skills/skill-planner.js +38 -0
  333. package/template/wall-e/tools/builtin-middleware.js +19 -9
  334. package/template/wall-e/tools/local-tools.js +1428 -16
  335. package/template/wall-e/tools/permission-checker.js +73 -5
  336. package/template/wall-e/tools/question-manager.js +117 -7
  337. package/template/wall-e/training/harvester.js +12 -28
  338. package/template/wall-e/training/replay.js +25 -80
  339. package/template/website/index.html +10 -10
  340. package/template/wall-e/eval/ab-test.js +0 -203
  341. package/template/wall-e/eval/agent-runner.js +0 -772
  342. package/template/wall-e/eval/agent-scorer.js +0 -461
  343. package/template/wall-e/eval/aggregator.js +0 -414
  344. package/template/wall-e/eval/allowed-test-commands.js +0 -34
  345. package/template/wall-e/eval/benchmark-generator.js +0 -113
  346. package/template/wall-e/eval/benchmarks/chat-eval.json +0 -1662
  347. package/template/wall-e/eval/benchmarks/chat.json +0 -82
  348. package/template/wall-e/eval/benchmarks/coding-agent-real.json +0 -1
  349. package/template/wall-e/eval/benchmarks/coding-agent.json +0 -1581
  350. package/template/wall-e/eval/benchmarks/coding.json +0 -122
  351. package/template/wall-e/eval/benchmarks/memory-retrieval.json +0 -234
  352. package/template/wall-e/eval/benchmarks/reasoning.json +0 -82
  353. package/template/wall-e/eval/benchmarks/swebench-lite-30.json +0 -212
  354. package/template/wall-e/eval/benchmarks.js +0 -669
  355. package/template/wall-e/eval/cc-replay.js +0 -719
  356. package/template/wall-e/eval/chat-eval.js +0 -525
  357. package/template/wall-e/eval/check-keys.js +0 -15
  358. package/template/wall-e/eval/check-providers.js +0 -42
  359. package/template/wall-e/eval/codex-cli-baseline.js +0 -669
  360. package/template/wall-e/eval/coding-agent-real.js +0 -570
  361. package/template/wall-e/eval/context-compactor.js +0 -251
  362. package/template/wall-e/eval/debug-agent003.js +0 -68
  363. package/template/wall-e/eval/diagnostics.js +0 -216
  364. package/template/wall-e/eval/eval-orchestrator.js +0 -642
  365. package/template/wall-e/eval/evaluate.js +0 -202
  366. package/template/wall-e/eval/evaluator.js +0 -373
  367. package/template/wall-e/eval/exporter.js +0 -212
  368. package/template/wall-e/eval/fixtures/express-basic/package.json +0 -9
  369. package/template/wall-e/eval/fixtures/express-basic/server.js +0 -115
  370. package/template/wall-e/eval/fixtures/express-basic/test.js +0 -83
  371. package/template/wall-e/eval/fixtures/express-buggy/package.json +0 -9
  372. package/template/wall-e/eval/fixtures/express-buggy/server.js +0 -113
  373. package/template/wall-e/eval/fixtures/express-buggy/test.js +0 -83
  374. package/template/wall-e/eval/fixtures/express-buggy-items/package.json +0 -9
  375. package/template/wall-e/eval/fixtures/express-buggy-items/server.js +0 -112
  376. package/template/wall-e/eval/fixtures/express-buggy-items/test.js +0 -83
  377. package/template/wall-e/eval/fixtures/express-buggy-search/package.json +0 -9
  378. package/template/wall-e/eval/fixtures/express-buggy-search/server.js +0 -121
  379. package/template/wall-e/eval/fixtures/express-buggy-search/test.js +0 -83
  380. package/template/wall-e/eval/fixtures/express-rename-data/data.js +0 -34
  381. package/template/wall-e/eval/fixtures/express-rename-data/package.json +0 -9
  382. package/template/wall-e/eval/fixtures/express-rename-data/server.js +0 -97
  383. package/template/wall-e/eval/fixtures/express-rename-data/test.js +0 -88
  384. package/template/wall-e/eval/fixtures/express-xss/package.json +0 -12
  385. package/template/wall-e/eval/fixtures/express-xss/server.js +0 -90
  386. package/template/wall-e/eval/fixtures/express-xss/test.js +0 -67
  387. package/template/wall-e/eval/fixtures/express-xss/views/profile.ejs +0 -9
  388. package/template/wall-e/eval/fixtures/fullstack-app/config/default.js +0 -9
  389. package/template/wall-e/eval/fixtures/fullstack-app/config/test.js +0 -13
  390. package/template/wall-e/eval/fixtures/fullstack-app/package.json +0 -11
  391. package/template/wall-e/eval/fixtures/fullstack-app/public/css/style.css +0 -137
  392. package/template/wall-e/eval/fixtures/fullstack-app/public/index.html +0 -46
  393. package/template/wall-e/eval/fixtures/fullstack-app/public/js/app.js +0 -121
  394. package/template/wall-e/eval/fixtures/fullstack-app/public/js/auth.js +0 -71
  395. package/template/wall-e/eval/fixtures/fullstack-app/public/js/items.js +0 -80
  396. package/template/wall-e/eval/fixtures/fullstack-app/public/js/users.js +0 -46
  397. package/template/wall-e/eval/fixtures/fullstack-app/public/login.html +0 -45
  398. package/template/wall-e/eval/fixtures/fullstack-app/public/register.html +0 -38
  399. package/template/wall-e/eval/fixtures/fullstack-app/scripts/migrate.js +0 -23
  400. package/template/wall-e/eval/fixtures/fullstack-app/scripts/seed.js +0 -46
  401. package/template/wall-e/eval/fixtures/fullstack-app/server/db.js +0 -99
  402. package/template/wall-e/eval/fixtures/fullstack-app/server/index.js +0 -94
  403. package/template/wall-e/eval/fixtures/fullstack-app/server/middleware/auth.js +0 -19
  404. package/template/wall-e/eval/fixtures/fullstack-app/server/middleware/logger.js +0 -19
  405. package/template/wall-e/eval/fixtures/fullstack-app/server/router.js +0 -50
  406. package/template/wall-e/eval/fixtures/fullstack-app/server/routes/auth.js +0 -69
  407. package/template/wall-e/eval/fixtures/fullstack-app/server/routes/health.js +0 -23
  408. package/template/wall-e/eval/fixtures/fullstack-app/server/routes/items.js +0 -88
  409. package/template/wall-e/eval/fixtures/fullstack-app/server/routes/users.js +0 -75
  410. package/template/wall-e/eval/fixtures/fullstack-app/server/test.js +0 -198
  411. package/template/wall-e/eval/fixtures/fullstack-app/server/utils/response.js +0 -34
  412. package/template/wall-e/eval/fixtures/fullstack-app/server/utils/validate.js +0 -26
  413. package/template/wall-e/eval/fixtures/fullstack-app/server.js +0 -8
  414. package/template/wall-e/eval/fixtures/fullstack-app/test.js +0 -12
  415. package/template/wall-e/eval/fixtures/monorepo-basic/package.json +0 -8
  416. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/data.js +0 -58
  417. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/middleware.js +0 -46
  418. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/package.json +0 -8
  419. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/routes.js +0 -64
  420. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/server.js +0 -56
  421. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/test.js +0 -116
  422. package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/commands.js +0 -61
  423. package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/index.js +0 -62
  424. package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/output.js +0 -43
  425. package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/package.json +0 -11
  426. package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/test.js +0 -44
  427. package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/formatters.js +0 -43
  428. package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/index.js +0 -12
  429. package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/package.json +0 -5
  430. package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/test.js +0 -55
  431. package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/validators.js +0 -29
  432. package/template/wall-e/eval/fixtures/monorepo-basic/test.js +0 -46
  433. package/template/wall-e/eval/fixtures/node-cli/index.js +0 -78
  434. package/template/wall-e/eval/fixtures/node-cli/package.json +0 -10
  435. package/template/wall-e/eval/fixtures/node-cli/test.js +0 -57
  436. package/template/wall-e/eval/fixtures/node-typed/package.json +0 -8
  437. package/template/wall-e/eval/fixtures/node-typed/src/handlers.js +0 -31
  438. package/template/wall-e/eval/fixtures/node-typed/src/utils.js +0 -33
  439. package/template/wall-e/eval/fixtures/node-typed/test.js +0 -36
  440. package/template/wall-e/eval/fixtures/python-flask/app.py +0 -14
  441. package/template/wall-e/eval/fixtures/python-flask/requirements.txt +0 -2
  442. package/template/wall-e/eval/fixtures/python-flask/test_app.py +0 -25
  443. package/template/wall-e/eval/fixtures/wall-e-subset/brain.js +0 -105
  444. package/template/wall-e/eval/fixtures/wall-e-subset/eval/aggregator.js +0 -101
  445. package/template/wall-e/eval/fixtures/wall-e-subset/eval/benchmarks/chat.json +0 -20
  446. package/template/wall-e/eval/fixtures/wall-e-subset/eval/benchmarks/coding.json +0 -32
  447. package/template/wall-e/eval/fixtures/wall-e-subset/eval/benchmarks.js +0 -64
  448. package/template/wall-e/eval/fixtures/wall-e-subset/eval/fixtures/simple-project/package.json +0 -6
  449. package/template/wall-e/eval/fixtures/wall-e-subset/eval/fixtures/simple-project/server.js +0 -31
  450. package/template/wall-e/eval/fixtures/wall-e-subset/eval/fixtures/simple-project/test.js +0 -18
  451. package/template/wall-e/eval/fixtures/wall-e-subset/eval/fixtures/simple-project/utils.js +0 -34
  452. package/template/wall-e/eval/fixtures/wall-e-subset/eval/runner.js +0 -104
  453. package/template/wall-e/eval/fixtures/wall-e-subset/eval/scorer.js +0 -73
  454. package/template/wall-e/eval/fixtures/wall-e-subset/eval/test.js +0 -134
  455. package/template/wall-e/eval/fixtures/wall-e-subset/llm/client.js +0 -99
  456. package/template/wall-e/eval/fixtures/wall-e-subset/llm/providers.js +0 -63
  457. package/template/wall-e/eval/fixtures/wall-e-subset/llm/test.js +0 -70
  458. package/template/wall-e/eval/fixtures/wall-e-subset/package.json +0 -10
  459. package/template/wall-e/eval/fixtures/wall-e-subset/test.js +0 -86
  460. package/template/wall-e/eval/harvester.js +0 -685
  461. package/template/wall-e/eval/head-to-head.js +0 -388
  462. package/template/wall-e/eval/humaneval-adapter.js +0 -321
  463. package/template/wall-e/eval/list-models.js +0 -31
  464. package/template/wall-e/eval/livecodebench-adapter.js +0 -291
  465. package/template/wall-e/eval/mail-integration.js +0 -443
  466. package/template/wall-e/eval/manifest.js +0 -186
  467. package/template/wall-e/eval/meta-harness/adapters/coding-agent.js +0 -57
  468. package/template/wall-e/eval/meta-harness/bootstrap-snapshot.js +0 -149
  469. package/template/wall-e/eval/meta-harness/candidate-store.js +0 -117
  470. package/template/wall-e/eval/meta-harness/cli.js +0 -86
  471. package/template/wall-e/eval/meta-harness/domain-spec.js +0 -154
  472. package/template/wall-e/eval/meta-harness/domains/coding-agent.domain.json +0 -84
  473. package/template/wall-e/eval/meta-harness/examples/env-bootstrap-candidate.js +0 -29
  474. package/template/wall-e/eval/meta-harness/experience-store.js +0 -174
  475. package/template/wall-e/eval/meta-harness/frontier.js +0 -96
  476. package/template/wall-e/eval/meta-harness/harness-interface.js +0 -90
  477. package/template/wall-e/eval/meta-harness/leakage-guard.js +0 -80
  478. package/template/wall-e/eval/meta-harness/optimizer.js +0 -207
  479. package/template/wall-e/eval/meta-harness/proposer-runner.js +0 -110
  480. package/template/wall-e/eval/meta-harness/reporting.js +0 -58
  481. package/template/wall-e/eval/meta-harness/telemetry.js +0 -27
  482. package/template/wall-e/eval/meta-harness/validation.js +0 -81
  483. package/template/wall-e/eval/promoter.js +0 -228
  484. package/template/wall-e/eval/provider-normalizer.js +0 -33
  485. package/template/wall-e/eval/replay.js +0 -395
  486. package/template/wall-e/eval/run-agent-benchmarks.js +0 -386
  487. package/template/wall-e/eval/run-codex-cli-baseline.js +0 -177
  488. package/template/wall-e/eval/run-coding-agent-real.js +0 -187
  489. package/template/wall-e/eval/run-eval.js +0 -435
  490. package/template/wall-e/eval/run-model-comparison.js +0 -142
  491. package/template/wall-e/eval/session-evaluator.js +0 -187
  492. package/template/wall-e/eval/session-miner.js +0 -207
  493. package/template/wall-e/eval/session-retrieval-benchmark.js +0 -150
  494. package/template/wall-e/eval/session-transcripts.js +0 -509
  495. package/template/wall-e/eval/shadow.js +0 -161
  496. package/template/wall-e/eval/swebench-adapter.js +0 -345
  497. package/template/wall-e/eval/swebench-docker.js +0 -192
  498. package/template/wall-e/eval/train.py +0 -320
  499. package/template/wall-e/eval/trainer.js +0 -232
  500. package/template/wall-e/eval/weekly-eval-loop.js +0 -241
@@ -1,719 +0,0 @@
1
- 'use strict';
2
-
3
- /**
4
- * Claude Code session replay — runs past Claude Code sessions through
5
- * the wall-e coding agent in isolated git worktrees, scores wall-e's
6
- * output against Claude's actual tool calls + diff for the same task
7
- * on the same repo state.
8
- *
9
- * Pipeline:
10
- * 1. pickRecentSessions — walks ~/.claude/projects, filters by cwd + mtime
11
- * 2. recoverRepoState — finds git commit at session start time
12
- * 3. makeReplaySandbox — git worktree add at recovered commit
13
- * 4. replayAndScore — runs wall-e on concatenated user messages,
14
- * scores tool/file Jaccard + LLM judge
15
- * 5. removeReplaySandbox — git worktree remove
16
- *
17
- * v1 uses concatenated-prompt scripted multi-turn (joins user messages
18
- * with `\n\n[user follow-up]\n`) — coding-orchestrator's resumeFromCheckpoint
19
- * does not natively support injecting new user messages mid-thread.
20
- */
21
-
22
- const fs = require('fs');
23
- const path = require('path');
24
- const os = require('os');
25
- const { execFileSync } = require('child_process');
26
-
27
- const {
28
- findJsonlFiles,
29
- extractContent,
30
- extractToolCalls,
31
- findCommitsInWindow,
32
- getCommitDiff,
33
- } = require('./harvester');
34
-
35
- const CLAUDE_PROJECTS_DIR = path.join(os.homedir(), '.claude', 'projects');
36
- const SANDBOX_PREFIX = '/tmp/cc-replay-';
37
- const MIN_PROMPT_CHARS = 20;
38
-
39
- // ---------------------------------------------------------------------------
40
- // pickRecentSessions
41
- // ---------------------------------------------------------------------------
42
-
43
- /**
44
- * Walk Claude Code session JSONLs and return candidates suitable for replay.
45
- *
46
- * @param {object} opts
47
- * @param {string} opts.repoPath - Filter to sessions whose `cwd` is under this path
48
- * @param {number} [opts.sinceDays] - Only consider files mtime within this window
49
- * @param {number} [opts.limit] - Max sessions to return (sorted by mtime desc)
50
- * @returns {Array<{
51
- * sessionId: string, jsonlPath: string, cwd: string, gitBranch: string|null,
52
- * userMessages: string[], tsStart: string, tsEnd: string, turnCount: number,
53
- * claudeToolCalls: string[], claudeFilesEdited: string[]
54
- * }>}
55
- */
56
- function pickRecentSessions({ repoPath, sinceDays = 14, limit = 5 } = {}) {
57
- if (!repoPath) throw new Error('repoPath is required');
58
- if (!fs.existsSync(CLAUDE_PROJECTS_DIR)) return [];
59
-
60
- const sinceMs = Date.now() - sinceDays * 24 * 60 * 60 * 1000;
61
- const candidates = [];
62
-
63
- for (const jsonlPath of findJsonlFiles(CLAUDE_PROJECTS_DIR)) {
64
- let stat;
65
- try { stat = fs.statSync(jsonlPath); } catch { continue; }
66
- if (stat.mtime.getTime() < sinceMs) continue;
67
-
68
- const session = parseSessionJsonl(jsonlPath, repoPath);
69
- if (!session) continue;
70
- if (session.userMessages.length < 1) continue;
71
- if (session.userMessages[0].length < MIN_PROMPT_CHARS) continue;
72
-
73
- candidates.push({ ...session, _mtime: stat.mtime.getTime() });
74
- }
75
-
76
- candidates.sort((a, b) => b._mtime - a._mtime);
77
- return candidates.slice(0, limit).map(({ _mtime: _, ...rest }) => rest);
78
- }
79
-
80
- /**
81
- * Parse a JSONL session file. Returns null if the session's cwd does not
82
- * fall under repoPath, or if the file has no usable user messages.
83
- */
84
- function parseSessionJsonl(jsonlPath, repoPath) {
85
- let lines;
86
- try { lines = fs.readFileSync(jsonlPath, 'utf8').split('\n').filter(Boolean); }
87
- catch { return null; }
88
-
89
- const userMessages = [];
90
- const claudeToolCalls = [];
91
- const claudeFilesEditedSet = new Set();
92
- let cwd = null;
93
- let gitBranch = null;
94
- let tsStart = null;
95
- let tsEnd = null;
96
- const sessionId = path.basename(jsonlPath, '.jsonl');
97
-
98
- for (const line of lines) {
99
- let evt;
100
- try { evt = JSON.parse(line); } catch { continue; }
101
-
102
- if (evt.cwd && !cwd) cwd = evt.cwd;
103
- if (evt.gitBranch && !gitBranch) gitBranch = evt.gitBranch;
104
- if (evt.timestamp) {
105
- if (!tsStart) tsStart = evt.timestamp;
106
- tsEnd = evt.timestamp;
107
- }
108
-
109
- if (evt.type === 'user' && evt.message?.role === 'user') {
110
- const text = extractContent(evt.message);
111
- if (text && text.trim().length >= MIN_PROMPT_CHARS && !looksLikeToolResult(text)) {
112
- userMessages.push(text);
113
- }
114
- }
115
-
116
- if (evt.type === 'assistant' && evt.message?.role === 'assistant') {
117
- for (const tc of extractToolCalls(evt.message)) {
118
- if (tc.name) claudeToolCalls.push(tc.name);
119
- // Only count files actually MODIFIED — read_file/glob/grep don't
120
- // count toward claudeFilesEdited.
121
- if (!isEditToolName(tc.name)) continue;
122
- const fp = editedFileFromToolInput(tc.input || {});
123
- if (fp) claudeFilesEditedSet.add(normalizeSessionFile(fp, cwd));
124
- }
125
- }
126
- }
127
-
128
- if (!cwd) return null;
129
- if (!cwd.startsWith(repoPath.replace(/\/+$/, ''))) return null;
130
-
131
- return {
132
- sessionId,
133
- jsonlPath,
134
- cwd,
135
- gitBranch,
136
- userMessages,
137
- tsStart: tsStart || new Date(0).toISOString(),
138
- tsEnd: tsEnd || new Date(0).toISOString(),
139
- turnCount: userMessages.length,
140
- claudeToolCalls,
141
- claudeFilesEdited: [...claudeFilesEditedSet],
142
- };
143
- }
144
-
145
- function looksLikeToolResult(text) {
146
- if (!text) return true;
147
- const head = text.slice(0, 400);
148
- // Wrappers Claude Code injects around system/command/tool plumbing —
149
- // these are not real user requests, just transcript markers.
150
- const wrapperPatterns = [
151
- /^\s*\[Request interrupted/i,
152
- /^\s*<tool_use_error>/i,
153
- /^\s*<system-reminder>/i,
154
- /^\s*<local-command-caveat>/i,
155
- /^\s*<local-command-stdout>/i,
156
- /^\s*<local-command-stderr>/i,
157
- /^\s*<command-message>/i,
158
- /^\s*<command-name>/i,
159
- /^\s*<command-args>/i,
160
- // Skill bodies that Claude Code injects when a slash-command runs.
161
- // Wall-e doesn't have the same skill, so replaying the body is meaningless.
162
- /^\s*Base directory for this skill:/i,
163
- ];
164
- if (wrapperPatterns.some((re) => re.test(head))) return true;
165
- // Pure slash-command invocation with no follow-up prose
166
- if (/^\s*\/[a-z][\w:-]+\s*$/i.test(text.trim())) return true;
167
- return false;
168
- }
169
-
170
- function isEditToolName(name) {
171
- return /^(Edit|Write|MultiEdit|NotebookEdit|Patch|ApplyPatch|edit_file|write_file|apply_patch|multi_edit|str_replace|create_file)$/i.test(name || '');
172
- }
173
-
174
- function editedFileFromToolInput(input) {
175
- return input.path
176
- || input.file_path
177
- || input.filePath
178
- || input.notebook_path
179
- || input.notebookPath
180
- || null;
181
- }
182
-
183
- function normalizeSessionFile(filePath, cwd) {
184
- if (!filePath || !cwd || !path.isAbsolute(filePath)) return filePath;
185
- const rel = path.relative(cwd, filePath);
186
- return rel && !rel.startsWith('..') && !path.isAbsolute(rel) ? rel : filePath;
187
- }
188
-
189
- // ---------------------------------------------------------------------------
190
- // recoverRepoState
191
- // ---------------------------------------------------------------------------
192
-
193
- /**
194
- * Find the git commit that was HEAD at session-start time.
195
- * Strategy: `git log --before=<tsStart> -n 1` — picks the most recent
196
- * commit reachable from any branch whose date is <= session start.
197
- * Returns null if the path isn't a git repo or has no commits at all.
198
- */
199
- function recoverRepoState(repoPath, tsStart) {
200
- if (!fs.existsSync(path.join(repoPath, '.git'))) return null;
201
- try {
202
- const sha = execFileSync(
203
- 'git', ['log', '--all', `--before=${new Date(tsStart).toISOString()}`, '-n', '1', '--format=%H'],
204
- { cwd: repoPath, encoding: 'utf8', timeout: 10000, stdio: ['pipe', 'pipe', 'pipe'] }
205
- ).trim();
206
- if (!sha) return null;
207
- return { commitSha: sha, branch: null };
208
- } catch {
209
- return null;
210
- }
211
- }
212
-
213
- // ---------------------------------------------------------------------------
214
- // sandbox lifecycle (git worktree)
215
- // ---------------------------------------------------------------------------
216
-
217
- function makeReplaySandbox(repoPath, commitSha, sessionId) {
218
- const sandbox = `${SANDBOX_PREFIX}${sessionId}`;
219
- if (fs.existsSync(sandbox)) removeReplaySandbox(sandbox, repoPath);
220
- execFileSync('git', ['worktree', 'add', '--detach', sandbox, commitSha], {
221
- cwd: repoPath, stdio: 'pipe', timeout: 30000,
222
- });
223
- return sandbox;
224
- }
225
-
226
- function removeReplaySandbox(sandbox, repoPath) {
227
- try {
228
- execFileSync('git', ['worktree', 'remove', '--force', sandbox], {
229
- cwd: repoPath, stdio: 'pipe', timeout: 30000,
230
- });
231
- } catch {
232
- // fall through to fs cleanup
233
- }
234
- try {
235
- if (fs.existsSync(sandbox)) fs.rmSync(sandbox, { recursive: true, force: true });
236
- } catch { /* best-effort */ }
237
- try {
238
- execFileSync('git', ['worktree', 'prune'], { cwd: repoPath, stdio: 'pipe', timeout: 10000 });
239
- } catch { /* best-effort */ }
240
- }
241
-
242
- /**
243
- * Sweep stale `cc-replay-*` worktrees from /tmp. Run at start of a batch
244
- * to clean up after crashes.
245
- */
246
- function gitWorktreeCleanup(repoPath) {
247
- try {
248
- const list = execFileSync('git', ['worktree', 'list', '--porcelain'], {
249
- cwd: repoPath, encoding: 'utf8', timeout: 10000,
250
- });
251
- const stale = (list.match(new RegExp(`worktree (${SANDBOX_PREFIX}[^\\n]+)`, 'g')) || [])
252
- .map(l => l.replace(/^worktree /, ''));
253
- for (const sb of stale) removeReplaySandbox(sb, repoPath);
254
- } catch { /* best-effort */ }
255
- }
256
-
257
- // ---------------------------------------------------------------------------
258
- // replayAndScore
259
- // ---------------------------------------------------------------------------
260
-
261
- /**
262
- * Replay a Claude Code session through wall-e and score the result.
263
- *
264
- * @param {object} session - From pickRecentSessions
265
- * @param {string} sandbox - Path returned by makeReplaySandbox
266
- * @param {Function} runAgentLoop - From require('../coding-orchestrator')
267
- * @param {object} [opts]
268
- * @param {boolean} [opts.useLlmJudge=true]
269
- * @param {string} [opts.model]
270
- * @param {object} [opts.provider]
271
- * @param {number} [opts.timeoutMs=600000]
272
- * @returns {Promise<object>} {sessionId, scores, walleTools, claudeTools, walleDiff, claudeDiff, latencyMs, error?}
273
- */
274
- async function replayAndScore(session, sandbox, runAgentLoop, opts = {}) {
275
- const { useLlmJudge = true, model, provider, timeoutMs = 600_000 } = opts;
276
- const startTime = Date.now();
277
-
278
- const concatenatedPrompt = session.userMessages.length === 1
279
- ? session.userMessages[0]
280
- : session.userMessages
281
- .map((msg, i) => i === 0 ? msg : `\n\n[user follow-up ${i + 1}]\n${msg}`)
282
- .join('');
283
-
284
- let agentResult;
285
- try {
286
- const agentPromise = runAgentLoop(concatenatedPrompt, {
287
- cwd: sandbox,
288
- timeoutMs,
289
- provider,
290
- model,
291
- mode: 'build',
292
- persistTranscript: false,
293
- });
294
- const hardTimeout = new Promise((_, reject) =>
295
- setTimeout(() => reject(new Error('cc-replay hard timeout exceeded')), timeoutMs + 60_000)
296
- );
297
- agentResult = await Promise.race([agentPromise, hardTimeout]);
298
- } catch (e) {
299
- return {
300
- sessionId: session.sessionId,
301
- turnCount: session.turnCount,
302
- error: e.message,
303
- latencyMs: Date.now() - startTime,
304
- scores: { composite: 0, tool_jaccard: 0, file_jaccard: 0, judge_score: 0 },
305
- walleTools: [],
306
- claudeTools: session.claudeToolCalls,
307
- walleDiff: '',
308
- claudeDiff: '',
309
- };
310
- }
311
-
312
- const latencyMs = Date.now() - startTime;
313
- const walleTools = extractWalleTools(agentResult);
314
- const claudeTools = canonicalizeToolList(session.claudeToolCalls);
315
- const walleFiles = extractWalleFiles(agentResult, sandbox);
316
- const walleDiff = computeSandboxDiff(sandbox);
317
- const claudeDiff = computeClaudeDiff(session);
318
-
319
- // ---- scoring -----------------------------------------------------------
320
- const tool_jaccard = jaccard(new Set(walleTools), new Set(claudeTools));
321
- const file_jaccard = jaccard(
322
- canonicalizeReplayFiles(walleFiles),
323
- canonicalizeReplayFiles(session.claudeFilesEdited),
324
- );
325
-
326
- let judge_score = null;
327
- let judge_reason = null;
328
- if (useLlmJudge && (walleDiff || claudeDiff)) {
329
- try {
330
- const judged = await scoreLlmJudge({
331
- prompt: session.userMessages[0],
332
- walleDiff,
333
- claudeDiff,
334
- model,
335
- });
336
- judge_score = judged.score;
337
- judge_reason = judged.reason;
338
- } catch (e) {
339
- judge_reason = `judge failed: ${e.message}`;
340
- }
341
- }
342
-
343
- // Composite: 0.25·tool + 0.25·file + 0.50·judge (when judge available);
344
- // otherwise 0.5/0.5 split between tool+file.
345
- let composite;
346
- if (judge_score != null) {
347
- composite = 0.25 * tool_jaccard + 0.25 * file_jaccard + 0.50 * judge_score;
348
- } else {
349
- composite = 0.5 * tool_jaccard + 0.5 * file_jaccard;
350
- }
351
-
352
- return {
353
- sessionId: session.sessionId,
354
- turnCount: session.turnCount,
355
- latencyMs,
356
- scores: { composite, tool_jaccard, file_jaccard, judge_score, judge_reason },
357
- walleTools,
358
- walleFiles,
359
- claudeTools,
360
- claudeFiles: session.claudeFilesEdited,
361
- walleDiff: walleDiff.slice(0, 8000),
362
- claudeDiff: (claudeDiff || '').slice(0, 8000),
363
- usage: agentResult.usage || null,
364
- agentResult: summarizeAgentResult(agentResult),
365
- };
366
- }
367
-
368
- // ---------------------------------------------------------------------------
369
- // scoring helpers
370
- // ---------------------------------------------------------------------------
371
-
372
- function jaccard(setA, setB) {
373
- if (!setA.size && !setB.size) return 1;
374
- const intersection = [...setA].filter(x => setB.has(x)).length;
375
- const union = new Set([...setA, ...setB]).size;
376
- return union === 0 ? 0 : intersection / union;
377
- }
378
-
379
- function canonicalizeReplayFiles(files = []) {
380
- return new Set((files || []).map(canonicalReplayFilePath).filter(Boolean));
381
- }
382
-
383
- function canonicalReplayFilePath(filePath) {
384
- const normalized = String(filePath || '').trim().replace(/\\/g, '/').replace(/^\.\//, '');
385
- const templatePrefix = 'create-walle/template/';
386
- if (normalized.startsWith(templatePrefix)) {
387
- return normalized.slice(templatePrefix.length);
388
- }
389
- return normalized;
390
- }
391
-
392
- function extractWalleTools(agentResult) {
393
- const log = agentResult?.log || [];
394
- const tools = [];
395
- for (const turn of log) {
396
- for (const tc of turn.toolCalls || []) {
397
- const canonical = canonicalToolName(tc.name);
398
- if (canonical) tools.push(canonical);
399
- }
400
- }
401
- return tools;
402
- }
403
-
404
- function canonicalizeToolList(tools = []) {
405
- return (tools || []).map(canonicalToolName).filter(Boolean);
406
- }
407
-
408
- function canonicalToolName(name) {
409
- const raw = String(name || '').trim();
410
- if (!raw) return null;
411
- if (/^(read|read_file)$/i.test(raw)) return 'read_file';
412
- if (/^(bash|run_shell|exec_command|shell)$/i.test(raw)) return 'run_shell';
413
- if (/^(grep|glob|toolsearch|list_directory|grep_files|search|rg)$/i.test(raw)) return 'search';
414
- if (/^(edit|write|multiedit|notebookedit|patch|applypatch|apply_patch|edit_file|write_file|multi_edit|str_replace|create_file)$/i.test(raw)) return 'edit';
415
- if (/^(agent|task|taskoutput)$/i.test(raw)) return 'agent';
416
- if (/^askuserquestion$/i.test(raw)) return 'ask_user';
417
- if (/^skill$/i.test(raw)) return 'skill';
418
- if (/^mcp__/i.test(raw)) return 'mcp';
419
- return raw.toLowerCase();
420
- }
421
-
422
- function extractWalleFiles(agentResult, sandbox) {
423
- const files = new Set();
424
- const log = agentResult?.log || [];
425
- for (const turn of log) {
426
- for (const tc of turn.toolCalls || []) {
427
- if (!isEditToolName(tc.name)) continue;
428
- const input = tc.input || {};
429
- const fp = editedFileFromToolInput(input);
430
- if (fp) {
431
- files.add(normalizeWalleFilePath(fp, sandbox));
432
- }
433
- for (const patchFile of extractPatchFilePaths(input.patch_text || input.patchText || input.patch || '')) {
434
- files.add(normalizeWalleFilePath(patchFile, sandbox));
435
- }
436
- }
437
- }
438
- return [...files];
439
- }
440
-
441
- function normalizeWalleFilePath(filePath, sandbox) {
442
- if (!filePath || !sandbox) return filePath;
443
- if (!path.isAbsolute(filePath)) return filePath.replace(/^\.\//, '');
444
- const fileVariants = pathVariants(filePath);
445
- const sandboxVariants = pathVariants(sandbox);
446
- for (const candidate of fileVariants) {
447
- for (const root of sandboxVariants) {
448
- const rel = relativeIfUnder(candidate, root);
449
- if (rel) return rel;
450
- }
451
- }
452
- return filePath;
453
- }
454
-
455
- function extractPatchFilePaths(patchText) {
456
- if (!patchText || typeof patchText !== 'string') return [];
457
- const files = [];
458
- for (const line of patchText.split('\n')) {
459
- const match = line.match(/^\*\*\* (?:Add|Update|Delete) File:\s+(.+?)\s*$/);
460
- if (match?.[1]) files.push(match[1].trim());
461
- const moveMatch = line.match(/^\*\*\* Move to:\s+(.+?)\s*$/);
462
- if (moveMatch?.[1]) files.push(moveMatch[1].trim());
463
- }
464
- return [...new Set(files)];
465
- }
466
-
467
- function pathVariants(filePath) {
468
- const variants = new Set([filePath]);
469
- try { variants.add(path.resolve(filePath)); } catch {}
470
- try { variants.add(fs.realpathSync(filePath)); } catch {}
471
- for (const value of [...variants]) {
472
- if (value.startsWith('/private/tmp/')) variants.add(value.replace(/^\/private\/tmp\//, '/tmp/'));
473
- if (value.startsWith('/tmp/')) variants.add(value.replace(/^\/tmp\//, '/private/tmp/'));
474
- }
475
- return [...variants].filter(Boolean);
476
- }
477
-
478
- function relativeIfUnder(filePath, root) {
479
- if (!filePath || !root || !path.isAbsolute(filePath) || !path.isAbsolute(root)) return null;
480
- const rel = path.relative(root, filePath);
481
- if (!rel || rel.startsWith('..') || path.isAbsolute(rel)) return null;
482
- return rel.replace(/\\/g, '/');
483
- }
484
-
485
- function computeSandboxDiff(sandbox) {
486
- try {
487
- return execFileSync('git', ['diff', 'HEAD'], {
488
- cwd: sandbox, encoding: 'utf8', timeout: 15000, maxBuffer: 10 * 1024 * 1024,
489
- });
490
- } catch {
491
- return '';
492
- }
493
- }
494
-
495
- function summarizeAgentResult(agentResult) {
496
- if (!agentResult || typeof agentResult !== 'object') return null;
497
- return {
498
- success: agentResult.success,
499
- exitCode: agentResult.exitCode,
500
- stderr: agentResult.stderr ? String(agentResult.stderr).slice(0, 1000) : '',
501
- output: agentResult.output ? String(agentResult.output).slice(0, 2000) : '',
502
- logLength: Array.isArray(agentResult.log) ? agentResult.log.length : 0,
503
- sessionId: agentResult.sessionId || null,
504
- };
505
- }
506
-
507
- /**
508
- * Best-effort: find the commit just AFTER session end and use its diff
509
- * as Claude's ground-truth output. If no commit landed, return null.
510
- */
511
- function computeClaudeDiff(session) {
512
- try {
513
- const commits = findCommitsInWindow(session.cwd, session.tsStart, session.tsEnd);
514
- if (!commits.length) return null;
515
- return getCommitDiff(session.cwd, commits[0].hash) || null;
516
- } catch {
517
- return null;
518
- }
519
- }
520
-
521
- /**
522
- * LLM-as-judge: ask Claude Haiku whether wall-e's diff achieves the
523
- * same intent as Claude's diff. Returns {score, reason}.
524
- */
525
- async function scoreLlmJudge({ prompt, walleDiff, claudeDiff, model }) {
526
- const { getDefaultClient } = require('../llm/client');
527
- const client = getDefaultClient();
528
- const judgePrompt = [
529
- `User's request: ${(prompt || '').slice(0, 1000)}`,
530
- '',
531
- `--- Diff A (Claude Code's actual output, ground truth) ---`,
532
- (claudeDiff || '<no commit landed for this session>').slice(0, 4000),
533
- '',
534
- `--- Diff B (wall-e's output) ---`,
535
- (walleDiff || '<empty>').slice(0, 4000),
536
- '',
537
- 'Score 0.0 to 1.0: how well does Diff B accomplish the same intent as Diff A',
538
- 'for the user\'s request? Consider goal achievement, not byte equality. If both',
539
- 'diffs are empty/missing, score 0.5. Reply with ONLY a single line:',
540
- 'SCORE: <0.0–1.0>',
541
- 'REASON: <one sentence>',
542
- ].join('\n');
543
-
544
- const resp = await client.chat({
545
- model: model || 'claude-haiku-4-5-20251001',
546
- messages: [{ role: 'user', content: judgePrompt }],
547
- maxTokens: 200,
548
- });
549
- const text = (resp.content || '').trim();
550
- const scoreMatch = text.match(/SCORE:\s*([0-9]*\.?[0-9]+)/i);
551
- const reasonMatch = text.match(/REASON:\s*(.+)/i);
552
- let score = scoreMatch ? parseFloat(scoreMatch[1]) : 0.5;
553
- if (!isFinite(score) || score < 0) score = 0;
554
- if (score > 1) score = 1;
555
- return { score, reason: reasonMatch ? reasonMatch[1].trim() : null };
556
- }
557
-
558
- // ---------------------------------------------------------------------------
559
- // Reap mode — harvest sessions into the persistent benchmark catalog
560
- // ---------------------------------------------------------------------------
561
-
562
- const crypto = require('crypto');
563
- const CODING_AGENT_BENCHMARKS_PATH = path.join(__dirname, 'benchmarks', 'coding-agent.json');
564
-
565
- /**
566
- * Convert a picked session into a self-contained benchmark catalog entry.
567
- * The entry embeds everything needed to replay later (userMessages, recovered
568
- * commit, claudeTools, claudeFiles) so the original JSONL can be deleted
569
- * without breaking the entry.
570
- */
571
- function sessionToCatalogEntry(session, recoveredCommit) {
572
- const sha = crypto.createHash('sha256').update(session.sessionId + '|' + (session.userMessages[0] || '')).digest('hex').slice(0, 8);
573
- const id = `agent-cc-${sha}`;
574
- const tools = session.claudeToolCalls || [];
575
- const traits = ['has code block'];
576
- if (tools.some((t) => /read_file|glob|grep|Read|Grep/.test(t))) traits.push('reads before writing');
577
- if (tools.some((t) => /edit_file|Edit|str_replace/.test(t))) traits.push('uses edit over write');
578
- if (tools.some((t) => /bash|Bash/.test(t))) traits.push('runs commands');
579
-
580
- const turnCount = session.turnCount || 1;
581
- const difficulty = turnCount > 10 ? 'hard' : turnCount > 4 ? 'medium' : 'easy';
582
-
583
- return {
584
- id,
585
- prompt: session.userMessages[0] || '',
586
- taskType: 'coding-agent',
587
- difficulty,
588
- expectedTraits: traits,
589
- agentExpectations: {
590
- expectedToolCalls: [...new Set(tools)].slice(0, 12),
591
- maxTurns: Math.min(turnCount * 2, 50),
592
- expectedFileChanges: session.claudeFilesEdited || [],
593
- },
594
- ccReplay: {
595
- sourceSessionId: session.sessionId,
596
- cwd: session.cwd,
597
- gitBranch: session.gitBranch || null,
598
- tsStart: session.tsStart,
599
- tsEnd: session.tsEnd,
600
- recoveredCommit,
601
- userMessages: session.userMessages,
602
- turnCount,
603
- claudeToolCalls: tools,
604
- claudeFilesEdited: session.claudeFilesEdited || [],
605
- reapedAt: new Date().toISOString(),
606
- },
607
- };
608
- }
609
-
610
- /**
611
- * Harvest fresh Claude Code sessions and append them to the benchmark
612
- * catalog (coding-agent.json). Dedups against existing `agent-cc-*` and
613
- * `agent-session-*` IDs. Returns the entries that were added.
614
- *
615
- * @param {object} opts
616
- * @param {string} opts.repoPath
617
- * @param {number} [opts.limit] — max NEW entries to add this run
618
- * @param {number} [opts.sinceDays] — only consider JSONLs with this mtime window
619
- * @param {string} [opts.benchmarksPath] — override for tests
620
- * @returns {{added: object[], skipped: number, total: number}}
621
- */
622
- function reapSessions({ repoPath, limit = 5, sinceDays = 14, benchmarksPath = CODING_AGENT_BENCHMARKS_PATH } = {}) {
623
- if (!repoPath) throw new Error('repoPath is required');
624
-
625
- // Load existing IDs for dedup
626
- let existing = [];
627
- try { existing = JSON.parse(fs.readFileSync(benchmarksPath, 'utf8')); }
628
- catch { existing = []; }
629
- const existingIds = new Set(existing.map((e) => e.id));
630
- const existingSourceIds = new Set(
631
- existing.filter((e) => e.ccReplay && e.ccReplay.sourceSessionId).map((e) => e.ccReplay.sourceSessionId)
632
- );
633
-
634
- // Pick more candidates than `limit` so we can skip non-replayable ones
635
- const candidates = pickRecentSessions({ repoPath, sinceDays, limit: limit * 4 });
636
-
637
- const added = [];
638
- let skipped = 0;
639
- for (const session of candidates) {
640
- if (added.length >= limit) break;
641
-
642
- if (existingSourceIds.has(session.sessionId)) { skipped++; continue; }
643
-
644
- const repo = recoverRepoState(session.cwd, session.tsStart);
645
- if (!repo) { skipped++; continue; } // can't reconstruct repo state
646
-
647
- const entry = sessionToCatalogEntry(session, repo.commitSha);
648
- if (existingIds.has(entry.id)) { skipped++; continue; }
649
-
650
- existing.push(entry);
651
- existingIds.add(entry.id);
652
- existingSourceIds.add(session.sessionId);
653
- added.push(entry);
654
- }
655
-
656
- if (added.length > 0) {
657
- fs.writeFileSync(benchmarksPath, JSON.stringify(existing, null, 2) + '\n');
658
- }
659
-
660
- return { added, skipped, total: existing.length };
661
- }
662
-
663
- /**
664
- * Replay a catalog entry that has an embedded ccReplay block. No live JSONL
665
- * required — everything needed (userMessages, claudeTools, claudeFiles,
666
- * recoveredCommit) is in the entry itself. Used by the standard
667
- * /eval/coding-agent/run path when bench.id starts with `agent-cc-`.
668
- *
669
- * @param {object} bench — catalog entry with bench.ccReplay
670
- * @param {Function} runAgentLoop
671
- * @param {object} [opts]
672
- * @returns {Promise<object>} same shape as replayAndScore
673
- */
674
- async function replayFromCatalog(bench, runAgentLoop, opts = {}) {
675
- if (!bench.ccReplay) throw new Error(`Benchmark ${bench.id} has no ccReplay block`);
676
- const cc = bench.ccReplay;
677
- const session = {
678
- sessionId: cc.sourceSessionId,
679
- cwd: cc.cwd,
680
- gitBranch: cc.gitBranch,
681
- userMessages: cc.userMessages || [bench.prompt],
682
- tsStart: cc.tsStart,
683
- tsEnd: cc.tsEnd,
684
- turnCount: cc.turnCount || (cc.userMessages || [bench.prompt]).length,
685
- claudeToolCalls: cc.claudeToolCalls || [],
686
- claudeFilesEdited: cc.claudeFilesEdited || [],
687
- };
688
-
689
- let sandbox = null;
690
- try {
691
- sandbox = makeReplaySandbox(cc.cwd, cc.recoveredCommit, session.sessionId);
692
- return await replayAndScore(session, sandbox, runAgentLoop, opts);
693
- } finally {
694
- if (sandbox) {
695
- try { removeReplaySandbox(sandbox, cc.cwd); }
696
- catch (e) { console.warn('[cc-replay] catalog sandbox cleanup failed:', e.message); }
697
- }
698
- }
699
- }
700
-
701
- module.exports = {
702
- pickRecentSessions,
703
- recoverRepoState,
704
- makeReplaySandbox,
705
- removeReplaySandbox,
706
- gitWorktreeCleanup,
707
- replayAndScore,
708
- reapSessions,
709
- replayFromCatalog,
710
- sessionToCatalogEntry,
711
- // exposed for tests
712
- parseSessionJsonl,
713
- jaccard,
714
- canonicalReplayFilePath,
715
- canonicalToolName,
716
- canonicalizeToolList,
717
- extractWalleFiles,
718
- extractPatchFilePaths,
719
- };