create-walle 0.9.21 → 0.9.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (500) hide show
  1. package/README.md +27 -5
  2. package/package.json +2 -2
  3. package/template/CLAUDE.md +2 -2
  4. package/template/LICENSE +1 -1
  5. package/template/bin/ctm-dev-cleanup.js +24 -3
  6. package/template/bin/ctm-launch.sh +13 -0
  7. package/template/bin/dev.sh +156 -18
  8. package/template/bin/node-bin.sh +84 -0
  9. package/template/bin/pin-node.sh +51 -0
  10. package/template/claude-task-manager/api-prompts.js +1203 -182
  11. package/template/claude-task-manager/api-reviews.js +109 -15
  12. package/template/claude-task-manager/approval-agent.js +1360 -280
  13. package/template/claude-task-manager/bin/restart-ctm.sh +64 -23
  14. package/template/claude-task-manager/bin/storage-migration-supervisor.js +338 -0
  15. package/template/claude-task-manager/db.js +4417 -295
  16. package/template/claude-task-manager/docs/app-update-refresh-protocol.md +69 -0
  17. package/template/claude-task-manager/docs/approval-ai-refinement.md +138 -0
  18. package/template/claude-task-manager/docs/approval-rescue-loop.md +74 -0
  19. package/template/claude-task-manager/docs/codex-operational-warning-health.md +107 -0
  20. package/template/claude-task-manager/docs/codex-resume-state-guard-design.md +17 -12
  21. package/template/claude-task-manager/docs/codex-terminal-render-controller-handoff.md +311 -0
  22. package/template/claude-task-manager/docs/coding-agent-hooks-architecture.md +418 -0
  23. package/template/claude-task-manager/docs/conversation-import-freshness.md +20 -0
  24. package/template/claude-task-manager/docs/google-workspace-auth-health.md +77 -0
  25. package/template/claude-task-manager/docs/image-paste-ux.md +13 -0
  26. package/template/claude-task-manager/docs/ipad-web-preview.md +88 -0
  27. package/template/claude-task-manager/docs/main-loop-offload-architecture.md +66 -0
  28. package/template/claude-task-manager/docs/microsoft-dev-tunnel-phone-access-design.md +274 -519
  29. package/template/claude-task-manager/docs/mobile-live-streaming.md +27 -5
  30. package/template/claude-task-manager/docs/mobile-remote-submission-lifecycle.md +69 -0
  31. package/template/claude-task-manager/docs/phone-access-design.md +53 -15
  32. package/template/claude-task-manager/docs/phone-passkey-identity.md +122 -0
  33. package/template/claude-task-manager/docs/phone-setup.md +3 -0
  34. package/template/claude-task-manager/docs/prompt-editing-tree-design.md +25 -1
  35. package/template/claude-task-manager/docs/remote-desktop-access-design.md +268 -0
  36. package/template/claude-task-manager/docs/restart-lifecycle-architecture.md +95 -0
  37. package/template/claude-task-manager/docs/runtime-work-control-plane.md +53 -0
  38. package/template/claude-task-manager/docs/session-interactive-wait-surfaces.md +38 -0
  39. package/template/claude-task-manager/docs/session-needs-you-dismissal.md +84 -0
  40. package/template/claude-task-manager/docs/session-render-state-management-design.md +91 -3
  41. package/template/claude-task-manager/docs/session-standup-command-center-design.md +25 -1
  42. package/template/claude-task-manager/docs/session-title-authority.md +32 -0
  43. package/template/claude-task-manager/docs/session-workspace-binding.md +33 -0
  44. package/template/claude-task-manager/docs/skill-intent-resolution-design.md +72 -0
  45. package/template/claude-task-manager/docs/walle-mcp-supervisor-health.md +86 -0
  46. package/template/claude-task-manager/docs/walle-relay-phone-access-design.md +24 -15
  47. package/template/claude-task-manager/docs/walle-session-history-hydration.md +114 -0
  48. package/template/claude-task-manager/docs/walle-session-input-queue.md +104 -0
  49. package/template/claude-task-manager/docs/walle-session-model-catalog.md +90 -0
  50. package/template/claude-task-manager/docs/walle-session-model-preferences.md +15 -6
  51. package/template/claude-task-manager/git-utils.js +897 -27
  52. package/template/claude-task-manager/lib/agent-capabilities.js +33 -0
  53. package/template/claude-task-manager/lib/agent-cli-cache.js +37 -7
  54. package/template/claude-task-manager/lib/agent-hooks-installer.js +26 -2
  55. package/template/claude-task-manager/lib/agent-presets.js +17 -1
  56. package/template/claude-task-manager/lib/all-sessions-query.js +108 -0
  57. package/template/claude-task-manager/lib/approval-ai-refinement.js +488 -0
  58. package/template/claude-task-manager/lib/approval-self-adapt.js +168 -0
  59. package/template/claude-task-manager/lib/async-semaphore.js +44 -0
  60. package/template/claude-task-manager/lib/auth-context.js +5 -0
  61. package/template/claude-task-manager/lib/auth-rate-limit.js +47 -4
  62. package/template/claude-task-manager/lib/auth-rules.js +29 -2
  63. package/template/claude-task-manager/lib/auto-approval-verifier.js +129 -16
  64. package/template/claude-task-manager/lib/background-llm.js +144 -17
  65. package/template/claude-task-manager/lib/branch-inventory.js +212 -0
  66. package/template/claude-task-manager/lib/claude-desktop-sessions.js +15 -3
  67. package/template/claude-task-manager/lib/coalesce-sync-frames.js +151 -0
  68. package/template/claude-task-manager/lib/codex-launch-health.js +762 -0
  69. package/template/claude-task-manager/lib/codex-transcript-pager.js +51 -0
  70. package/template/claude-task-manager/lib/codex-zst.js +124 -0
  71. package/template/claude-task-manager/lib/coding-agent-models.js +233 -30
  72. package/template/claude-task-manager/lib/connection-health.js +232 -0
  73. package/template/claude-task-manager/lib/conversation-blob-parser.js +42 -0
  74. package/template/claude-task-manager/lib/conversation-tail-merge.js +89 -26
  75. package/template/claude-task-manager/lib/ctm-session-context-api.js +39 -10
  76. package/template/claude-task-manager/lib/cursor-conversation-store.js +354 -0
  77. package/template/claude-task-manager/lib/db-owner-worker-client.js +315 -0
  78. package/template/claude-task-manager/lib/document-review.js +141 -6
  79. package/template/claude-task-manager/lib/escalation-review.js +152 -0
  80. package/template/claude-task-manager/lib/graceful-shutdown.js +159 -0
  81. package/template/claude-task-manager/lib/headless-term-service.js +678 -0
  82. package/template/claude-task-manager/lib/heavy-worker-fallback.js +38 -0
  83. package/template/claude-task-manager/lib/jsonl-conversation-parser.js +542 -0
  84. package/template/claude-task-manager/lib/jsonl-range-reader.js +112 -0
  85. package/template/claude-task-manager/lib/main-db-census.js +216 -0
  86. package/template/claude-task-manager/lib/message-pagination.js +106 -4
  87. package/template/claude-task-manager/lib/microsoft-dev-tunnel-setup.js +750 -26
  88. package/template/claude-task-manager/lib/mobile-auth-api.js +274 -7
  89. package/template/claude-task-manager/lib/mobile-auth-store.js +592 -10
  90. package/template/claude-task-manager/lib/mobile-notification-dispatcher.js +15 -0
  91. package/template/claude-task-manager/lib/model-overview-brain-fallback.js +311 -0
  92. package/template/claude-task-manager/lib/model-overview-cache.js +141 -0
  93. package/template/claude-task-manager/lib/models-health-routing-notice.js +126 -0
  94. package/template/claude-task-manager/lib/node-pin-guard.js +93 -0
  95. package/template/claude-task-manager/lib/perf-tracker.js +242 -6
  96. package/template/claude-task-manager/lib/permission-match.js +76 -0
  97. package/template/claude-task-manager/lib/permission-sync.js +133 -20
  98. package/template/claude-task-manager/lib/process-title.js +35 -0
  99. package/template/claude-task-manager/lib/prompt-executions-query.js +25 -0
  100. package/template/claude-task-manager/lib/prompt-index-disk-cache.js +44 -0
  101. package/template/claude-task-manager/lib/prompt-intent.js +132 -0
  102. package/template/claude-task-manager/lib/provider-user-context.js +34 -0
  103. package/template/claude-task-manager/lib/read-pool-client.js +313 -0
  104. package/template/claude-task-manager/lib/readpool-breaker.js +31 -0
  105. package/template/claude-task-manager/lib/recent-sessions-breaker.js +12 -0
  106. package/template/claude-task-manager/lib/remote-feedback-client.js +72 -0
  107. package/template/claude-task-manager/lib/remote-relay-protocol.js +37 -4
  108. package/template/claude-task-manager/lib/remote-relay-store.js +159 -0
  109. package/template/claude-task-manager/lib/remote-submission-observer.js +278 -0
  110. package/template/claude-task-manager/lib/restart-guard.js +109 -0
  111. package/template/claude-task-manager/lib/restore-interruption-detector.js +439 -0
  112. package/template/claude-task-manager/lib/restore-policy.js +13 -0
  113. package/template/claude-task-manager/lib/restore-resume-batch.js +74 -0
  114. package/template/claude-task-manager/lib/restore-runtime.js +68 -0
  115. package/template/claude-task-manager/lib/restore-storm.js +34 -0
  116. package/template/claude-task-manager/lib/resume-cwd.js +36 -0
  117. package/template/claude-task-manager/lib/resume-preflight.js +313 -0
  118. package/template/claude-task-manager/lib/runtime-work-registry.js +444 -0
  119. package/template/claude-task-manager/lib/sanitize-openai-auth.js +31 -0
  120. package/template/claude-task-manager/lib/scheduler.js +21 -1
  121. package/template/claude-task-manager/lib/scrollback-snapshot-store.js +159 -0
  122. package/template/claude-task-manager/lib/serial-task-queue.js +64 -0
  123. package/template/claude-task-manager/lib/server-listeners.js +239 -0
  124. package/template/claude-task-manager/lib/session-capture.js +42 -7
  125. package/template/claude-task-manager/lib/session-content-backfill.js +131 -0
  126. package/template/claude-task-manager/lib/session-history.js +388 -43
  127. package/template/claude-task-manager/lib/session-host-manager.js +287 -0
  128. package/template/claude-task-manager/lib/session-image-refs.js +209 -0
  129. package/template/claude-task-manager/lib/session-jobs.js +399 -59
  130. package/template/claude-task-manager/lib/session-prompt-index.js +137 -0
  131. package/template/claude-task-manager/lib/session-restore.js +53 -0
  132. package/template/claude-task-manager/lib/session-standup.js +123 -23
  133. package/template/claude-task-manager/lib/session-state-bus.js +14 -0
  134. package/template/claude-task-manager/lib/session-stream.js +64 -16
  135. package/template/claude-task-manager/lib/session-timeline-summary.js +260 -0
  136. package/template/claude-task-manager/lib/session-token-usage.js +494 -0
  137. package/template/claude-task-manager/lib/session-workspace-binding.js +356 -0
  138. package/template/claude-task-manager/lib/setup-network-config.js +9 -0
  139. package/template/claude-task-manager/lib/size-cap.js +45 -0
  140. package/template/claude-task-manager/lib/size-cap.test.js +62 -0
  141. package/template/claude-task-manager/lib/skill-autocomplete.js +180 -1
  142. package/template/claude-task-manager/lib/skill-intent-resolver.js +304 -0
  143. package/template/claude-task-manager/lib/sqlite-driver.js +19 -3
  144. package/template/claude-task-manager/lib/standup-attention.js +7 -3
  145. package/template/claude-task-manager/lib/status-authority.js +39 -0
  146. package/template/claude-task-manager/lib/status-hooks.js +4 -0
  147. package/template/claude-task-manager/lib/storage-migration.js +235 -0
  148. package/template/claude-task-manager/lib/structured-capture.js +298 -0
  149. package/template/claude-task-manager/lib/sync-io-census.js +163 -0
  150. package/template/claude-task-manager/lib/tailscale-setup.js +6 -0
  151. package/template/claude-task-manager/lib/terminal-activity-evidence.js +33 -0
  152. package/template/claude-task-manager/lib/terminal-choice.js +364 -0
  153. package/template/claude-task-manager/lib/terminal-control-sanitize.js +17 -0
  154. package/template/claude-task-manager/lib/terminal-fingerprint.js +48 -0
  155. package/template/claude-task-manager/lib/terminal-output-flush.js +84 -0
  156. package/template/claude-task-manager/lib/timeline-order.js +122 -0
  157. package/template/claude-task-manager/lib/transcript-store.js +348 -43
  158. package/template/claude-task-manager/lib/transport-security.js +84 -1
  159. package/template/claude-task-manager/lib/wait-state.js +184 -0
  160. package/template/claude-task-manager/lib/walle-client.js +47 -5
  161. package/template/claude-task-manager/lib/walle-ctm-history.js +564 -4
  162. package/template/claude-task-manager/lib/walle-external-actions.js +135 -16
  163. package/template/claude-task-manager/lib/walle-history-hydration.js +46 -0
  164. package/template/claude-task-manager/lib/walle-native-health.js +403 -0
  165. package/template/claude-task-manager/lib/walle-repair.js +701 -0
  166. package/template/claude-task-manager/lib/walle-session-cache.js +109 -0
  167. package/template/claude-task-manager/lib/walle-session-context.js +57 -21
  168. package/template/claude-task-manager/lib/walle-session-model-catalog.js +34 -0
  169. package/template/claude-task-manager/lib/walle-supervisor.js +539 -63
  170. package/template/claude-task-manager/lib/walle-transcript.js +52 -0
  171. package/template/claude-task-manager/lib/worktree-active-sync.js +11 -7
  172. package/template/claude-task-manager/lib/worktree-cwd.js +32 -1
  173. package/template/claude-task-manager/package.json +1 -1
  174. package/template/claude-task-manager/prompt-harvest.js +89 -66
  175. package/template/claude-task-manager/providers/claude-code.js +51 -3
  176. package/template/claude-task-manager/providers/cursor.js +140 -45
  177. package/template/claude-task-manager/public/css/reviews.css +551 -61
  178. package/template/claude-task-manager/public/css/setup.css +191 -0
  179. package/template/claude-task-manager/public/css/walle-session.css +865 -10
  180. package/template/claude-task-manager/public/css/walle.css +154 -0
  181. package/template/claude-task-manager/public/designs/ai-providers-consolidation-v2.html +830 -0
  182. package/template/claude-task-manager/public/index.html +18516 -2058
  183. package/template/claude-task-manager/public/ipad.html +363 -0
  184. package/template/claude-task-manager/public/js/document-review-links.js +301 -0
  185. package/template/claude-task-manager/public/js/image-normalize.js +69 -36
  186. package/template/claude-task-manager/public/js/message-renderer.js +1265 -77
  187. package/template/claude-task-manager/public/js/prompts.js +66 -29
  188. package/template/claude-task-manager/public/js/reviews.js +901 -133
  189. package/template/claude-task-manager/public/js/session-activity-utils.js +11 -1
  190. package/template/claude-task-manager/public/js/session-search-utils.js +94 -10
  191. package/template/claude-task-manager/public/js/session-status-precedence.js +23 -5
  192. package/template/claude-task-manager/public/js/setup.js +1273 -176
  193. package/template/claude-task-manager/public/js/stream-view.js +691 -73
  194. package/template/claude-task-manager/public/js/terminal-reconciler.js +210 -0
  195. package/template/claude-task-manager/public/js/walle-session.js +2455 -158
  196. package/template/claude-task-manager/public/js/walle.js +455 -28
  197. package/template/claude-task-manager/public/m/app.css +2909 -262
  198. package/template/claude-task-manager/public/m/app.js +6601 -398
  199. package/template/claude-task-manager/public/m/claim.html +224 -17
  200. package/template/claude-task-manager/public/m/index.html +117 -21
  201. package/template/claude-task-manager/public/m/sw.js +3 -1
  202. package/template/claude-task-manager/public/manifest.json +2 -2
  203. package/template/claude-task-manager/public/prompts.html +30 -14
  204. package/template/claude-task-manager/queue-engine.js +507 -28
  205. package/template/claude-task-manager/scripts/repair-claude-session-images.js +27 -8
  206. package/template/claude-task-manager/server.js +14341 -2197
  207. package/template/claude-task-manager/session-integrity.js +160 -18
  208. package/template/claude-task-manager/session-search-ranking.js +1 -0
  209. package/template/claude-task-manager/session-utils.js +25 -5
  210. package/template/claude-task-manager/workers/approval-blocklist.js +96 -6
  211. package/template/claude-task-manager/workers/approval-widget-validator.js +14 -8
  212. package/template/claude-task-manager/workers/conversation-import-worker.js +11 -50
  213. package/template/claude-task-manager/workers/db-owner-worker.js +386 -0
  214. package/template/claude-task-manager/workers/harvest-worker.js +9 -55
  215. package/template/claude-task-manager/workers/headless-term-worker.js +9 -530
  216. package/template/claude-task-manager/workers/read-pool-worker.js +387 -0
  217. package/template/claude-task-manager/workers/scrollback-worker.js +11 -72
  218. package/template/claude-task-manager/workers/session-host-process.js +146 -0
  219. package/template/claude-task-manager/workers/session-integrity-worker.js +10 -54
  220. package/template/claude-task-manager/workers/state-detectors/base.js +18 -1
  221. package/template/claude-task-manager/workers/state-detectors/claude-code.js +182 -9
  222. package/template/claude-task-manager/workers/state-detectors/codex.js +150 -2
  223. package/template/claude-task-manager/workers/state-detectors/cursor.js +127 -0
  224. package/template/claude-task-manager/workers/state-detectors/gemini.js +21 -0
  225. package/template/claude-task-manager/workers/state-detectors/index.js +29 -0
  226. package/template/claude-task-manager/workers/state-detectors/opencode.js +103 -0
  227. package/template/docs/design/markdown-review-pane.md +206 -0
  228. package/template/docs/designs/2026-05-17-portkey-gateway-provider-ux.md +129 -38
  229. package/template/docs/designs/2026-05-20-mobile-worktree-finish-command.md +27 -0
  230. package/template/docs/designs/2026-05-22-ai-configuration-consolidation.md +248 -0
  231. package/template/docs/designs/ai-configuration-consolidation-mock.html +812 -0
  232. package/template/docs/private-memory-and-pii-policy.md +69 -0
  233. package/template/package.json +2 -1
  234. package/template/scripts/check-private-data.js +201 -0
  235. package/template/shared/sqlite-owner-guard.js +30 -0
  236. package/template/shared/sqlite-owner-write-queue.js +225 -0
  237. package/template/shared/sqlite-storage-policy.js +111 -0
  238. package/template/shared/sqlite-write-lock.js +428 -0
  239. package/template/wall-e/agent-runners/claude-code.js +5 -0
  240. package/template/wall-e/agent.js +166 -22
  241. package/template/wall-e/api-walle.js +524 -70
  242. package/template/wall-e/auth/provider-flows.js +11 -1
  243. package/template/wall-e/bin/walle-mcp-stdio.js +341 -17
  244. package/template/wall-e/brain.js +1614 -141
  245. package/template/wall-e/chat/attachment-blocks.js +96 -0
  246. package/template/wall-e/chat/attachments.js +2 -1
  247. package/template/wall-e/chat/capability-resolver.js +7 -7
  248. package/template/wall-e/chat/context-messages.js +28 -0
  249. package/template/wall-e/chat/conversation-frame.js +630 -0
  250. package/template/wall-e/chat/provider-messages.js +125 -0
  251. package/template/wall-e/chat.js +1002 -233
  252. package/template/wall-e/coding/acceptance-contract.js +170 -0
  253. package/template/wall-e/coding/acp-adapter.js +1 -1
  254. package/template/wall-e/coding/agent-catalog.js +3 -0
  255. package/template/wall-e/coding/artifact-store.js +93 -0
  256. package/template/wall-e/coding/capability-router.js +120 -0
  257. package/template/wall-e/coding/coding-run-controller.js +423 -0
  258. package/template/wall-e/coding/compaction-service.js +157 -12
  259. package/template/wall-e/coding/frontend-verification.js +258 -0
  260. package/template/wall-e/coding/lifecycle-hooks.js +75 -0
  261. package/template/wall-e/coding/local-preview-contract.js +157 -0
  262. package/template/wall-e/coding/permission-service.js +57 -13
  263. package/template/wall-e/coding/prompt-bundle.js +19 -1
  264. package/template/wall-e/coding/prompt-section-registry.js +227 -0
  265. package/template/wall-e/coding/provider-compat.js +15 -0
  266. package/template/wall-e/coding/runtime-events.js +224 -0
  267. package/template/wall-e/coding/runtime-mode.js +3 -0
  268. package/template/wall-e/coding/side-git-snapshot.js +160 -4
  269. package/template/wall-e/coding/snapshot-service.js +143 -1
  270. package/template/wall-e/coding/stream-processor.js +388 -34
  271. package/template/wall-e/coding/task-tool.js +141 -4
  272. package/template/wall-e/coding/tool-execution-controller.js +365 -0
  273. package/template/wall-e/coding/tool-registry.js +43 -5
  274. package/template/wall-e/coding/user-hooks.js +217 -0
  275. package/template/wall-e/coding-orchestrator.js +1330 -221
  276. package/template/wall-e/coding-prompts.js +20 -4
  277. package/template/wall-e/context/context-builder.js +15 -2
  278. package/template/wall-e/decision/confidence.js +1 -1
  279. package/template/wall-e/docs/coding-acceptance-contract.md +41 -0
  280. package/template/wall-e/docs/external-action-controller.md +26 -6
  281. package/template/wall-e/docs/telemetry-lifecycle.md +8 -2
  282. package/template/wall-e/embeddings.js +591 -53
  283. package/template/wall-e/external-action-controller.js +12 -0
  284. package/template/wall-e/http/auth.js +1 -0
  285. package/template/wall-e/http/chat-api.js +46 -11
  286. package/template/wall-e/http/model-admin.js +836 -34
  287. package/template/wall-e/lib/boot-profile.js +88 -0
  288. package/template/wall-e/lib/event-loop-monitor.js +93 -0
  289. package/template/wall-e/lib/service-health.js +194 -0
  290. package/template/wall-e/llm/anthropic.js +130 -5
  291. package/template/wall-e/llm/client.js +266 -63
  292. package/template/wall-e/llm/default-fallback.js +382 -0
  293. package/template/wall-e/llm/health.js +19 -0
  294. package/template/wall-e/llm/message-guard.js +78 -0
  295. package/template/wall-e/llm/model-catalog.js +252 -1
  296. package/template/wall-e/llm/openai.js +26 -4
  297. package/template/wall-e/llm/portkey-sync.js +654 -0
  298. package/template/wall-e/llm/provider-error.js +30 -2
  299. package/template/wall-e/llm/registry.js +5 -1
  300. package/template/wall-e/llm/request-compat.js +67 -0
  301. package/template/wall-e/loops/backfill.js +79 -23
  302. package/template/wall-e/loops/brain-optimize.js +67 -0
  303. package/template/wall-e/loops/ingest.js +25 -10
  304. package/template/wall-e/loops/question-digest.js +160 -0
  305. package/template/wall-e/loops/reflect.js +6 -4
  306. package/template/wall-e/loops/think.js +39 -12
  307. package/template/wall-e/mcp-server.js +318 -36
  308. package/template/wall-e/memory/ctm-context-client.js +52 -14
  309. package/template/wall-e/memory/ctm-operational-context.js +237 -0
  310. package/template/wall-e/memory/ctm-prompt-executions-client.js +128 -0
  311. package/template/wall-e/memory/ctm-session-context.js +111 -63
  312. package/template/wall-e/prompts/coding/deepseek.txt +3 -0
  313. package/template/wall-e/prompts/coding/gemini.txt +6 -0
  314. package/template/wall-e/prompts/coding/gpt.txt +6 -0
  315. package/template/wall-e/prompts/coding/local.txt +7 -0
  316. package/template/wall-e/runtime/decision-hooks.js +115 -0
  317. package/template/wall-e/runtime/devbox-gateway.js +82 -8
  318. package/template/wall-e/runtime/prompt-manifest.js +86 -0
  319. package/template/wall-e/runtime/tool-executor.js +269 -0
  320. package/template/wall-e/runtime/tool-result-envelope.js +138 -0
  321. package/template/wall-e/runtime/transcript-projection.js +60 -0
  322. package/template/wall-e/runtime/walle-runtime.js +224 -0
  323. package/template/wall-e/scripts/db-optimize/migrate.js +162 -0
  324. package/template/wall-e/scripts/db-optimize/recall-eval.js +117 -0
  325. package/template/wall-e/server.js +15 -0
  326. package/template/wall-e/session-files.js +9 -0
  327. package/template/wall-e/skills/_bundled/google-calendar/run.js +1 -1
  328. package/template/wall-e/skills/_bundled/gws-workspace/run.js +1 -1
  329. package/template/wall-e/skills/_bundled/slack-mentions/run.js +76 -6
  330. package/template/wall-e/skills/claude-code-reader.js +7 -3
  331. package/template/wall-e/skills/script-skill-runner.js +10 -0
  332. package/template/wall-e/skills/skill-planner.js +38 -0
  333. package/template/wall-e/tools/builtin-middleware.js +19 -9
  334. package/template/wall-e/tools/local-tools.js +1428 -16
  335. package/template/wall-e/tools/permission-checker.js +73 -5
  336. package/template/wall-e/tools/question-manager.js +117 -7
  337. package/template/wall-e/training/harvester.js +12 -28
  338. package/template/wall-e/training/replay.js +25 -80
  339. package/template/website/index.html +10 -10
  340. package/template/wall-e/eval/ab-test.js +0 -203
  341. package/template/wall-e/eval/agent-runner.js +0 -772
  342. package/template/wall-e/eval/agent-scorer.js +0 -461
  343. package/template/wall-e/eval/aggregator.js +0 -414
  344. package/template/wall-e/eval/allowed-test-commands.js +0 -34
  345. package/template/wall-e/eval/benchmark-generator.js +0 -113
  346. package/template/wall-e/eval/benchmarks/chat-eval.json +0 -1662
  347. package/template/wall-e/eval/benchmarks/chat.json +0 -82
  348. package/template/wall-e/eval/benchmarks/coding-agent-real.json +0 -1
  349. package/template/wall-e/eval/benchmarks/coding-agent.json +0 -1581
  350. package/template/wall-e/eval/benchmarks/coding.json +0 -122
  351. package/template/wall-e/eval/benchmarks/memory-retrieval.json +0 -234
  352. package/template/wall-e/eval/benchmarks/reasoning.json +0 -82
  353. package/template/wall-e/eval/benchmarks/swebench-lite-30.json +0 -212
  354. package/template/wall-e/eval/benchmarks.js +0 -669
  355. package/template/wall-e/eval/cc-replay.js +0 -719
  356. package/template/wall-e/eval/chat-eval.js +0 -525
  357. package/template/wall-e/eval/check-keys.js +0 -15
  358. package/template/wall-e/eval/check-providers.js +0 -42
  359. package/template/wall-e/eval/codex-cli-baseline.js +0 -669
  360. package/template/wall-e/eval/coding-agent-real.js +0 -570
  361. package/template/wall-e/eval/context-compactor.js +0 -251
  362. package/template/wall-e/eval/debug-agent003.js +0 -68
  363. package/template/wall-e/eval/diagnostics.js +0 -216
  364. package/template/wall-e/eval/eval-orchestrator.js +0 -642
  365. package/template/wall-e/eval/evaluate.js +0 -202
  366. package/template/wall-e/eval/evaluator.js +0 -373
  367. package/template/wall-e/eval/exporter.js +0 -212
  368. package/template/wall-e/eval/fixtures/express-basic/package.json +0 -9
  369. package/template/wall-e/eval/fixtures/express-basic/server.js +0 -115
  370. package/template/wall-e/eval/fixtures/express-basic/test.js +0 -83
  371. package/template/wall-e/eval/fixtures/express-buggy/package.json +0 -9
  372. package/template/wall-e/eval/fixtures/express-buggy/server.js +0 -113
  373. package/template/wall-e/eval/fixtures/express-buggy/test.js +0 -83
  374. package/template/wall-e/eval/fixtures/express-buggy-items/package.json +0 -9
  375. package/template/wall-e/eval/fixtures/express-buggy-items/server.js +0 -112
  376. package/template/wall-e/eval/fixtures/express-buggy-items/test.js +0 -83
  377. package/template/wall-e/eval/fixtures/express-buggy-search/package.json +0 -9
  378. package/template/wall-e/eval/fixtures/express-buggy-search/server.js +0 -121
  379. package/template/wall-e/eval/fixtures/express-buggy-search/test.js +0 -83
  380. package/template/wall-e/eval/fixtures/express-rename-data/data.js +0 -34
  381. package/template/wall-e/eval/fixtures/express-rename-data/package.json +0 -9
  382. package/template/wall-e/eval/fixtures/express-rename-data/server.js +0 -97
  383. package/template/wall-e/eval/fixtures/express-rename-data/test.js +0 -88
  384. package/template/wall-e/eval/fixtures/express-xss/package.json +0 -12
  385. package/template/wall-e/eval/fixtures/express-xss/server.js +0 -90
  386. package/template/wall-e/eval/fixtures/express-xss/test.js +0 -67
  387. package/template/wall-e/eval/fixtures/express-xss/views/profile.ejs +0 -9
  388. package/template/wall-e/eval/fixtures/fullstack-app/config/default.js +0 -9
  389. package/template/wall-e/eval/fixtures/fullstack-app/config/test.js +0 -13
  390. package/template/wall-e/eval/fixtures/fullstack-app/package.json +0 -11
  391. package/template/wall-e/eval/fixtures/fullstack-app/public/css/style.css +0 -137
  392. package/template/wall-e/eval/fixtures/fullstack-app/public/index.html +0 -46
  393. package/template/wall-e/eval/fixtures/fullstack-app/public/js/app.js +0 -121
  394. package/template/wall-e/eval/fixtures/fullstack-app/public/js/auth.js +0 -71
  395. package/template/wall-e/eval/fixtures/fullstack-app/public/js/items.js +0 -80
  396. package/template/wall-e/eval/fixtures/fullstack-app/public/js/users.js +0 -46
  397. package/template/wall-e/eval/fixtures/fullstack-app/public/login.html +0 -45
  398. package/template/wall-e/eval/fixtures/fullstack-app/public/register.html +0 -38
  399. package/template/wall-e/eval/fixtures/fullstack-app/scripts/migrate.js +0 -23
  400. package/template/wall-e/eval/fixtures/fullstack-app/scripts/seed.js +0 -46
  401. package/template/wall-e/eval/fixtures/fullstack-app/server/db.js +0 -99
  402. package/template/wall-e/eval/fixtures/fullstack-app/server/index.js +0 -94
  403. package/template/wall-e/eval/fixtures/fullstack-app/server/middleware/auth.js +0 -19
  404. package/template/wall-e/eval/fixtures/fullstack-app/server/middleware/logger.js +0 -19
  405. package/template/wall-e/eval/fixtures/fullstack-app/server/router.js +0 -50
  406. package/template/wall-e/eval/fixtures/fullstack-app/server/routes/auth.js +0 -69
  407. package/template/wall-e/eval/fixtures/fullstack-app/server/routes/health.js +0 -23
  408. package/template/wall-e/eval/fixtures/fullstack-app/server/routes/items.js +0 -88
  409. package/template/wall-e/eval/fixtures/fullstack-app/server/routes/users.js +0 -75
  410. package/template/wall-e/eval/fixtures/fullstack-app/server/test.js +0 -198
  411. package/template/wall-e/eval/fixtures/fullstack-app/server/utils/response.js +0 -34
  412. package/template/wall-e/eval/fixtures/fullstack-app/server/utils/validate.js +0 -26
  413. package/template/wall-e/eval/fixtures/fullstack-app/server.js +0 -8
  414. package/template/wall-e/eval/fixtures/fullstack-app/test.js +0 -12
  415. package/template/wall-e/eval/fixtures/monorepo-basic/package.json +0 -8
  416. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/data.js +0 -58
  417. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/middleware.js +0 -46
  418. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/package.json +0 -8
  419. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/routes.js +0 -64
  420. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/server.js +0 -56
  421. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/test.js +0 -116
  422. package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/commands.js +0 -61
  423. package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/index.js +0 -62
  424. package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/output.js +0 -43
  425. package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/package.json +0 -11
  426. package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/test.js +0 -44
  427. package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/formatters.js +0 -43
  428. package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/index.js +0 -12
  429. package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/package.json +0 -5
  430. package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/test.js +0 -55
  431. package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/validators.js +0 -29
  432. package/template/wall-e/eval/fixtures/monorepo-basic/test.js +0 -46
  433. package/template/wall-e/eval/fixtures/node-cli/index.js +0 -78
  434. package/template/wall-e/eval/fixtures/node-cli/package.json +0 -10
  435. package/template/wall-e/eval/fixtures/node-cli/test.js +0 -57
  436. package/template/wall-e/eval/fixtures/node-typed/package.json +0 -8
  437. package/template/wall-e/eval/fixtures/node-typed/src/handlers.js +0 -31
  438. package/template/wall-e/eval/fixtures/node-typed/src/utils.js +0 -33
  439. package/template/wall-e/eval/fixtures/node-typed/test.js +0 -36
  440. package/template/wall-e/eval/fixtures/python-flask/app.py +0 -14
  441. package/template/wall-e/eval/fixtures/python-flask/requirements.txt +0 -2
  442. package/template/wall-e/eval/fixtures/python-flask/test_app.py +0 -25
  443. package/template/wall-e/eval/fixtures/wall-e-subset/brain.js +0 -105
  444. package/template/wall-e/eval/fixtures/wall-e-subset/eval/aggregator.js +0 -101
  445. package/template/wall-e/eval/fixtures/wall-e-subset/eval/benchmarks/chat.json +0 -20
  446. package/template/wall-e/eval/fixtures/wall-e-subset/eval/benchmarks/coding.json +0 -32
  447. package/template/wall-e/eval/fixtures/wall-e-subset/eval/benchmarks.js +0 -64
  448. package/template/wall-e/eval/fixtures/wall-e-subset/eval/fixtures/simple-project/package.json +0 -6
  449. package/template/wall-e/eval/fixtures/wall-e-subset/eval/fixtures/simple-project/server.js +0 -31
  450. package/template/wall-e/eval/fixtures/wall-e-subset/eval/fixtures/simple-project/test.js +0 -18
  451. package/template/wall-e/eval/fixtures/wall-e-subset/eval/fixtures/simple-project/utils.js +0 -34
  452. package/template/wall-e/eval/fixtures/wall-e-subset/eval/runner.js +0 -104
  453. package/template/wall-e/eval/fixtures/wall-e-subset/eval/scorer.js +0 -73
  454. package/template/wall-e/eval/fixtures/wall-e-subset/eval/test.js +0 -134
  455. package/template/wall-e/eval/fixtures/wall-e-subset/llm/client.js +0 -99
  456. package/template/wall-e/eval/fixtures/wall-e-subset/llm/providers.js +0 -63
  457. package/template/wall-e/eval/fixtures/wall-e-subset/llm/test.js +0 -70
  458. package/template/wall-e/eval/fixtures/wall-e-subset/package.json +0 -10
  459. package/template/wall-e/eval/fixtures/wall-e-subset/test.js +0 -86
  460. package/template/wall-e/eval/harvester.js +0 -685
  461. package/template/wall-e/eval/head-to-head.js +0 -388
  462. package/template/wall-e/eval/humaneval-adapter.js +0 -321
  463. package/template/wall-e/eval/list-models.js +0 -31
  464. package/template/wall-e/eval/livecodebench-adapter.js +0 -291
  465. package/template/wall-e/eval/mail-integration.js +0 -443
  466. package/template/wall-e/eval/manifest.js +0 -186
  467. package/template/wall-e/eval/meta-harness/adapters/coding-agent.js +0 -57
  468. package/template/wall-e/eval/meta-harness/bootstrap-snapshot.js +0 -149
  469. package/template/wall-e/eval/meta-harness/candidate-store.js +0 -117
  470. package/template/wall-e/eval/meta-harness/cli.js +0 -86
  471. package/template/wall-e/eval/meta-harness/domain-spec.js +0 -154
  472. package/template/wall-e/eval/meta-harness/domains/coding-agent.domain.json +0 -84
  473. package/template/wall-e/eval/meta-harness/examples/env-bootstrap-candidate.js +0 -29
  474. package/template/wall-e/eval/meta-harness/experience-store.js +0 -174
  475. package/template/wall-e/eval/meta-harness/frontier.js +0 -96
  476. package/template/wall-e/eval/meta-harness/harness-interface.js +0 -90
  477. package/template/wall-e/eval/meta-harness/leakage-guard.js +0 -80
  478. package/template/wall-e/eval/meta-harness/optimizer.js +0 -207
  479. package/template/wall-e/eval/meta-harness/proposer-runner.js +0 -110
  480. package/template/wall-e/eval/meta-harness/reporting.js +0 -58
  481. package/template/wall-e/eval/meta-harness/telemetry.js +0 -27
  482. package/template/wall-e/eval/meta-harness/validation.js +0 -81
  483. package/template/wall-e/eval/promoter.js +0 -228
  484. package/template/wall-e/eval/provider-normalizer.js +0 -33
  485. package/template/wall-e/eval/replay.js +0 -395
  486. package/template/wall-e/eval/run-agent-benchmarks.js +0 -386
  487. package/template/wall-e/eval/run-codex-cli-baseline.js +0 -177
  488. package/template/wall-e/eval/run-coding-agent-real.js +0 -187
  489. package/template/wall-e/eval/run-eval.js +0 -435
  490. package/template/wall-e/eval/run-model-comparison.js +0 -142
  491. package/template/wall-e/eval/session-evaluator.js +0 -187
  492. package/template/wall-e/eval/session-miner.js +0 -207
  493. package/template/wall-e/eval/session-retrieval-benchmark.js +0 -150
  494. package/template/wall-e/eval/session-transcripts.js +0 -509
  495. package/template/wall-e/eval/shadow.js +0 -161
  496. package/template/wall-e/eval/swebench-adapter.js +0 -345
  497. package/template/wall-e/eval/swebench-docker.js +0 -192
  498. package/template/wall-e/eval/train.py +0 -320
  499. package/template/wall-e/eval/trainer.js +0 -232
  500. package/template/wall-e/eval/weekly-eval-loop.js +0 -241
@@ -1,570 +0,0 @@
1
- 'use strict';
2
-
3
- const crypto = require('crypto');
4
- const fs = require('fs');
5
- const os = require('os');
6
- const path = require('path');
7
- const { execFileSync } = require('child_process');
8
-
9
- const {
10
- findTranscriptJsonlFiles,
11
- parseTranscriptJsonl,
12
- } = require('./session-transcripts');
13
-
14
- const {
15
- recoverRepoState,
16
- makeReplaySandbox,
17
- removeReplaySandbox,
18
- replayAndScore,
19
- } = require('./cc-replay');
20
-
21
- const DEFAULT_REAL_CATALOG_PATH = path.join(__dirname, 'benchmarks', 'coding-agent-real.json');
22
- const DEFAULT_RESULTS_DIR = path.join(os.homedir(), '.walle', 'eval-results', 'coding-agent-real');
23
-
24
- function buildReplayCatalog({
25
- repoPath,
26
- roots,
27
- source = 'all',
28
- sinceDays = 14,
29
- limit = 20,
30
- catalogPath = DEFAULT_REAL_CATALOG_PATH,
31
- requireEdits = true,
32
- requireCodingIntent = true,
33
- } = {}) {
34
- if (!repoPath) throw new Error('repoPath is required');
35
-
36
- const files = findTranscriptJsonlFiles({ roots, source, sinceDays, limit: limit * 8 });
37
- const existing = loadCatalog(catalogPath);
38
- const existingIds = new Set(existing.map((entry) => entry.id));
39
- const existingSourceKeys = new Set(existing.map(sourceKeyForEntry).filter(Boolean));
40
-
41
- const added = [];
42
- const skipped = {
43
- unreadable: 0,
44
- outOfRepo: 0,
45
- noPrompt: 0,
46
- nonCodingPrompt: 0,
47
- nonReplayableConversation: 0,
48
- noReferenceEdits: 0,
49
- externalReferenceEdits: 0,
50
- noCommit: 0,
51
- duplicate: 0,
52
- limit: 0,
53
- };
54
-
55
- for (const file of files) {
56
- if (added.length >= limit) { skipped.limit += 1; continue; }
57
-
58
- const session = parseTranscriptJsonl(file, { repoPath });
59
- if (!session) { skipped.outOfRepo += 1; continue; }
60
- if (!session.userMessages.length) { skipped.noPrompt += 1; continue; }
61
- if (requireCodingIntent && !isStandaloneCodingPrompt(session.userMessages[0])) { skipped.nonCodingPrompt += 1; continue; }
62
- if (requireCodingIntent && !isReplayConversationSelfContained(session)) { skipped.nonReplayableConversation += 1; continue; }
63
- if (requireEdits && !session.filesEdited.length) { skipped.noReferenceEdits += 1; continue; }
64
- if (requireEdits && !hasOnlyRepoRelativeEdits(session.filesEdited)) { skipped.externalReferenceEdits += 1; continue; }
65
-
66
- const sourceKey = `${session.source}:${session.sessionId}`;
67
- if (existingSourceKeys.has(sourceKey)) { skipped.duplicate += 1; continue; }
68
-
69
- const recovered = recoverRepoState(session.cwd, session.tsStart);
70
- if (!recovered?.commitSha) { skipped.noCommit += 1; continue; }
71
-
72
- const entry = sessionToRealCatalogEntry(session, recovered.commitSha);
73
- if (existingIds.has(entry.id)) { skipped.duplicate += 1; continue; }
74
-
75
- existing.push(entry);
76
- existingIds.add(entry.id);
77
- existingSourceKeys.add(sourceKey);
78
- added.push(entry);
79
- }
80
-
81
- if (catalogPath) {
82
- fs.mkdirSync(path.dirname(catalogPath), { recursive: true });
83
- fs.writeFileSync(catalogPath, JSON.stringify(existing, null, 2) + '\n');
84
- }
85
-
86
- return {
87
- added,
88
- skipped,
89
- total: existing.length,
90
- scanned: files.length,
91
- catalogPath,
92
- };
93
- }
94
-
95
- function sessionToRealCatalogEntry(session, recoveredCommit) {
96
- const prompt = session.userMessages[0] || '';
97
- const referenceToolCalls = session.toolCalls.map((call) => call.name).filter(Boolean);
98
- const referenceFilesEdited = session.filesEdited || [];
99
- const turnCount = session.turnCount || session.userMessages.length || 1;
100
- const idHash = crypto
101
- .createHash('sha256')
102
- .update([session.source, session.sessionId, session.cwd, prompt].join('|'))
103
- .digest('hex')
104
- .slice(0, 10);
105
- const difficulty = classifyDifficulty({ turnCount, files: referenceFilesEdited.length, tools: referenceToolCalls.length });
106
-
107
- return {
108
- id: `agent-real-${session.source}-${idHash}`,
109
- prompt,
110
- taskType: 'coding-agent-real',
111
- difficulty,
112
- expectedTraits: expectedTraitsFor(referenceToolCalls, referenceFilesEdited),
113
- agentExpectations: {
114
- expectedToolCalls: [...new Set(referenceToolCalls)].slice(0, 16),
115
- expectedFileChanges: referenceFilesEdited,
116
- maxTurns: Math.max(12, Math.min(turnCount * 4 + 8, 80)),
117
- testCommand: null,
118
- },
119
- realReplay: {
120
- source: session.source,
121
- sourceSessionId: session.sessionId,
122
- sourceJsonlPath: session.jsonlPath,
123
- cwd: session.cwd,
124
- gitBranch: session.gitBranch || null,
125
- tsStart: session.tsStart,
126
- tsEnd: session.tsEnd,
127
- recoveredCommit: recoveredCommit || null,
128
- userMessages: session.userMessages,
129
- assistantMessageCount: session.assistantMessages.length,
130
- turnCount,
131
- referenceToolCalls,
132
- referenceFilesEdited,
133
- shellCommands: session.shellCommands || [],
134
- rawEventCount: session.rawEventCount || 0,
135
- reapedAt: new Date().toISOString(),
136
- },
137
- };
138
- }
139
-
140
- function isStandaloneCodingPrompt(prompt) {
141
- const text = String(prompt || '').trim();
142
- if (text.length < 25) return false;
143
- const lower = text.toLowerCase();
144
-
145
- // These are often useful conversations, but they do not make good replay
146
- // prompts because the expected output is explanation or shared context, not
147
- // a concrete codebase action.
148
- if (/\b(let'?s discuss|let us discuss|let'?s have a discussion|discussion\s*\/\s*plan|walk me through|how should i|what do you think|take a close look)\b/.test(lower)) {
149
- return false;
150
- }
151
- if (/^\s*(go ahead|goa ahead|proceed|do it|yes[, ]|yep[, ]|ok[, ]|okay[, ]|everything in main)\b/.test(lower)) {
152
- return false;
153
- }
154
-
155
- return /\b(fix|implement|add|change|update|refactor|test|merge|commit|build|debug|make|write|delete|remove|harden|wire|land|ship|bug|failing|error|regression|feature|endpoint|api|ui|server|component|tests?|code review|review.*code)\b/.test(lower);
156
- }
157
-
158
- function isReplayConversationSelfContained(session) {
159
- const messages = session?.userMessages || [];
160
- if (!messages.length) return false;
161
- if (!isStandaloneCodingPrompt(messages[0])) return false;
162
- return messages.every((message, index) => isReplayableCodingTurn(message, index));
163
- }
164
-
165
- function isReplayableCodingTurn(message, index = 0) {
166
- const text = String(message || '').trim();
167
- if (!text) return false;
168
- const lower = text.toLowerCase();
169
-
170
- if (/\[note:\s*user originally attached[\s\S]*not visible in this replay/i.test(text)) return false;
171
- if (/\[image:/i.test(text) || /<image\b/i.test(text)) return false;
172
- if (/^continue from where you left off\b/i.test(text)) return false;
173
- if (/^\s*(go ahead|yes|yep|ok|okay|check again|have you committed|everything in main)\b/i.test(lower)) return false;
174
- if (/\b(i have ran|i ran it|check the log|pending fixes|auto approver missed|look at recent changes)\b/i.test(lower)) return false;
175
-
176
- // Follow-ups are replayable only when they carry fresh standalone coding
177
- // instructions; otherwise they depend on invisible prior UI/session state.
178
- if (index > 0 && !isStandaloneCodingPrompt(text)) return false;
179
- return true;
180
- }
181
-
182
- function hasOnlyRepoRelativeEdits(files = []) {
183
- return (files || []).every((file) => {
184
- if (!file || typeof file !== 'string') return false;
185
- if (path.isAbsolute(file)) return false;
186
- return !file.split(/[\\/]+/).includes('..');
187
- });
188
- }
189
-
190
- function preflightCatalogEntry(entry) {
191
- const issues = [];
192
- const rr = entry?.realReplay || null;
193
- const prompt = entry?.prompt || '';
194
-
195
- if (!rr) issues.push('missing_replay');
196
- if (!prompt.trim()) issues.push('no_prompt');
197
- if (rr && !(rr.userMessages || []).length) issues.push('no_user_messages');
198
- if (rr && !(rr.referenceFilesEdited || []).length) issues.push('no_reference_edits');
199
- if (rr && !(rr.referenceToolCalls || []).length) issues.push('no_reference_tools');
200
-
201
- let repoRoot = null;
202
- if (rr) {
203
- const userMessages = (rr.userMessages || []).length ? rr.userMessages : [prompt];
204
- if (!isReplayConversationSelfContained({ userMessages })) issues.push('non_replayable_conversation');
205
-
206
- const expectedFiles = rr.referenceFilesEdited || entry.agentExpectations?.expectedFileChanges || [];
207
- if (!hasOnlyRepoRelativeEdits(expectedFiles)) issues.push('external_reference_edits');
208
-
209
- if (!rr.cwd || !fs.existsSync(rr.cwd)) {
210
- issues.push('missing_repo');
211
- } else {
212
- repoRoot = gitRoot(rr.cwd);
213
- if (!repoRoot) issues.push('missing_git_repo');
214
- }
215
-
216
- if (!rr.recoveredCommit) {
217
- issues.push('missing_commit');
218
- } else if (repoRoot && !gitHasCommit(repoRoot, rr.recoveredCommit)) {
219
- issues.push('missing_commit');
220
- }
221
- }
222
-
223
- const status = issues.length ? issues[0] : 'replayable_git';
224
- return {
225
- ok: issues.length === 0,
226
- status,
227
- issues: [...new Set(issues)],
228
- repoRoot,
229
- recoveredCommit: rr?.recoveredCommit || null,
230
- };
231
- }
232
-
233
- async function runReplayEntry(entry, {
234
- runAgentLoop,
235
- provider = null,
236
- model = null,
237
- dryRun = false,
238
- record = false,
239
- resultsDir = DEFAULT_RESULTS_DIR,
240
- timeoutMs = 600_000,
241
- keepFailures = false,
242
- } = {}) {
243
- const startedAt = new Date().toISOString();
244
- const startMs = Date.now();
245
- const preflight = preflightCatalogEntry(entry);
246
- const base = {
247
- id: entry?.id || 'unknown',
248
- prompt: entry?.prompt || '',
249
- dryRun,
250
- preflight,
251
- startedAt,
252
- provider: provider?.type || provider || null,
253
- model: model || null,
254
- };
255
-
256
- if (!preflight.ok) {
257
- const result = {
258
- ...base,
259
- success: false,
260
- failureType: classifyFailure({ preflight }),
261
- latencyMs: Date.now() - startMs,
262
- };
263
- if (record) result.artifactPath = writeResultArtifact(result, { resultsDir });
264
- return result;
265
- }
266
-
267
- const rr = entry.realReplay;
268
- const session = catalogEntryToReplaySession(entry);
269
- let sandbox = null;
270
- let keepSandbox = false;
271
- try {
272
- sandbox = makeReplaySandbox(rr.cwd, rr.recoveredCommit, safeId(`${rr.source}-${rr.sourceSessionId}`));
273
-
274
- if (dryRun) {
275
- const result = {
276
- ...base,
277
- success: true,
278
- status: 'dry_run_ok',
279
- sandboxCreated: sandbox,
280
- latencyMs: Date.now() - startMs,
281
- };
282
- if (record) result.artifactPath = writeResultArtifact(result, { resultsDir });
283
- return result;
284
- }
285
-
286
- if (!runAgentLoop) throw new Error('runAgentLoop function is required for non-dry-run replay');
287
- const replay = await replayAndScore(session, sandbox, runAgentLoop, {
288
- provider,
289
- model,
290
- timeoutMs,
291
- useLlmJudge: false,
292
- });
293
- const failureType = classifyFailure({
294
- ...replay,
295
- expectedFiles: session.claudeFilesEdited,
296
- });
297
- const success = !replay.error && replay.scores?.composite >= 0.35 && failureType === null;
298
- const result = {
299
- ...base,
300
- success,
301
- failureType,
302
- latencyMs: Date.now() - startMs,
303
- replay,
304
- };
305
- if (!success) keepSandbox = !!keepFailures;
306
- if (record) result.artifactPath = writeResultArtifact(result, { resultsDir });
307
- return result;
308
- } catch (err) {
309
- keepSandbox = !!keepFailures;
310
- const result = {
311
- ...base,
312
- success: false,
313
- failureType: classifyFailure({ error: err.message, preflight }),
314
- error: err.message,
315
- latencyMs: Date.now() - startMs,
316
- };
317
- if (record) result.artifactPath = writeResultArtifact(result, { resultsDir });
318
- return result;
319
- } finally {
320
- if (sandbox && !keepSandbox) {
321
- try { removeReplaySandbox(sandbox, rr.cwd); } catch { /* best-effort */ }
322
- }
323
- }
324
- }
325
-
326
- function defaultReplayTimeoutMs(entry = {}) {
327
- const expectedFiles = entry.agentExpectations?.expectedFileChanges || entry.realReplay?.referenceFilesEdited || [];
328
- const expectedTools = entry.agentExpectations?.expectedToolCalls || entry.realReplay?.referenceToolCalls || [];
329
- if (entry.difficulty === 'hard' || expectedFiles.length > 4 || expectedTools.length > 30) return 1_200_000;
330
- if (entry.difficulty === 'medium' || expectedFiles.length > 1 || expectedTools.length > 10) return 900_000;
331
- return 600_000;
332
- }
333
-
334
- function catalogEntryToReplaySession(entry) {
335
- const rr = entry.realReplay;
336
- return {
337
- sessionId: rr.sourceSessionId,
338
- cwd: rr.cwd,
339
- gitBranch: rr.gitBranch,
340
- userMessages: rr.userMessages?.length ? rr.userMessages : [entry.prompt],
341
- tsStart: rr.tsStart,
342
- tsEnd: rr.tsEnd,
343
- turnCount: rr.turnCount || rr.userMessages?.length || 1,
344
- claudeToolCalls: rr.referenceToolCalls || entry.agentExpectations?.expectedToolCalls || [],
345
- claudeFilesEdited: rr.referenceFilesEdited || entry.agentExpectations?.expectedFileChanges || [],
346
- };
347
- }
348
-
349
- function classifyFailure(resultLike = {}) {
350
- const preflight = resultLike.preflight || null;
351
- if (preflight && !preflight.ok) {
352
- if (preflight.issues?.includes('non_replayable_conversation') || preflight.issues?.includes('external_reference_edits')) return 'bad_replay_entry';
353
- if (preflight.issues?.includes('missing_repo') || preflight.issues?.includes('missing_git_repo')) return 'path_blocked';
354
- if (preflight.issues?.includes('missing_commit')) return 'missing_commit';
355
- if (preflight.issues?.includes('no_prompt') || preflight.issues?.includes('no_user_messages')) return 'planning_failed';
356
- if (preflight.issues?.includes('no_reference_edits')) return 'no_reference_edits';
357
- return 'preflight_failed';
358
- }
359
-
360
- const text = [
361
- resultLike.error,
362
- resultLike.message,
363
- resultLike.stderr,
364
- resultLike.replay?.error,
365
- resultLike.agentResult?.stderr,
366
- resultLike.agentResult?.output,
367
- resultLike.replay?.agentResult?.stderr,
368
- resultLike.replay?.agentResult?.output,
369
- ].filter(Boolean).join('\n').toLowerCase();
370
-
371
- if (/timeout|timed out|hard timeout|aborted|aborterror/.test(text)) return 'timeout';
372
- if (/api key|unauthorized|forbidden|401|403|invalid auth|authentication/.test(text)) return 'provider_auth';
373
- if (/quota|rate limit|429|insufficient credits|billing/.test(text)) return 'provider_quota';
374
- if (/econn|enotfound|network|socket hang up|fetch failed|dns/.test(text)) return 'provider_network';
375
- if (/permission denied|operation not permitted|sandbox|worktree add|outside project|path.*blocked|not allowed/.test(text)) return 'path_blocked';
376
- if (/git add -a|git add --all|unsafe commit|git commit/.test(text)) return 'unsafe_commit';
377
- if (/test failed|tests failed|assertionerror|npm test.*failed/.test(text)) return 'tests_failed';
378
-
379
- const tools = resultLike.walleTools || resultLike.actualToolCalls || resultLike.replay?.walleTools || [];
380
- const files = resultLike.walleFiles || resultLike.actualFileChanges || resultLike.replay?.walleFiles || [];
381
- const expectedFiles = resultLike.expectedFiles || resultLike.claudeFiles || resultLike.replay?.claudeFiles || [];
382
- const score = resultLike.scores || resultLike.score || resultLike.replay?.scores || {};
383
-
384
- if (Array.isArray(tools) && tools.length === 0) return 'no_tool_calls';
385
- if (Array.isArray(files) && Array.isArray(expectedFiles) && expectedFiles.length > 0 && files.length === 0) return 'no_file_changes';
386
- if (expectedFiles.length > 0 && score.file_jaccard === 0) return 'wrong_files';
387
- if (expectedFiles.length > 0 && typeof score.file_jaccard === 'number' && score.file_jaccard < 0.5) return 'partial_file_coverage';
388
- if (typeof score.composite === 'number' && score.composite < 0.35) return 'low_score';
389
-
390
- if (resultLike.success === false) return 'unknown';
391
- return null;
392
- }
393
-
394
- function summarizeResults(results) {
395
- const total = results.length;
396
- const passed = results.filter((r) => r.success).length;
397
- const failureCounts = {};
398
- let compositeTotal = 0;
399
- let compositeCount = 0;
400
-
401
- for (const result of results) {
402
- const score = result.replay?.scores?.composite;
403
- if (typeof score === 'number') {
404
- compositeTotal += score;
405
- compositeCount += 1;
406
- }
407
- if (!result.success) {
408
- const key = result.failureType || 'unknown';
409
- failureCounts[key] = (failureCounts[key] || 0) + 1;
410
- }
411
- }
412
-
413
- return {
414
- total,
415
- passed,
416
- failed: total - passed,
417
- passRate: total ? passed / total : 0,
418
- avgComposite: compositeCount ? compositeTotal / compositeCount : null,
419
- failureCounts,
420
- };
421
- }
422
-
423
- function writeResultArtifact(result, { resultsDir = DEFAULT_RESULTS_DIR } = {}) {
424
- fs.mkdirSync(resultsDir, { recursive: true });
425
- const stamp = new Date().toISOString().replace(/[:.]/g, '-');
426
- const file = path.join(resultsDir, `${stamp}-${safeId(result.id || 'result')}.json`);
427
- fs.writeFileSync(file, JSON.stringify(result, null, 2) + '\n');
428
- return file;
429
- }
430
-
431
- function loadCatalog(catalogPath = DEFAULT_REAL_CATALOG_PATH) {
432
- if (!catalogPath || !fs.existsSync(catalogPath)) return [];
433
- try {
434
- const parsed = JSON.parse(fs.readFileSync(catalogPath, 'utf8'));
435
- return Array.isArray(parsed) ? parsed : [];
436
- } catch {
437
- return [];
438
- }
439
- }
440
-
441
- function loadProviderKey(providerType) {
442
- const envByProvider = {
443
- anthropic: ['ANTHROPIC_API_KEY', 'ANTHROPIC_AUTH_TOKEN'],
444
- openai: ['OPENAI_API_KEY'],
445
- google: ['GOOGLE_API_KEY', 'GEMINI_API_KEY'],
446
- deepseek: ['DEEPSEEK_API_KEY'],
447
- moonshot: ['MOONSHOT_API_KEY'],
448
- };
449
- for (const key of envByProvider[providerType] || []) {
450
- if (process.env[key]) return { key: process.env[key], source: key };
451
- }
452
-
453
- try {
454
- const brain = require('../brain');
455
- const row = brain.getDb().prepare(
456
- 'SELECT api_key_encrypted FROM model_providers WHERE type = ? AND enabled = 1 AND api_key_encrypted IS NOT NULL ORDER BY updated_at DESC LIMIT 1'
457
- ).get(providerType);
458
- if (row?.api_key_encrypted) return { key: row.api_key_encrypted, source: 'brain_db' };
459
- } catch {}
460
-
461
- return { key: null, source: null };
462
- }
463
-
464
- function preflightProvider(providerType, { dryRun = false } = {}) {
465
- if (!providerType) return { ok: true, status: dryRun ? 'default_provider_deferred' : 'default_provider' };
466
- if (dryRun) return { ok: true, status: 'provider_deferred_dry_run', providerType };
467
- if (providerType === 'ollama' || providerType === 'mlx') return { ok: true, status: 'local_provider', providerType };
468
- if (providerType === 'claude-cli') return commandPreflight('claude', providerType);
469
- if (providerType === 'codex-cli') return commandPreflight('codex', providerType);
470
-
471
- const { key, source } = loadProviderKey(providerType);
472
- if (!key) return { ok: false, status: 'missing_provider_key', providerType };
473
- return { ok: true, status: 'provider_key_found', providerType, keySource: source };
474
- }
475
-
476
- function commandPreflight(command, providerType) {
477
- try {
478
- execFileSync('which', [command], { encoding: 'utf8', stdio: ['pipe', 'pipe', 'pipe'], timeout: 5000 });
479
- return { ok: true, status: 'provider_cli_found', providerType, command };
480
- } catch {
481
- return { ok: false, status: 'missing_provider_cli', providerType, command };
482
- }
483
- }
484
-
485
- function createProviderClient(providerType) {
486
- if (!providerType) return null;
487
- const { createClient } = require('../llm/client');
488
- if (providerType === 'ollama' || providerType === 'mlx' || providerType === 'claude-cli' || providerType === 'codex-cli') {
489
- return createClient(providerType, {});
490
- }
491
- const { key } = loadProviderKey(providerType);
492
- const config = {};
493
- if (key) config.apiKey = key;
494
- if (providerType === 'openai' && process.env.OPENAI_BASE_URL) config.baseUrl = process.env.OPENAI_BASE_URL;
495
- if (providerType === 'deepseek' && process.env.DEEPSEEK_BASE_URL) config.baseUrl = process.env.DEEPSEEK_BASE_URL;
496
- if (providerType === 'moonshot' && process.env.MOONSHOT_BASE_URL) config.baseUrl = process.env.MOONSHOT_BASE_URL;
497
- if (providerType === 'google' && process.env.GOOGLE_AUTH_MODE === 'oauth') {
498
- config.authMode = 'oauth';
499
- config.refreshToken = process.env.GOOGLE_REFRESH_TOKEN;
500
- }
501
- return createClient(providerType, config);
502
- }
503
-
504
- function classifyDifficulty({ turnCount, files, tools }) {
505
- if (turnCount > 8 || files > 4 || tools > 30) return 'hard';
506
- if (turnCount > 3 || files > 1 || tools > 10) return 'medium';
507
- return 'easy';
508
- }
509
-
510
- function expectedTraitsFor(tools, files) {
511
- const traits = ['real transcript'];
512
- if (tools.some((t) => /read|grep|glob|list|search/i.test(t))) traits.push('reads before writing');
513
- if (tools.some((t) => /bash|exec|shell|command/i.test(t))) traits.push('runs commands');
514
- if (files.length > 0) traits.push('edits files');
515
- if (files.length > 1) traits.push('multi-file change');
516
- return traits;
517
- }
518
-
519
- function sourceKeyForEntry(entry) {
520
- const rr = entry.realReplay;
521
- return rr?.source && rr?.sourceSessionId ? `${rr.source}:${rr.sourceSessionId}` : null;
522
- }
523
-
524
- function gitRoot(cwd) {
525
- try {
526
- return execFileSync('git', ['-C', cwd, 'rev-parse', '--show-toplevel'], {
527
- encoding: 'utf8',
528
- stdio: ['pipe', 'pipe', 'pipe'],
529
- timeout: 10000,
530
- }).trim();
531
- } catch {
532
- return null;
533
- }
534
- }
535
-
536
- function gitHasCommit(repoRoot, commit) {
537
- try {
538
- execFileSync('git', ['-C', repoRoot, 'cat-file', '-e', `${commit}^{commit}`], {
539
- stdio: ['pipe', 'pipe', 'pipe'],
540
- timeout: 10000,
541
- });
542
- return true;
543
- } catch {
544
- return false;
545
- }
546
- }
547
-
548
- function safeId(value) {
549
- return String(value || 'unknown').replace(/[^a-z0-9_.-]+/gi, '-').slice(0, 80);
550
- }
551
-
552
- module.exports = {
553
- DEFAULT_REAL_CATALOG_PATH,
554
- DEFAULT_RESULTS_DIR,
555
- buildReplayCatalog,
556
- sessionToRealCatalogEntry,
557
- preflightCatalogEntry,
558
- runReplayEntry,
559
- catalogEntryToReplaySession,
560
- defaultReplayTimeoutMs,
561
- classifyFailure,
562
- summarizeResults,
563
- writeResultArtifact,
564
- loadCatalog,
565
- preflightProvider,
566
- createProviderClient,
567
- isStandaloneCodingPrompt,
568
- isReplayConversationSelfContained,
569
- hasOnlyRepoRelativeEdits,
570
- };