create-walle 0.9.21 → 0.9.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (500) hide show
  1. package/README.md +27 -5
  2. package/package.json +2 -2
  3. package/template/CLAUDE.md +2 -2
  4. package/template/LICENSE +1 -1
  5. package/template/bin/ctm-dev-cleanup.js +24 -3
  6. package/template/bin/ctm-launch.sh +13 -0
  7. package/template/bin/dev.sh +156 -18
  8. package/template/bin/node-bin.sh +84 -0
  9. package/template/bin/pin-node.sh +51 -0
  10. package/template/claude-task-manager/api-prompts.js +1203 -182
  11. package/template/claude-task-manager/api-reviews.js +109 -15
  12. package/template/claude-task-manager/approval-agent.js +1360 -280
  13. package/template/claude-task-manager/bin/restart-ctm.sh +64 -23
  14. package/template/claude-task-manager/bin/storage-migration-supervisor.js +338 -0
  15. package/template/claude-task-manager/db.js +4417 -295
  16. package/template/claude-task-manager/docs/app-update-refresh-protocol.md +69 -0
  17. package/template/claude-task-manager/docs/approval-ai-refinement.md +138 -0
  18. package/template/claude-task-manager/docs/approval-rescue-loop.md +74 -0
  19. package/template/claude-task-manager/docs/codex-operational-warning-health.md +107 -0
  20. package/template/claude-task-manager/docs/codex-resume-state-guard-design.md +17 -12
  21. package/template/claude-task-manager/docs/codex-terminal-render-controller-handoff.md +311 -0
  22. package/template/claude-task-manager/docs/coding-agent-hooks-architecture.md +418 -0
  23. package/template/claude-task-manager/docs/conversation-import-freshness.md +20 -0
  24. package/template/claude-task-manager/docs/google-workspace-auth-health.md +77 -0
  25. package/template/claude-task-manager/docs/image-paste-ux.md +13 -0
  26. package/template/claude-task-manager/docs/ipad-web-preview.md +88 -0
  27. package/template/claude-task-manager/docs/main-loop-offload-architecture.md +66 -0
  28. package/template/claude-task-manager/docs/microsoft-dev-tunnel-phone-access-design.md +274 -519
  29. package/template/claude-task-manager/docs/mobile-live-streaming.md +27 -5
  30. package/template/claude-task-manager/docs/mobile-remote-submission-lifecycle.md +69 -0
  31. package/template/claude-task-manager/docs/phone-access-design.md +53 -15
  32. package/template/claude-task-manager/docs/phone-passkey-identity.md +122 -0
  33. package/template/claude-task-manager/docs/phone-setup.md +3 -0
  34. package/template/claude-task-manager/docs/prompt-editing-tree-design.md +25 -1
  35. package/template/claude-task-manager/docs/remote-desktop-access-design.md +268 -0
  36. package/template/claude-task-manager/docs/restart-lifecycle-architecture.md +95 -0
  37. package/template/claude-task-manager/docs/runtime-work-control-plane.md +53 -0
  38. package/template/claude-task-manager/docs/session-interactive-wait-surfaces.md +38 -0
  39. package/template/claude-task-manager/docs/session-needs-you-dismissal.md +84 -0
  40. package/template/claude-task-manager/docs/session-render-state-management-design.md +91 -3
  41. package/template/claude-task-manager/docs/session-standup-command-center-design.md +25 -1
  42. package/template/claude-task-manager/docs/session-title-authority.md +32 -0
  43. package/template/claude-task-manager/docs/session-workspace-binding.md +33 -0
  44. package/template/claude-task-manager/docs/skill-intent-resolution-design.md +72 -0
  45. package/template/claude-task-manager/docs/walle-mcp-supervisor-health.md +86 -0
  46. package/template/claude-task-manager/docs/walle-relay-phone-access-design.md +24 -15
  47. package/template/claude-task-manager/docs/walle-session-history-hydration.md +114 -0
  48. package/template/claude-task-manager/docs/walle-session-input-queue.md +104 -0
  49. package/template/claude-task-manager/docs/walle-session-model-catalog.md +90 -0
  50. package/template/claude-task-manager/docs/walle-session-model-preferences.md +15 -6
  51. package/template/claude-task-manager/git-utils.js +897 -27
  52. package/template/claude-task-manager/lib/agent-capabilities.js +33 -0
  53. package/template/claude-task-manager/lib/agent-cli-cache.js +37 -7
  54. package/template/claude-task-manager/lib/agent-hooks-installer.js +26 -2
  55. package/template/claude-task-manager/lib/agent-presets.js +17 -1
  56. package/template/claude-task-manager/lib/all-sessions-query.js +108 -0
  57. package/template/claude-task-manager/lib/approval-ai-refinement.js +488 -0
  58. package/template/claude-task-manager/lib/approval-self-adapt.js +168 -0
  59. package/template/claude-task-manager/lib/async-semaphore.js +44 -0
  60. package/template/claude-task-manager/lib/auth-context.js +5 -0
  61. package/template/claude-task-manager/lib/auth-rate-limit.js +47 -4
  62. package/template/claude-task-manager/lib/auth-rules.js +29 -2
  63. package/template/claude-task-manager/lib/auto-approval-verifier.js +129 -16
  64. package/template/claude-task-manager/lib/background-llm.js +144 -17
  65. package/template/claude-task-manager/lib/branch-inventory.js +212 -0
  66. package/template/claude-task-manager/lib/claude-desktop-sessions.js +15 -3
  67. package/template/claude-task-manager/lib/coalesce-sync-frames.js +151 -0
  68. package/template/claude-task-manager/lib/codex-launch-health.js +762 -0
  69. package/template/claude-task-manager/lib/codex-transcript-pager.js +51 -0
  70. package/template/claude-task-manager/lib/codex-zst.js +124 -0
  71. package/template/claude-task-manager/lib/coding-agent-models.js +233 -30
  72. package/template/claude-task-manager/lib/connection-health.js +232 -0
  73. package/template/claude-task-manager/lib/conversation-blob-parser.js +42 -0
  74. package/template/claude-task-manager/lib/conversation-tail-merge.js +89 -26
  75. package/template/claude-task-manager/lib/ctm-session-context-api.js +39 -10
  76. package/template/claude-task-manager/lib/cursor-conversation-store.js +354 -0
  77. package/template/claude-task-manager/lib/db-owner-worker-client.js +315 -0
  78. package/template/claude-task-manager/lib/document-review.js +141 -6
  79. package/template/claude-task-manager/lib/escalation-review.js +152 -0
  80. package/template/claude-task-manager/lib/graceful-shutdown.js +159 -0
  81. package/template/claude-task-manager/lib/headless-term-service.js +678 -0
  82. package/template/claude-task-manager/lib/heavy-worker-fallback.js +38 -0
  83. package/template/claude-task-manager/lib/jsonl-conversation-parser.js +542 -0
  84. package/template/claude-task-manager/lib/jsonl-range-reader.js +112 -0
  85. package/template/claude-task-manager/lib/main-db-census.js +216 -0
  86. package/template/claude-task-manager/lib/message-pagination.js +106 -4
  87. package/template/claude-task-manager/lib/microsoft-dev-tunnel-setup.js +750 -26
  88. package/template/claude-task-manager/lib/mobile-auth-api.js +274 -7
  89. package/template/claude-task-manager/lib/mobile-auth-store.js +592 -10
  90. package/template/claude-task-manager/lib/mobile-notification-dispatcher.js +15 -0
  91. package/template/claude-task-manager/lib/model-overview-brain-fallback.js +311 -0
  92. package/template/claude-task-manager/lib/model-overview-cache.js +141 -0
  93. package/template/claude-task-manager/lib/models-health-routing-notice.js +126 -0
  94. package/template/claude-task-manager/lib/node-pin-guard.js +93 -0
  95. package/template/claude-task-manager/lib/perf-tracker.js +242 -6
  96. package/template/claude-task-manager/lib/permission-match.js +76 -0
  97. package/template/claude-task-manager/lib/permission-sync.js +133 -20
  98. package/template/claude-task-manager/lib/process-title.js +35 -0
  99. package/template/claude-task-manager/lib/prompt-executions-query.js +25 -0
  100. package/template/claude-task-manager/lib/prompt-index-disk-cache.js +44 -0
  101. package/template/claude-task-manager/lib/prompt-intent.js +132 -0
  102. package/template/claude-task-manager/lib/provider-user-context.js +34 -0
  103. package/template/claude-task-manager/lib/read-pool-client.js +313 -0
  104. package/template/claude-task-manager/lib/readpool-breaker.js +31 -0
  105. package/template/claude-task-manager/lib/recent-sessions-breaker.js +12 -0
  106. package/template/claude-task-manager/lib/remote-feedback-client.js +72 -0
  107. package/template/claude-task-manager/lib/remote-relay-protocol.js +37 -4
  108. package/template/claude-task-manager/lib/remote-relay-store.js +159 -0
  109. package/template/claude-task-manager/lib/remote-submission-observer.js +278 -0
  110. package/template/claude-task-manager/lib/restart-guard.js +109 -0
  111. package/template/claude-task-manager/lib/restore-interruption-detector.js +439 -0
  112. package/template/claude-task-manager/lib/restore-policy.js +13 -0
  113. package/template/claude-task-manager/lib/restore-resume-batch.js +74 -0
  114. package/template/claude-task-manager/lib/restore-runtime.js +68 -0
  115. package/template/claude-task-manager/lib/restore-storm.js +34 -0
  116. package/template/claude-task-manager/lib/resume-cwd.js +36 -0
  117. package/template/claude-task-manager/lib/resume-preflight.js +313 -0
  118. package/template/claude-task-manager/lib/runtime-work-registry.js +444 -0
  119. package/template/claude-task-manager/lib/sanitize-openai-auth.js +31 -0
  120. package/template/claude-task-manager/lib/scheduler.js +21 -1
  121. package/template/claude-task-manager/lib/scrollback-snapshot-store.js +159 -0
  122. package/template/claude-task-manager/lib/serial-task-queue.js +64 -0
  123. package/template/claude-task-manager/lib/server-listeners.js +239 -0
  124. package/template/claude-task-manager/lib/session-capture.js +42 -7
  125. package/template/claude-task-manager/lib/session-content-backfill.js +131 -0
  126. package/template/claude-task-manager/lib/session-history.js +388 -43
  127. package/template/claude-task-manager/lib/session-host-manager.js +287 -0
  128. package/template/claude-task-manager/lib/session-image-refs.js +209 -0
  129. package/template/claude-task-manager/lib/session-jobs.js +399 -59
  130. package/template/claude-task-manager/lib/session-prompt-index.js +137 -0
  131. package/template/claude-task-manager/lib/session-restore.js +53 -0
  132. package/template/claude-task-manager/lib/session-standup.js +123 -23
  133. package/template/claude-task-manager/lib/session-state-bus.js +14 -0
  134. package/template/claude-task-manager/lib/session-stream.js +64 -16
  135. package/template/claude-task-manager/lib/session-timeline-summary.js +260 -0
  136. package/template/claude-task-manager/lib/session-token-usage.js +494 -0
  137. package/template/claude-task-manager/lib/session-workspace-binding.js +356 -0
  138. package/template/claude-task-manager/lib/setup-network-config.js +9 -0
  139. package/template/claude-task-manager/lib/size-cap.js +45 -0
  140. package/template/claude-task-manager/lib/size-cap.test.js +62 -0
  141. package/template/claude-task-manager/lib/skill-autocomplete.js +180 -1
  142. package/template/claude-task-manager/lib/skill-intent-resolver.js +304 -0
  143. package/template/claude-task-manager/lib/sqlite-driver.js +19 -3
  144. package/template/claude-task-manager/lib/standup-attention.js +7 -3
  145. package/template/claude-task-manager/lib/status-authority.js +39 -0
  146. package/template/claude-task-manager/lib/status-hooks.js +4 -0
  147. package/template/claude-task-manager/lib/storage-migration.js +235 -0
  148. package/template/claude-task-manager/lib/structured-capture.js +298 -0
  149. package/template/claude-task-manager/lib/sync-io-census.js +163 -0
  150. package/template/claude-task-manager/lib/tailscale-setup.js +6 -0
  151. package/template/claude-task-manager/lib/terminal-activity-evidence.js +33 -0
  152. package/template/claude-task-manager/lib/terminal-choice.js +364 -0
  153. package/template/claude-task-manager/lib/terminal-control-sanitize.js +17 -0
  154. package/template/claude-task-manager/lib/terminal-fingerprint.js +48 -0
  155. package/template/claude-task-manager/lib/terminal-output-flush.js +84 -0
  156. package/template/claude-task-manager/lib/timeline-order.js +122 -0
  157. package/template/claude-task-manager/lib/transcript-store.js +348 -43
  158. package/template/claude-task-manager/lib/transport-security.js +84 -1
  159. package/template/claude-task-manager/lib/wait-state.js +184 -0
  160. package/template/claude-task-manager/lib/walle-client.js +47 -5
  161. package/template/claude-task-manager/lib/walle-ctm-history.js +564 -4
  162. package/template/claude-task-manager/lib/walle-external-actions.js +135 -16
  163. package/template/claude-task-manager/lib/walle-history-hydration.js +46 -0
  164. package/template/claude-task-manager/lib/walle-native-health.js +403 -0
  165. package/template/claude-task-manager/lib/walle-repair.js +701 -0
  166. package/template/claude-task-manager/lib/walle-session-cache.js +109 -0
  167. package/template/claude-task-manager/lib/walle-session-context.js +57 -21
  168. package/template/claude-task-manager/lib/walle-session-model-catalog.js +34 -0
  169. package/template/claude-task-manager/lib/walle-supervisor.js +539 -63
  170. package/template/claude-task-manager/lib/walle-transcript.js +52 -0
  171. package/template/claude-task-manager/lib/worktree-active-sync.js +11 -7
  172. package/template/claude-task-manager/lib/worktree-cwd.js +32 -1
  173. package/template/claude-task-manager/package.json +1 -1
  174. package/template/claude-task-manager/prompt-harvest.js +89 -66
  175. package/template/claude-task-manager/providers/claude-code.js +51 -3
  176. package/template/claude-task-manager/providers/cursor.js +140 -45
  177. package/template/claude-task-manager/public/css/reviews.css +551 -61
  178. package/template/claude-task-manager/public/css/setup.css +191 -0
  179. package/template/claude-task-manager/public/css/walle-session.css +865 -10
  180. package/template/claude-task-manager/public/css/walle.css +154 -0
  181. package/template/claude-task-manager/public/designs/ai-providers-consolidation-v2.html +830 -0
  182. package/template/claude-task-manager/public/index.html +18516 -2058
  183. package/template/claude-task-manager/public/ipad.html +363 -0
  184. package/template/claude-task-manager/public/js/document-review-links.js +301 -0
  185. package/template/claude-task-manager/public/js/image-normalize.js +69 -36
  186. package/template/claude-task-manager/public/js/message-renderer.js +1265 -77
  187. package/template/claude-task-manager/public/js/prompts.js +66 -29
  188. package/template/claude-task-manager/public/js/reviews.js +901 -133
  189. package/template/claude-task-manager/public/js/session-activity-utils.js +11 -1
  190. package/template/claude-task-manager/public/js/session-search-utils.js +94 -10
  191. package/template/claude-task-manager/public/js/session-status-precedence.js +23 -5
  192. package/template/claude-task-manager/public/js/setup.js +1273 -176
  193. package/template/claude-task-manager/public/js/stream-view.js +691 -73
  194. package/template/claude-task-manager/public/js/terminal-reconciler.js +210 -0
  195. package/template/claude-task-manager/public/js/walle-session.js +2455 -158
  196. package/template/claude-task-manager/public/js/walle.js +455 -28
  197. package/template/claude-task-manager/public/m/app.css +2909 -262
  198. package/template/claude-task-manager/public/m/app.js +6601 -398
  199. package/template/claude-task-manager/public/m/claim.html +224 -17
  200. package/template/claude-task-manager/public/m/index.html +117 -21
  201. package/template/claude-task-manager/public/m/sw.js +3 -1
  202. package/template/claude-task-manager/public/manifest.json +2 -2
  203. package/template/claude-task-manager/public/prompts.html +30 -14
  204. package/template/claude-task-manager/queue-engine.js +507 -28
  205. package/template/claude-task-manager/scripts/repair-claude-session-images.js +27 -8
  206. package/template/claude-task-manager/server.js +14341 -2197
  207. package/template/claude-task-manager/session-integrity.js +160 -18
  208. package/template/claude-task-manager/session-search-ranking.js +1 -0
  209. package/template/claude-task-manager/session-utils.js +25 -5
  210. package/template/claude-task-manager/workers/approval-blocklist.js +96 -6
  211. package/template/claude-task-manager/workers/approval-widget-validator.js +14 -8
  212. package/template/claude-task-manager/workers/conversation-import-worker.js +11 -50
  213. package/template/claude-task-manager/workers/db-owner-worker.js +386 -0
  214. package/template/claude-task-manager/workers/harvest-worker.js +9 -55
  215. package/template/claude-task-manager/workers/headless-term-worker.js +9 -530
  216. package/template/claude-task-manager/workers/read-pool-worker.js +387 -0
  217. package/template/claude-task-manager/workers/scrollback-worker.js +11 -72
  218. package/template/claude-task-manager/workers/session-host-process.js +146 -0
  219. package/template/claude-task-manager/workers/session-integrity-worker.js +10 -54
  220. package/template/claude-task-manager/workers/state-detectors/base.js +18 -1
  221. package/template/claude-task-manager/workers/state-detectors/claude-code.js +182 -9
  222. package/template/claude-task-manager/workers/state-detectors/codex.js +150 -2
  223. package/template/claude-task-manager/workers/state-detectors/cursor.js +127 -0
  224. package/template/claude-task-manager/workers/state-detectors/gemini.js +21 -0
  225. package/template/claude-task-manager/workers/state-detectors/index.js +29 -0
  226. package/template/claude-task-manager/workers/state-detectors/opencode.js +103 -0
  227. package/template/docs/design/markdown-review-pane.md +206 -0
  228. package/template/docs/designs/2026-05-17-portkey-gateway-provider-ux.md +129 -38
  229. package/template/docs/designs/2026-05-20-mobile-worktree-finish-command.md +27 -0
  230. package/template/docs/designs/2026-05-22-ai-configuration-consolidation.md +248 -0
  231. package/template/docs/designs/ai-configuration-consolidation-mock.html +812 -0
  232. package/template/docs/private-memory-and-pii-policy.md +69 -0
  233. package/template/package.json +2 -1
  234. package/template/scripts/check-private-data.js +201 -0
  235. package/template/shared/sqlite-owner-guard.js +30 -0
  236. package/template/shared/sqlite-owner-write-queue.js +225 -0
  237. package/template/shared/sqlite-storage-policy.js +111 -0
  238. package/template/shared/sqlite-write-lock.js +428 -0
  239. package/template/wall-e/agent-runners/claude-code.js +5 -0
  240. package/template/wall-e/agent.js +166 -22
  241. package/template/wall-e/api-walle.js +524 -70
  242. package/template/wall-e/auth/provider-flows.js +11 -1
  243. package/template/wall-e/bin/walle-mcp-stdio.js +341 -17
  244. package/template/wall-e/brain.js +1614 -141
  245. package/template/wall-e/chat/attachment-blocks.js +96 -0
  246. package/template/wall-e/chat/attachments.js +2 -1
  247. package/template/wall-e/chat/capability-resolver.js +7 -7
  248. package/template/wall-e/chat/context-messages.js +28 -0
  249. package/template/wall-e/chat/conversation-frame.js +630 -0
  250. package/template/wall-e/chat/provider-messages.js +125 -0
  251. package/template/wall-e/chat.js +1002 -233
  252. package/template/wall-e/coding/acceptance-contract.js +170 -0
  253. package/template/wall-e/coding/acp-adapter.js +1 -1
  254. package/template/wall-e/coding/agent-catalog.js +3 -0
  255. package/template/wall-e/coding/artifact-store.js +93 -0
  256. package/template/wall-e/coding/capability-router.js +120 -0
  257. package/template/wall-e/coding/coding-run-controller.js +423 -0
  258. package/template/wall-e/coding/compaction-service.js +157 -12
  259. package/template/wall-e/coding/frontend-verification.js +258 -0
  260. package/template/wall-e/coding/lifecycle-hooks.js +75 -0
  261. package/template/wall-e/coding/local-preview-contract.js +157 -0
  262. package/template/wall-e/coding/permission-service.js +57 -13
  263. package/template/wall-e/coding/prompt-bundle.js +19 -1
  264. package/template/wall-e/coding/prompt-section-registry.js +227 -0
  265. package/template/wall-e/coding/provider-compat.js +15 -0
  266. package/template/wall-e/coding/runtime-events.js +224 -0
  267. package/template/wall-e/coding/runtime-mode.js +3 -0
  268. package/template/wall-e/coding/side-git-snapshot.js +160 -4
  269. package/template/wall-e/coding/snapshot-service.js +143 -1
  270. package/template/wall-e/coding/stream-processor.js +388 -34
  271. package/template/wall-e/coding/task-tool.js +141 -4
  272. package/template/wall-e/coding/tool-execution-controller.js +365 -0
  273. package/template/wall-e/coding/tool-registry.js +43 -5
  274. package/template/wall-e/coding/user-hooks.js +217 -0
  275. package/template/wall-e/coding-orchestrator.js +1330 -221
  276. package/template/wall-e/coding-prompts.js +20 -4
  277. package/template/wall-e/context/context-builder.js +15 -2
  278. package/template/wall-e/decision/confidence.js +1 -1
  279. package/template/wall-e/docs/coding-acceptance-contract.md +41 -0
  280. package/template/wall-e/docs/external-action-controller.md +26 -6
  281. package/template/wall-e/docs/telemetry-lifecycle.md +8 -2
  282. package/template/wall-e/embeddings.js +591 -53
  283. package/template/wall-e/external-action-controller.js +12 -0
  284. package/template/wall-e/http/auth.js +1 -0
  285. package/template/wall-e/http/chat-api.js +46 -11
  286. package/template/wall-e/http/model-admin.js +836 -34
  287. package/template/wall-e/lib/boot-profile.js +88 -0
  288. package/template/wall-e/lib/event-loop-monitor.js +93 -0
  289. package/template/wall-e/lib/service-health.js +194 -0
  290. package/template/wall-e/llm/anthropic.js +130 -5
  291. package/template/wall-e/llm/client.js +266 -63
  292. package/template/wall-e/llm/default-fallback.js +382 -0
  293. package/template/wall-e/llm/health.js +19 -0
  294. package/template/wall-e/llm/message-guard.js +78 -0
  295. package/template/wall-e/llm/model-catalog.js +252 -1
  296. package/template/wall-e/llm/openai.js +26 -4
  297. package/template/wall-e/llm/portkey-sync.js +654 -0
  298. package/template/wall-e/llm/provider-error.js +30 -2
  299. package/template/wall-e/llm/registry.js +5 -1
  300. package/template/wall-e/llm/request-compat.js +67 -0
  301. package/template/wall-e/loops/backfill.js +79 -23
  302. package/template/wall-e/loops/brain-optimize.js +67 -0
  303. package/template/wall-e/loops/ingest.js +25 -10
  304. package/template/wall-e/loops/question-digest.js +160 -0
  305. package/template/wall-e/loops/reflect.js +6 -4
  306. package/template/wall-e/loops/think.js +39 -12
  307. package/template/wall-e/mcp-server.js +318 -36
  308. package/template/wall-e/memory/ctm-context-client.js +52 -14
  309. package/template/wall-e/memory/ctm-operational-context.js +237 -0
  310. package/template/wall-e/memory/ctm-prompt-executions-client.js +128 -0
  311. package/template/wall-e/memory/ctm-session-context.js +111 -63
  312. package/template/wall-e/prompts/coding/deepseek.txt +3 -0
  313. package/template/wall-e/prompts/coding/gemini.txt +6 -0
  314. package/template/wall-e/prompts/coding/gpt.txt +6 -0
  315. package/template/wall-e/prompts/coding/local.txt +7 -0
  316. package/template/wall-e/runtime/decision-hooks.js +115 -0
  317. package/template/wall-e/runtime/devbox-gateway.js +82 -8
  318. package/template/wall-e/runtime/prompt-manifest.js +86 -0
  319. package/template/wall-e/runtime/tool-executor.js +269 -0
  320. package/template/wall-e/runtime/tool-result-envelope.js +138 -0
  321. package/template/wall-e/runtime/transcript-projection.js +60 -0
  322. package/template/wall-e/runtime/walle-runtime.js +224 -0
  323. package/template/wall-e/scripts/db-optimize/migrate.js +162 -0
  324. package/template/wall-e/scripts/db-optimize/recall-eval.js +117 -0
  325. package/template/wall-e/server.js +15 -0
  326. package/template/wall-e/session-files.js +9 -0
  327. package/template/wall-e/skills/_bundled/google-calendar/run.js +1 -1
  328. package/template/wall-e/skills/_bundled/gws-workspace/run.js +1 -1
  329. package/template/wall-e/skills/_bundled/slack-mentions/run.js +76 -6
  330. package/template/wall-e/skills/claude-code-reader.js +7 -3
  331. package/template/wall-e/skills/script-skill-runner.js +10 -0
  332. package/template/wall-e/skills/skill-planner.js +38 -0
  333. package/template/wall-e/tools/builtin-middleware.js +19 -9
  334. package/template/wall-e/tools/local-tools.js +1428 -16
  335. package/template/wall-e/tools/permission-checker.js +73 -5
  336. package/template/wall-e/tools/question-manager.js +117 -7
  337. package/template/wall-e/training/harvester.js +12 -28
  338. package/template/wall-e/training/replay.js +25 -80
  339. package/template/website/index.html +10 -10
  340. package/template/wall-e/eval/ab-test.js +0 -203
  341. package/template/wall-e/eval/agent-runner.js +0 -772
  342. package/template/wall-e/eval/agent-scorer.js +0 -461
  343. package/template/wall-e/eval/aggregator.js +0 -414
  344. package/template/wall-e/eval/allowed-test-commands.js +0 -34
  345. package/template/wall-e/eval/benchmark-generator.js +0 -113
  346. package/template/wall-e/eval/benchmarks/chat-eval.json +0 -1662
  347. package/template/wall-e/eval/benchmarks/chat.json +0 -82
  348. package/template/wall-e/eval/benchmarks/coding-agent-real.json +0 -1
  349. package/template/wall-e/eval/benchmarks/coding-agent.json +0 -1581
  350. package/template/wall-e/eval/benchmarks/coding.json +0 -122
  351. package/template/wall-e/eval/benchmarks/memory-retrieval.json +0 -234
  352. package/template/wall-e/eval/benchmarks/reasoning.json +0 -82
  353. package/template/wall-e/eval/benchmarks/swebench-lite-30.json +0 -212
  354. package/template/wall-e/eval/benchmarks.js +0 -669
  355. package/template/wall-e/eval/cc-replay.js +0 -719
  356. package/template/wall-e/eval/chat-eval.js +0 -525
  357. package/template/wall-e/eval/check-keys.js +0 -15
  358. package/template/wall-e/eval/check-providers.js +0 -42
  359. package/template/wall-e/eval/codex-cli-baseline.js +0 -669
  360. package/template/wall-e/eval/coding-agent-real.js +0 -570
  361. package/template/wall-e/eval/context-compactor.js +0 -251
  362. package/template/wall-e/eval/debug-agent003.js +0 -68
  363. package/template/wall-e/eval/diagnostics.js +0 -216
  364. package/template/wall-e/eval/eval-orchestrator.js +0 -642
  365. package/template/wall-e/eval/evaluate.js +0 -202
  366. package/template/wall-e/eval/evaluator.js +0 -373
  367. package/template/wall-e/eval/exporter.js +0 -212
  368. package/template/wall-e/eval/fixtures/express-basic/package.json +0 -9
  369. package/template/wall-e/eval/fixtures/express-basic/server.js +0 -115
  370. package/template/wall-e/eval/fixtures/express-basic/test.js +0 -83
  371. package/template/wall-e/eval/fixtures/express-buggy/package.json +0 -9
  372. package/template/wall-e/eval/fixtures/express-buggy/server.js +0 -113
  373. package/template/wall-e/eval/fixtures/express-buggy/test.js +0 -83
  374. package/template/wall-e/eval/fixtures/express-buggy-items/package.json +0 -9
  375. package/template/wall-e/eval/fixtures/express-buggy-items/server.js +0 -112
  376. package/template/wall-e/eval/fixtures/express-buggy-items/test.js +0 -83
  377. package/template/wall-e/eval/fixtures/express-buggy-search/package.json +0 -9
  378. package/template/wall-e/eval/fixtures/express-buggy-search/server.js +0 -121
  379. package/template/wall-e/eval/fixtures/express-buggy-search/test.js +0 -83
  380. package/template/wall-e/eval/fixtures/express-rename-data/data.js +0 -34
  381. package/template/wall-e/eval/fixtures/express-rename-data/package.json +0 -9
  382. package/template/wall-e/eval/fixtures/express-rename-data/server.js +0 -97
  383. package/template/wall-e/eval/fixtures/express-rename-data/test.js +0 -88
  384. package/template/wall-e/eval/fixtures/express-xss/package.json +0 -12
  385. package/template/wall-e/eval/fixtures/express-xss/server.js +0 -90
  386. package/template/wall-e/eval/fixtures/express-xss/test.js +0 -67
  387. package/template/wall-e/eval/fixtures/express-xss/views/profile.ejs +0 -9
  388. package/template/wall-e/eval/fixtures/fullstack-app/config/default.js +0 -9
  389. package/template/wall-e/eval/fixtures/fullstack-app/config/test.js +0 -13
  390. package/template/wall-e/eval/fixtures/fullstack-app/package.json +0 -11
  391. package/template/wall-e/eval/fixtures/fullstack-app/public/css/style.css +0 -137
  392. package/template/wall-e/eval/fixtures/fullstack-app/public/index.html +0 -46
  393. package/template/wall-e/eval/fixtures/fullstack-app/public/js/app.js +0 -121
  394. package/template/wall-e/eval/fixtures/fullstack-app/public/js/auth.js +0 -71
  395. package/template/wall-e/eval/fixtures/fullstack-app/public/js/items.js +0 -80
  396. package/template/wall-e/eval/fixtures/fullstack-app/public/js/users.js +0 -46
  397. package/template/wall-e/eval/fixtures/fullstack-app/public/login.html +0 -45
  398. package/template/wall-e/eval/fixtures/fullstack-app/public/register.html +0 -38
  399. package/template/wall-e/eval/fixtures/fullstack-app/scripts/migrate.js +0 -23
  400. package/template/wall-e/eval/fixtures/fullstack-app/scripts/seed.js +0 -46
  401. package/template/wall-e/eval/fixtures/fullstack-app/server/db.js +0 -99
  402. package/template/wall-e/eval/fixtures/fullstack-app/server/index.js +0 -94
  403. package/template/wall-e/eval/fixtures/fullstack-app/server/middleware/auth.js +0 -19
  404. package/template/wall-e/eval/fixtures/fullstack-app/server/middleware/logger.js +0 -19
  405. package/template/wall-e/eval/fixtures/fullstack-app/server/router.js +0 -50
  406. package/template/wall-e/eval/fixtures/fullstack-app/server/routes/auth.js +0 -69
  407. package/template/wall-e/eval/fixtures/fullstack-app/server/routes/health.js +0 -23
  408. package/template/wall-e/eval/fixtures/fullstack-app/server/routes/items.js +0 -88
  409. package/template/wall-e/eval/fixtures/fullstack-app/server/routes/users.js +0 -75
  410. package/template/wall-e/eval/fixtures/fullstack-app/server/test.js +0 -198
  411. package/template/wall-e/eval/fixtures/fullstack-app/server/utils/response.js +0 -34
  412. package/template/wall-e/eval/fixtures/fullstack-app/server/utils/validate.js +0 -26
  413. package/template/wall-e/eval/fixtures/fullstack-app/server.js +0 -8
  414. package/template/wall-e/eval/fixtures/fullstack-app/test.js +0 -12
  415. package/template/wall-e/eval/fixtures/monorepo-basic/package.json +0 -8
  416. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/data.js +0 -58
  417. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/middleware.js +0 -46
  418. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/package.json +0 -8
  419. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/routes.js +0 -64
  420. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/server.js +0 -56
  421. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/test.js +0 -116
  422. package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/commands.js +0 -61
  423. package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/index.js +0 -62
  424. package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/output.js +0 -43
  425. package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/package.json +0 -11
  426. package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/test.js +0 -44
  427. package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/formatters.js +0 -43
  428. package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/index.js +0 -12
  429. package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/package.json +0 -5
  430. package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/test.js +0 -55
  431. package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/validators.js +0 -29
  432. package/template/wall-e/eval/fixtures/monorepo-basic/test.js +0 -46
  433. package/template/wall-e/eval/fixtures/node-cli/index.js +0 -78
  434. package/template/wall-e/eval/fixtures/node-cli/package.json +0 -10
  435. package/template/wall-e/eval/fixtures/node-cli/test.js +0 -57
  436. package/template/wall-e/eval/fixtures/node-typed/package.json +0 -8
  437. package/template/wall-e/eval/fixtures/node-typed/src/handlers.js +0 -31
  438. package/template/wall-e/eval/fixtures/node-typed/src/utils.js +0 -33
  439. package/template/wall-e/eval/fixtures/node-typed/test.js +0 -36
  440. package/template/wall-e/eval/fixtures/python-flask/app.py +0 -14
  441. package/template/wall-e/eval/fixtures/python-flask/requirements.txt +0 -2
  442. package/template/wall-e/eval/fixtures/python-flask/test_app.py +0 -25
  443. package/template/wall-e/eval/fixtures/wall-e-subset/brain.js +0 -105
  444. package/template/wall-e/eval/fixtures/wall-e-subset/eval/aggregator.js +0 -101
  445. package/template/wall-e/eval/fixtures/wall-e-subset/eval/benchmarks/chat.json +0 -20
  446. package/template/wall-e/eval/fixtures/wall-e-subset/eval/benchmarks/coding.json +0 -32
  447. package/template/wall-e/eval/fixtures/wall-e-subset/eval/benchmarks.js +0 -64
  448. package/template/wall-e/eval/fixtures/wall-e-subset/eval/fixtures/simple-project/package.json +0 -6
  449. package/template/wall-e/eval/fixtures/wall-e-subset/eval/fixtures/simple-project/server.js +0 -31
  450. package/template/wall-e/eval/fixtures/wall-e-subset/eval/fixtures/simple-project/test.js +0 -18
  451. package/template/wall-e/eval/fixtures/wall-e-subset/eval/fixtures/simple-project/utils.js +0 -34
  452. package/template/wall-e/eval/fixtures/wall-e-subset/eval/runner.js +0 -104
  453. package/template/wall-e/eval/fixtures/wall-e-subset/eval/scorer.js +0 -73
  454. package/template/wall-e/eval/fixtures/wall-e-subset/eval/test.js +0 -134
  455. package/template/wall-e/eval/fixtures/wall-e-subset/llm/client.js +0 -99
  456. package/template/wall-e/eval/fixtures/wall-e-subset/llm/providers.js +0 -63
  457. package/template/wall-e/eval/fixtures/wall-e-subset/llm/test.js +0 -70
  458. package/template/wall-e/eval/fixtures/wall-e-subset/package.json +0 -10
  459. package/template/wall-e/eval/fixtures/wall-e-subset/test.js +0 -86
  460. package/template/wall-e/eval/harvester.js +0 -685
  461. package/template/wall-e/eval/head-to-head.js +0 -388
  462. package/template/wall-e/eval/humaneval-adapter.js +0 -321
  463. package/template/wall-e/eval/list-models.js +0 -31
  464. package/template/wall-e/eval/livecodebench-adapter.js +0 -291
  465. package/template/wall-e/eval/mail-integration.js +0 -443
  466. package/template/wall-e/eval/manifest.js +0 -186
  467. package/template/wall-e/eval/meta-harness/adapters/coding-agent.js +0 -57
  468. package/template/wall-e/eval/meta-harness/bootstrap-snapshot.js +0 -149
  469. package/template/wall-e/eval/meta-harness/candidate-store.js +0 -117
  470. package/template/wall-e/eval/meta-harness/cli.js +0 -86
  471. package/template/wall-e/eval/meta-harness/domain-spec.js +0 -154
  472. package/template/wall-e/eval/meta-harness/domains/coding-agent.domain.json +0 -84
  473. package/template/wall-e/eval/meta-harness/examples/env-bootstrap-candidate.js +0 -29
  474. package/template/wall-e/eval/meta-harness/experience-store.js +0 -174
  475. package/template/wall-e/eval/meta-harness/frontier.js +0 -96
  476. package/template/wall-e/eval/meta-harness/harness-interface.js +0 -90
  477. package/template/wall-e/eval/meta-harness/leakage-guard.js +0 -80
  478. package/template/wall-e/eval/meta-harness/optimizer.js +0 -207
  479. package/template/wall-e/eval/meta-harness/proposer-runner.js +0 -110
  480. package/template/wall-e/eval/meta-harness/reporting.js +0 -58
  481. package/template/wall-e/eval/meta-harness/telemetry.js +0 -27
  482. package/template/wall-e/eval/meta-harness/validation.js +0 -81
  483. package/template/wall-e/eval/promoter.js +0 -228
  484. package/template/wall-e/eval/provider-normalizer.js +0 -33
  485. package/template/wall-e/eval/replay.js +0 -395
  486. package/template/wall-e/eval/run-agent-benchmarks.js +0 -386
  487. package/template/wall-e/eval/run-codex-cli-baseline.js +0 -177
  488. package/template/wall-e/eval/run-coding-agent-real.js +0 -187
  489. package/template/wall-e/eval/run-eval.js +0 -435
  490. package/template/wall-e/eval/run-model-comparison.js +0 -142
  491. package/template/wall-e/eval/session-evaluator.js +0 -187
  492. package/template/wall-e/eval/session-miner.js +0 -207
  493. package/template/wall-e/eval/session-retrieval-benchmark.js +0 -150
  494. package/template/wall-e/eval/session-transcripts.js +0 -509
  495. package/template/wall-e/eval/shadow.js +0 -161
  496. package/template/wall-e/eval/swebench-adapter.js +0 -345
  497. package/template/wall-e/eval/swebench-docker.js +0 -192
  498. package/template/wall-e/eval/train.py +0 -320
  499. package/template/wall-e/eval/trainer.js +0 -232
  500. package/template/wall-e/eval/weekly-eval-loop.js +0 -241
@@ -1,251 +0,0 @@
1
- 'use strict';
2
-
3
- /**
4
- * Smart context compaction for the coding agent.
5
- * Replaces blunt character truncation with semantic compaction.
6
- * Prioritizes: errors > test results > file reads > tool outputs > successes.
7
- */
8
-
9
- const PRIORITY = {
10
- ERROR: 10,
11
- TEST_RESULT: 9,
12
- DIAGNOSTIC: 8,
13
- FILE_READ: 5,
14
- TOOL_OUTPUT: 4,
15
- PLAN: 6,
16
- SUCCESS_MSG: 2,
17
- ASSISTANT_TEXT: 3,
18
- DEFAULT: 3,
19
- };
20
-
21
- /**
22
- * Classify a message by its content to determine compaction priority.
23
- * @param {object} msg - { role, content, toolCall?, toolResult? }
24
- * @returns {number} Priority score (higher = keep longer)
25
- */
26
- function classifyMessage(msg) {
27
- const content = typeof msg.content === 'string' ? msg.content : JSON.stringify(msg.content || '');
28
- const toolName = msg.toolCall?.name || msg.tool_use?.name || '';
29
-
30
- // Errors are highest priority — always keep
31
- if (/error|Error|ERROR|FAIL|failed|exception|TypeError|SyntaxError|ReferenceError/i.test(content)) {
32
- return PRIORITY.ERROR;
33
- }
34
-
35
- // Test results
36
- if (/test|PASS|FAIL|✓|✗|assertion|expect/i.test(content) && /run_shell/.test(toolName)) {
37
- return PRIORITY.TEST_RESULT;
38
- }
39
-
40
- // LSP diagnostics
41
- if (/diagnostic|lsp_diagnostics|warning.*line/i.test(content) || toolName === 'lsp_diagnostics') {
42
- return PRIORITY.DIAGNOSTIC;
43
- }
44
-
45
- // Planning
46
- if (toolName === 'update_todos' || /plan|step\s+\d|todo/i.test(content)) {
47
- return PRIORITY.PLAN;
48
- }
49
-
50
- // File reads
51
- if (/read_file|glob|grep_files|list_directory/.test(toolName)) {
52
- return PRIORITY.FILE_READ;
53
- }
54
-
55
- // General tool output
56
- if (toolName) {
57
- return PRIORITY.TOOL_OUTPUT;
58
- }
59
-
60
- // Success messages (low priority — can be summarized)
61
- if (/success|done|complete|finished|created|updated/i.test(content) && content.length < 200) {
62
- return PRIORITY.SUCCESS_MSG;
63
- }
64
-
65
- // Assistant text
66
- if (msg.role === 'assistant') {
67
- return PRIORITY.ASSISTANT_TEXT;
68
- }
69
-
70
- return PRIORITY.DEFAULT;
71
- }
72
-
73
- /**
74
- * Compact a message array to fit within a character budget.
75
- * Preserves high-priority messages, summarizes low-priority ones.
76
- *
77
- * @param {Array} messages - Array of { role, content, ... } messages
78
- * @param {number} budgetChars - Maximum total characters
79
- * @param {object} [options]
80
- * @param {number} [options.keepFirst] - Always keep first N messages (default: 2)
81
- * @param {number} [options.keepLast] - Always keep last N messages (default: 3)
82
- * @returns {Array} Compacted messages
83
- */
84
- function compactMessages(messages, budgetChars, options = {}) {
85
- const { keepFirst = 2, keepLast = 3 } = options;
86
-
87
- if (!messages || messages.length === 0) return [];
88
-
89
- // Compute total size
90
- const totalSize = messages.reduce((sum, m) => sum + messageSize(m), 0);
91
- if (totalSize <= budgetChars) return messages; // no compaction needed
92
-
93
- // Score and tag each message
94
- const scored = messages.map((msg, idx) => ({
95
- msg,
96
- idx,
97
- priority: classifyMessage(msg),
98
- size: messageSize(msg),
99
- protected: idx < keepFirst || idx >= messages.length - keepLast,
100
- }));
101
-
102
- // Protected messages are always kept
103
- const protectedMsgs = scored.filter(s => s.protected);
104
- const evictable = scored.filter(s => !s.protected);
105
-
106
- // Sort evictable by priority ascending (lowest priority evicted first)
107
- evictable.sort((a, b) => a.priority - b.priority);
108
-
109
- let usedBudget = protectedMsgs.reduce((sum, s) => sum + s.size, 0);
110
- const kept = new Set(protectedMsgs.map(s => s.idx));
111
-
112
- // Add evictable messages in priority order until budget is exceeded
113
- for (const item of evictable.reverse()) {
114
- if (usedBudget + item.size <= budgetChars) {
115
- kept.add(item.idx);
116
- usedBudget += item.size;
117
- } else {
118
- // Try to include a truncated version for high-priority messages
119
- if (item.priority >= PRIORITY.PLAN) {
120
- const truncated = truncateMessage(item.msg, Math.max(200, budgetChars - usedBudget));
121
- if (truncated) {
122
- item.msg = truncated;
123
- kept.add(item.idx);
124
- usedBudget += messageSize(truncated);
125
- }
126
- }
127
- }
128
- }
129
-
130
- // Rebuild in original order, inserting compaction markers where messages were dropped
131
- const result = [];
132
- let droppedCount = 0;
133
- const scoredByIdx = new Map(scored.map(s => [s.idx, s]));
134
-
135
- for (let i = 0; i < messages.length; i++) {
136
- if (kept.has(i)) {
137
- if (droppedCount > 0) {
138
- result.push({
139
- role: 'system',
140
- content: `[${droppedCount} message${droppedCount > 1 ? 's' : ''} compacted]`,
141
- });
142
- droppedCount = 0;
143
- }
144
- const item = scoredByIdx.get(i);
145
- result.push(item ? item.msg : messages[i]);
146
- } else {
147
- droppedCount++;
148
- }
149
- }
150
-
151
- if (droppedCount > 0) {
152
- result.push({
153
- role: 'system',
154
- content: `[${droppedCount} message${droppedCount > 1 ? 's' : ''} compacted]`,
155
- });
156
- }
157
-
158
- return result;
159
- }
160
-
161
- /**
162
- * Compact a cumulative context string with semantic awareness.
163
- * @param {string} context - Raw cumulative context
164
- * @param {number} budgetChars - Max characters
165
- * @returns {string} Compacted context
166
- */
167
- function compactContext(context, budgetChars) {
168
- if (!context || context.length <= budgetChars) return context;
169
-
170
- // Split into sections by step markers
171
- const sections = context.split(/(?=\n\[Step \d+)/);
172
- if (sections.length <= 1) {
173
- return context.slice(-budgetChars);
174
- }
175
-
176
- // Score sections
177
- const scored = sections.map((section, idx) => ({
178
- section,
179
- idx,
180
- priority: /error|fail|Error/i.test(section) ? PRIORITY.ERROR :
181
- /test|PASS|FAIL/i.test(section) ? PRIORITY.TEST_RESULT :
182
- PRIORITY.SUCCESS_MSG,
183
- size: section.length,
184
- }));
185
-
186
- // Always keep last 2 sections and any error sections
187
- let usedBudget = 0;
188
- const kept = [];
189
-
190
- // Keep last 2 sections always
191
- const lastTwo = scored.slice(-2);
192
- for (const s of lastTwo) {
193
- kept.push(s);
194
- usedBudget += s.size;
195
- }
196
-
197
- // Add error sections
198
- for (const s of scored.slice(0, -2)) {
199
- if (s.priority >= PRIORITY.ERROR && usedBudget + s.size <= budgetChars) {
200
- kept.push(s);
201
- usedBudget += s.size;
202
- }
203
- }
204
-
205
- // Fill remaining budget with other sections (newest first)
206
- for (const s of scored.slice(0, -2).reverse()) {
207
- if (kept.includes(s)) continue;
208
- if (usedBudget + s.size <= budgetChars) {
209
- kept.push(s);
210
- usedBudget += s.size;
211
- }
212
- }
213
-
214
- // Sort by original index
215
- kept.sort((a, b) => a.idx - b.idx);
216
-
217
- const compactedCount = scored.length - kept.length;
218
- let result = kept.map(s => s.section).join('');
219
-
220
- if (compactedCount > 0) {
221
- result = `[${compactedCount} earlier step${compactedCount > 1 ? 's' : ''} compacted]\n` + result;
222
- }
223
-
224
- // Final safety: if still over budget, fall back to tail truncation
225
- if (result.length > budgetChars) {
226
- result = result.slice(-budgetChars);
227
- }
228
-
229
- return result;
230
- }
231
-
232
- function messageSize(msg) {
233
- const content = typeof msg.content === 'string' ? msg.content : JSON.stringify(msg.content || '');
234
- return content.length;
235
- }
236
-
237
- function truncateMessage(msg, maxChars) {
238
- const content = typeof msg.content === 'string' ? msg.content : JSON.stringify(msg.content || '');
239
- if (content.length <= maxChars) return msg;
240
- return {
241
- ...msg,
242
- content: content.slice(0, maxChars - 20) + '\n[...truncated]',
243
- };
244
- }
245
-
246
- module.exports = {
247
- PRIORITY,
248
- classifyMessage,
249
- compactMessages,
250
- compactContext,
251
- };
@@ -1,68 +0,0 @@
1
- #!/usr/bin/env node
2
- 'use strict';
3
- const path = require('path');
4
- process.chdir(path.join(__dirname, '..'));
5
-
6
- // Load .env
7
- try {
8
- const envPath = path.resolve(__dirname, '..', '..', '.env');
9
- const lines = require('fs').readFileSync(envPath, 'utf8').split('\n');
10
- for (const line of lines) {
11
- const m = line.match(/^([A-Z_]+)=(.*)$/);
12
- if (m && !process.env[m[1]]) process.env[m[1]] = m[2];
13
- }
14
- } catch {}
15
-
16
- const { setupSandbox, cleanupSandbox } = require('./agent-runner');
17
- const { runAgentLoop } = require('../coding-orchestrator');
18
- const benchmarks = require('./benchmarks/coding-agent.json');
19
-
20
- const benchId = process.argv[2] || 'agent-003';
21
- const bench = benchmarks.find(b => b.id === benchId);
22
- if (!bench) { console.error('No benchmark:', benchId); process.exit(1); }
23
-
24
- (async () => {
25
- const fixture = bench.agentExpectations?.projectFixture || 'express-basic';
26
- const dir = setupSandbox(fixture);
27
- console.log(`Benchmark: ${bench.id} | Fixture: ${fixture} | Sandbox: ${dir}`);
28
-
29
- try {
30
- const result = await runAgentLoop(bench.prompt, {
31
- cwd: dir,
32
- timeoutMs: 120000,
33
- mode: 'build',
34
- persistTranscript: false,
35
- onProgress: (e) => {
36
- if (e.message) {
37
- const detail = e.detail ? ` ${JSON.stringify(e.detail).slice(0, 120)}` : '';
38
- console.log(` [${e.phase}/${e.step}] ${e.message}${detail}`);
39
- }
40
- }
41
- });
42
-
43
- console.log('\n=== RESULT ===');
44
- console.log('Success:', result.success);
45
- console.log('Turns:', (result.log || []).length);
46
-
47
- for (const entry of (result.log || [])) {
48
- const tools = (entry.toolCalls || []).map(t => t.name);
49
- console.log(` Turn ${entry.turn}: stop=${entry.stopReason} tools=[${tools.join(', ')}]`);
50
- if (entry.content) console.log(` "${(entry.content || '').slice(0, 150)}"`);
51
- }
52
-
53
- console.log('\nOutput:', (result.output || '').slice(0, 400));
54
-
55
- const { execFileSync } = require('child_process');
56
- const diff = execFileSync('git', ['diff', '--name-only', 'HEAD'], { cwd: dir, encoding: 'utf8' });
57
- const untracked = execFileSync('git', ['ls-files', '--others', '--exclude-standard'], { cwd: dir, encoding: 'utf8' });
58
- console.log('Modified:', diff.trim() || 'none');
59
- console.log('Untracked:', untracked.trim() || 'none');
60
-
61
- try {
62
- execFileSync('node', ['test.js'], { cwd: dir, timeout: 10000, stdio: 'pipe' });
63
- console.log('Tests: PASS');
64
- } catch { console.log('Tests: FAIL'); }
65
- } finally {
66
- cleanupSandbox(dir);
67
- }
68
- })().catch(e => { console.error(e.message); process.exit(1); });
@@ -1,216 +0,0 @@
1
- 'use strict';
2
-
3
- /**
4
- * Per-session failure analysis — auto-classifies root causes
5
- * when any scoring dimension falls below threshold.
6
- *
7
- * Root causes: "doom_loop" | "no_edits" | "timeout" | "test_failure" | "path_error"
8
- */
9
-
10
- const crypto = require('crypto');
11
-
12
- const FAILING_THRESHOLD = 0.3;
13
-
14
- const ROOT_CAUSE_PATTERNS = [
15
- { name: 'timeout', test: (_seq, meta) => meta?.error?.includes('timeout') || meta?.error?.includes('TIMEOUT') },
16
- { name: 'doom_loop', test: (seq) => hasDoomLoop(seq) },
17
- { name: 'path_error', test: (seq) => seq.some(t => t.result?.includes?.('ENOENT') || t.result?.includes?.('no such file')) },
18
- { name: 'test_failure', test: (_seq, meta) => meta?.testsPassed === false },
19
- { name: 'no_edits', test: (seq) => seq.length > 0 && !seq.some(t => /write_file|edit_file|Edit|Write/.test(t.name)) },
20
- ];
21
-
22
- /**
23
- * Detect doom loops — same tool call pattern repeated 3+ times consecutively.
24
- */
25
- function hasDoomLoop(toolSequence) {
26
- if (toolSequence.length < 6) return false;
27
- for (let windowSize = 2; windowSize <= 4; windowSize++) {
28
- for (let i = 0; i <= toolSequence.length - windowSize * 3; i++) {
29
- const pattern = toolSequence.slice(i, i + windowSize).map(t => t.name).join(',');
30
- let repeats = 1;
31
- for (let j = i + windowSize; j <= toolSequence.length - windowSize; j += windowSize) {
32
- const next = toolSequence.slice(j, j + windowSize).map(t => t.name).join(',');
33
- if (next === pattern) repeats++;
34
- else break;
35
- }
36
- if (repeats >= 3) return true;
37
- }
38
- }
39
- return false;
40
- }
41
-
42
- /**
43
- * Classify root cause from tool call sequence and metadata.
44
- */
45
- function classifyRootCause(toolSequence, meta) {
46
- for (const pattern of ROOT_CAUSE_PATTERNS) {
47
- if (pattern.test(toolSequence, meta)) return pattern.name;
48
- }
49
- return 'unknown';
50
- }
51
-
52
- /**
53
- * Analyze a benchmark result and store diagnostics for any failing dimensions.
54
- * @param {Object} brain - Brain module
55
- * @param {Object} result - Benchmark result with score.dimensions and toolCallDetails
56
- * @returns {Array} diagnostics entries created
57
- */
58
- function analyzeBenchmarkResult(brain, result) {
59
- const entries = [];
60
- const dimensions = result.score?.dimensions || {};
61
- const toolSequence = result.toolCallDetails || result.actualToolCalls?.map(name => ({ name })) || [];
62
- const meta = {
63
- error: result.error,
64
- testsPassed: result.testsPassed,
65
- };
66
-
67
- for (const [dim, score] of Object.entries(dimensions)) {
68
- if (typeof score !== 'number' || score >= FAILING_THRESHOLD) continue;
69
-
70
- const rootCause = classifyRootCause(toolSequence, meta);
71
- const entry = {
72
- id: crypto.randomUUID(),
73
- sessionId: result.sessionId || null,
74
- benchmarkId: result.benchmarkId,
75
- failingDimension: dim,
76
- score,
77
- toolSequence: JSON.stringify(toolSequence.map(t => t.name || t)),
78
- rootCause,
79
- };
80
-
81
- entries.push(entry);
82
-
83
- if (brain) {
84
- try {
85
- const db = brain.getDb();
86
- db.prepare(`
87
- INSERT OR IGNORE INTO eval_diagnostics (id, session_id, benchmark_id, failing_dimension, score, tool_sequence, root_cause)
88
- VALUES (?, ?, ?, ?, ?, ?, ?)
89
- `).run(entry.id, entry.sessionId, entry.benchmarkId, entry.failingDimension, entry.score, entry.toolSequence, entry.rootCause);
90
- } catch { /* non-fatal */ }
91
- }
92
- }
93
-
94
- return entries;
95
- }
96
-
97
- /**
98
- * Query recent diagnostics.
99
- * @param {Object} brain - Brain module
100
- * @param {Object} [opts]
101
- * @param {number} [opts.days=7] - Look-back window
102
- * @param {string} [opts.rootCause] - Filter by root cause
103
- * @param {string} [opts.benchmarkId] - Filter by benchmark
104
- * @returns {Array} diagnostic entries
105
- */
106
- function queryDiagnostics(brain, opts = {}) {
107
- const { days = 7, rootCause, benchmarkId } = opts;
108
- const db = brain.getDb();
109
-
110
- let sql = `SELECT * FROM eval_diagnostics WHERE created_at >= datetime('now', ?)`;
111
- const params = [`-${days} days`];
112
-
113
- if (rootCause) {
114
- sql += ' AND root_cause = ?';
115
- params.push(rootCause);
116
- }
117
- if (benchmarkId) {
118
- sql += ' AND benchmark_id = ?';
119
- params.push(benchmarkId);
120
- }
121
-
122
- sql += ' ORDER BY created_at DESC LIMIT 200';
123
-
124
- try {
125
- return db.prepare(sql).all(...params);
126
- } catch {
127
- return [];
128
- }
129
- }
130
-
131
- /**
132
- * Get a summary of root cause distribution.
133
- */
134
- function getDiagnosticsSummary(brain, days = 7) {
135
- const db = brain.getDb();
136
- try {
137
- return db.prepare(`
138
- SELECT root_cause, COUNT(*) as count, AVG(score) as avg_score,
139
- GROUP_CONCAT(DISTINCT benchmark_id) as benchmarks
140
- FROM eval_diagnostics
141
- WHERE created_at >= datetime('now', ?)
142
- GROUP BY root_cause
143
- ORDER BY count DESC
144
- `).all(`-${days} days`);
145
- } catch {
146
- return [];
147
- }
148
- }
149
-
150
- // Ensure eval_diagnostics table exists
151
- function ensureTable(brain) {
152
- try {
153
- brain.getDb().exec(`
154
- CREATE TABLE IF NOT EXISTS eval_diagnostics (
155
- id TEXT PRIMARY KEY,
156
- session_id TEXT,
157
- benchmark_id TEXT,
158
- failing_dimension TEXT,
159
- score REAL,
160
- tool_sequence TEXT,
161
- root_cause TEXT,
162
- created_at TEXT DEFAULT (datetime('now'))
163
- )
164
- `);
165
- } catch { /* table may already exist */ }
166
- }
167
-
168
- // CLI: node eval/diagnostics.js [--days 7] [--root-cause doom_loop]
169
- if (require.main === module) {
170
- const args = process.argv.slice(2);
171
- const daysIdx = args.indexOf('--days');
172
- const days = daysIdx >= 0 ? parseInt(args[daysIdx + 1], 10) : 7;
173
- const rcIdx = args.indexOf('--root-cause');
174
- const rootCause = rcIdx >= 0 ? args[rcIdx + 1] : undefined;
175
-
176
- let brain;
177
- try {
178
- brain = require('../brain');
179
- brain.initDb();
180
- ensureTable(brain);
181
- } catch (err) {
182
- console.error('Brain not available:', err.message);
183
- process.exit(1);
184
- }
185
-
186
- console.log(`=== Eval Diagnostics (last ${days} days) ===\n`);
187
-
188
- const summary = getDiagnosticsSummary(brain, days);
189
- if (summary.length === 0) {
190
- console.log('No diagnostics found.');
191
- } else {
192
- console.log('Root cause distribution:');
193
- for (const row of summary) {
194
- console.log(` ${row.root_cause}: ${row.count} failures (avg score ${row.avg_score.toFixed(3)})`);
195
- console.log(` Benchmarks: ${row.benchmarks}`);
196
- }
197
- }
198
-
199
- const entries = queryDiagnostics(brain, { days, rootCause });
200
- if (entries.length > 0) {
201
- console.log(`\nRecent entries (${entries.length}):`);
202
- for (const e of entries.slice(0, 20)) {
203
- console.log(` [${e.created_at}] ${e.benchmark_id} — ${e.failing_dimension}: ${e.score.toFixed(3)} (${e.root_cause})`);
204
- }
205
- }
206
- }
207
-
208
- module.exports = {
209
- FAILING_THRESHOLD,
210
- classifyRootCause,
211
- hasDoomLoop,
212
- analyzeBenchmarkResult,
213
- queryDiagnostics,
214
- getDiagnosticsSummary,
215
- ensureTable,
216
- };