create-walle 0.9.21 → 0.9.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (500) hide show
  1. package/README.md +27 -5
  2. package/package.json +2 -2
  3. package/template/CLAUDE.md +2 -2
  4. package/template/LICENSE +1 -1
  5. package/template/bin/ctm-dev-cleanup.js +24 -3
  6. package/template/bin/ctm-launch.sh +13 -0
  7. package/template/bin/dev.sh +156 -18
  8. package/template/bin/node-bin.sh +84 -0
  9. package/template/bin/pin-node.sh +51 -0
  10. package/template/claude-task-manager/api-prompts.js +1203 -182
  11. package/template/claude-task-manager/api-reviews.js +109 -15
  12. package/template/claude-task-manager/approval-agent.js +1360 -280
  13. package/template/claude-task-manager/bin/restart-ctm.sh +64 -23
  14. package/template/claude-task-manager/bin/storage-migration-supervisor.js +338 -0
  15. package/template/claude-task-manager/db.js +4417 -295
  16. package/template/claude-task-manager/docs/app-update-refresh-protocol.md +69 -0
  17. package/template/claude-task-manager/docs/approval-ai-refinement.md +138 -0
  18. package/template/claude-task-manager/docs/approval-rescue-loop.md +74 -0
  19. package/template/claude-task-manager/docs/codex-operational-warning-health.md +107 -0
  20. package/template/claude-task-manager/docs/codex-resume-state-guard-design.md +17 -12
  21. package/template/claude-task-manager/docs/codex-terminal-render-controller-handoff.md +311 -0
  22. package/template/claude-task-manager/docs/coding-agent-hooks-architecture.md +418 -0
  23. package/template/claude-task-manager/docs/conversation-import-freshness.md +20 -0
  24. package/template/claude-task-manager/docs/google-workspace-auth-health.md +77 -0
  25. package/template/claude-task-manager/docs/image-paste-ux.md +13 -0
  26. package/template/claude-task-manager/docs/ipad-web-preview.md +88 -0
  27. package/template/claude-task-manager/docs/main-loop-offload-architecture.md +66 -0
  28. package/template/claude-task-manager/docs/microsoft-dev-tunnel-phone-access-design.md +274 -519
  29. package/template/claude-task-manager/docs/mobile-live-streaming.md +27 -5
  30. package/template/claude-task-manager/docs/mobile-remote-submission-lifecycle.md +69 -0
  31. package/template/claude-task-manager/docs/phone-access-design.md +53 -15
  32. package/template/claude-task-manager/docs/phone-passkey-identity.md +122 -0
  33. package/template/claude-task-manager/docs/phone-setup.md +3 -0
  34. package/template/claude-task-manager/docs/prompt-editing-tree-design.md +25 -1
  35. package/template/claude-task-manager/docs/remote-desktop-access-design.md +268 -0
  36. package/template/claude-task-manager/docs/restart-lifecycle-architecture.md +95 -0
  37. package/template/claude-task-manager/docs/runtime-work-control-plane.md +53 -0
  38. package/template/claude-task-manager/docs/session-interactive-wait-surfaces.md +38 -0
  39. package/template/claude-task-manager/docs/session-needs-you-dismissal.md +84 -0
  40. package/template/claude-task-manager/docs/session-render-state-management-design.md +91 -3
  41. package/template/claude-task-manager/docs/session-standup-command-center-design.md +25 -1
  42. package/template/claude-task-manager/docs/session-title-authority.md +32 -0
  43. package/template/claude-task-manager/docs/session-workspace-binding.md +33 -0
  44. package/template/claude-task-manager/docs/skill-intent-resolution-design.md +72 -0
  45. package/template/claude-task-manager/docs/walle-mcp-supervisor-health.md +86 -0
  46. package/template/claude-task-manager/docs/walle-relay-phone-access-design.md +24 -15
  47. package/template/claude-task-manager/docs/walle-session-history-hydration.md +114 -0
  48. package/template/claude-task-manager/docs/walle-session-input-queue.md +104 -0
  49. package/template/claude-task-manager/docs/walle-session-model-catalog.md +90 -0
  50. package/template/claude-task-manager/docs/walle-session-model-preferences.md +15 -6
  51. package/template/claude-task-manager/git-utils.js +897 -27
  52. package/template/claude-task-manager/lib/agent-capabilities.js +33 -0
  53. package/template/claude-task-manager/lib/agent-cli-cache.js +37 -7
  54. package/template/claude-task-manager/lib/agent-hooks-installer.js +26 -2
  55. package/template/claude-task-manager/lib/agent-presets.js +17 -1
  56. package/template/claude-task-manager/lib/all-sessions-query.js +108 -0
  57. package/template/claude-task-manager/lib/approval-ai-refinement.js +488 -0
  58. package/template/claude-task-manager/lib/approval-self-adapt.js +168 -0
  59. package/template/claude-task-manager/lib/async-semaphore.js +44 -0
  60. package/template/claude-task-manager/lib/auth-context.js +5 -0
  61. package/template/claude-task-manager/lib/auth-rate-limit.js +47 -4
  62. package/template/claude-task-manager/lib/auth-rules.js +29 -2
  63. package/template/claude-task-manager/lib/auto-approval-verifier.js +129 -16
  64. package/template/claude-task-manager/lib/background-llm.js +144 -17
  65. package/template/claude-task-manager/lib/branch-inventory.js +212 -0
  66. package/template/claude-task-manager/lib/claude-desktop-sessions.js +15 -3
  67. package/template/claude-task-manager/lib/coalesce-sync-frames.js +151 -0
  68. package/template/claude-task-manager/lib/codex-launch-health.js +762 -0
  69. package/template/claude-task-manager/lib/codex-transcript-pager.js +51 -0
  70. package/template/claude-task-manager/lib/codex-zst.js +124 -0
  71. package/template/claude-task-manager/lib/coding-agent-models.js +233 -30
  72. package/template/claude-task-manager/lib/connection-health.js +232 -0
  73. package/template/claude-task-manager/lib/conversation-blob-parser.js +42 -0
  74. package/template/claude-task-manager/lib/conversation-tail-merge.js +89 -26
  75. package/template/claude-task-manager/lib/ctm-session-context-api.js +39 -10
  76. package/template/claude-task-manager/lib/cursor-conversation-store.js +354 -0
  77. package/template/claude-task-manager/lib/db-owner-worker-client.js +315 -0
  78. package/template/claude-task-manager/lib/document-review.js +141 -6
  79. package/template/claude-task-manager/lib/escalation-review.js +152 -0
  80. package/template/claude-task-manager/lib/graceful-shutdown.js +159 -0
  81. package/template/claude-task-manager/lib/headless-term-service.js +678 -0
  82. package/template/claude-task-manager/lib/heavy-worker-fallback.js +38 -0
  83. package/template/claude-task-manager/lib/jsonl-conversation-parser.js +542 -0
  84. package/template/claude-task-manager/lib/jsonl-range-reader.js +112 -0
  85. package/template/claude-task-manager/lib/main-db-census.js +216 -0
  86. package/template/claude-task-manager/lib/message-pagination.js +106 -4
  87. package/template/claude-task-manager/lib/microsoft-dev-tunnel-setup.js +750 -26
  88. package/template/claude-task-manager/lib/mobile-auth-api.js +274 -7
  89. package/template/claude-task-manager/lib/mobile-auth-store.js +592 -10
  90. package/template/claude-task-manager/lib/mobile-notification-dispatcher.js +15 -0
  91. package/template/claude-task-manager/lib/model-overview-brain-fallback.js +311 -0
  92. package/template/claude-task-manager/lib/model-overview-cache.js +141 -0
  93. package/template/claude-task-manager/lib/models-health-routing-notice.js +126 -0
  94. package/template/claude-task-manager/lib/node-pin-guard.js +93 -0
  95. package/template/claude-task-manager/lib/perf-tracker.js +242 -6
  96. package/template/claude-task-manager/lib/permission-match.js +76 -0
  97. package/template/claude-task-manager/lib/permission-sync.js +133 -20
  98. package/template/claude-task-manager/lib/process-title.js +35 -0
  99. package/template/claude-task-manager/lib/prompt-executions-query.js +25 -0
  100. package/template/claude-task-manager/lib/prompt-index-disk-cache.js +44 -0
  101. package/template/claude-task-manager/lib/prompt-intent.js +132 -0
  102. package/template/claude-task-manager/lib/provider-user-context.js +34 -0
  103. package/template/claude-task-manager/lib/read-pool-client.js +313 -0
  104. package/template/claude-task-manager/lib/readpool-breaker.js +31 -0
  105. package/template/claude-task-manager/lib/recent-sessions-breaker.js +12 -0
  106. package/template/claude-task-manager/lib/remote-feedback-client.js +72 -0
  107. package/template/claude-task-manager/lib/remote-relay-protocol.js +37 -4
  108. package/template/claude-task-manager/lib/remote-relay-store.js +159 -0
  109. package/template/claude-task-manager/lib/remote-submission-observer.js +278 -0
  110. package/template/claude-task-manager/lib/restart-guard.js +109 -0
  111. package/template/claude-task-manager/lib/restore-interruption-detector.js +439 -0
  112. package/template/claude-task-manager/lib/restore-policy.js +13 -0
  113. package/template/claude-task-manager/lib/restore-resume-batch.js +74 -0
  114. package/template/claude-task-manager/lib/restore-runtime.js +68 -0
  115. package/template/claude-task-manager/lib/restore-storm.js +34 -0
  116. package/template/claude-task-manager/lib/resume-cwd.js +36 -0
  117. package/template/claude-task-manager/lib/resume-preflight.js +313 -0
  118. package/template/claude-task-manager/lib/runtime-work-registry.js +444 -0
  119. package/template/claude-task-manager/lib/sanitize-openai-auth.js +31 -0
  120. package/template/claude-task-manager/lib/scheduler.js +21 -1
  121. package/template/claude-task-manager/lib/scrollback-snapshot-store.js +159 -0
  122. package/template/claude-task-manager/lib/serial-task-queue.js +64 -0
  123. package/template/claude-task-manager/lib/server-listeners.js +239 -0
  124. package/template/claude-task-manager/lib/session-capture.js +42 -7
  125. package/template/claude-task-manager/lib/session-content-backfill.js +131 -0
  126. package/template/claude-task-manager/lib/session-history.js +388 -43
  127. package/template/claude-task-manager/lib/session-host-manager.js +287 -0
  128. package/template/claude-task-manager/lib/session-image-refs.js +209 -0
  129. package/template/claude-task-manager/lib/session-jobs.js +399 -59
  130. package/template/claude-task-manager/lib/session-prompt-index.js +137 -0
  131. package/template/claude-task-manager/lib/session-restore.js +53 -0
  132. package/template/claude-task-manager/lib/session-standup.js +123 -23
  133. package/template/claude-task-manager/lib/session-state-bus.js +14 -0
  134. package/template/claude-task-manager/lib/session-stream.js +64 -16
  135. package/template/claude-task-manager/lib/session-timeline-summary.js +260 -0
  136. package/template/claude-task-manager/lib/session-token-usage.js +494 -0
  137. package/template/claude-task-manager/lib/session-workspace-binding.js +356 -0
  138. package/template/claude-task-manager/lib/setup-network-config.js +9 -0
  139. package/template/claude-task-manager/lib/size-cap.js +45 -0
  140. package/template/claude-task-manager/lib/size-cap.test.js +62 -0
  141. package/template/claude-task-manager/lib/skill-autocomplete.js +180 -1
  142. package/template/claude-task-manager/lib/skill-intent-resolver.js +304 -0
  143. package/template/claude-task-manager/lib/sqlite-driver.js +19 -3
  144. package/template/claude-task-manager/lib/standup-attention.js +7 -3
  145. package/template/claude-task-manager/lib/status-authority.js +39 -0
  146. package/template/claude-task-manager/lib/status-hooks.js +4 -0
  147. package/template/claude-task-manager/lib/storage-migration.js +235 -0
  148. package/template/claude-task-manager/lib/structured-capture.js +298 -0
  149. package/template/claude-task-manager/lib/sync-io-census.js +163 -0
  150. package/template/claude-task-manager/lib/tailscale-setup.js +6 -0
  151. package/template/claude-task-manager/lib/terminal-activity-evidence.js +33 -0
  152. package/template/claude-task-manager/lib/terminal-choice.js +364 -0
  153. package/template/claude-task-manager/lib/terminal-control-sanitize.js +17 -0
  154. package/template/claude-task-manager/lib/terminal-fingerprint.js +48 -0
  155. package/template/claude-task-manager/lib/terminal-output-flush.js +84 -0
  156. package/template/claude-task-manager/lib/timeline-order.js +122 -0
  157. package/template/claude-task-manager/lib/transcript-store.js +348 -43
  158. package/template/claude-task-manager/lib/transport-security.js +84 -1
  159. package/template/claude-task-manager/lib/wait-state.js +184 -0
  160. package/template/claude-task-manager/lib/walle-client.js +47 -5
  161. package/template/claude-task-manager/lib/walle-ctm-history.js +564 -4
  162. package/template/claude-task-manager/lib/walle-external-actions.js +135 -16
  163. package/template/claude-task-manager/lib/walle-history-hydration.js +46 -0
  164. package/template/claude-task-manager/lib/walle-native-health.js +403 -0
  165. package/template/claude-task-manager/lib/walle-repair.js +701 -0
  166. package/template/claude-task-manager/lib/walle-session-cache.js +109 -0
  167. package/template/claude-task-manager/lib/walle-session-context.js +57 -21
  168. package/template/claude-task-manager/lib/walle-session-model-catalog.js +34 -0
  169. package/template/claude-task-manager/lib/walle-supervisor.js +539 -63
  170. package/template/claude-task-manager/lib/walle-transcript.js +52 -0
  171. package/template/claude-task-manager/lib/worktree-active-sync.js +11 -7
  172. package/template/claude-task-manager/lib/worktree-cwd.js +32 -1
  173. package/template/claude-task-manager/package.json +1 -1
  174. package/template/claude-task-manager/prompt-harvest.js +89 -66
  175. package/template/claude-task-manager/providers/claude-code.js +51 -3
  176. package/template/claude-task-manager/providers/cursor.js +140 -45
  177. package/template/claude-task-manager/public/css/reviews.css +551 -61
  178. package/template/claude-task-manager/public/css/setup.css +191 -0
  179. package/template/claude-task-manager/public/css/walle-session.css +865 -10
  180. package/template/claude-task-manager/public/css/walle.css +154 -0
  181. package/template/claude-task-manager/public/designs/ai-providers-consolidation-v2.html +830 -0
  182. package/template/claude-task-manager/public/index.html +18516 -2058
  183. package/template/claude-task-manager/public/ipad.html +363 -0
  184. package/template/claude-task-manager/public/js/document-review-links.js +301 -0
  185. package/template/claude-task-manager/public/js/image-normalize.js +69 -36
  186. package/template/claude-task-manager/public/js/message-renderer.js +1265 -77
  187. package/template/claude-task-manager/public/js/prompts.js +66 -29
  188. package/template/claude-task-manager/public/js/reviews.js +901 -133
  189. package/template/claude-task-manager/public/js/session-activity-utils.js +11 -1
  190. package/template/claude-task-manager/public/js/session-search-utils.js +94 -10
  191. package/template/claude-task-manager/public/js/session-status-precedence.js +23 -5
  192. package/template/claude-task-manager/public/js/setup.js +1273 -176
  193. package/template/claude-task-manager/public/js/stream-view.js +691 -73
  194. package/template/claude-task-manager/public/js/terminal-reconciler.js +210 -0
  195. package/template/claude-task-manager/public/js/walle-session.js +2455 -158
  196. package/template/claude-task-manager/public/js/walle.js +455 -28
  197. package/template/claude-task-manager/public/m/app.css +2909 -262
  198. package/template/claude-task-manager/public/m/app.js +6601 -398
  199. package/template/claude-task-manager/public/m/claim.html +224 -17
  200. package/template/claude-task-manager/public/m/index.html +117 -21
  201. package/template/claude-task-manager/public/m/sw.js +3 -1
  202. package/template/claude-task-manager/public/manifest.json +2 -2
  203. package/template/claude-task-manager/public/prompts.html +30 -14
  204. package/template/claude-task-manager/queue-engine.js +507 -28
  205. package/template/claude-task-manager/scripts/repair-claude-session-images.js +27 -8
  206. package/template/claude-task-manager/server.js +14341 -2197
  207. package/template/claude-task-manager/session-integrity.js +160 -18
  208. package/template/claude-task-manager/session-search-ranking.js +1 -0
  209. package/template/claude-task-manager/session-utils.js +25 -5
  210. package/template/claude-task-manager/workers/approval-blocklist.js +96 -6
  211. package/template/claude-task-manager/workers/approval-widget-validator.js +14 -8
  212. package/template/claude-task-manager/workers/conversation-import-worker.js +11 -50
  213. package/template/claude-task-manager/workers/db-owner-worker.js +386 -0
  214. package/template/claude-task-manager/workers/harvest-worker.js +9 -55
  215. package/template/claude-task-manager/workers/headless-term-worker.js +9 -530
  216. package/template/claude-task-manager/workers/read-pool-worker.js +387 -0
  217. package/template/claude-task-manager/workers/scrollback-worker.js +11 -72
  218. package/template/claude-task-manager/workers/session-host-process.js +146 -0
  219. package/template/claude-task-manager/workers/session-integrity-worker.js +10 -54
  220. package/template/claude-task-manager/workers/state-detectors/base.js +18 -1
  221. package/template/claude-task-manager/workers/state-detectors/claude-code.js +182 -9
  222. package/template/claude-task-manager/workers/state-detectors/codex.js +150 -2
  223. package/template/claude-task-manager/workers/state-detectors/cursor.js +127 -0
  224. package/template/claude-task-manager/workers/state-detectors/gemini.js +21 -0
  225. package/template/claude-task-manager/workers/state-detectors/index.js +29 -0
  226. package/template/claude-task-manager/workers/state-detectors/opencode.js +103 -0
  227. package/template/docs/design/markdown-review-pane.md +206 -0
  228. package/template/docs/designs/2026-05-17-portkey-gateway-provider-ux.md +129 -38
  229. package/template/docs/designs/2026-05-20-mobile-worktree-finish-command.md +27 -0
  230. package/template/docs/designs/2026-05-22-ai-configuration-consolidation.md +248 -0
  231. package/template/docs/designs/ai-configuration-consolidation-mock.html +812 -0
  232. package/template/docs/private-memory-and-pii-policy.md +69 -0
  233. package/template/package.json +2 -1
  234. package/template/scripts/check-private-data.js +201 -0
  235. package/template/shared/sqlite-owner-guard.js +30 -0
  236. package/template/shared/sqlite-owner-write-queue.js +225 -0
  237. package/template/shared/sqlite-storage-policy.js +111 -0
  238. package/template/shared/sqlite-write-lock.js +428 -0
  239. package/template/wall-e/agent-runners/claude-code.js +5 -0
  240. package/template/wall-e/agent.js +166 -22
  241. package/template/wall-e/api-walle.js +524 -70
  242. package/template/wall-e/auth/provider-flows.js +11 -1
  243. package/template/wall-e/bin/walle-mcp-stdio.js +341 -17
  244. package/template/wall-e/brain.js +1614 -141
  245. package/template/wall-e/chat/attachment-blocks.js +96 -0
  246. package/template/wall-e/chat/attachments.js +2 -1
  247. package/template/wall-e/chat/capability-resolver.js +7 -7
  248. package/template/wall-e/chat/context-messages.js +28 -0
  249. package/template/wall-e/chat/conversation-frame.js +630 -0
  250. package/template/wall-e/chat/provider-messages.js +125 -0
  251. package/template/wall-e/chat.js +1002 -233
  252. package/template/wall-e/coding/acceptance-contract.js +170 -0
  253. package/template/wall-e/coding/acp-adapter.js +1 -1
  254. package/template/wall-e/coding/agent-catalog.js +3 -0
  255. package/template/wall-e/coding/artifact-store.js +93 -0
  256. package/template/wall-e/coding/capability-router.js +120 -0
  257. package/template/wall-e/coding/coding-run-controller.js +423 -0
  258. package/template/wall-e/coding/compaction-service.js +157 -12
  259. package/template/wall-e/coding/frontend-verification.js +258 -0
  260. package/template/wall-e/coding/lifecycle-hooks.js +75 -0
  261. package/template/wall-e/coding/local-preview-contract.js +157 -0
  262. package/template/wall-e/coding/permission-service.js +57 -13
  263. package/template/wall-e/coding/prompt-bundle.js +19 -1
  264. package/template/wall-e/coding/prompt-section-registry.js +227 -0
  265. package/template/wall-e/coding/provider-compat.js +15 -0
  266. package/template/wall-e/coding/runtime-events.js +224 -0
  267. package/template/wall-e/coding/runtime-mode.js +3 -0
  268. package/template/wall-e/coding/side-git-snapshot.js +160 -4
  269. package/template/wall-e/coding/snapshot-service.js +143 -1
  270. package/template/wall-e/coding/stream-processor.js +388 -34
  271. package/template/wall-e/coding/task-tool.js +141 -4
  272. package/template/wall-e/coding/tool-execution-controller.js +365 -0
  273. package/template/wall-e/coding/tool-registry.js +43 -5
  274. package/template/wall-e/coding/user-hooks.js +217 -0
  275. package/template/wall-e/coding-orchestrator.js +1330 -221
  276. package/template/wall-e/coding-prompts.js +20 -4
  277. package/template/wall-e/context/context-builder.js +15 -2
  278. package/template/wall-e/decision/confidence.js +1 -1
  279. package/template/wall-e/docs/coding-acceptance-contract.md +41 -0
  280. package/template/wall-e/docs/external-action-controller.md +26 -6
  281. package/template/wall-e/docs/telemetry-lifecycle.md +8 -2
  282. package/template/wall-e/embeddings.js +591 -53
  283. package/template/wall-e/external-action-controller.js +12 -0
  284. package/template/wall-e/http/auth.js +1 -0
  285. package/template/wall-e/http/chat-api.js +46 -11
  286. package/template/wall-e/http/model-admin.js +836 -34
  287. package/template/wall-e/lib/boot-profile.js +88 -0
  288. package/template/wall-e/lib/event-loop-monitor.js +93 -0
  289. package/template/wall-e/lib/service-health.js +194 -0
  290. package/template/wall-e/llm/anthropic.js +130 -5
  291. package/template/wall-e/llm/client.js +266 -63
  292. package/template/wall-e/llm/default-fallback.js +382 -0
  293. package/template/wall-e/llm/health.js +19 -0
  294. package/template/wall-e/llm/message-guard.js +78 -0
  295. package/template/wall-e/llm/model-catalog.js +252 -1
  296. package/template/wall-e/llm/openai.js +26 -4
  297. package/template/wall-e/llm/portkey-sync.js +654 -0
  298. package/template/wall-e/llm/provider-error.js +30 -2
  299. package/template/wall-e/llm/registry.js +5 -1
  300. package/template/wall-e/llm/request-compat.js +67 -0
  301. package/template/wall-e/loops/backfill.js +79 -23
  302. package/template/wall-e/loops/brain-optimize.js +67 -0
  303. package/template/wall-e/loops/ingest.js +25 -10
  304. package/template/wall-e/loops/question-digest.js +160 -0
  305. package/template/wall-e/loops/reflect.js +6 -4
  306. package/template/wall-e/loops/think.js +39 -12
  307. package/template/wall-e/mcp-server.js +318 -36
  308. package/template/wall-e/memory/ctm-context-client.js +52 -14
  309. package/template/wall-e/memory/ctm-operational-context.js +237 -0
  310. package/template/wall-e/memory/ctm-prompt-executions-client.js +128 -0
  311. package/template/wall-e/memory/ctm-session-context.js +111 -63
  312. package/template/wall-e/prompts/coding/deepseek.txt +3 -0
  313. package/template/wall-e/prompts/coding/gemini.txt +6 -0
  314. package/template/wall-e/prompts/coding/gpt.txt +6 -0
  315. package/template/wall-e/prompts/coding/local.txt +7 -0
  316. package/template/wall-e/runtime/decision-hooks.js +115 -0
  317. package/template/wall-e/runtime/devbox-gateway.js +82 -8
  318. package/template/wall-e/runtime/prompt-manifest.js +86 -0
  319. package/template/wall-e/runtime/tool-executor.js +269 -0
  320. package/template/wall-e/runtime/tool-result-envelope.js +138 -0
  321. package/template/wall-e/runtime/transcript-projection.js +60 -0
  322. package/template/wall-e/runtime/walle-runtime.js +224 -0
  323. package/template/wall-e/scripts/db-optimize/migrate.js +162 -0
  324. package/template/wall-e/scripts/db-optimize/recall-eval.js +117 -0
  325. package/template/wall-e/server.js +15 -0
  326. package/template/wall-e/session-files.js +9 -0
  327. package/template/wall-e/skills/_bundled/google-calendar/run.js +1 -1
  328. package/template/wall-e/skills/_bundled/gws-workspace/run.js +1 -1
  329. package/template/wall-e/skills/_bundled/slack-mentions/run.js +76 -6
  330. package/template/wall-e/skills/claude-code-reader.js +7 -3
  331. package/template/wall-e/skills/script-skill-runner.js +10 -0
  332. package/template/wall-e/skills/skill-planner.js +38 -0
  333. package/template/wall-e/tools/builtin-middleware.js +19 -9
  334. package/template/wall-e/tools/local-tools.js +1428 -16
  335. package/template/wall-e/tools/permission-checker.js +73 -5
  336. package/template/wall-e/tools/question-manager.js +117 -7
  337. package/template/wall-e/training/harvester.js +12 -28
  338. package/template/wall-e/training/replay.js +25 -80
  339. package/template/website/index.html +10 -10
  340. package/template/wall-e/eval/ab-test.js +0 -203
  341. package/template/wall-e/eval/agent-runner.js +0 -772
  342. package/template/wall-e/eval/agent-scorer.js +0 -461
  343. package/template/wall-e/eval/aggregator.js +0 -414
  344. package/template/wall-e/eval/allowed-test-commands.js +0 -34
  345. package/template/wall-e/eval/benchmark-generator.js +0 -113
  346. package/template/wall-e/eval/benchmarks/chat-eval.json +0 -1662
  347. package/template/wall-e/eval/benchmarks/chat.json +0 -82
  348. package/template/wall-e/eval/benchmarks/coding-agent-real.json +0 -1
  349. package/template/wall-e/eval/benchmarks/coding-agent.json +0 -1581
  350. package/template/wall-e/eval/benchmarks/coding.json +0 -122
  351. package/template/wall-e/eval/benchmarks/memory-retrieval.json +0 -234
  352. package/template/wall-e/eval/benchmarks/reasoning.json +0 -82
  353. package/template/wall-e/eval/benchmarks/swebench-lite-30.json +0 -212
  354. package/template/wall-e/eval/benchmarks.js +0 -669
  355. package/template/wall-e/eval/cc-replay.js +0 -719
  356. package/template/wall-e/eval/chat-eval.js +0 -525
  357. package/template/wall-e/eval/check-keys.js +0 -15
  358. package/template/wall-e/eval/check-providers.js +0 -42
  359. package/template/wall-e/eval/codex-cli-baseline.js +0 -669
  360. package/template/wall-e/eval/coding-agent-real.js +0 -570
  361. package/template/wall-e/eval/context-compactor.js +0 -251
  362. package/template/wall-e/eval/debug-agent003.js +0 -68
  363. package/template/wall-e/eval/diagnostics.js +0 -216
  364. package/template/wall-e/eval/eval-orchestrator.js +0 -642
  365. package/template/wall-e/eval/evaluate.js +0 -202
  366. package/template/wall-e/eval/evaluator.js +0 -373
  367. package/template/wall-e/eval/exporter.js +0 -212
  368. package/template/wall-e/eval/fixtures/express-basic/package.json +0 -9
  369. package/template/wall-e/eval/fixtures/express-basic/server.js +0 -115
  370. package/template/wall-e/eval/fixtures/express-basic/test.js +0 -83
  371. package/template/wall-e/eval/fixtures/express-buggy/package.json +0 -9
  372. package/template/wall-e/eval/fixtures/express-buggy/server.js +0 -113
  373. package/template/wall-e/eval/fixtures/express-buggy/test.js +0 -83
  374. package/template/wall-e/eval/fixtures/express-buggy-items/package.json +0 -9
  375. package/template/wall-e/eval/fixtures/express-buggy-items/server.js +0 -112
  376. package/template/wall-e/eval/fixtures/express-buggy-items/test.js +0 -83
  377. package/template/wall-e/eval/fixtures/express-buggy-search/package.json +0 -9
  378. package/template/wall-e/eval/fixtures/express-buggy-search/server.js +0 -121
  379. package/template/wall-e/eval/fixtures/express-buggy-search/test.js +0 -83
  380. package/template/wall-e/eval/fixtures/express-rename-data/data.js +0 -34
  381. package/template/wall-e/eval/fixtures/express-rename-data/package.json +0 -9
  382. package/template/wall-e/eval/fixtures/express-rename-data/server.js +0 -97
  383. package/template/wall-e/eval/fixtures/express-rename-data/test.js +0 -88
  384. package/template/wall-e/eval/fixtures/express-xss/package.json +0 -12
  385. package/template/wall-e/eval/fixtures/express-xss/server.js +0 -90
  386. package/template/wall-e/eval/fixtures/express-xss/test.js +0 -67
  387. package/template/wall-e/eval/fixtures/express-xss/views/profile.ejs +0 -9
  388. package/template/wall-e/eval/fixtures/fullstack-app/config/default.js +0 -9
  389. package/template/wall-e/eval/fixtures/fullstack-app/config/test.js +0 -13
  390. package/template/wall-e/eval/fixtures/fullstack-app/package.json +0 -11
  391. package/template/wall-e/eval/fixtures/fullstack-app/public/css/style.css +0 -137
  392. package/template/wall-e/eval/fixtures/fullstack-app/public/index.html +0 -46
  393. package/template/wall-e/eval/fixtures/fullstack-app/public/js/app.js +0 -121
  394. package/template/wall-e/eval/fixtures/fullstack-app/public/js/auth.js +0 -71
  395. package/template/wall-e/eval/fixtures/fullstack-app/public/js/items.js +0 -80
  396. package/template/wall-e/eval/fixtures/fullstack-app/public/js/users.js +0 -46
  397. package/template/wall-e/eval/fixtures/fullstack-app/public/login.html +0 -45
  398. package/template/wall-e/eval/fixtures/fullstack-app/public/register.html +0 -38
  399. package/template/wall-e/eval/fixtures/fullstack-app/scripts/migrate.js +0 -23
  400. package/template/wall-e/eval/fixtures/fullstack-app/scripts/seed.js +0 -46
  401. package/template/wall-e/eval/fixtures/fullstack-app/server/db.js +0 -99
  402. package/template/wall-e/eval/fixtures/fullstack-app/server/index.js +0 -94
  403. package/template/wall-e/eval/fixtures/fullstack-app/server/middleware/auth.js +0 -19
  404. package/template/wall-e/eval/fixtures/fullstack-app/server/middleware/logger.js +0 -19
  405. package/template/wall-e/eval/fixtures/fullstack-app/server/router.js +0 -50
  406. package/template/wall-e/eval/fixtures/fullstack-app/server/routes/auth.js +0 -69
  407. package/template/wall-e/eval/fixtures/fullstack-app/server/routes/health.js +0 -23
  408. package/template/wall-e/eval/fixtures/fullstack-app/server/routes/items.js +0 -88
  409. package/template/wall-e/eval/fixtures/fullstack-app/server/routes/users.js +0 -75
  410. package/template/wall-e/eval/fixtures/fullstack-app/server/test.js +0 -198
  411. package/template/wall-e/eval/fixtures/fullstack-app/server/utils/response.js +0 -34
  412. package/template/wall-e/eval/fixtures/fullstack-app/server/utils/validate.js +0 -26
  413. package/template/wall-e/eval/fixtures/fullstack-app/server.js +0 -8
  414. package/template/wall-e/eval/fixtures/fullstack-app/test.js +0 -12
  415. package/template/wall-e/eval/fixtures/monorepo-basic/package.json +0 -8
  416. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/data.js +0 -58
  417. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/middleware.js +0 -46
  418. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/package.json +0 -8
  419. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/routes.js +0 -64
  420. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/server.js +0 -56
  421. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/test.js +0 -116
  422. package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/commands.js +0 -61
  423. package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/index.js +0 -62
  424. package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/output.js +0 -43
  425. package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/package.json +0 -11
  426. package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/test.js +0 -44
  427. package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/formatters.js +0 -43
  428. package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/index.js +0 -12
  429. package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/package.json +0 -5
  430. package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/test.js +0 -55
  431. package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/validators.js +0 -29
  432. package/template/wall-e/eval/fixtures/monorepo-basic/test.js +0 -46
  433. package/template/wall-e/eval/fixtures/node-cli/index.js +0 -78
  434. package/template/wall-e/eval/fixtures/node-cli/package.json +0 -10
  435. package/template/wall-e/eval/fixtures/node-cli/test.js +0 -57
  436. package/template/wall-e/eval/fixtures/node-typed/package.json +0 -8
  437. package/template/wall-e/eval/fixtures/node-typed/src/handlers.js +0 -31
  438. package/template/wall-e/eval/fixtures/node-typed/src/utils.js +0 -33
  439. package/template/wall-e/eval/fixtures/node-typed/test.js +0 -36
  440. package/template/wall-e/eval/fixtures/python-flask/app.py +0 -14
  441. package/template/wall-e/eval/fixtures/python-flask/requirements.txt +0 -2
  442. package/template/wall-e/eval/fixtures/python-flask/test_app.py +0 -25
  443. package/template/wall-e/eval/fixtures/wall-e-subset/brain.js +0 -105
  444. package/template/wall-e/eval/fixtures/wall-e-subset/eval/aggregator.js +0 -101
  445. package/template/wall-e/eval/fixtures/wall-e-subset/eval/benchmarks/chat.json +0 -20
  446. package/template/wall-e/eval/fixtures/wall-e-subset/eval/benchmarks/coding.json +0 -32
  447. package/template/wall-e/eval/fixtures/wall-e-subset/eval/benchmarks.js +0 -64
  448. package/template/wall-e/eval/fixtures/wall-e-subset/eval/fixtures/simple-project/package.json +0 -6
  449. package/template/wall-e/eval/fixtures/wall-e-subset/eval/fixtures/simple-project/server.js +0 -31
  450. package/template/wall-e/eval/fixtures/wall-e-subset/eval/fixtures/simple-project/test.js +0 -18
  451. package/template/wall-e/eval/fixtures/wall-e-subset/eval/fixtures/simple-project/utils.js +0 -34
  452. package/template/wall-e/eval/fixtures/wall-e-subset/eval/runner.js +0 -104
  453. package/template/wall-e/eval/fixtures/wall-e-subset/eval/scorer.js +0 -73
  454. package/template/wall-e/eval/fixtures/wall-e-subset/eval/test.js +0 -134
  455. package/template/wall-e/eval/fixtures/wall-e-subset/llm/client.js +0 -99
  456. package/template/wall-e/eval/fixtures/wall-e-subset/llm/providers.js +0 -63
  457. package/template/wall-e/eval/fixtures/wall-e-subset/llm/test.js +0 -70
  458. package/template/wall-e/eval/fixtures/wall-e-subset/package.json +0 -10
  459. package/template/wall-e/eval/fixtures/wall-e-subset/test.js +0 -86
  460. package/template/wall-e/eval/harvester.js +0 -685
  461. package/template/wall-e/eval/head-to-head.js +0 -388
  462. package/template/wall-e/eval/humaneval-adapter.js +0 -321
  463. package/template/wall-e/eval/list-models.js +0 -31
  464. package/template/wall-e/eval/livecodebench-adapter.js +0 -291
  465. package/template/wall-e/eval/mail-integration.js +0 -443
  466. package/template/wall-e/eval/manifest.js +0 -186
  467. package/template/wall-e/eval/meta-harness/adapters/coding-agent.js +0 -57
  468. package/template/wall-e/eval/meta-harness/bootstrap-snapshot.js +0 -149
  469. package/template/wall-e/eval/meta-harness/candidate-store.js +0 -117
  470. package/template/wall-e/eval/meta-harness/cli.js +0 -86
  471. package/template/wall-e/eval/meta-harness/domain-spec.js +0 -154
  472. package/template/wall-e/eval/meta-harness/domains/coding-agent.domain.json +0 -84
  473. package/template/wall-e/eval/meta-harness/examples/env-bootstrap-candidate.js +0 -29
  474. package/template/wall-e/eval/meta-harness/experience-store.js +0 -174
  475. package/template/wall-e/eval/meta-harness/frontier.js +0 -96
  476. package/template/wall-e/eval/meta-harness/harness-interface.js +0 -90
  477. package/template/wall-e/eval/meta-harness/leakage-guard.js +0 -80
  478. package/template/wall-e/eval/meta-harness/optimizer.js +0 -207
  479. package/template/wall-e/eval/meta-harness/proposer-runner.js +0 -110
  480. package/template/wall-e/eval/meta-harness/reporting.js +0 -58
  481. package/template/wall-e/eval/meta-harness/telemetry.js +0 -27
  482. package/template/wall-e/eval/meta-harness/validation.js +0 -81
  483. package/template/wall-e/eval/promoter.js +0 -228
  484. package/template/wall-e/eval/provider-normalizer.js +0 -33
  485. package/template/wall-e/eval/replay.js +0 -395
  486. package/template/wall-e/eval/run-agent-benchmarks.js +0 -386
  487. package/template/wall-e/eval/run-codex-cli-baseline.js +0 -177
  488. package/template/wall-e/eval/run-coding-agent-real.js +0 -187
  489. package/template/wall-e/eval/run-eval.js +0 -435
  490. package/template/wall-e/eval/run-model-comparison.js +0 -142
  491. package/template/wall-e/eval/session-evaluator.js +0 -187
  492. package/template/wall-e/eval/session-miner.js +0 -207
  493. package/template/wall-e/eval/session-retrieval-benchmark.js +0 -150
  494. package/template/wall-e/eval/session-transcripts.js +0 -509
  495. package/template/wall-e/eval/shadow.js +0 -161
  496. package/template/wall-e/eval/swebench-adapter.js +0 -345
  497. package/template/wall-e/eval/swebench-docker.js +0 -192
  498. package/template/wall-e/eval/train.py +0 -320
  499. package/template/wall-e/eval/trainer.js +0 -232
  500. package/template/wall-e/eval/weekly-eval-loop.js +0 -241
@@ -1,772 +0,0 @@
1
- 'use strict';
2
- const fs = require('fs');
3
- const path = require('path');
4
- const os = require('os');
5
- const crypto = require('crypto');
6
- const { execFileSync, execFile } = require('child_process');
7
- const { promisify } = require('util');
8
- const execFileAsync = promisify(execFile);
9
- const { decorateBenchmarkResult, DEFAULT_SCORER_VERSION } = require('./manifest');
10
- const { testCommandAllowed } = require('./allowed-test-commands');
11
- const { createArtifactTranscript, exportBenchmarkArtifacts } = require('./meta-harness/trace-exporter');
12
-
13
- const DEFAULT_TIMEOUT_MS = 600_000; // 10 minutes — coding agents can take long
14
- const FIXTURES_DIR = path.join(__dirname, 'fixtures');
15
-
16
- let _estimateProviderCost;
17
- function getEstimateProviderCost() {
18
- if (!_estimateProviderCost) {
19
- try {
20
- _estimateProviderCost = require('./head-to-head').estimateProviderCost;
21
- } catch { _estimateProviderCost = () => 0; }
22
- }
23
- return _estimateProviderCost;
24
- }
25
-
26
- /**
27
- * Set up a sandboxed project from a fixture.
28
- * Copies fixture to temp dir, initializes git.
29
- */
30
- function setupSandbox(fixtureName) {
31
- const fixtureDir = path.join(FIXTURES_DIR, fixtureName);
32
- if (!fs.existsSync(fixtureDir)) {
33
- throw new Error(`Fixture not found: ${fixtureName}`);
34
- }
35
-
36
- // Use HOME-based temp dir because local-tools restricts file access to $HOME
37
- const homeDir = process.env.HOME || os.homedir();
38
- const tmpDir = path.join(homeDir, '.walle', 'bench-sandbox', `bench-${crypto.randomUUID().slice(0, 8)}`);
39
- fs.mkdirSync(tmpDir, { recursive: true });
40
-
41
- // Copy fixture files recursively
42
- copyDirSync(fixtureDir, tmpDir);
43
-
44
- // Init git
45
- try {
46
- execFileSync('git', ['init'], { cwd: tmpDir, stdio: 'pipe' });
47
- execFileSync('git', ['add', '.'], { cwd: tmpDir, stdio: 'pipe' });
48
- execFileSync('git', ['commit', '-m', 'Initial fixture'], { cwd: tmpDir, stdio: 'pipe', env: { ...process.env, GIT_AUTHOR_NAME: 'test', GIT_AUTHOR_EMAIL: 'test@test.com', GIT_COMMITTER_NAME: 'test', GIT_COMMITTER_EMAIL: 'test@test.com' } });
49
- } catch (e) {
50
- // git init is best-effort
51
- }
52
-
53
- return tmpDir;
54
- }
55
-
56
- function copyDirSync(src, dest) {
57
- fs.mkdirSync(dest, { recursive: true });
58
- for (const entry of fs.readdirSync(src, { withFileTypes: true })) {
59
- const srcPath = path.join(src, entry.name);
60
- const destPath = path.join(dest, entry.name);
61
- if (entry.isDirectory()) {
62
- copyDirSync(srcPath, destPath);
63
- } else {
64
- fs.copyFileSync(srcPath, destPath);
65
- }
66
- }
67
- }
68
-
69
- function cleanupSandbox(dir) {
70
- try {
71
- fs.rmSync(dir, { recursive: true, force: true });
72
- } catch { /* best-effort cleanup */ }
73
- }
74
-
75
- /**
76
- * Run a single agent benchmark.
77
- * @param {object} benchmark - Benchmark entry with agentExpectations
78
- * @param {object} options - { runAgentLoop, brain, timeoutMs, provider, model }
79
- * @returns {Promise<object>} Result with scores and metadata
80
- */
81
- async function runAgentBenchmark(benchmark, options = {}) {
82
- const maxAttempts = options.retryHarnessFailures === false
83
- ? 1
84
- : Math.max(1, options.maxHarnessAttempts || 2);
85
- const started = Date.now();
86
- const previousErrors = [];
87
- let lastResult = null;
88
- for (let attempt = 1; attempt <= maxAttempts; attempt++) {
89
- lastResult = await runAgentBenchmarkAttempt(benchmark, { ...options, _attempt: attempt });
90
- lastResult.attempts = attempt;
91
- if (attempt > 1) {
92
- lastResult.latencyMs = Date.now() - started;
93
- lastResult.previousErrors = previousErrors.slice();
94
- }
95
- if (!shouldRetryAgentBenchmarkResult(lastResult, attempt, maxAttempts)) return lastResult;
96
- previousErrors.push(lastResult.error || 'retryable harness failure');
97
- }
98
- if (lastResult) {
99
- lastResult.latencyMs = Date.now() - started;
100
- lastResult.previousErrors = previousErrors.slice();
101
- }
102
- return lastResult;
103
- }
104
-
105
- async function runAgentBenchmarkAttempt(benchmark, options = {}) {
106
- const { runAgentLoop, timeoutMs = DEFAULT_TIMEOUT_MS, provider, model, artifactDir } = options;
107
- if (!runAgentLoop) throw new Error('runAgentLoop function is required');
108
-
109
- const expectations = benchmark.agentExpectations || {};
110
- const fixtureName = expectations.projectFixture || 'express-basic';
111
-
112
- let sandboxDir;
113
- let artifactContext = null;
114
- let exportedArtifact = null;
115
- const startTime = Date.now();
116
- const startedAt = new Date(startTime).toISOString();
117
-
118
- try {
119
- sandboxDir = setupSandbox(fixtureName);
120
- if (artifactDir) {
121
- artifactContext = createArtifactTranscript({
122
- artifactDir,
123
- cwd: sandboxDir,
124
- label: String(benchmark.prompt || '').slice(0, 160),
125
- modelId: model || '',
126
- modelProvider: provider?.type || String(provider || ''),
127
- });
128
- }
129
-
130
- // Count tests before agent run
131
- let testsBefore = null;
132
- let totalTests = null;
133
- if (testCommandAllowed(expectations.testCommand)) {
134
- const testCounts = countTests(sandboxDir, expectations.testCommand);
135
- testsBefore = testCounts.passed;
136
- totalTests = testCounts.total;
137
- }
138
-
139
- // Run the agent loop with hard timeout safety net
140
- const maxTurns = expectations.maxTurns || 20;
141
- const turnBudgetTimeout = maxTurns * 30000;
142
- const effectiveTimeout = Math.min(timeoutMs || turnBudgetTimeout, turnBudgetTimeout);
143
- const agentPromise = runAgentLoop(benchmark.prompt, {
144
- cwd: sandboxDir,
145
- timeoutMs: effectiveTimeout,
146
- maxTurns,
147
- provider,
148
- model,
149
- mode: 'build',
150
- benchmark: true,
151
- headless: true,
152
- headlessPolicy: 'allow',
153
- permissionTimeoutMs: 0,
154
- persistTranscript: artifactContext ? true : false,
155
- transcript: artifactContext?.transcript || null,
156
- });
157
- let timeoutHandle;
158
- const timeoutPromise = new Promise((_, reject) => {
159
- timeoutHandle = setTimeout(() => reject(new Error('Hard timeout exceeded')), effectiveTimeout + 60000); // +1min grace
160
- if (typeof timeoutHandle.unref === 'function') timeoutHandle.unref();
161
- });
162
- const result = await Promise.race([agentPromise, timeoutPromise]);
163
- if (timeoutHandle) clearTimeout(timeoutHandle);
164
-
165
- const latencyMs = Date.now() - startTime;
166
-
167
- // Estimate cost from LLM usage
168
- const usage = result.usage || {};
169
- const estimateCost = getEstimateProviderCost();
170
- const costDollars = estimateCost(usage, provider?.type || provider || 'anthropic', model);
171
-
172
- // Collect actual results
173
- const actualToolCalls = extractToolCalls(result);
174
- const toolCallDetails = extractToolCallDetails(result);
175
- const actualFileChanges = await getModifiedFiles(sandboxDir);
176
- const externalRunnerId = result.runnerId || result.fallback?.runnerId || null;
177
- const externalRunnerWork = Boolean(externalRunnerId && actualFileChanges.length > 0);
178
- const actualTurns = (result.log || []).length || actualToolCalls.length || (externalRunnerId ? 1 : 0);
179
-
180
- // Run test command if specified (validate against allowlist)
181
- let testsPassed = null;
182
- let testsAfter = null;
183
- if (testCommandAllowed(expectations.testCommand)) {
184
- try {
185
- execFileSync('sh', ['-c', expectations.testCommand], {
186
- cwd: sandboxDir,
187
- timeout: 30000,
188
- stdio: 'pipe',
189
- });
190
- testsPassed = true;
191
- } catch {
192
- testsPassed = false;
193
- }
194
- // Count tests after agent run
195
- const afterCounts = countTests(sandboxDir, expectations.testCommand);
196
- testsAfter = afterCounts.passed;
197
- if (totalTests === null) totalTests = afterCounts.total;
198
- }
199
-
200
- const inputTokens = usage.inputTokens ?? usage.input ?? 0;
201
- const expectedFileChanges = expectations.expectedFileChanges || [];
202
- const missingExpectedWork = expectedFileChanges.length > 0 && actualFileChanges.length === 0;
203
- const attemptedFileChange = actualToolCalls.some((call) => {
204
- const name = typeof call === 'string' ? call : call?.name;
205
- return /edit|write|patch|create|delete|modify/i.test(String(name || ''));
206
- });
207
- const testRegression = (expectations.testCommand && testsPassed === false);
208
- const rawError = result.stderr || result.error || null;
209
- const validatedByTests = Boolean(
210
- expectations.testCommand &&
211
- testsPassed === true &&
212
- actualFileChanges.length > 0
213
- );
214
- const fatalError = rawError && !validatedByTests ? rawError : null;
215
- const noEffort = (actualToolCalls.length === 0 && !externalRunnerWork) ||
216
- (inputTokens === 0 && !externalRunnerWork) ||
217
- missingExpectedWork;
218
- const hadError = !!fatalError;
219
- const validatedSuccess = Boolean(result.success || validatedByTests || externalRunnerWork) && !hadError && !noEffort && !testRegression;
220
-
221
- // Score the result
222
- let score = scoreAgentResult(benchmark, {
223
- actualToolCalls,
224
- actualFileChanges,
225
- actualTurns,
226
- testsPassed,
227
- output: result.output || '',
228
- success: validatedSuccess,
229
- sandboxDir,
230
- costDollars,
231
- testsBefore,
232
- testsAfter,
233
- totalTests,
234
- toolCallDetails,
235
- });
236
-
237
- // Hard-zero floor: prevent runs that didn't actually do anything from
238
- // scoring above 0. Without these gates, an agent that hits a 401 / makes
239
- // zero tool calls / leaves tests broken still landed at composite ≈ 0.4
240
- // through process-metric weights (turnEconomy, errorHandling, costEfficiency).
241
- // That inflated past failure-investigation thresholds and reported FAIL as
242
- // PASS. Cap explicitly here.
243
- if (hadError || noEffort || testRegression) {
244
- score = {
245
- composite: 0,
246
- dimensions: { ...(score.dimensions || {}), _zeroed: true,
247
- _zeroReason: hadError
248
- ? 'error'
249
- : testRegression
250
- ? 'tests_failed'
251
- : missingExpectedWork
252
- ? attemptedFileChange ? 'missing_expected_changes' : 'no_file_changes'
253
- : 'no_effort' },
254
- };
255
- }
256
-
257
- const finalResult = {
258
- benchmarkId: benchmark.id,
259
- success: validatedSuccess,
260
- score,
261
- latencyMs,
262
- actualToolCalls,
263
- actualFileChanges,
264
- actualTurns,
265
- testsPassed,
266
- costDollars,
267
- testsBefore,
268
- testsAfter,
269
- totalTests,
270
- inputTokens: usage.inputTokens ?? usage.input ?? null,
271
- outputTokens: usage.outputTokens ?? usage.output ?? null,
272
- dimensionsJson: JSON.stringify(score.dimensions || {}),
273
- output: (result.output || '').slice(0, 2000),
274
- error: fatalError,
275
- };
276
- if (artifactDir) {
277
- exportedArtifact = exportBenchmarkArtifacts({
278
- artifactDir,
279
- sandboxDir,
280
- benchmark,
281
- result: { ...finalResult, sessionId: result.sessionId || null, rawResult: result },
282
- artifactContext,
283
- startedAt,
284
- });
285
- finalResult.artifactPath = exportedArtifact.artifactDir;
286
- finalResult.transcriptPath = exportedArtifact.transcriptPath;
287
- }
288
- return finalResult;
289
- } catch (err) {
290
- const errorResult = {
291
- benchmarkId: benchmark.id,
292
- success: false,
293
- score: { composite: 0, dimensions: {} },
294
- latencyMs: Date.now() - startTime,
295
- error: err.message,
296
- };
297
- if (artifactDir && sandboxDir) {
298
- exportedArtifact = exportBenchmarkArtifacts({
299
- artifactDir,
300
- sandboxDir,
301
- benchmark,
302
- result: errorResult,
303
- artifactContext,
304
- startedAt,
305
- });
306
- errorResult.artifactPath = exportedArtifact.artifactDir;
307
- errorResult.transcriptPath = exportedArtifact.transcriptPath;
308
- }
309
- return errorResult;
310
- } finally {
311
- if (sandboxDir) cleanupSandbox(sandboxDir);
312
- }
313
- }
314
-
315
- function shouldRetryAgentBenchmarkResult(result, attempt, maxAttempts) {
316
- if (!result || result.success || attempt >= maxAttempts) return false;
317
- return isRetryableHarnessFailure(result.error);
318
- }
319
-
320
- function isRetryableHarnessFailure(error) {
321
- const text = String(error || '');
322
- return /operation was aborted|AbortError|Hard timeout exceeded|modified since last read|edit-conflict|edit_file no-op|Command failed: SIGTERM|timed out/i.test(text);
323
- }
324
-
325
- function extractToolCalls(result) {
326
- // Extract tool call names from agent result
327
- if (result.toolCalls) return result.toolCalls.map(t => t.name || t);
328
- if (result.log) {
329
- const calls = [];
330
- for (const entry of result.log) {
331
- if (entry.toolCalls) {
332
- for (const tc of entry.toolCalls) {
333
- calls.push(tc.name || tc);
334
- }
335
- } else if (entry.toolCall) {
336
- calls.push(entry.toolCall.name || entry.toolCall);
337
- }
338
- }
339
- return calls;
340
- }
341
- return [];
342
- }
343
-
344
- async function getModifiedFiles(dir) {
345
- try {
346
- const { stdout } = await execFileAsync('git', ['diff', '--name-only', 'HEAD'], { cwd: dir });
347
- const untracked = (await execFileAsync('git', ['ls-files', '--others', '--exclude-standard'], { cwd: dir })).stdout;
348
- return [...stdout.trim().split('\n'), ...untracked.trim().split('\n')].filter(Boolean);
349
- } catch {
350
- return [];
351
- }
352
- }
353
-
354
- /**
355
- * Score an agent's performance across multiple dimensions.
356
- * Delegates to agent-scorer.js for consistent scoring across benchmarks and sessions.
357
- */
358
- function scoreAgentResult(benchmark, actual) {
359
- const expectations = benchmark.agentExpectations || {};
360
- const { computeAgentScore } = require('./agent-scorer');
361
-
362
- return computeAgentScore({
363
- actualToolCalls: actual.actualToolCalls || [],
364
- expectedToolCalls: expectations.expectedToolCalls || [],
365
- forbiddenToolCalls: expectations.forbiddenToolCalls || [],
366
- testsPassed: actual.testsPassed ?? null,
367
- success: actual.success || false,
368
- output: actual.output || '',
369
- actualFiles: actual.actualFileChanges || [],
370
- expectedFiles: expectations.expectedFileChanges || [],
371
- actualTurns: actual.actualTurns || 0,
372
- maxTurns: expectations.maxTurns || 20,
373
- consecutiveErrors: 0,
374
- // Enhanced dimensions
375
- sandboxDir: actual.sandboxDir || null,
376
- costDollars: actual.costDollars ?? null,
377
- testsBefore: actual.testsBefore ?? null,
378
- testsAfter: actual.testsAfter ?? null,
379
- totalTests: actual.totalTests ?? null,
380
- toolCallDetails: actual.toolCallDetails || null,
381
- shouldAskUser: expectations.shouldAskUser || false,
382
- });
383
- }
384
-
385
- function isTrustedAgentResult(result = {}) {
386
- return result.success === true && !result.error && result.testsPassed === true;
387
- }
388
-
389
- /**
390
- * Run a multi-turn benchmark — sends each turn's prompt sequentially,
391
- * accumulating conversation context. Scores after the final turn.
392
- */
393
- async function runMultiTurnBenchmark(benchmark, options = {}) {
394
- const { runAgentLoop, timeoutMs = DEFAULT_TIMEOUT_MS, provider, model } = options;
395
- if (!runAgentLoop) throw new Error('runAgentLoop function is required');
396
-
397
- const expectations = benchmark.agentExpectations || {};
398
- const fixtureName = expectations.projectFixture || 'express-basic';
399
- const turns = benchmark.turns || [];
400
-
401
- let sandboxDir;
402
- const startTime = Date.now();
403
-
404
- try {
405
- sandboxDir = setupSandbox(fixtureName);
406
- const messages = [];
407
- let lastResult = null;
408
- let allToolCalls = [];
409
- let allToolCallDetails = [];
410
- let totalUsage = { inputTokens: 0, outputTokens: 0 };
411
- let totalTurns = 0;
412
-
413
- for (const turn of turns) {
414
- messages.push({ role: 'user', content: turn.prompt });
415
-
416
- lastResult = await runAgentLoop(turn.prompt, {
417
- cwd: sandboxDir,
418
- timeoutMs: timeoutMs || (expectations.maxTurns || 20) * 30000,
419
- provider,
420
- model,
421
- mode: 'build',
422
- benchmark: true,
423
- headless: true,
424
- headlessPolicy: 'allow',
425
- permissionTimeoutMs: 0,
426
- persistTranscript: false,
427
- messages, // pass accumulated conversation
428
- });
429
-
430
- allToolCalls.push(...extractToolCalls(lastResult));
431
- allToolCallDetails.push(...extractToolCallDetails(lastResult));
432
- totalTurns += (lastResult.log || []).length || 1;
433
- const turnUsage = lastResult.usage || {};
434
- totalUsage.inputTokens += turnUsage.inputTokens || turnUsage.input || 0;
435
- totalUsage.outputTokens += turnUsage.outputTokens || turnUsage.output || 0;
436
-
437
- if (lastResult.output) {
438
- messages.push({ role: 'assistant', content: lastResult.output });
439
- }
440
- }
441
-
442
- const latencyMs = Date.now() - startTime;
443
- const estimateCost = getEstimateProviderCost();
444
- const costDollars = estimateCost(totalUsage, provider?.type || provider || 'anthropic', model);
445
-
446
- const actualFileChanges = await getModifiedFiles(sandboxDir);
447
- const externalRunnerId = lastResult?.runnerId || lastResult?.fallback?.runnerId || null;
448
- const externalRunnerWork = Boolean(externalRunnerId && actualFileChanges.length > 0);
449
- const actualTurns = totalTurns || (externalRunnerId ? 1 : 0);
450
-
451
- let testsPassed = null;
452
- let testsAfter = null;
453
- let testsBefore = null;
454
- let totalTests = null;
455
-
456
- if (testCommandAllowed(expectations.testCommand)) {
457
- try {
458
- execFileSync('sh', ['-c', expectations.testCommand], { cwd: sandboxDir, timeout: 30000, stdio: 'pipe' });
459
- testsPassed = true;
460
- } catch {
461
- testsPassed = false;
462
- }
463
- const afterCounts = countTests(sandboxDir, expectations.testCommand);
464
- testsAfter = afterCounts.passed;
465
- totalTests = afterCounts.total;
466
- }
467
-
468
- let score = scoreAgentResult(benchmark, {
469
- actualToolCalls: allToolCalls,
470
- actualFileChanges,
471
- actualTurns,
472
- testsPassed,
473
- output: lastResult?.output || '',
474
- success: lastResult?.success || false,
475
- sandboxDir,
476
- costDollars,
477
- testsBefore,
478
- testsAfter,
479
- totalTests,
480
- toolCallDetails: allToolCallDetails,
481
- });
482
-
483
- // Same hard-zero floor as single-turn — see runAgentBenchmark for rationale.
484
- const inputTokens = totalUsage.inputTokens ?? 0;
485
- const hadError = !!(lastResult?.stderr || lastResult?.error);
486
- const noEffort = (allToolCalls.length === 0 && !externalRunnerWork) ||
487
- (inputTokens === 0 && !externalRunnerWork);
488
- const testRegression = (expectations.testCommand && testsPassed === false);
489
- if (hadError || noEffort || testRegression) {
490
- score = {
491
- composite: 0,
492
- dimensions: { ...(score.dimensions || {}), _zeroed: true,
493
- _zeroReason: hadError ? 'error' : noEffort ? 'no_effort' : 'tests_failed' },
494
- };
495
- }
496
-
497
- return {
498
- benchmarkId: benchmark.id,
499
- multiTurn: true,
500
- turnsCompleted: turns.length,
501
- success: lastResult?.success || false,
502
- score,
503
- latencyMs,
504
- actualToolCalls: allToolCalls,
505
- actualFileChanges,
506
- actualTurns,
507
- testsPassed,
508
- costDollars,
509
- testsBefore,
510
- testsAfter,
511
- totalTests,
512
- inputTokens: totalUsage.inputTokens ?? null,
513
- outputTokens: totalUsage.outputTokens ?? null,
514
- dimensionsJson: JSON.stringify(score.dimensions || {}),
515
- output: (lastResult?.output || '').slice(0, 2000),
516
- error: lastResult?.stderr || lastResult?.error || null,
517
- };
518
- } catch (err) {
519
- return {
520
- benchmarkId: benchmark.id,
521
- multiTurn: true,
522
- success: false,
523
- score: { composite: 0, dimensions: {} },
524
- latencyMs: Date.now() - startTime,
525
- error: err.message,
526
- };
527
- } finally {
528
- if (sandboxDir) cleanupSandbox(sandboxDir);
529
- }
530
- }
531
-
532
- /**
533
- * Resolve the actual model name that runAgentLoop will use.
534
- * Mirrors the resolution logic in coding-orchestrator.js.
535
- */
536
- function resolveModelName(model) {
537
- return model || process.env.WALLE_MODEL_COMPLEX || process.env.WALLE_MODEL || 'claude-haiku-4-5-20251001';
538
- }
539
-
540
- /**
541
- * Run the full coding-agent benchmark suite.
542
- */
543
- async function runAgentBenchmarkSuite(options = {}) {
544
- const { brain, runAgentLoop, provider, model, timeoutMs, signal } = options;
545
-
546
- let benchmarks;
547
- try {
548
- benchmarks = require('./benchmarks/coding-agent.json');
549
- } catch (err) {
550
- throw new Error(`Failed to load coding-agent benchmarks: ${err.message}`);
551
- }
552
-
553
- const runId = crypto.randomUUID();
554
- const results = [];
555
-
556
- for (const benchmark of benchmarks) {
557
- if (signal?.aborted) break;
558
-
559
- const runner = benchmark.multiTurn ? runMultiTurnBenchmark : runAgentBenchmark;
560
- const result = await runner(benchmark, {
561
- runAgentLoop,
562
- brain,
563
- timeoutMs,
564
- provider,
565
- model,
566
- });
567
-
568
- result.runId = runId;
569
- result.timestamp = new Date().toISOString();
570
- results.push(result);
571
-
572
- // Store result
573
- if (brain && typeof brain.insertBenchmarkResult === 'function') {
574
- try {
575
- const scoringMethod = benchmark.agentExpectations?.testCommand
576
- ? 'agent-rubric+tests'
577
- : 'agent-rubric';
578
- brain.insertBenchmarkResult(decorateBenchmarkResult({
579
- runId,
580
- suite: 'coding-agent',
581
- promptId: benchmark.id,
582
- taskType: 'coding-agent',
583
- difficulty: benchmark.difficulty,
584
- provider: provider?.type || 'default',
585
- model: resolveModelName(model),
586
- prompt: benchmark.prompt,
587
- response: result.output || '',
588
- traitScore: null,
589
- matchedTraits: [],
590
- compositeScore: result.score?.composite || 0,
591
- latencyMs: result.latencyMs,
592
- error: result.error,
593
- timestamp: result.timestamp,
594
- // Enhanced metrics
595
- costDollars: result.costDollars || null,
596
- testsBefore: result.testsBefore ?? null,
597
- testsAfter: result.testsAfter ?? null,
598
- totalTests: result.totalTests ?? null,
599
- dimensionsJson: result.dimensionsJson || null,
600
- inputTokens: result.inputTokens ?? null,
601
- outputTokens: result.outputTokens ?? null,
602
- scorerVersion: DEFAULT_SCORER_VERSION,
603
- scoringMethod,
604
- trusted: isTrustedAgentResult(result),
605
- runConfig: { timeoutMs, scoringMethod },
606
- }, {
607
- suite: 'coding-agent',
608
- benchmark,
609
- runId,
610
- provider: provider?.type || 'default',
611
- model: resolveModelName(model),
612
- scoringMethod,
613
- scorerVersion: DEFAULT_SCORER_VERSION,
614
- trusted: isTrustedAgentResult(result),
615
- runConfig: { timeoutMs, scoringMethod },
616
- }));
617
- } catch { /* non-fatal */ }
618
- }
619
- }
620
-
621
- // Compute summary
622
- const avgScore = results.length > 0
623
- ? results.reduce((sum, r) => sum + (r.score?.composite || 0), 0) / results.length
624
- : 0;
625
-
626
- return {
627
- runId,
628
- suite: 'coding-agent',
629
- totalBenchmarks: benchmarks.length,
630
- completed: results.length,
631
- avgScore,
632
- results,
633
- };
634
- }
635
-
636
- /**
637
- * Detect regressions by comparing against stored baselines.
638
- */
639
- function detectRegressions(brain, currentResults, { thresholdPct = 10 } = {}) {
640
- if (!brain || typeof brain.getBenchmarkResults !== 'function') return [];
641
-
642
- const historical = brain.getBenchmarkResults({ suite: 'coding-agent', days: 30 });
643
- if (historical.length === 0) return [];
644
-
645
- // Group historical by benchmark_id
646
- const baselines = {};
647
- for (const h of historical) {
648
- const key = h.benchmark_id || h.promptId;
649
- if (!baselines[key]) baselines[key] = [];
650
- baselines[key].push(h.composite_score || h.compositeScore || 0);
651
- }
652
-
653
- const regressions = [];
654
- for (const result of currentResults) {
655
- const baseline = baselines[result.benchmarkId];
656
- if (!baseline || baseline.length < 3) continue;
657
-
658
- const avgBaseline = baseline.reduce((a, b) => a + b, 0) / baseline.length;
659
- const currentScore = result.score?.composite || 0;
660
- const dropPct = ((avgBaseline - currentScore) / avgBaseline) * 100;
661
-
662
- if (dropPct > thresholdPct) {
663
- regressions.push({
664
- benchmarkId: result.benchmarkId,
665
- baselineAvg: avgBaseline,
666
- currentScore,
667
- dropPct: Math.round(dropPct),
668
- });
669
- }
670
- }
671
-
672
- return regressions;
673
- }
674
-
675
- /**
676
- * Count passing/total tests by running the test command and parsing output.
677
- * Best-effort — returns { passed: null, total: null } if parsing fails.
678
- */
679
- function countTests(cwd, testCommand) {
680
- try {
681
- const result = execFileSync('sh', ['-c', testCommand + ' 2>&1 || true'], {
682
- cwd,
683
- timeout: 30000,
684
- stdio: ['pipe', 'pipe', 'pipe'],
685
- });
686
- const output = result.toString();
687
-
688
- // Try to parse common test output formats
689
- // node:test: "# pass N" / "# tests N"
690
- const passMatch = output.match(/# pass\s+(\d+)/);
691
- const totalMatch = output.match(/# tests\s+(\d+)/);
692
- if (passMatch && totalMatch) {
693
- return { passed: parseInt(passMatch[1], 10), total: parseInt(totalMatch[1], 10) };
694
- }
695
-
696
- // jest/mocha: "N passing" / "N failing"
697
- const passingMatch = output.match(/(\d+)\s+passing/);
698
- const failingMatch = output.match(/(\d+)\s+failing/);
699
- if (passingMatch) {
700
- const passed = parseInt(passingMatch[1], 10);
701
- const failed = failingMatch ? parseInt(failingMatch[1], 10) : 0;
702
- return { passed, total: passed + failed };
703
- }
704
-
705
- // pytest: "N passed, M failed"
706
- const pytestMatch = output.match(/(\d+)\s+passed/);
707
- const pytestFail = output.match(/(\d+)\s+failed/);
708
- if (pytestMatch) {
709
- const passed = parseInt(pytestMatch[1], 10);
710
- const failed = pytestFail ? parseInt(pytestFail[1], 10) : 0;
711
- return { passed, total: passed + failed };
712
- }
713
-
714
- return { passed: null, total: null };
715
- } catch {
716
- return { passed: null, total: null };
717
- }
718
- }
719
-
720
- /**
721
- * Extract detailed tool call info (name + args + result) from agent result.
722
- */
723
- function extractToolCallDetails(result) {
724
- if (!result) return [];
725
- if (result.toolCalls && result.toolCalls.length > 0) {
726
- return result.toolCalls.map(t => ({
727
- name: t.name || (typeof t === 'string' ? t : ''),
728
- args: t.args || t.input || {},
729
- result: t.result || t.output || '',
730
- }));
731
- }
732
- if (result.log) {
733
- const details = [];
734
- for (const entry of result.log) {
735
- if (entry.toolCalls) {
736
- for (const tc of entry.toolCalls) {
737
- details.push({
738
- name: tc.name || '',
739
- args: tc.args || tc.input || {},
740
- result: tc.result || tc.output || '',
741
- });
742
- }
743
- } else if (entry.toolCall) {
744
- details.push({
745
- name: entry.toolCall.name || '',
746
- args: entry.toolCall.args || entry.toolCall.input || {},
747
- result: entry.toolCall.result || entry.toolCall.output || '',
748
- });
749
- }
750
- }
751
- return details;
752
- }
753
- return [];
754
- }
755
-
756
- module.exports = {
757
- setupSandbox,
758
- cleanupSandbox,
759
- runAgentBenchmark,
760
- runMultiTurnBenchmark,
761
- runAgentBenchmarkSuite,
762
- scoreAgentResult,
763
- isTrustedAgentResult,
764
- extractToolCalls,
765
- extractToolCallDetails,
766
- countTests,
767
- detectRegressions,
768
- isRetryableHarnessFailure,
769
- testCommandAllowed,
770
- resolveModelName,
771
- FIXTURES_DIR,
772
- };