create-walle 0.9.21 → 0.9.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (500) hide show
  1. package/README.md +27 -5
  2. package/package.json +2 -2
  3. package/template/CLAUDE.md +2 -2
  4. package/template/LICENSE +1 -1
  5. package/template/bin/ctm-dev-cleanup.js +24 -3
  6. package/template/bin/ctm-launch.sh +13 -0
  7. package/template/bin/dev.sh +156 -18
  8. package/template/bin/node-bin.sh +84 -0
  9. package/template/bin/pin-node.sh +51 -0
  10. package/template/claude-task-manager/api-prompts.js +1203 -182
  11. package/template/claude-task-manager/api-reviews.js +109 -15
  12. package/template/claude-task-manager/approval-agent.js +1360 -280
  13. package/template/claude-task-manager/bin/restart-ctm.sh +64 -23
  14. package/template/claude-task-manager/bin/storage-migration-supervisor.js +338 -0
  15. package/template/claude-task-manager/db.js +4417 -295
  16. package/template/claude-task-manager/docs/app-update-refresh-protocol.md +69 -0
  17. package/template/claude-task-manager/docs/approval-ai-refinement.md +138 -0
  18. package/template/claude-task-manager/docs/approval-rescue-loop.md +74 -0
  19. package/template/claude-task-manager/docs/codex-operational-warning-health.md +107 -0
  20. package/template/claude-task-manager/docs/codex-resume-state-guard-design.md +17 -12
  21. package/template/claude-task-manager/docs/codex-terminal-render-controller-handoff.md +311 -0
  22. package/template/claude-task-manager/docs/coding-agent-hooks-architecture.md +418 -0
  23. package/template/claude-task-manager/docs/conversation-import-freshness.md +20 -0
  24. package/template/claude-task-manager/docs/google-workspace-auth-health.md +77 -0
  25. package/template/claude-task-manager/docs/image-paste-ux.md +13 -0
  26. package/template/claude-task-manager/docs/ipad-web-preview.md +88 -0
  27. package/template/claude-task-manager/docs/main-loop-offload-architecture.md +66 -0
  28. package/template/claude-task-manager/docs/microsoft-dev-tunnel-phone-access-design.md +274 -519
  29. package/template/claude-task-manager/docs/mobile-live-streaming.md +27 -5
  30. package/template/claude-task-manager/docs/mobile-remote-submission-lifecycle.md +69 -0
  31. package/template/claude-task-manager/docs/phone-access-design.md +53 -15
  32. package/template/claude-task-manager/docs/phone-passkey-identity.md +122 -0
  33. package/template/claude-task-manager/docs/phone-setup.md +3 -0
  34. package/template/claude-task-manager/docs/prompt-editing-tree-design.md +25 -1
  35. package/template/claude-task-manager/docs/remote-desktop-access-design.md +268 -0
  36. package/template/claude-task-manager/docs/restart-lifecycle-architecture.md +95 -0
  37. package/template/claude-task-manager/docs/runtime-work-control-plane.md +53 -0
  38. package/template/claude-task-manager/docs/session-interactive-wait-surfaces.md +38 -0
  39. package/template/claude-task-manager/docs/session-needs-you-dismissal.md +84 -0
  40. package/template/claude-task-manager/docs/session-render-state-management-design.md +91 -3
  41. package/template/claude-task-manager/docs/session-standup-command-center-design.md +25 -1
  42. package/template/claude-task-manager/docs/session-title-authority.md +32 -0
  43. package/template/claude-task-manager/docs/session-workspace-binding.md +33 -0
  44. package/template/claude-task-manager/docs/skill-intent-resolution-design.md +72 -0
  45. package/template/claude-task-manager/docs/walle-mcp-supervisor-health.md +86 -0
  46. package/template/claude-task-manager/docs/walle-relay-phone-access-design.md +24 -15
  47. package/template/claude-task-manager/docs/walle-session-history-hydration.md +114 -0
  48. package/template/claude-task-manager/docs/walle-session-input-queue.md +104 -0
  49. package/template/claude-task-manager/docs/walle-session-model-catalog.md +90 -0
  50. package/template/claude-task-manager/docs/walle-session-model-preferences.md +15 -6
  51. package/template/claude-task-manager/git-utils.js +897 -27
  52. package/template/claude-task-manager/lib/agent-capabilities.js +33 -0
  53. package/template/claude-task-manager/lib/agent-cli-cache.js +37 -7
  54. package/template/claude-task-manager/lib/agent-hooks-installer.js +26 -2
  55. package/template/claude-task-manager/lib/agent-presets.js +17 -1
  56. package/template/claude-task-manager/lib/all-sessions-query.js +108 -0
  57. package/template/claude-task-manager/lib/approval-ai-refinement.js +488 -0
  58. package/template/claude-task-manager/lib/approval-self-adapt.js +168 -0
  59. package/template/claude-task-manager/lib/async-semaphore.js +44 -0
  60. package/template/claude-task-manager/lib/auth-context.js +5 -0
  61. package/template/claude-task-manager/lib/auth-rate-limit.js +47 -4
  62. package/template/claude-task-manager/lib/auth-rules.js +29 -2
  63. package/template/claude-task-manager/lib/auto-approval-verifier.js +129 -16
  64. package/template/claude-task-manager/lib/background-llm.js +144 -17
  65. package/template/claude-task-manager/lib/branch-inventory.js +212 -0
  66. package/template/claude-task-manager/lib/claude-desktop-sessions.js +15 -3
  67. package/template/claude-task-manager/lib/coalesce-sync-frames.js +151 -0
  68. package/template/claude-task-manager/lib/codex-launch-health.js +762 -0
  69. package/template/claude-task-manager/lib/codex-transcript-pager.js +51 -0
  70. package/template/claude-task-manager/lib/codex-zst.js +124 -0
  71. package/template/claude-task-manager/lib/coding-agent-models.js +233 -30
  72. package/template/claude-task-manager/lib/connection-health.js +232 -0
  73. package/template/claude-task-manager/lib/conversation-blob-parser.js +42 -0
  74. package/template/claude-task-manager/lib/conversation-tail-merge.js +89 -26
  75. package/template/claude-task-manager/lib/ctm-session-context-api.js +39 -10
  76. package/template/claude-task-manager/lib/cursor-conversation-store.js +354 -0
  77. package/template/claude-task-manager/lib/db-owner-worker-client.js +315 -0
  78. package/template/claude-task-manager/lib/document-review.js +141 -6
  79. package/template/claude-task-manager/lib/escalation-review.js +152 -0
  80. package/template/claude-task-manager/lib/graceful-shutdown.js +159 -0
  81. package/template/claude-task-manager/lib/headless-term-service.js +678 -0
  82. package/template/claude-task-manager/lib/heavy-worker-fallback.js +38 -0
  83. package/template/claude-task-manager/lib/jsonl-conversation-parser.js +542 -0
  84. package/template/claude-task-manager/lib/jsonl-range-reader.js +112 -0
  85. package/template/claude-task-manager/lib/main-db-census.js +216 -0
  86. package/template/claude-task-manager/lib/message-pagination.js +106 -4
  87. package/template/claude-task-manager/lib/microsoft-dev-tunnel-setup.js +750 -26
  88. package/template/claude-task-manager/lib/mobile-auth-api.js +274 -7
  89. package/template/claude-task-manager/lib/mobile-auth-store.js +592 -10
  90. package/template/claude-task-manager/lib/mobile-notification-dispatcher.js +15 -0
  91. package/template/claude-task-manager/lib/model-overview-brain-fallback.js +311 -0
  92. package/template/claude-task-manager/lib/model-overview-cache.js +141 -0
  93. package/template/claude-task-manager/lib/models-health-routing-notice.js +126 -0
  94. package/template/claude-task-manager/lib/node-pin-guard.js +93 -0
  95. package/template/claude-task-manager/lib/perf-tracker.js +242 -6
  96. package/template/claude-task-manager/lib/permission-match.js +76 -0
  97. package/template/claude-task-manager/lib/permission-sync.js +133 -20
  98. package/template/claude-task-manager/lib/process-title.js +35 -0
  99. package/template/claude-task-manager/lib/prompt-executions-query.js +25 -0
  100. package/template/claude-task-manager/lib/prompt-index-disk-cache.js +44 -0
  101. package/template/claude-task-manager/lib/prompt-intent.js +132 -0
  102. package/template/claude-task-manager/lib/provider-user-context.js +34 -0
  103. package/template/claude-task-manager/lib/read-pool-client.js +313 -0
  104. package/template/claude-task-manager/lib/readpool-breaker.js +31 -0
  105. package/template/claude-task-manager/lib/recent-sessions-breaker.js +12 -0
  106. package/template/claude-task-manager/lib/remote-feedback-client.js +72 -0
  107. package/template/claude-task-manager/lib/remote-relay-protocol.js +37 -4
  108. package/template/claude-task-manager/lib/remote-relay-store.js +159 -0
  109. package/template/claude-task-manager/lib/remote-submission-observer.js +278 -0
  110. package/template/claude-task-manager/lib/restart-guard.js +109 -0
  111. package/template/claude-task-manager/lib/restore-interruption-detector.js +439 -0
  112. package/template/claude-task-manager/lib/restore-policy.js +13 -0
  113. package/template/claude-task-manager/lib/restore-resume-batch.js +74 -0
  114. package/template/claude-task-manager/lib/restore-runtime.js +68 -0
  115. package/template/claude-task-manager/lib/restore-storm.js +34 -0
  116. package/template/claude-task-manager/lib/resume-cwd.js +36 -0
  117. package/template/claude-task-manager/lib/resume-preflight.js +313 -0
  118. package/template/claude-task-manager/lib/runtime-work-registry.js +444 -0
  119. package/template/claude-task-manager/lib/sanitize-openai-auth.js +31 -0
  120. package/template/claude-task-manager/lib/scheduler.js +21 -1
  121. package/template/claude-task-manager/lib/scrollback-snapshot-store.js +159 -0
  122. package/template/claude-task-manager/lib/serial-task-queue.js +64 -0
  123. package/template/claude-task-manager/lib/server-listeners.js +239 -0
  124. package/template/claude-task-manager/lib/session-capture.js +42 -7
  125. package/template/claude-task-manager/lib/session-content-backfill.js +131 -0
  126. package/template/claude-task-manager/lib/session-history.js +388 -43
  127. package/template/claude-task-manager/lib/session-host-manager.js +287 -0
  128. package/template/claude-task-manager/lib/session-image-refs.js +209 -0
  129. package/template/claude-task-manager/lib/session-jobs.js +399 -59
  130. package/template/claude-task-manager/lib/session-prompt-index.js +137 -0
  131. package/template/claude-task-manager/lib/session-restore.js +53 -0
  132. package/template/claude-task-manager/lib/session-standup.js +123 -23
  133. package/template/claude-task-manager/lib/session-state-bus.js +14 -0
  134. package/template/claude-task-manager/lib/session-stream.js +64 -16
  135. package/template/claude-task-manager/lib/session-timeline-summary.js +260 -0
  136. package/template/claude-task-manager/lib/session-token-usage.js +494 -0
  137. package/template/claude-task-manager/lib/session-workspace-binding.js +356 -0
  138. package/template/claude-task-manager/lib/setup-network-config.js +9 -0
  139. package/template/claude-task-manager/lib/size-cap.js +45 -0
  140. package/template/claude-task-manager/lib/size-cap.test.js +62 -0
  141. package/template/claude-task-manager/lib/skill-autocomplete.js +180 -1
  142. package/template/claude-task-manager/lib/skill-intent-resolver.js +304 -0
  143. package/template/claude-task-manager/lib/sqlite-driver.js +19 -3
  144. package/template/claude-task-manager/lib/standup-attention.js +7 -3
  145. package/template/claude-task-manager/lib/status-authority.js +39 -0
  146. package/template/claude-task-manager/lib/status-hooks.js +4 -0
  147. package/template/claude-task-manager/lib/storage-migration.js +235 -0
  148. package/template/claude-task-manager/lib/structured-capture.js +298 -0
  149. package/template/claude-task-manager/lib/sync-io-census.js +163 -0
  150. package/template/claude-task-manager/lib/tailscale-setup.js +6 -0
  151. package/template/claude-task-manager/lib/terminal-activity-evidence.js +33 -0
  152. package/template/claude-task-manager/lib/terminal-choice.js +364 -0
  153. package/template/claude-task-manager/lib/terminal-control-sanitize.js +17 -0
  154. package/template/claude-task-manager/lib/terminal-fingerprint.js +48 -0
  155. package/template/claude-task-manager/lib/terminal-output-flush.js +84 -0
  156. package/template/claude-task-manager/lib/timeline-order.js +122 -0
  157. package/template/claude-task-manager/lib/transcript-store.js +348 -43
  158. package/template/claude-task-manager/lib/transport-security.js +84 -1
  159. package/template/claude-task-manager/lib/wait-state.js +184 -0
  160. package/template/claude-task-manager/lib/walle-client.js +47 -5
  161. package/template/claude-task-manager/lib/walle-ctm-history.js +564 -4
  162. package/template/claude-task-manager/lib/walle-external-actions.js +135 -16
  163. package/template/claude-task-manager/lib/walle-history-hydration.js +46 -0
  164. package/template/claude-task-manager/lib/walle-native-health.js +403 -0
  165. package/template/claude-task-manager/lib/walle-repair.js +701 -0
  166. package/template/claude-task-manager/lib/walle-session-cache.js +109 -0
  167. package/template/claude-task-manager/lib/walle-session-context.js +57 -21
  168. package/template/claude-task-manager/lib/walle-session-model-catalog.js +34 -0
  169. package/template/claude-task-manager/lib/walle-supervisor.js +539 -63
  170. package/template/claude-task-manager/lib/walle-transcript.js +52 -0
  171. package/template/claude-task-manager/lib/worktree-active-sync.js +11 -7
  172. package/template/claude-task-manager/lib/worktree-cwd.js +32 -1
  173. package/template/claude-task-manager/package.json +1 -1
  174. package/template/claude-task-manager/prompt-harvest.js +89 -66
  175. package/template/claude-task-manager/providers/claude-code.js +51 -3
  176. package/template/claude-task-manager/providers/cursor.js +140 -45
  177. package/template/claude-task-manager/public/css/reviews.css +551 -61
  178. package/template/claude-task-manager/public/css/setup.css +191 -0
  179. package/template/claude-task-manager/public/css/walle-session.css +865 -10
  180. package/template/claude-task-manager/public/css/walle.css +154 -0
  181. package/template/claude-task-manager/public/designs/ai-providers-consolidation-v2.html +830 -0
  182. package/template/claude-task-manager/public/index.html +18516 -2058
  183. package/template/claude-task-manager/public/ipad.html +363 -0
  184. package/template/claude-task-manager/public/js/document-review-links.js +301 -0
  185. package/template/claude-task-manager/public/js/image-normalize.js +69 -36
  186. package/template/claude-task-manager/public/js/message-renderer.js +1265 -77
  187. package/template/claude-task-manager/public/js/prompts.js +66 -29
  188. package/template/claude-task-manager/public/js/reviews.js +901 -133
  189. package/template/claude-task-manager/public/js/session-activity-utils.js +11 -1
  190. package/template/claude-task-manager/public/js/session-search-utils.js +94 -10
  191. package/template/claude-task-manager/public/js/session-status-precedence.js +23 -5
  192. package/template/claude-task-manager/public/js/setup.js +1273 -176
  193. package/template/claude-task-manager/public/js/stream-view.js +691 -73
  194. package/template/claude-task-manager/public/js/terminal-reconciler.js +210 -0
  195. package/template/claude-task-manager/public/js/walle-session.js +2455 -158
  196. package/template/claude-task-manager/public/js/walle.js +455 -28
  197. package/template/claude-task-manager/public/m/app.css +2909 -262
  198. package/template/claude-task-manager/public/m/app.js +6601 -398
  199. package/template/claude-task-manager/public/m/claim.html +224 -17
  200. package/template/claude-task-manager/public/m/index.html +117 -21
  201. package/template/claude-task-manager/public/m/sw.js +3 -1
  202. package/template/claude-task-manager/public/manifest.json +2 -2
  203. package/template/claude-task-manager/public/prompts.html +30 -14
  204. package/template/claude-task-manager/queue-engine.js +507 -28
  205. package/template/claude-task-manager/scripts/repair-claude-session-images.js +27 -8
  206. package/template/claude-task-manager/server.js +14341 -2197
  207. package/template/claude-task-manager/session-integrity.js +160 -18
  208. package/template/claude-task-manager/session-search-ranking.js +1 -0
  209. package/template/claude-task-manager/session-utils.js +25 -5
  210. package/template/claude-task-manager/workers/approval-blocklist.js +96 -6
  211. package/template/claude-task-manager/workers/approval-widget-validator.js +14 -8
  212. package/template/claude-task-manager/workers/conversation-import-worker.js +11 -50
  213. package/template/claude-task-manager/workers/db-owner-worker.js +386 -0
  214. package/template/claude-task-manager/workers/harvest-worker.js +9 -55
  215. package/template/claude-task-manager/workers/headless-term-worker.js +9 -530
  216. package/template/claude-task-manager/workers/read-pool-worker.js +387 -0
  217. package/template/claude-task-manager/workers/scrollback-worker.js +11 -72
  218. package/template/claude-task-manager/workers/session-host-process.js +146 -0
  219. package/template/claude-task-manager/workers/session-integrity-worker.js +10 -54
  220. package/template/claude-task-manager/workers/state-detectors/base.js +18 -1
  221. package/template/claude-task-manager/workers/state-detectors/claude-code.js +182 -9
  222. package/template/claude-task-manager/workers/state-detectors/codex.js +150 -2
  223. package/template/claude-task-manager/workers/state-detectors/cursor.js +127 -0
  224. package/template/claude-task-manager/workers/state-detectors/gemini.js +21 -0
  225. package/template/claude-task-manager/workers/state-detectors/index.js +29 -0
  226. package/template/claude-task-manager/workers/state-detectors/opencode.js +103 -0
  227. package/template/docs/design/markdown-review-pane.md +206 -0
  228. package/template/docs/designs/2026-05-17-portkey-gateway-provider-ux.md +129 -38
  229. package/template/docs/designs/2026-05-20-mobile-worktree-finish-command.md +27 -0
  230. package/template/docs/designs/2026-05-22-ai-configuration-consolidation.md +248 -0
  231. package/template/docs/designs/ai-configuration-consolidation-mock.html +812 -0
  232. package/template/docs/private-memory-and-pii-policy.md +69 -0
  233. package/template/package.json +2 -1
  234. package/template/scripts/check-private-data.js +201 -0
  235. package/template/shared/sqlite-owner-guard.js +30 -0
  236. package/template/shared/sqlite-owner-write-queue.js +225 -0
  237. package/template/shared/sqlite-storage-policy.js +111 -0
  238. package/template/shared/sqlite-write-lock.js +428 -0
  239. package/template/wall-e/agent-runners/claude-code.js +5 -0
  240. package/template/wall-e/agent.js +166 -22
  241. package/template/wall-e/api-walle.js +524 -70
  242. package/template/wall-e/auth/provider-flows.js +11 -1
  243. package/template/wall-e/bin/walle-mcp-stdio.js +341 -17
  244. package/template/wall-e/brain.js +1614 -141
  245. package/template/wall-e/chat/attachment-blocks.js +96 -0
  246. package/template/wall-e/chat/attachments.js +2 -1
  247. package/template/wall-e/chat/capability-resolver.js +7 -7
  248. package/template/wall-e/chat/context-messages.js +28 -0
  249. package/template/wall-e/chat/conversation-frame.js +630 -0
  250. package/template/wall-e/chat/provider-messages.js +125 -0
  251. package/template/wall-e/chat.js +1002 -233
  252. package/template/wall-e/coding/acceptance-contract.js +170 -0
  253. package/template/wall-e/coding/acp-adapter.js +1 -1
  254. package/template/wall-e/coding/agent-catalog.js +3 -0
  255. package/template/wall-e/coding/artifact-store.js +93 -0
  256. package/template/wall-e/coding/capability-router.js +120 -0
  257. package/template/wall-e/coding/coding-run-controller.js +423 -0
  258. package/template/wall-e/coding/compaction-service.js +157 -12
  259. package/template/wall-e/coding/frontend-verification.js +258 -0
  260. package/template/wall-e/coding/lifecycle-hooks.js +75 -0
  261. package/template/wall-e/coding/local-preview-contract.js +157 -0
  262. package/template/wall-e/coding/permission-service.js +57 -13
  263. package/template/wall-e/coding/prompt-bundle.js +19 -1
  264. package/template/wall-e/coding/prompt-section-registry.js +227 -0
  265. package/template/wall-e/coding/provider-compat.js +15 -0
  266. package/template/wall-e/coding/runtime-events.js +224 -0
  267. package/template/wall-e/coding/runtime-mode.js +3 -0
  268. package/template/wall-e/coding/side-git-snapshot.js +160 -4
  269. package/template/wall-e/coding/snapshot-service.js +143 -1
  270. package/template/wall-e/coding/stream-processor.js +388 -34
  271. package/template/wall-e/coding/task-tool.js +141 -4
  272. package/template/wall-e/coding/tool-execution-controller.js +365 -0
  273. package/template/wall-e/coding/tool-registry.js +43 -5
  274. package/template/wall-e/coding/user-hooks.js +217 -0
  275. package/template/wall-e/coding-orchestrator.js +1330 -221
  276. package/template/wall-e/coding-prompts.js +20 -4
  277. package/template/wall-e/context/context-builder.js +15 -2
  278. package/template/wall-e/decision/confidence.js +1 -1
  279. package/template/wall-e/docs/coding-acceptance-contract.md +41 -0
  280. package/template/wall-e/docs/external-action-controller.md +26 -6
  281. package/template/wall-e/docs/telemetry-lifecycle.md +8 -2
  282. package/template/wall-e/embeddings.js +591 -53
  283. package/template/wall-e/external-action-controller.js +12 -0
  284. package/template/wall-e/http/auth.js +1 -0
  285. package/template/wall-e/http/chat-api.js +46 -11
  286. package/template/wall-e/http/model-admin.js +836 -34
  287. package/template/wall-e/lib/boot-profile.js +88 -0
  288. package/template/wall-e/lib/event-loop-monitor.js +93 -0
  289. package/template/wall-e/lib/service-health.js +194 -0
  290. package/template/wall-e/llm/anthropic.js +130 -5
  291. package/template/wall-e/llm/client.js +266 -63
  292. package/template/wall-e/llm/default-fallback.js +382 -0
  293. package/template/wall-e/llm/health.js +19 -0
  294. package/template/wall-e/llm/message-guard.js +78 -0
  295. package/template/wall-e/llm/model-catalog.js +252 -1
  296. package/template/wall-e/llm/openai.js +26 -4
  297. package/template/wall-e/llm/portkey-sync.js +654 -0
  298. package/template/wall-e/llm/provider-error.js +30 -2
  299. package/template/wall-e/llm/registry.js +5 -1
  300. package/template/wall-e/llm/request-compat.js +67 -0
  301. package/template/wall-e/loops/backfill.js +79 -23
  302. package/template/wall-e/loops/brain-optimize.js +67 -0
  303. package/template/wall-e/loops/ingest.js +25 -10
  304. package/template/wall-e/loops/question-digest.js +160 -0
  305. package/template/wall-e/loops/reflect.js +6 -4
  306. package/template/wall-e/loops/think.js +39 -12
  307. package/template/wall-e/mcp-server.js +318 -36
  308. package/template/wall-e/memory/ctm-context-client.js +52 -14
  309. package/template/wall-e/memory/ctm-operational-context.js +237 -0
  310. package/template/wall-e/memory/ctm-prompt-executions-client.js +128 -0
  311. package/template/wall-e/memory/ctm-session-context.js +111 -63
  312. package/template/wall-e/prompts/coding/deepseek.txt +3 -0
  313. package/template/wall-e/prompts/coding/gemini.txt +6 -0
  314. package/template/wall-e/prompts/coding/gpt.txt +6 -0
  315. package/template/wall-e/prompts/coding/local.txt +7 -0
  316. package/template/wall-e/runtime/decision-hooks.js +115 -0
  317. package/template/wall-e/runtime/devbox-gateway.js +82 -8
  318. package/template/wall-e/runtime/prompt-manifest.js +86 -0
  319. package/template/wall-e/runtime/tool-executor.js +269 -0
  320. package/template/wall-e/runtime/tool-result-envelope.js +138 -0
  321. package/template/wall-e/runtime/transcript-projection.js +60 -0
  322. package/template/wall-e/runtime/walle-runtime.js +224 -0
  323. package/template/wall-e/scripts/db-optimize/migrate.js +162 -0
  324. package/template/wall-e/scripts/db-optimize/recall-eval.js +117 -0
  325. package/template/wall-e/server.js +15 -0
  326. package/template/wall-e/session-files.js +9 -0
  327. package/template/wall-e/skills/_bundled/google-calendar/run.js +1 -1
  328. package/template/wall-e/skills/_bundled/gws-workspace/run.js +1 -1
  329. package/template/wall-e/skills/_bundled/slack-mentions/run.js +76 -6
  330. package/template/wall-e/skills/claude-code-reader.js +7 -3
  331. package/template/wall-e/skills/script-skill-runner.js +10 -0
  332. package/template/wall-e/skills/skill-planner.js +38 -0
  333. package/template/wall-e/tools/builtin-middleware.js +19 -9
  334. package/template/wall-e/tools/local-tools.js +1428 -16
  335. package/template/wall-e/tools/permission-checker.js +73 -5
  336. package/template/wall-e/tools/question-manager.js +117 -7
  337. package/template/wall-e/training/harvester.js +12 -28
  338. package/template/wall-e/training/replay.js +25 -80
  339. package/template/website/index.html +10 -10
  340. package/template/wall-e/eval/ab-test.js +0 -203
  341. package/template/wall-e/eval/agent-runner.js +0 -772
  342. package/template/wall-e/eval/agent-scorer.js +0 -461
  343. package/template/wall-e/eval/aggregator.js +0 -414
  344. package/template/wall-e/eval/allowed-test-commands.js +0 -34
  345. package/template/wall-e/eval/benchmark-generator.js +0 -113
  346. package/template/wall-e/eval/benchmarks/chat-eval.json +0 -1662
  347. package/template/wall-e/eval/benchmarks/chat.json +0 -82
  348. package/template/wall-e/eval/benchmarks/coding-agent-real.json +0 -1
  349. package/template/wall-e/eval/benchmarks/coding-agent.json +0 -1581
  350. package/template/wall-e/eval/benchmarks/coding.json +0 -122
  351. package/template/wall-e/eval/benchmarks/memory-retrieval.json +0 -234
  352. package/template/wall-e/eval/benchmarks/reasoning.json +0 -82
  353. package/template/wall-e/eval/benchmarks/swebench-lite-30.json +0 -212
  354. package/template/wall-e/eval/benchmarks.js +0 -669
  355. package/template/wall-e/eval/cc-replay.js +0 -719
  356. package/template/wall-e/eval/chat-eval.js +0 -525
  357. package/template/wall-e/eval/check-keys.js +0 -15
  358. package/template/wall-e/eval/check-providers.js +0 -42
  359. package/template/wall-e/eval/codex-cli-baseline.js +0 -669
  360. package/template/wall-e/eval/coding-agent-real.js +0 -570
  361. package/template/wall-e/eval/context-compactor.js +0 -251
  362. package/template/wall-e/eval/debug-agent003.js +0 -68
  363. package/template/wall-e/eval/diagnostics.js +0 -216
  364. package/template/wall-e/eval/eval-orchestrator.js +0 -642
  365. package/template/wall-e/eval/evaluate.js +0 -202
  366. package/template/wall-e/eval/evaluator.js +0 -373
  367. package/template/wall-e/eval/exporter.js +0 -212
  368. package/template/wall-e/eval/fixtures/express-basic/package.json +0 -9
  369. package/template/wall-e/eval/fixtures/express-basic/server.js +0 -115
  370. package/template/wall-e/eval/fixtures/express-basic/test.js +0 -83
  371. package/template/wall-e/eval/fixtures/express-buggy/package.json +0 -9
  372. package/template/wall-e/eval/fixtures/express-buggy/server.js +0 -113
  373. package/template/wall-e/eval/fixtures/express-buggy/test.js +0 -83
  374. package/template/wall-e/eval/fixtures/express-buggy-items/package.json +0 -9
  375. package/template/wall-e/eval/fixtures/express-buggy-items/server.js +0 -112
  376. package/template/wall-e/eval/fixtures/express-buggy-items/test.js +0 -83
  377. package/template/wall-e/eval/fixtures/express-buggy-search/package.json +0 -9
  378. package/template/wall-e/eval/fixtures/express-buggy-search/server.js +0 -121
  379. package/template/wall-e/eval/fixtures/express-buggy-search/test.js +0 -83
  380. package/template/wall-e/eval/fixtures/express-rename-data/data.js +0 -34
  381. package/template/wall-e/eval/fixtures/express-rename-data/package.json +0 -9
  382. package/template/wall-e/eval/fixtures/express-rename-data/server.js +0 -97
  383. package/template/wall-e/eval/fixtures/express-rename-data/test.js +0 -88
  384. package/template/wall-e/eval/fixtures/express-xss/package.json +0 -12
  385. package/template/wall-e/eval/fixtures/express-xss/server.js +0 -90
  386. package/template/wall-e/eval/fixtures/express-xss/test.js +0 -67
  387. package/template/wall-e/eval/fixtures/express-xss/views/profile.ejs +0 -9
  388. package/template/wall-e/eval/fixtures/fullstack-app/config/default.js +0 -9
  389. package/template/wall-e/eval/fixtures/fullstack-app/config/test.js +0 -13
  390. package/template/wall-e/eval/fixtures/fullstack-app/package.json +0 -11
  391. package/template/wall-e/eval/fixtures/fullstack-app/public/css/style.css +0 -137
  392. package/template/wall-e/eval/fixtures/fullstack-app/public/index.html +0 -46
  393. package/template/wall-e/eval/fixtures/fullstack-app/public/js/app.js +0 -121
  394. package/template/wall-e/eval/fixtures/fullstack-app/public/js/auth.js +0 -71
  395. package/template/wall-e/eval/fixtures/fullstack-app/public/js/items.js +0 -80
  396. package/template/wall-e/eval/fixtures/fullstack-app/public/js/users.js +0 -46
  397. package/template/wall-e/eval/fixtures/fullstack-app/public/login.html +0 -45
  398. package/template/wall-e/eval/fixtures/fullstack-app/public/register.html +0 -38
  399. package/template/wall-e/eval/fixtures/fullstack-app/scripts/migrate.js +0 -23
  400. package/template/wall-e/eval/fixtures/fullstack-app/scripts/seed.js +0 -46
  401. package/template/wall-e/eval/fixtures/fullstack-app/server/db.js +0 -99
  402. package/template/wall-e/eval/fixtures/fullstack-app/server/index.js +0 -94
  403. package/template/wall-e/eval/fixtures/fullstack-app/server/middleware/auth.js +0 -19
  404. package/template/wall-e/eval/fixtures/fullstack-app/server/middleware/logger.js +0 -19
  405. package/template/wall-e/eval/fixtures/fullstack-app/server/router.js +0 -50
  406. package/template/wall-e/eval/fixtures/fullstack-app/server/routes/auth.js +0 -69
  407. package/template/wall-e/eval/fixtures/fullstack-app/server/routes/health.js +0 -23
  408. package/template/wall-e/eval/fixtures/fullstack-app/server/routes/items.js +0 -88
  409. package/template/wall-e/eval/fixtures/fullstack-app/server/routes/users.js +0 -75
  410. package/template/wall-e/eval/fixtures/fullstack-app/server/test.js +0 -198
  411. package/template/wall-e/eval/fixtures/fullstack-app/server/utils/response.js +0 -34
  412. package/template/wall-e/eval/fixtures/fullstack-app/server/utils/validate.js +0 -26
  413. package/template/wall-e/eval/fixtures/fullstack-app/server.js +0 -8
  414. package/template/wall-e/eval/fixtures/fullstack-app/test.js +0 -12
  415. package/template/wall-e/eval/fixtures/monorepo-basic/package.json +0 -8
  416. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/data.js +0 -58
  417. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/middleware.js +0 -46
  418. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/package.json +0 -8
  419. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/routes.js +0 -64
  420. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/server.js +0 -56
  421. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/test.js +0 -116
  422. package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/commands.js +0 -61
  423. package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/index.js +0 -62
  424. package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/output.js +0 -43
  425. package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/package.json +0 -11
  426. package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/test.js +0 -44
  427. package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/formatters.js +0 -43
  428. package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/index.js +0 -12
  429. package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/package.json +0 -5
  430. package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/test.js +0 -55
  431. package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/validators.js +0 -29
  432. package/template/wall-e/eval/fixtures/monorepo-basic/test.js +0 -46
  433. package/template/wall-e/eval/fixtures/node-cli/index.js +0 -78
  434. package/template/wall-e/eval/fixtures/node-cli/package.json +0 -10
  435. package/template/wall-e/eval/fixtures/node-cli/test.js +0 -57
  436. package/template/wall-e/eval/fixtures/node-typed/package.json +0 -8
  437. package/template/wall-e/eval/fixtures/node-typed/src/handlers.js +0 -31
  438. package/template/wall-e/eval/fixtures/node-typed/src/utils.js +0 -33
  439. package/template/wall-e/eval/fixtures/node-typed/test.js +0 -36
  440. package/template/wall-e/eval/fixtures/python-flask/app.py +0 -14
  441. package/template/wall-e/eval/fixtures/python-flask/requirements.txt +0 -2
  442. package/template/wall-e/eval/fixtures/python-flask/test_app.py +0 -25
  443. package/template/wall-e/eval/fixtures/wall-e-subset/brain.js +0 -105
  444. package/template/wall-e/eval/fixtures/wall-e-subset/eval/aggregator.js +0 -101
  445. package/template/wall-e/eval/fixtures/wall-e-subset/eval/benchmarks/chat.json +0 -20
  446. package/template/wall-e/eval/fixtures/wall-e-subset/eval/benchmarks/coding.json +0 -32
  447. package/template/wall-e/eval/fixtures/wall-e-subset/eval/benchmarks.js +0 -64
  448. package/template/wall-e/eval/fixtures/wall-e-subset/eval/fixtures/simple-project/package.json +0 -6
  449. package/template/wall-e/eval/fixtures/wall-e-subset/eval/fixtures/simple-project/server.js +0 -31
  450. package/template/wall-e/eval/fixtures/wall-e-subset/eval/fixtures/simple-project/test.js +0 -18
  451. package/template/wall-e/eval/fixtures/wall-e-subset/eval/fixtures/simple-project/utils.js +0 -34
  452. package/template/wall-e/eval/fixtures/wall-e-subset/eval/runner.js +0 -104
  453. package/template/wall-e/eval/fixtures/wall-e-subset/eval/scorer.js +0 -73
  454. package/template/wall-e/eval/fixtures/wall-e-subset/eval/test.js +0 -134
  455. package/template/wall-e/eval/fixtures/wall-e-subset/llm/client.js +0 -99
  456. package/template/wall-e/eval/fixtures/wall-e-subset/llm/providers.js +0 -63
  457. package/template/wall-e/eval/fixtures/wall-e-subset/llm/test.js +0 -70
  458. package/template/wall-e/eval/fixtures/wall-e-subset/package.json +0 -10
  459. package/template/wall-e/eval/fixtures/wall-e-subset/test.js +0 -86
  460. package/template/wall-e/eval/harvester.js +0 -685
  461. package/template/wall-e/eval/head-to-head.js +0 -388
  462. package/template/wall-e/eval/humaneval-adapter.js +0 -321
  463. package/template/wall-e/eval/list-models.js +0 -31
  464. package/template/wall-e/eval/livecodebench-adapter.js +0 -291
  465. package/template/wall-e/eval/mail-integration.js +0 -443
  466. package/template/wall-e/eval/manifest.js +0 -186
  467. package/template/wall-e/eval/meta-harness/adapters/coding-agent.js +0 -57
  468. package/template/wall-e/eval/meta-harness/bootstrap-snapshot.js +0 -149
  469. package/template/wall-e/eval/meta-harness/candidate-store.js +0 -117
  470. package/template/wall-e/eval/meta-harness/cli.js +0 -86
  471. package/template/wall-e/eval/meta-harness/domain-spec.js +0 -154
  472. package/template/wall-e/eval/meta-harness/domains/coding-agent.domain.json +0 -84
  473. package/template/wall-e/eval/meta-harness/examples/env-bootstrap-candidate.js +0 -29
  474. package/template/wall-e/eval/meta-harness/experience-store.js +0 -174
  475. package/template/wall-e/eval/meta-harness/frontier.js +0 -96
  476. package/template/wall-e/eval/meta-harness/harness-interface.js +0 -90
  477. package/template/wall-e/eval/meta-harness/leakage-guard.js +0 -80
  478. package/template/wall-e/eval/meta-harness/optimizer.js +0 -207
  479. package/template/wall-e/eval/meta-harness/proposer-runner.js +0 -110
  480. package/template/wall-e/eval/meta-harness/reporting.js +0 -58
  481. package/template/wall-e/eval/meta-harness/telemetry.js +0 -27
  482. package/template/wall-e/eval/meta-harness/validation.js +0 -81
  483. package/template/wall-e/eval/promoter.js +0 -228
  484. package/template/wall-e/eval/provider-normalizer.js +0 -33
  485. package/template/wall-e/eval/replay.js +0 -395
  486. package/template/wall-e/eval/run-agent-benchmarks.js +0 -386
  487. package/template/wall-e/eval/run-codex-cli-baseline.js +0 -177
  488. package/template/wall-e/eval/run-coding-agent-real.js +0 -187
  489. package/template/wall-e/eval/run-eval.js +0 -435
  490. package/template/wall-e/eval/run-model-comparison.js +0 -142
  491. package/template/wall-e/eval/session-evaluator.js +0 -187
  492. package/template/wall-e/eval/session-miner.js +0 -207
  493. package/template/wall-e/eval/session-retrieval-benchmark.js +0 -150
  494. package/template/wall-e/eval/session-transcripts.js +0 -509
  495. package/template/wall-e/eval/shadow.js +0 -161
  496. package/template/wall-e/eval/swebench-adapter.js +0 -345
  497. package/template/wall-e/eval/swebench-docker.js +0 -192
  498. package/template/wall-e/eval/train.py +0 -320
  499. package/template/wall-e/eval/trainer.js +0 -232
  500. package/template/wall-e/eval/weekly-eval-loop.js +0 -241
@@ -1,187 +0,0 @@
1
- #!/usr/bin/env node
2
- 'use strict';
3
-
4
- try {
5
- const envPath = require('path').resolve(__dirname, '..', '..', '.env');
6
- const lines = require('fs').readFileSync(envPath, 'utf8').split('\n');
7
- for (const line of lines) {
8
- const match = line.match(/^([A-Z_]+)=(.*)$/);
9
- if (match && !process.env[match[1]]) process.env[match[1]] = match[2];
10
- }
11
- } catch {}
12
-
13
- const path = require('path');
14
- process.chdir(path.join(__dirname, '..'));
15
-
16
- const {
17
- DEFAULT_REAL_CATALOG_PATH,
18
- buildReplayCatalog,
19
- createProviderClient,
20
- defaultReplayTimeoutMs,
21
- loadCatalog,
22
- preflightProvider,
23
- runReplayEntry,
24
- summarizeResults,
25
- } = require('./coding-agent-real');
26
-
27
- async function main() {
28
- const args = parseArgs(process.argv.slice(2));
29
- const catalogPath = path.resolve(args.catalog || DEFAULT_REAL_CATALOG_PATH);
30
- const dryRun = !!args['dry-run'];
31
- const limit = args.limit ? Number(args.limit) : 20;
32
-
33
- if (args.help) {
34
- printHelp();
35
- return;
36
- }
37
-
38
- if (args.reap) {
39
- const repoPath = path.resolve(args.repo || process.cwd());
40
- const result = buildReplayCatalog({
41
- repoPath,
42
- source: args.source || 'all',
43
- roots: args.root,
44
- sinceDays: args['since-days'] ? Number(args['since-days']) : 14,
45
- limit,
46
- catalogPath,
47
- requireEdits: args['require-edits'] !== 'false',
48
- requireCodingIntent: args['require-coding-intent'] !== 'false',
49
- });
50
- console.log(`Reaped ${result.added.length} real replay entries into ${result.catalogPath}`);
51
- console.log(`Scanned: ${result.scanned}; catalog total: ${result.total}`);
52
- console.log(`Skipped: ${JSON.stringify(result.skipped)}`);
53
- if (!dryRun && !args.run) return;
54
- }
55
-
56
- const catalog = loadCatalog(catalogPath);
57
- const selected = selectEntries(catalog, {
58
- id: args.id,
59
- limit,
60
- source: args.source,
61
- });
62
-
63
- if (!selected.length) {
64
- console.error(`No real replay entries selected from ${catalogPath}`);
65
- process.exitCode = 1;
66
- return;
67
- }
68
-
69
- const providerType = args.provider || null;
70
- initBrain();
71
- const providerCheck = preflightProvider(providerType, { dryRun });
72
- console.log(`Catalog: ${catalogPath}`);
73
- console.log(`Entries: ${selected.length}`);
74
- console.log(`Mode: ${dryRun ? 'dry-run' : 'real model run'}`);
75
- console.log(`Provider preflight: ${providerCheck.status}${providerCheck.providerType ? ` (${providerCheck.providerType})` : ''}`);
76
- if (!providerCheck.ok) {
77
- console.error(`Provider is not ready: ${providerCheck.status}`);
78
- process.exitCode = 1;
79
- return;
80
- }
81
-
82
- let runAgentLoop = null;
83
- let provider = null;
84
- if (!dryRun) {
85
- const orchestrator = require('../coding-orchestrator');
86
- runAgentLoop = orchestrator.runAgentLoop;
87
- provider = createProviderClient(providerType);
88
- }
89
-
90
- const results = [];
91
- for (const entry of selected) {
92
- console.log(`\n--- ${entry.id} (${entry.difficulty || 'unknown'}) ---`);
93
- console.log(`Prompt: ${(entry.prompt || '').replace(/\s+/g, ' ').slice(0, 140)}`);
94
- const result = await runReplayEntry(entry, {
95
- runAgentLoop,
96
- provider,
97
- model: args.model || null,
98
- dryRun,
99
- record: !!args.record,
100
- resultsDir: args['results-dir'],
101
- keepFailures: !!args['keep-failures'],
102
- timeoutMs: args.timeout ? Number(args.timeout) : defaultReplayTimeoutMs(entry),
103
- });
104
- results.push(result);
105
- const score = result.replay?.scores?.composite;
106
- console.log(`Success: ${result.success}`);
107
- console.log(`Preflight: ${result.preflight.status}`);
108
- if (typeof score === 'number') console.log(`Score: ${score.toFixed(3)}`);
109
- if (result.failureType) console.log(`Failure: ${result.failureType}`);
110
- if (result.artifactPath) console.log(`Artifact: ${result.artifactPath}`);
111
- }
112
-
113
- const summary = summarizeResults(results);
114
- console.log('\n=== Summary ===');
115
- console.log(`Passed: ${summary.passed}/${summary.total}`);
116
- if (summary.avgComposite != null) console.log(`Avg composite: ${summary.avgComposite.toFixed(3)}`);
117
- console.log(`Failures: ${JSON.stringify(summary.failureCounts)}`);
118
- if (summary.failed > 0) process.exitCode = 1;
119
- }
120
-
121
- function initBrain() {
122
- try {
123
- const brain = require('../brain');
124
- brain.initDb();
125
- return brain;
126
- } catch (err) {
127
- console.warn(`Brain not available: ${err.message}`);
128
- return null;
129
- }
130
- }
131
-
132
- function selectEntries(catalog, { id, limit, source } = {}) {
133
- let entries = catalog;
134
- if (id) entries = entries.filter((entry) => entry.id === id);
135
- if (source && source !== 'all') entries = entries.filter((entry) => entry.realReplay?.source === source);
136
- return entries.slice(0, limit || entries.length);
137
- }
138
-
139
- function parseArgs(argv) {
140
- const out = {};
141
- for (let i = 0; i < argv.length; i += 1) {
142
- const arg = argv[i];
143
- if (!arg.startsWith('--')) continue;
144
- const key = arg.slice(2);
145
- if (['reap', 'dry-run', 'record', 'keep-failures', 'run', 'help'].includes(key)) {
146
- out[key] = true;
147
- } else {
148
- out[key] = argv[i + 1];
149
- i += 1;
150
- }
151
- }
152
- return out;
153
- }
154
-
155
- function printHelp() {
156
- console.log(`Usage:
157
- node eval/run-coding-agent-real.js --reap --repo /path/to/repo --source claude|codex|all
158
- node eval/run-coding-agent-real.js --dry-run --catalog eval/benchmarks/coding-agent-real.json
159
- node eval/run-coding-agent-real.js --id agent-real-codex-... --provider openai --model gpt-5.4-mini --record
160
-
161
- Options:
162
- --reap Harvest Claude/Codex JSONL sessions into the real catalog
163
- --run After --reap, run the selected entries too
164
- --repo <path> Repo path used to filter transcript cwd values
165
- --root <path> Transcript root override
166
- --source <value> claude, codex, or all
167
- --since-days <n> Transcript mtime window for reap
168
- --limit <n> Entry limit
169
- --catalog <path> Catalog JSON path
170
- --dry-run Verify catalog preflight and sandbox creation only
171
- --provider <type> openai, deepseek, moonshot, anthropic, google, ollama, mlx, claude-cli, codex-cli
172
- --model <id> Model override for runAgentLoop
173
- --record Write result JSON artifacts
174
- --results-dir <path> Artifact directory
175
- --keep-failures Leave failed replay sandbox worktrees for inspection
176
- --require-edits false Allow non-editing sessions into the catalog
177
- --require-coding-intent false
178
- Allow non-standalone prompts into the catalog
179
- `);
180
- }
181
-
182
- main()
183
- .then(() => process.exit(process.exitCode || 0))
184
- .catch((err) => {
185
- console.error(err.stack || err.message);
186
- process.exit(1);
187
- });
@@ -1,435 +0,0 @@
1
- 'use strict';
2
-
3
- /**
4
- * CLI entry point for running the full eval pipeline across all models.
5
- *
6
- * Usage:
7
- * node eval/run-eval.js # all models, all benchmarks
8
- * node eval/run-eval.js --suite coding-agent # specific suite
9
- * node eval/run-eval.js --models gemma4:e4b,gemma4:26b # specific models
10
- * node eval/run-eval.js --id agent-001 # single benchmark
11
- * node eval/run-eval.js --budget 5.0 # cost cap in dollars
12
- * node eval/run-eval.js --timeout 300000 # per-benchmark timeout ms
13
- * node eval/run-eval.js --concurrency 1 # parallel benchmarks
14
- * node eval/run-eval.js --dry-run # list work items without running
15
- * node eval/run-eval.js --check-providers # test provider health and exit
16
- */
17
-
18
- const fs = require('fs');
19
- const path = require('path');
20
- const { EvalOrchestrator } = require('./eval-orchestrator');
21
-
22
- // ── Default model roster ──
23
- const DEFAULT_MODELS = [
24
- // Frontier
25
- 'claude-opus-4-7',
26
- 'gpt-5.5',
27
- 'deepseek-v4-pro',
28
- 'kimi-k2.6',
29
- // Daily
30
- 'claude-sonnet-4-6',
31
- 'gpt-5.4',
32
- 'gemini-2.5-pro',
33
- 'kimi-k2.5',
34
- // Budget
35
- 'claude-haiku-4-5-20251001',
36
- 'gpt-5.4-mini',
37
- 'gemini-2.5-flash',
38
- 'deepseek-v4-flash',
39
- // Local (Ollama)
40
- 'gemma4:e4b',
41
- 'gemma4:26b',
42
- ];
43
-
44
- // ── Arg parsing ──
45
- const args = process.argv.slice(2);
46
- function getArg(flag, fallback) {
47
- const i = args.indexOf(flag);
48
- return i >= 0 && args[i + 1] ? args[i + 1] : fallback;
49
- }
50
- const hasFlag = (flag) => args.includes(flag);
51
-
52
- const suite = getArg('--suite', 'coding-agent');
53
- const modelArg = getArg('--models', null);
54
- const models = modelArg ? modelArg.split(',').map(s => s.trim()) : DEFAULT_MODELS;
55
- const benchmarkId = getArg('--id', null);
56
- const budget = parseFloat(getArg('--budget', '10.0'));
57
- const timeout = parseInt(getArg('--timeout', '600000'), 10);
58
- const concurrency = parseInt(getArg('--concurrency', '2'), 10);
59
- const dryRun = hasFlag('--dry-run');
60
- const resumeId = getArg('--resume', null);
61
- const checkProviders = hasFlag('--check-providers');
62
-
63
- /**
64
- * Pre-flight check: test each provider with a minimal API call.
65
- * Returns map of providerType -> { ok: boolean, error?: string, model?: string }
66
- */
67
- async function checkProviderHealth(brain) {
68
- const results = {};
69
- const { createClient } = require('../llm/client');
70
- const { createAnthropicFromEnv } = require('../llm/anthropic');
71
-
72
- const providers = [
73
- { type: 'anthropic', testModel: 'claude-haiku-4-5-20251001', envKeys: ['ANTHROPIC_API_KEY', 'ANTHROPIC_AUTH_TOKEN', 'PORTKEY_API_KEY'] },
74
- { type: 'openai', testModel: 'gpt-5.4-mini', envKeys: ['OPENAI_API_KEY'] },
75
- { type: 'google', testModel: 'gemini-2.5-flash', envKeys: ['GOOGLE_API_KEY', 'GEMINI_API_KEY'] },
76
- { type: 'deepseek', testModel: 'deepseek-v4-flash', envKeys: ['DEEPSEEK_API_KEY'] },
77
- { type: 'moonshot', testModel: 'kimi-k2.6', envKeys: ['MOONSHOT_API_KEY'] },
78
- ];
79
-
80
- // Check cloud providers
81
- for (const prov of providers) {
82
- let hasKey = false;
83
- let apiKey = null;
84
-
85
- // Check brain DB for stored key
86
- if (brain) {
87
- try {
88
- const row = brain.getDb().prepare(
89
- 'SELECT api_key_encrypted FROM model_providers WHERE type = ? AND enabled = 1 AND api_key_encrypted IS NOT NULL LIMIT 1'
90
- ).get(prov.type);
91
- if (row && row.api_key_encrypted) {
92
- hasKey = true;
93
- try { apiKey = brain.decrypt(row.api_key_encrypted); } catch {}
94
- if (!apiKey) apiKey = row.api_key_encrypted; // plaintext fallback
95
- }
96
- } catch {}
97
- }
98
-
99
- // Check env vars
100
- if (!hasKey) {
101
- for (const key of prov.envKeys) {
102
- if (process.env[key]) {
103
- hasKey = true;
104
- apiKey = process.env[key];
105
- break;
106
- }
107
- }
108
- }
109
-
110
- if (!hasKey) {
111
- results[prov.type] = { ok: false, error: 'No API key configured' };
112
- continue;
113
- }
114
-
115
- // Make a minimal test call
116
- try {
117
- let client;
118
- if (prov.type === 'anthropic') {
119
- client = createAnthropicFromEnv();
120
- } else {
121
- client = createClient(prov.type, { apiKey });
122
- }
123
-
124
- await client.chat({
125
- model: prov.testModel,
126
- messages: [{ role: 'user', content: 'hi' }],
127
- maxTokens: 1,
128
- });
129
-
130
- results[prov.type] = { ok: true, model: prov.testModel };
131
- } catch (err) {
132
- const msg = err.message || String(err);
133
- let diagnosis = msg;
134
-
135
- // Gemini CLI OAuth tokens (ya29.*) use Google's private Code Assist API
136
- // (cloudcode-pa.googleapis.com) and cannot auth with the public Gemini API.
137
- if (apiKey && apiKey.startsWith('ya29.')) {
138
- diagnosis = `Stored key is a Gemini CLI OAuth token (ya29.*). These tokens use ` +
139
- `Google's private Code Assist API and don't work with the public Gemini API. ` +
140
- `Get a Gemini API key from https://aistudio.google.com/apikey`;
141
- } else if (msg.includes('429') || msg.includes('quota') || msg.includes('rate')) {
142
- const url = prov.type === 'openai'
143
- ? 'https://platform.openai.com/account/billing'
144
- : prov.type === 'google'
145
- ? 'https://aistudio.google.com/apikey'
146
- : prov.type === 'moonshot'
147
- ? 'https://platform.kimi.ai/console'
148
- : '';
149
- diagnosis = `Billing quota exceeded (HTTP 429).${url ? ` Check billing at ${url}` : ''}`;
150
- } else if (msg.includes('401') || msg.includes('unauthorized') || msg.includes('invalid_api_key')) {
151
- diagnosis = `Invalid API key (HTTP 401). Check your API key configuration.`;
152
- } else if (msg.includes('API_KEY_INVALID') || msg.includes('INVALID_ARGUMENT')) {
153
- diagnosis = `Invalid API key. Get a Gemini API key from https://aistudio.google.com/apikey`;
154
- }
155
-
156
- results[prov.type] = { ok: false, error: diagnosis };
157
- }
158
- }
159
-
160
- // Check Ollama
161
- try {
162
- const resp = await fetch('http://localhost:11434/api/tags', { signal: AbortSignal.timeout(3000) });
163
- if (resp.ok) {
164
- const data = await resp.json();
165
- const modelCount = (data.models || []).length;
166
- results['ollama'] = { ok: true, model: `${modelCount} models available` };
167
- } else {
168
- results['ollama'] = { ok: false, error: `Ollama returned ${resp.status}` };
169
- }
170
- } catch {
171
- results['ollama'] = { ok: false, error: 'Ollama not running (localhost:11434)' };
172
- }
173
-
174
- return results;
175
- }
176
-
177
- (async () => {
178
- // Load brain
179
- let brain = null;
180
- try {
181
- brain = require('../brain');
182
- brain.initDb();
183
- console.log('[eval] Brain loaded');
184
- } catch (err) {
185
- console.warn(`[eval] Brain not available: ${err.message}`);
186
- }
187
-
188
- // Handle --check-providers
189
- if (checkProviders) {
190
- console.log('\n[eval] Checking provider health...\n');
191
- const health = await checkProviderHealth(brain);
192
-
193
- for (const [provider, status] of Object.entries(health)) {
194
- const icon = status.ok ? 'OK' : 'FAIL';
195
- const detail = status.ok ? `(${status.model})` : `-- ${status.error}`;
196
- const label = provider.charAt(0).toUpperCase() + provider.slice(1);
197
- console.log(` ${label}: ${icon} ${detail}`);
198
- }
199
-
200
- const okCount = Object.values(health).filter(s => s.ok).length;
201
- const totalCount = Object.keys(health).length;
202
- console.log(`\n[eval] ${okCount}/${totalCount} providers healthy`);
203
-
204
- if (brain) brain.closeDb(true);
205
- process.exit(okCount > 0 ? 0 : 1);
206
- }
207
-
208
- // Load coding orchestrator for runAgentLoop
209
- let runAgentLoop;
210
- try {
211
- const codingOrch = require('../coding-orchestrator');
212
- runAgentLoop = codingOrch.runAgentLoop;
213
- console.log('[eval] Coding orchestrator loaded');
214
- } catch (err) {
215
- console.error(`[eval] Failed to load coding-orchestrator: ${err.message}`);
216
- process.exit(1);
217
- }
218
-
219
- // Check which models are actually available
220
- console.log(`\n[eval] Model roster (${models.length}):`);
221
- const availableModels = [];
222
-
223
- // Cache Ollama model list (one fetch instead of per-model)
224
- let ollamaModels = null;
225
- try {
226
- const resp = await fetch('http://localhost:11434/api/tags', { signal: AbortSignal.timeout(3000) });
227
- if (resp.ok) {
228
- const data = await resp.json();
229
- ollamaModels = new Set((data.models || []).map(m => m.name));
230
- }
231
- } catch { /* Ollama not running */ }
232
-
233
- for (const model of models) {
234
- const isOllama = model.includes(':');
235
- if (isOllama) {
236
- const found = ollamaModels && ollamaModels.has(model);
237
- const reason = ollamaModels ? (found ? '' : ' — NOT FOUND') : ' — NOT RUNNING';
238
- console.log(` ${found ? '+' : '-'} ${model} (ollama${reason})`);
239
- if (found) availableModels.push(model);
240
- } else {
241
- // Cloud model — check if brain has API key for its provider
242
- let providerType = 'anthropic';
243
- if (model.startsWith('gpt-')) providerType = 'openai';
244
- else if (model.startsWith('gemini-')) providerType = 'google';
245
- else if (model.startsWith('deepseek-')) providerType = 'deepseek';
246
- else if (model.startsWith('kimi-') || model.startsWith('moonshot-')) providerType = 'moonshot';
247
-
248
- let hasKey = false;
249
- if (brain) {
250
- try {
251
- const row = brain.getDb().prepare(
252
- 'SELECT id FROM model_providers WHERE type = ? AND enabled = 1 AND api_key_encrypted IS NOT NULL LIMIT 1'
253
- ).get(providerType);
254
- hasKey = !!row;
255
- } catch {}
256
- }
257
- // Also check env vars (including Portkey/gateway for Anthropic)
258
- if (!hasKey) {
259
- const envMap = {
260
- anthropic: ['ANTHROPIC_API_KEY', 'ANTHROPIC_AUTH_TOKEN', 'PORTKEY_API_KEY'],
261
- openai: ['OPENAI_API_KEY'],
262
- google: ['GOOGLE_API_KEY', 'GEMINI_API_KEY'],
263
- deepseek: ['DEEPSEEK_API_KEY'],
264
- moonshot: ['MOONSHOT_API_KEY'],
265
- };
266
- hasKey = (envMap[providerType] || []).some(k => !!process.env[k]);
267
- }
268
-
269
- console.log(` ${hasKey ? '+' : '-'} ${model} (${providerType}${hasKey ? '' : ' — NO API KEY'})`);
270
- if (hasKey) availableModels.push(model);
271
- }
272
- }
273
-
274
- if (availableModels.length === 0) {
275
- console.error('\n[eval] No models available. Configure API keys in setup or start Ollama.');
276
- process.exit(1);
277
- }
278
-
279
- console.log(`\n[eval] Available: ${availableModels.length}/${models.length} models`);
280
-
281
- // Load benchmarks to show work items
282
- const BENCHMARKS_DIR = path.join(__dirname, 'benchmarks');
283
- let benchmarks;
284
- if (suite === 'all') {
285
- benchmarks = [];
286
- for (const fname of fs.readdirSync(BENCHMARKS_DIR).filter(n => n.endsWith('.json'))) {
287
- try {
288
- const items = JSON.parse(fs.readFileSync(path.join(BENCHMARKS_DIR, fname), 'utf8'));
289
- benchmarks.push(...items);
290
- } catch {}
291
- }
292
- } else {
293
- const file = path.join(BENCHMARKS_DIR, `${suite}.json`);
294
- if (!fs.existsSync(file)) {
295
- console.error(`[eval] Suite not found: ${suite} (looked in ${file})`);
296
- process.exit(1);
297
- }
298
- benchmarks = JSON.parse(fs.readFileSync(file, 'utf8'));
299
- }
300
-
301
- if (benchmarkId) {
302
- benchmarks = benchmarks.filter(b => b.id === benchmarkId);
303
- }
304
-
305
- const totalWork = availableModels.length * benchmarks.length;
306
- console.log(`[eval] Suite: ${suite} (${benchmarks.length} benchmarks)`);
307
- console.log(`[eval] Total work items: ${totalWork} (${availableModels.length} models x ${benchmarks.length} benchmarks)`);
308
- console.log(`[eval] Budget: $${budget.toFixed(2)}, Timeout: ${timeout / 1000}s/benchmark, Concurrency: ${concurrency}`);
309
-
310
- if (dryRun) {
311
- console.log('\n[eval] DRY RUN — would execute:');
312
- for (const model of availableModels) {
313
- for (const b of benchmarks) {
314
- console.log(` ${model} x ${b.id} (${b.difficulty})`);
315
- }
316
- }
317
- process.exit(0);
318
- }
319
-
320
- // Pre-flight: warn about unhealthy cloud providers
321
- const cloudModels = availableModels.filter(m => !m.includes(':'));
322
- if (cloudModels.length > 0) {
323
- const health = await checkProviderHealth(brain);
324
- const unhealthy = Object.entries(health).filter(([, s]) => !s.ok);
325
- if (unhealthy.length > 0) {
326
- console.log('\n[eval] Provider warnings:');
327
- for (const [provider, status] of unhealthy) {
328
- const affectedModels = availableModels.filter(m => {
329
- if (provider === 'anthropic') return m.startsWith('claude-');
330
- if (provider === 'openai') return m.startsWith('gpt-') || m.startsWith('o1') || m.startsWith('o3') || m.startsWith('o4');
331
- if (provider === 'google') return m.startsWith('gemini-');
332
- if (provider === 'deepseek') return m.startsWith('deepseek-');
333
- if (provider === 'moonshot') return m.startsWith('kimi-') || m.startsWith('moonshot-');
334
- return false;
335
- });
336
- if (affectedModels.length > 0) {
337
- console.warn(` ${provider}: ${status.error} (skipping ${affectedModels.length} models: ${affectedModels.join(', ')})`);
338
- // Remove unhealthy models from availableModels
339
- for (const m of affectedModels) {
340
- const idx = availableModels.indexOf(m);
341
- if (idx >= 0) availableModels.splice(idx, 1);
342
- }
343
- }
344
- }
345
- if (availableModels.length === 0) {
346
- console.error('\n[eval] No healthy models remaining after pre-flight check.');
347
- if (brain) brain.closeDb(true);
348
- process.exit(1);
349
- }
350
- console.log(`[eval] Proceeding with ${availableModels.length} healthy models\n`);
351
- }
352
- }
353
-
354
- // Create and run orchestrator
355
- const orch = new EvalOrchestrator({
356
- concurrency,
357
- budgetDollars: budget,
358
- timeoutMs: timeout,
359
- brain,
360
- runId: resumeId || undefined,
361
- });
362
-
363
- // Wire up events for live progress
364
- orch.on('benchmark-start', ({ benchmarkId: bid, model }) => {
365
- console.log(`\n[START] ${model} x ${bid}`);
366
- });
367
-
368
- orch.on('benchmark-complete', ({ benchmarkId: bid, model, composite, costDollars, elapsed }) => {
369
- console.log(`[DONE] ${model} x ${bid}: score=${composite.toFixed(3)} cost=$${(costDollars || 0).toFixed(6)} time=${(elapsed / 1000).toFixed(1)}s`);
370
- });
371
-
372
- orch.on('model-complete', ({ model, avgScore, totalCost, benchmarksRun }) => {
373
- console.log(`\n[MODEL] ${model}: avg=${avgScore.toFixed(3)} cost=$${totalCost.toFixed(6)} runs=${benchmarksRun}`);
374
- });
375
-
376
- orch.on('budget-warning', ({ spent, budget: bgt, remaining, model }) => {
377
- const ctx = model ? ` (${model})` : '';
378
- console.warn(`\n[BUDGET] $${spent.toFixed(4)} / $${bgt.toFixed(2)} spent${ctx}, remaining: $${(remaining || 0).toFixed(4)}`);
379
- });
380
-
381
- orch.on('error', ({ benchmarkId: bid, model, error }) => {
382
- console.error(`[ERROR] ${model || '?'} x ${bid || '?'}: ${error}`);
383
- });
384
-
385
- // Handle Ctrl+C gracefully
386
- process.on('SIGINT', () => {
387
- console.log('\n[eval] Aborting... (run can be resumed with --resume ' + orch.runId + ')');
388
- orch.abort();
389
- });
390
-
391
- console.log(`\n[eval] Starting run ${orch.runId}...\n`);
392
- const startTime = Date.now();
393
-
394
- const summary = await orch.run({
395
- suite,
396
- models: availableModels,
397
- benchmarkIds: benchmarkId ? [benchmarkId] : undefined,
398
- runAgentLoop,
399
- });
400
-
401
- const totalElapsed = ((Date.now() - startTime) / 1000).toFixed(0);
402
-
403
- console.log('\n' + '='.repeat(60));
404
- console.log('EVAL RUN COMPLETE');
405
- console.log('='.repeat(60));
406
- console.log(`Run ID: ${summary.runId}`);
407
- console.log(`Status: ${summary.status}`);
408
- console.log(`Benchmarks: ${summary.totalBenchmarks}`);
409
- console.log(`Total cost: $${summary.totalSpent.toFixed(6)}`);
410
- console.log(`Elapsed: ${totalElapsed}s`);
411
- console.log('');
412
-
413
- // Leaderboard
414
- const sorted = Object.entries(summary.models)
415
- .sort(([, a], [, b]) => b.avgScore - a.avgScore);
416
-
417
- console.log('LEADERBOARD:');
418
- console.log('-'.repeat(60));
419
- console.log('Rank Model Avg Score Cost Errors');
420
- console.log('-'.repeat(60));
421
- sorted.forEach(([model, stats], i) => {
422
- const name = model.padEnd(30);
423
- const score = stats.avgScore.toFixed(3).padStart(9);
424
- const cost = ('$' + stats.totalCost.toFixed(4)).padStart(9);
425
- const errors = String(stats.errors).padStart(6);
426
- console.log(`${String(i + 1).padStart(4)} ${name} ${score} ${cost} ${errors}`);
427
- });
428
- console.log('-'.repeat(60));
429
-
430
- if (summary.status === 'aborted') {
431
- console.log(`\nResume with: node eval/run-eval.js --resume ${summary.runId}`);
432
- }
433
-
434
- if (brain) brain.closeDb(true);
435
- })();