create-walle 0.9.21 → 0.9.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (500) hide show
  1. package/README.md +27 -5
  2. package/package.json +2 -2
  3. package/template/CLAUDE.md +2 -2
  4. package/template/LICENSE +1 -1
  5. package/template/bin/ctm-dev-cleanup.js +24 -3
  6. package/template/bin/ctm-launch.sh +13 -0
  7. package/template/bin/dev.sh +156 -18
  8. package/template/bin/node-bin.sh +84 -0
  9. package/template/bin/pin-node.sh +51 -0
  10. package/template/claude-task-manager/api-prompts.js +1203 -182
  11. package/template/claude-task-manager/api-reviews.js +109 -15
  12. package/template/claude-task-manager/approval-agent.js +1360 -280
  13. package/template/claude-task-manager/bin/restart-ctm.sh +64 -23
  14. package/template/claude-task-manager/bin/storage-migration-supervisor.js +338 -0
  15. package/template/claude-task-manager/db.js +4417 -295
  16. package/template/claude-task-manager/docs/app-update-refresh-protocol.md +69 -0
  17. package/template/claude-task-manager/docs/approval-ai-refinement.md +138 -0
  18. package/template/claude-task-manager/docs/approval-rescue-loop.md +74 -0
  19. package/template/claude-task-manager/docs/codex-operational-warning-health.md +107 -0
  20. package/template/claude-task-manager/docs/codex-resume-state-guard-design.md +17 -12
  21. package/template/claude-task-manager/docs/codex-terminal-render-controller-handoff.md +311 -0
  22. package/template/claude-task-manager/docs/coding-agent-hooks-architecture.md +418 -0
  23. package/template/claude-task-manager/docs/conversation-import-freshness.md +20 -0
  24. package/template/claude-task-manager/docs/google-workspace-auth-health.md +77 -0
  25. package/template/claude-task-manager/docs/image-paste-ux.md +13 -0
  26. package/template/claude-task-manager/docs/ipad-web-preview.md +88 -0
  27. package/template/claude-task-manager/docs/main-loop-offload-architecture.md +66 -0
  28. package/template/claude-task-manager/docs/microsoft-dev-tunnel-phone-access-design.md +274 -519
  29. package/template/claude-task-manager/docs/mobile-live-streaming.md +27 -5
  30. package/template/claude-task-manager/docs/mobile-remote-submission-lifecycle.md +69 -0
  31. package/template/claude-task-manager/docs/phone-access-design.md +53 -15
  32. package/template/claude-task-manager/docs/phone-passkey-identity.md +122 -0
  33. package/template/claude-task-manager/docs/phone-setup.md +3 -0
  34. package/template/claude-task-manager/docs/prompt-editing-tree-design.md +25 -1
  35. package/template/claude-task-manager/docs/remote-desktop-access-design.md +268 -0
  36. package/template/claude-task-manager/docs/restart-lifecycle-architecture.md +95 -0
  37. package/template/claude-task-manager/docs/runtime-work-control-plane.md +53 -0
  38. package/template/claude-task-manager/docs/session-interactive-wait-surfaces.md +38 -0
  39. package/template/claude-task-manager/docs/session-needs-you-dismissal.md +84 -0
  40. package/template/claude-task-manager/docs/session-render-state-management-design.md +91 -3
  41. package/template/claude-task-manager/docs/session-standup-command-center-design.md +25 -1
  42. package/template/claude-task-manager/docs/session-title-authority.md +32 -0
  43. package/template/claude-task-manager/docs/session-workspace-binding.md +33 -0
  44. package/template/claude-task-manager/docs/skill-intent-resolution-design.md +72 -0
  45. package/template/claude-task-manager/docs/walle-mcp-supervisor-health.md +86 -0
  46. package/template/claude-task-manager/docs/walle-relay-phone-access-design.md +24 -15
  47. package/template/claude-task-manager/docs/walle-session-history-hydration.md +114 -0
  48. package/template/claude-task-manager/docs/walle-session-input-queue.md +104 -0
  49. package/template/claude-task-manager/docs/walle-session-model-catalog.md +90 -0
  50. package/template/claude-task-manager/docs/walle-session-model-preferences.md +15 -6
  51. package/template/claude-task-manager/git-utils.js +897 -27
  52. package/template/claude-task-manager/lib/agent-capabilities.js +33 -0
  53. package/template/claude-task-manager/lib/agent-cli-cache.js +37 -7
  54. package/template/claude-task-manager/lib/agent-hooks-installer.js +26 -2
  55. package/template/claude-task-manager/lib/agent-presets.js +17 -1
  56. package/template/claude-task-manager/lib/all-sessions-query.js +108 -0
  57. package/template/claude-task-manager/lib/approval-ai-refinement.js +488 -0
  58. package/template/claude-task-manager/lib/approval-self-adapt.js +168 -0
  59. package/template/claude-task-manager/lib/async-semaphore.js +44 -0
  60. package/template/claude-task-manager/lib/auth-context.js +5 -0
  61. package/template/claude-task-manager/lib/auth-rate-limit.js +47 -4
  62. package/template/claude-task-manager/lib/auth-rules.js +29 -2
  63. package/template/claude-task-manager/lib/auto-approval-verifier.js +129 -16
  64. package/template/claude-task-manager/lib/background-llm.js +144 -17
  65. package/template/claude-task-manager/lib/branch-inventory.js +212 -0
  66. package/template/claude-task-manager/lib/claude-desktop-sessions.js +15 -3
  67. package/template/claude-task-manager/lib/coalesce-sync-frames.js +151 -0
  68. package/template/claude-task-manager/lib/codex-launch-health.js +762 -0
  69. package/template/claude-task-manager/lib/codex-transcript-pager.js +51 -0
  70. package/template/claude-task-manager/lib/codex-zst.js +124 -0
  71. package/template/claude-task-manager/lib/coding-agent-models.js +233 -30
  72. package/template/claude-task-manager/lib/connection-health.js +232 -0
  73. package/template/claude-task-manager/lib/conversation-blob-parser.js +42 -0
  74. package/template/claude-task-manager/lib/conversation-tail-merge.js +89 -26
  75. package/template/claude-task-manager/lib/ctm-session-context-api.js +39 -10
  76. package/template/claude-task-manager/lib/cursor-conversation-store.js +354 -0
  77. package/template/claude-task-manager/lib/db-owner-worker-client.js +315 -0
  78. package/template/claude-task-manager/lib/document-review.js +141 -6
  79. package/template/claude-task-manager/lib/escalation-review.js +152 -0
  80. package/template/claude-task-manager/lib/graceful-shutdown.js +159 -0
  81. package/template/claude-task-manager/lib/headless-term-service.js +678 -0
  82. package/template/claude-task-manager/lib/heavy-worker-fallback.js +38 -0
  83. package/template/claude-task-manager/lib/jsonl-conversation-parser.js +542 -0
  84. package/template/claude-task-manager/lib/jsonl-range-reader.js +112 -0
  85. package/template/claude-task-manager/lib/main-db-census.js +216 -0
  86. package/template/claude-task-manager/lib/message-pagination.js +106 -4
  87. package/template/claude-task-manager/lib/microsoft-dev-tunnel-setup.js +750 -26
  88. package/template/claude-task-manager/lib/mobile-auth-api.js +274 -7
  89. package/template/claude-task-manager/lib/mobile-auth-store.js +592 -10
  90. package/template/claude-task-manager/lib/mobile-notification-dispatcher.js +15 -0
  91. package/template/claude-task-manager/lib/model-overview-brain-fallback.js +311 -0
  92. package/template/claude-task-manager/lib/model-overview-cache.js +141 -0
  93. package/template/claude-task-manager/lib/models-health-routing-notice.js +126 -0
  94. package/template/claude-task-manager/lib/node-pin-guard.js +93 -0
  95. package/template/claude-task-manager/lib/perf-tracker.js +242 -6
  96. package/template/claude-task-manager/lib/permission-match.js +76 -0
  97. package/template/claude-task-manager/lib/permission-sync.js +133 -20
  98. package/template/claude-task-manager/lib/process-title.js +35 -0
  99. package/template/claude-task-manager/lib/prompt-executions-query.js +25 -0
  100. package/template/claude-task-manager/lib/prompt-index-disk-cache.js +44 -0
  101. package/template/claude-task-manager/lib/prompt-intent.js +132 -0
  102. package/template/claude-task-manager/lib/provider-user-context.js +34 -0
  103. package/template/claude-task-manager/lib/read-pool-client.js +313 -0
  104. package/template/claude-task-manager/lib/readpool-breaker.js +31 -0
  105. package/template/claude-task-manager/lib/recent-sessions-breaker.js +12 -0
  106. package/template/claude-task-manager/lib/remote-feedback-client.js +72 -0
  107. package/template/claude-task-manager/lib/remote-relay-protocol.js +37 -4
  108. package/template/claude-task-manager/lib/remote-relay-store.js +159 -0
  109. package/template/claude-task-manager/lib/remote-submission-observer.js +278 -0
  110. package/template/claude-task-manager/lib/restart-guard.js +109 -0
  111. package/template/claude-task-manager/lib/restore-interruption-detector.js +439 -0
  112. package/template/claude-task-manager/lib/restore-policy.js +13 -0
  113. package/template/claude-task-manager/lib/restore-resume-batch.js +74 -0
  114. package/template/claude-task-manager/lib/restore-runtime.js +68 -0
  115. package/template/claude-task-manager/lib/restore-storm.js +34 -0
  116. package/template/claude-task-manager/lib/resume-cwd.js +36 -0
  117. package/template/claude-task-manager/lib/resume-preflight.js +313 -0
  118. package/template/claude-task-manager/lib/runtime-work-registry.js +444 -0
  119. package/template/claude-task-manager/lib/sanitize-openai-auth.js +31 -0
  120. package/template/claude-task-manager/lib/scheduler.js +21 -1
  121. package/template/claude-task-manager/lib/scrollback-snapshot-store.js +159 -0
  122. package/template/claude-task-manager/lib/serial-task-queue.js +64 -0
  123. package/template/claude-task-manager/lib/server-listeners.js +239 -0
  124. package/template/claude-task-manager/lib/session-capture.js +42 -7
  125. package/template/claude-task-manager/lib/session-content-backfill.js +131 -0
  126. package/template/claude-task-manager/lib/session-history.js +388 -43
  127. package/template/claude-task-manager/lib/session-host-manager.js +287 -0
  128. package/template/claude-task-manager/lib/session-image-refs.js +209 -0
  129. package/template/claude-task-manager/lib/session-jobs.js +399 -59
  130. package/template/claude-task-manager/lib/session-prompt-index.js +137 -0
  131. package/template/claude-task-manager/lib/session-restore.js +53 -0
  132. package/template/claude-task-manager/lib/session-standup.js +123 -23
  133. package/template/claude-task-manager/lib/session-state-bus.js +14 -0
  134. package/template/claude-task-manager/lib/session-stream.js +64 -16
  135. package/template/claude-task-manager/lib/session-timeline-summary.js +260 -0
  136. package/template/claude-task-manager/lib/session-token-usage.js +494 -0
  137. package/template/claude-task-manager/lib/session-workspace-binding.js +356 -0
  138. package/template/claude-task-manager/lib/setup-network-config.js +9 -0
  139. package/template/claude-task-manager/lib/size-cap.js +45 -0
  140. package/template/claude-task-manager/lib/size-cap.test.js +62 -0
  141. package/template/claude-task-manager/lib/skill-autocomplete.js +180 -1
  142. package/template/claude-task-manager/lib/skill-intent-resolver.js +304 -0
  143. package/template/claude-task-manager/lib/sqlite-driver.js +19 -3
  144. package/template/claude-task-manager/lib/standup-attention.js +7 -3
  145. package/template/claude-task-manager/lib/status-authority.js +39 -0
  146. package/template/claude-task-manager/lib/status-hooks.js +4 -0
  147. package/template/claude-task-manager/lib/storage-migration.js +235 -0
  148. package/template/claude-task-manager/lib/structured-capture.js +298 -0
  149. package/template/claude-task-manager/lib/sync-io-census.js +163 -0
  150. package/template/claude-task-manager/lib/tailscale-setup.js +6 -0
  151. package/template/claude-task-manager/lib/terminal-activity-evidence.js +33 -0
  152. package/template/claude-task-manager/lib/terminal-choice.js +364 -0
  153. package/template/claude-task-manager/lib/terminal-control-sanitize.js +17 -0
  154. package/template/claude-task-manager/lib/terminal-fingerprint.js +48 -0
  155. package/template/claude-task-manager/lib/terminal-output-flush.js +84 -0
  156. package/template/claude-task-manager/lib/timeline-order.js +122 -0
  157. package/template/claude-task-manager/lib/transcript-store.js +348 -43
  158. package/template/claude-task-manager/lib/transport-security.js +84 -1
  159. package/template/claude-task-manager/lib/wait-state.js +184 -0
  160. package/template/claude-task-manager/lib/walle-client.js +47 -5
  161. package/template/claude-task-manager/lib/walle-ctm-history.js +564 -4
  162. package/template/claude-task-manager/lib/walle-external-actions.js +135 -16
  163. package/template/claude-task-manager/lib/walle-history-hydration.js +46 -0
  164. package/template/claude-task-manager/lib/walle-native-health.js +403 -0
  165. package/template/claude-task-manager/lib/walle-repair.js +701 -0
  166. package/template/claude-task-manager/lib/walle-session-cache.js +109 -0
  167. package/template/claude-task-manager/lib/walle-session-context.js +57 -21
  168. package/template/claude-task-manager/lib/walle-session-model-catalog.js +34 -0
  169. package/template/claude-task-manager/lib/walle-supervisor.js +539 -63
  170. package/template/claude-task-manager/lib/walle-transcript.js +52 -0
  171. package/template/claude-task-manager/lib/worktree-active-sync.js +11 -7
  172. package/template/claude-task-manager/lib/worktree-cwd.js +32 -1
  173. package/template/claude-task-manager/package.json +1 -1
  174. package/template/claude-task-manager/prompt-harvest.js +89 -66
  175. package/template/claude-task-manager/providers/claude-code.js +51 -3
  176. package/template/claude-task-manager/providers/cursor.js +140 -45
  177. package/template/claude-task-manager/public/css/reviews.css +551 -61
  178. package/template/claude-task-manager/public/css/setup.css +191 -0
  179. package/template/claude-task-manager/public/css/walle-session.css +865 -10
  180. package/template/claude-task-manager/public/css/walle.css +154 -0
  181. package/template/claude-task-manager/public/designs/ai-providers-consolidation-v2.html +830 -0
  182. package/template/claude-task-manager/public/index.html +18516 -2058
  183. package/template/claude-task-manager/public/ipad.html +363 -0
  184. package/template/claude-task-manager/public/js/document-review-links.js +301 -0
  185. package/template/claude-task-manager/public/js/image-normalize.js +69 -36
  186. package/template/claude-task-manager/public/js/message-renderer.js +1265 -77
  187. package/template/claude-task-manager/public/js/prompts.js +66 -29
  188. package/template/claude-task-manager/public/js/reviews.js +901 -133
  189. package/template/claude-task-manager/public/js/session-activity-utils.js +11 -1
  190. package/template/claude-task-manager/public/js/session-search-utils.js +94 -10
  191. package/template/claude-task-manager/public/js/session-status-precedence.js +23 -5
  192. package/template/claude-task-manager/public/js/setup.js +1273 -176
  193. package/template/claude-task-manager/public/js/stream-view.js +691 -73
  194. package/template/claude-task-manager/public/js/terminal-reconciler.js +210 -0
  195. package/template/claude-task-manager/public/js/walle-session.js +2455 -158
  196. package/template/claude-task-manager/public/js/walle.js +455 -28
  197. package/template/claude-task-manager/public/m/app.css +2909 -262
  198. package/template/claude-task-manager/public/m/app.js +6601 -398
  199. package/template/claude-task-manager/public/m/claim.html +224 -17
  200. package/template/claude-task-manager/public/m/index.html +117 -21
  201. package/template/claude-task-manager/public/m/sw.js +3 -1
  202. package/template/claude-task-manager/public/manifest.json +2 -2
  203. package/template/claude-task-manager/public/prompts.html +30 -14
  204. package/template/claude-task-manager/queue-engine.js +507 -28
  205. package/template/claude-task-manager/scripts/repair-claude-session-images.js +27 -8
  206. package/template/claude-task-manager/server.js +14341 -2197
  207. package/template/claude-task-manager/session-integrity.js +160 -18
  208. package/template/claude-task-manager/session-search-ranking.js +1 -0
  209. package/template/claude-task-manager/session-utils.js +25 -5
  210. package/template/claude-task-manager/workers/approval-blocklist.js +96 -6
  211. package/template/claude-task-manager/workers/approval-widget-validator.js +14 -8
  212. package/template/claude-task-manager/workers/conversation-import-worker.js +11 -50
  213. package/template/claude-task-manager/workers/db-owner-worker.js +386 -0
  214. package/template/claude-task-manager/workers/harvest-worker.js +9 -55
  215. package/template/claude-task-manager/workers/headless-term-worker.js +9 -530
  216. package/template/claude-task-manager/workers/read-pool-worker.js +387 -0
  217. package/template/claude-task-manager/workers/scrollback-worker.js +11 -72
  218. package/template/claude-task-manager/workers/session-host-process.js +146 -0
  219. package/template/claude-task-manager/workers/session-integrity-worker.js +10 -54
  220. package/template/claude-task-manager/workers/state-detectors/base.js +18 -1
  221. package/template/claude-task-manager/workers/state-detectors/claude-code.js +182 -9
  222. package/template/claude-task-manager/workers/state-detectors/codex.js +150 -2
  223. package/template/claude-task-manager/workers/state-detectors/cursor.js +127 -0
  224. package/template/claude-task-manager/workers/state-detectors/gemini.js +21 -0
  225. package/template/claude-task-manager/workers/state-detectors/index.js +29 -0
  226. package/template/claude-task-manager/workers/state-detectors/opencode.js +103 -0
  227. package/template/docs/design/markdown-review-pane.md +206 -0
  228. package/template/docs/designs/2026-05-17-portkey-gateway-provider-ux.md +129 -38
  229. package/template/docs/designs/2026-05-20-mobile-worktree-finish-command.md +27 -0
  230. package/template/docs/designs/2026-05-22-ai-configuration-consolidation.md +248 -0
  231. package/template/docs/designs/ai-configuration-consolidation-mock.html +812 -0
  232. package/template/docs/private-memory-and-pii-policy.md +69 -0
  233. package/template/package.json +2 -1
  234. package/template/scripts/check-private-data.js +201 -0
  235. package/template/shared/sqlite-owner-guard.js +30 -0
  236. package/template/shared/sqlite-owner-write-queue.js +225 -0
  237. package/template/shared/sqlite-storage-policy.js +111 -0
  238. package/template/shared/sqlite-write-lock.js +428 -0
  239. package/template/wall-e/agent-runners/claude-code.js +5 -0
  240. package/template/wall-e/agent.js +166 -22
  241. package/template/wall-e/api-walle.js +524 -70
  242. package/template/wall-e/auth/provider-flows.js +11 -1
  243. package/template/wall-e/bin/walle-mcp-stdio.js +341 -17
  244. package/template/wall-e/brain.js +1614 -141
  245. package/template/wall-e/chat/attachment-blocks.js +96 -0
  246. package/template/wall-e/chat/attachments.js +2 -1
  247. package/template/wall-e/chat/capability-resolver.js +7 -7
  248. package/template/wall-e/chat/context-messages.js +28 -0
  249. package/template/wall-e/chat/conversation-frame.js +630 -0
  250. package/template/wall-e/chat/provider-messages.js +125 -0
  251. package/template/wall-e/chat.js +1002 -233
  252. package/template/wall-e/coding/acceptance-contract.js +170 -0
  253. package/template/wall-e/coding/acp-adapter.js +1 -1
  254. package/template/wall-e/coding/agent-catalog.js +3 -0
  255. package/template/wall-e/coding/artifact-store.js +93 -0
  256. package/template/wall-e/coding/capability-router.js +120 -0
  257. package/template/wall-e/coding/coding-run-controller.js +423 -0
  258. package/template/wall-e/coding/compaction-service.js +157 -12
  259. package/template/wall-e/coding/frontend-verification.js +258 -0
  260. package/template/wall-e/coding/lifecycle-hooks.js +75 -0
  261. package/template/wall-e/coding/local-preview-contract.js +157 -0
  262. package/template/wall-e/coding/permission-service.js +57 -13
  263. package/template/wall-e/coding/prompt-bundle.js +19 -1
  264. package/template/wall-e/coding/prompt-section-registry.js +227 -0
  265. package/template/wall-e/coding/provider-compat.js +15 -0
  266. package/template/wall-e/coding/runtime-events.js +224 -0
  267. package/template/wall-e/coding/runtime-mode.js +3 -0
  268. package/template/wall-e/coding/side-git-snapshot.js +160 -4
  269. package/template/wall-e/coding/snapshot-service.js +143 -1
  270. package/template/wall-e/coding/stream-processor.js +388 -34
  271. package/template/wall-e/coding/task-tool.js +141 -4
  272. package/template/wall-e/coding/tool-execution-controller.js +365 -0
  273. package/template/wall-e/coding/tool-registry.js +43 -5
  274. package/template/wall-e/coding/user-hooks.js +217 -0
  275. package/template/wall-e/coding-orchestrator.js +1330 -221
  276. package/template/wall-e/coding-prompts.js +20 -4
  277. package/template/wall-e/context/context-builder.js +15 -2
  278. package/template/wall-e/decision/confidence.js +1 -1
  279. package/template/wall-e/docs/coding-acceptance-contract.md +41 -0
  280. package/template/wall-e/docs/external-action-controller.md +26 -6
  281. package/template/wall-e/docs/telemetry-lifecycle.md +8 -2
  282. package/template/wall-e/embeddings.js +591 -53
  283. package/template/wall-e/external-action-controller.js +12 -0
  284. package/template/wall-e/http/auth.js +1 -0
  285. package/template/wall-e/http/chat-api.js +46 -11
  286. package/template/wall-e/http/model-admin.js +836 -34
  287. package/template/wall-e/lib/boot-profile.js +88 -0
  288. package/template/wall-e/lib/event-loop-monitor.js +93 -0
  289. package/template/wall-e/lib/service-health.js +194 -0
  290. package/template/wall-e/llm/anthropic.js +130 -5
  291. package/template/wall-e/llm/client.js +266 -63
  292. package/template/wall-e/llm/default-fallback.js +382 -0
  293. package/template/wall-e/llm/health.js +19 -0
  294. package/template/wall-e/llm/message-guard.js +78 -0
  295. package/template/wall-e/llm/model-catalog.js +252 -1
  296. package/template/wall-e/llm/openai.js +26 -4
  297. package/template/wall-e/llm/portkey-sync.js +654 -0
  298. package/template/wall-e/llm/provider-error.js +30 -2
  299. package/template/wall-e/llm/registry.js +5 -1
  300. package/template/wall-e/llm/request-compat.js +67 -0
  301. package/template/wall-e/loops/backfill.js +79 -23
  302. package/template/wall-e/loops/brain-optimize.js +67 -0
  303. package/template/wall-e/loops/ingest.js +25 -10
  304. package/template/wall-e/loops/question-digest.js +160 -0
  305. package/template/wall-e/loops/reflect.js +6 -4
  306. package/template/wall-e/loops/think.js +39 -12
  307. package/template/wall-e/mcp-server.js +318 -36
  308. package/template/wall-e/memory/ctm-context-client.js +52 -14
  309. package/template/wall-e/memory/ctm-operational-context.js +237 -0
  310. package/template/wall-e/memory/ctm-prompt-executions-client.js +128 -0
  311. package/template/wall-e/memory/ctm-session-context.js +111 -63
  312. package/template/wall-e/prompts/coding/deepseek.txt +3 -0
  313. package/template/wall-e/prompts/coding/gemini.txt +6 -0
  314. package/template/wall-e/prompts/coding/gpt.txt +6 -0
  315. package/template/wall-e/prompts/coding/local.txt +7 -0
  316. package/template/wall-e/runtime/decision-hooks.js +115 -0
  317. package/template/wall-e/runtime/devbox-gateway.js +82 -8
  318. package/template/wall-e/runtime/prompt-manifest.js +86 -0
  319. package/template/wall-e/runtime/tool-executor.js +269 -0
  320. package/template/wall-e/runtime/tool-result-envelope.js +138 -0
  321. package/template/wall-e/runtime/transcript-projection.js +60 -0
  322. package/template/wall-e/runtime/walle-runtime.js +224 -0
  323. package/template/wall-e/scripts/db-optimize/migrate.js +162 -0
  324. package/template/wall-e/scripts/db-optimize/recall-eval.js +117 -0
  325. package/template/wall-e/server.js +15 -0
  326. package/template/wall-e/session-files.js +9 -0
  327. package/template/wall-e/skills/_bundled/google-calendar/run.js +1 -1
  328. package/template/wall-e/skills/_bundled/gws-workspace/run.js +1 -1
  329. package/template/wall-e/skills/_bundled/slack-mentions/run.js +76 -6
  330. package/template/wall-e/skills/claude-code-reader.js +7 -3
  331. package/template/wall-e/skills/script-skill-runner.js +10 -0
  332. package/template/wall-e/skills/skill-planner.js +38 -0
  333. package/template/wall-e/tools/builtin-middleware.js +19 -9
  334. package/template/wall-e/tools/local-tools.js +1428 -16
  335. package/template/wall-e/tools/permission-checker.js +73 -5
  336. package/template/wall-e/tools/question-manager.js +117 -7
  337. package/template/wall-e/training/harvester.js +12 -28
  338. package/template/wall-e/training/replay.js +25 -80
  339. package/template/website/index.html +10 -10
  340. package/template/wall-e/eval/ab-test.js +0 -203
  341. package/template/wall-e/eval/agent-runner.js +0 -772
  342. package/template/wall-e/eval/agent-scorer.js +0 -461
  343. package/template/wall-e/eval/aggregator.js +0 -414
  344. package/template/wall-e/eval/allowed-test-commands.js +0 -34
  345. package/template/wall-e/eval/benchmark-generator.js +0 -113
  346. package/template/wall-e/eval/benchmarks/chat-eval.json +0 -1662
  347. package/template/wall-e/eval/benchmarks/chat.json +0 -82
  348. package/template/wall-e/eval/benchmarks/coding-agent-real.json +0 -1
  349. package/template/wall-e/eval/benchmarks/coding-agent.json +0 -1581
  350. package/template/wall-e/eval/benchmarks/coding.json +0 -122
  351. package/template/wall-e/eval/benchmarks/memory-retrieval.json +0 -234
  352. package/template/wall-e/eval/benchmarks/reasoning.json +0 -82
  353. package/template/wall-e/eval/benchmarks/swebench-lite-30.json +0 -212
  354. package/template/wall-e/eval/benchmarks.js +0 -669
  355. package/template/wall-e/eval/cc-replay.js +0 -719
  356. package/template/wall-e/eval/chat-eval.js +0 -525
  357. package/template/wall-e/eval/check-keys.js +0 -15
  358. package/template/wall-e/eval/check-providers.js +0 -42
  359. package/template/wall-e/eval/codex-cli-baseline.js +0 -669
  360. package/template/wall-e/eval/coding-agent-real.js +0 -570
  361. package/template/wall-e/eval/context-compactor.js +0 -251
  362. package/template/wall-e/eval/debug-agent003.js +0 -68
  363. package/template/wall-e/eval/diagnostics.js +0 -216
  364. package/template/wall-e/eval/eval-orchestrator.js +0 -642
  365. package/template/wall-e/eval/evaluate.js +0 -202
  366. package/template/wall-e/eval/evaluator.js +0 -373
  367. package/template/wall-e/eval/exporter.js +0 -212
  368. package/template/wall-e/eval/fixtures/express-basic/package.json +0 -9
  369. package/template/wall-e/eval/fixtures/express-basic/server.js +0 -115
  370. package/template/wall-e/eval/fixtures/express-basic/test.js +0 -83
  371. package/template/wall-e/eval/fixtures/express-buggy/package.json +0 -9
  372. package/template/wall-e/eval/fixtures/express-buggy/server.js +0 -113
  373. package/template/wall-e/eval/fixtures/express-buggy/test.js +0 -83
  374. package/template/wall-e/eval/fixtures/express-buggy-items/package.json +0 -9
  375. package/template/wall-e/eval/fixtures/express-buggy-items/server.js +0 -112
  376. package/template/wall-e/eval/fixtures/express-buggy-items/test.js +0 -83
  377. package/template/wall-e/eval/fixtures/express-buggy-search/package.json +0 -9
  378. package/template/wall-e/eval/fixtures/express-buggy-search/server.js +0 -121
  379. package/template/wall-e/eval/fixtures/express-buggy-search/test.js +0 -83
  380. package/template/wall-e/eval/fixtures/express-rename-data/data.js +0 -34
  381. package/template/wall-e/eval/fixtures/express-rename-data/package.json +0 -9
  382. package/template/wall-e/eval/fixtures/express-rename-data/server.js +0 -97
  383. package/template/wall-e/eval/fixtures/express-rename-data/test.js +0 -88
  384. package/template/wall-e/eval/fixtures/express-xss/package.json +0 -12
  385. package/template/wall-e/eval/fixtures/express-xss/server.js +0 -90
  386. package/template/wall-e/eval/fixtures/express-xss/test.js +0 -67
  387. package/template/wall-e/eval/fixtures/express-xss/views/profile.ejs +0 -9
  388. package/template/wall-e/eval/fixtures/fullstack-app/config/default.js +0 -9
  389. package/template/wall-e/eval/fixtures/fullstack-app/config/test.js +0 -13
  390. package/template/wall-e/eval/fixtures/fullstack-app/package.json +0 -11
  391. package/template/wall-e/eval/fixtures/fullstack-app/public/css/style.css +0 -137
  392. package/template/wall-e/eval/fixtures/fullstack-app/public/index.html +0 -46
  393. package/template/wall-e/eval/fixtures/fullstack-app/public/js/app.js +0 -121
  394. package/template/wall-e/eval/fixtures/fullstack-app/public/js/auth.js +0 -71
  395. package/template/wall-e/eval/fixtures/fullstack-app/public/js/items.js +0 -80
  396. package/template/wall-e/eval/fixtures/fullstack-app/public/js/users.js +0 -46
  397. package/template/wall-e/eval/fixtures/fullstack-app/public/login.html +0 -45
  398. package/template/wall-e/eval/fixtures/fullstack-app/public/register.html +0 -38
  399. package/template/wall-e/eval/fixtures/fullstack-app/scripts/migrate.js +0 -23
  400. package/template/wall-e/eval/fixtures/fullstack-app/scripts/seed.js +0 -46
  401. package/template/wall-e/eval/fixtures/fullstack-app/server/db.js +0 -99
  402. package/template/wall-e/eval/fixtures/fullstack-app/server/index.js +0 -94
  403. package/template/wall-e/eval/fixtures/fullstack-app/server/middleware/auth.js +0 -19
  404. package/template/wall-e/eval/fixtures/fullstack-app/server/middleware/logger.js +0 -19
  405. package/template/wall-e/eval/fixtures/fullstack-app/server/router.js +0 -50
  406. package/template/wall-e/eval/fixtures/fullstack-app/server/routes/auth.js +0 -69
  407. package/template/wall-e/eval/fixtures/fullstack-app/server/routes/health.js +0 -23
  408. package/template/wall-e/eval/fixtures/fullstack-app/server/routes/items.js +0 -88
  409. package/template/wall-e/eval/fixtures/fullstack-app/server/routes/users.js +0 -75
  410. package/template/wall-e/eval/fixtures/fullstack-app/server/test.js +0 -198
  411. package/template/wall-e/eval/fixtures/fullstack-app/server/utils/response.js +0 -34
  412. package/template/wall-e/eval/fixtures/fullstack-app/server/utils/validate.js +0 -26
  413. package/template/wall-e/eval/fixtures/fullstack-app/server.js +0 -8
  414. package/template/wall-e/eval/fixtures/fullstack-app/test.js +0 -12
  415. package/template/wall-e/eval/fixtures/monorepo-basic/package.json +0 -8
  416. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/data.js +0 -58
  417. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/middleware.js +0 -46
  418. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/package.json +0 -8
  419. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/routes.js +0 -64
  420. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/server.js +0 -56
  421. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/test.js +0 -116
  422. package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/commands.js +0 -61
  423. package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/index.js +0 -62
  424. package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/output.js +0 -43
  425. package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/package.json +0 -11
  426. package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/test.js +0 -44
  427. package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/formatters.js +0 -43
  428. package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/index.js +0 -12
  429. package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/package.json +0 -5
  430. package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/test.js +0 -55
  431. package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/validators.js +0 -29
  432. package/template/wall-e/eval/fixtures/monorepo-basic/test.js +0 -46
  433. package/template/wall-e/eval/fixtures/node-cli/index.js +0 -78
  434. package/template/wall-e/eval/fixtures/node-cli/package.json +0 -10
  435. package/template/wall-e/eval/fixtures/node-cli/test.js +0 -57
  436. package/template/wall-e/eval/fixtures/node-typed/package.json +0 -8
  437. package/template/wall-e/eval/fixtures/node-typed/src/handlers.js +0 -31
  438. package/template/wall-e/eval/fixtures/node-typed/src/utils.js +0 -33
  439. package/template/wall-e/eval/fixtures/node-typed/test.js +0 -36
  440. package/template/wall-e/eval/fixtures/python-flask/app.py +0 -14
  441. package/template/wall-e/eval/fixtures/python-flask/requirements.txt +0 -2
  442. package/template/wall-e/eval/fixtures/python-flask/test_app.py +0 -25
  443. package/template/wall-e/eval/fixtures/wall-e-subset/brain.js +0 -105
  444. package/template/wall-e/eval/fixtures/wall-e-subset/eval/aggregator.js +0 -101
  445. package/template/wall-e/eval/fixtures/wall-e-subset/eval/benchmarks/chat.json +0 -20
  446. package/template/wall-e/eval/fixtures/wall-e-subset/eval/benchmarks/coding.json +0 -32
  447. package/template/wall-e/eval/fixtures/wall-e-subset/eval/benchmarks.js +0 -64
  448. package/template/wall-e/eval/fixtures/wall-e-subset/eval/fixtures/simple-project/package.json +0 -6
  449. package/template/wall-e/eval/fixtures/wall-e-subset/eval/fixtures/simple-project/server.js +0 -31
  450. package/template/wall-e/eval/fixtures/wall-e-subset/eval/fixtures/simple-project/test.js +0 -18
  451. package/template/wall-e/eval/fixtures/wall-e-subset/eval/fixtures/simple-project/utils.js +0 -34
  452. package/template/wall-e/eval/fixtures/wall-e-subset/eval/runner.js +0 -104
  453. package/template/wall-e/eval/fixtures/wall-e-subset/eval/scorer.js +0 -73
  454. package/template/wall-e/eval/fixtures/wall-e-subset/eval/test.js +0 -134
  455. package/template/wall-e/eval/fixtures/wall-e-subset/llm/client.js +0 -99
  456. package/template/wall-e/eval/fixtures/wall-e-subset/llm/providers.js +0 -63
  457. package/template/wall-e/eval/fixtures/wall-e-subset/llm/test.js +0 -70
  458. package/template/wall-e/eval/fixtures/wall-e-subset/package.json +0 -10
  459. package/template/wall-e/eval/fixtures/wall-e-subset/test.js +0 -86
  460. package/template/wall-e/eval/harvester.js +0 -685
  461. package/template/wall-e/eval/head-to-head.js +0 -388
  462. package/template/wall-e/eval/humaneval-adapter.js +0 -321
  463. package/template/wall-e/eval/list-models.js +0 -31
  464. package/template/wall-e/eval/livecodebench-adapter.js +0 -291
  465. package/template/wall-e/eval/mail-integration.js +0 -443
  466. package/template/wall-e/eval/manifest.js +0 -186
  467. package/template/wall-e/eval/meta-harness/adapters/coding-agent.js +0 -57
  468. package/template/wall-e/eval/meta-harness/bootstrap-snapshot.js +0 -149
  469. package/template/wall-e/eval/meta-harness/candidate-store.js +0 -117
  470. package/template/wall-e/eval/meta-harness/cli.js +0 -86
  471. package/template/wall-e/eval/meta-harness/domain-spec.js +0 -154
  472. package/template/wall-e/eval/meta-harness/domains/coding-agent.domain.json +0 -84
  473. package/template/wall-e/eval/meta-harness/examples/env-bootstrap-candidate.js +0 -29
  474. package/template/wall-e/eval/meta-harness/experience-store.js +0 -174
  475. package/template/wall-e/eval/meta-harness/frontier.js +0 -96
  476. package/template/wall-e/eval/meta-harness/harness-interface.js +0 -90
  477. package/template/wall-e/eval/meta-harness/leakage-guard.js +0 -80
  478. package/template/wall-e/eval/meta-harness/optimizer.js +0 -207
  479. package/template/wall-e/eval/meta-harness/proposer-runner.js +0 -110
  480. package/template/wall-e/eval/meta-harness/reporting.js +0 -58
  481. package/template/wall-e/eval/meta-harness/telemetry.js +0 -27
  482. package/template/wall-e/eval/meta-harness/validation.js +0 -81
  483. package/template/wall-e/eval/promoter.js +0 -228
  484. package/template/wall-e/eval/provider-normalizer.js +0 -33
  485. package/template/wall-e/eval/replay.js +0 -395
  486. package/template/wall-e/eval/run-agent-benchmarks.js +0 -386
  487. package/template/wall-e/eval/run-codex-cli-baseline.js +0 -177
  488. package/template/wall-e/eval/run-coding-agent-real.js +0 -187
  489. package/template/wall-e/eval/run-eval.js +0 -435
  490. package/template/wall-e/eval/run-model-comparison.js +0 -142
  491. package/template/wall-e/eval/session-evaluator.js +0 -187
  492. package/template/wall-e/eval/session-miner.js +0 -207
  493. package/template/wall-e/eval/session-retrieval-benchmark.js +0 -150
  494. package/template/wall-e/eval/session-transcripts.js +0 -509
  495. package/template/wall-e/eval/shadow.js +0 -161
  496. package/template/wall-e/eval/swebench-adapter.js +0 -345
  497. package/template/wall-e/eval/swebench-docker.js +0 -192
  498. package/template/wall-e/eval/train.py +0 -320
  499. package/template/wall-e/eval/trainer.js +0 -232
  500. package/template/wall-e/eval/weekly-eval-loop.js +0 -241
@@ -1,388 +0,0 @@
1
- 'use strict';
2
-
3
- const { randomUUID } = require('node:crypto');
4
- const { createClient } = require('../llm/client');
5
- const { heuristicScore } = require('./evaluator');
6
- const { scoreTraitsDetailed, TRAIT_MATCHERS, UNSCORABLE_TRAITS } = require('./benchmarks');
7
-
8
- // ============================================================
9
- // Concurrency limiter (inline, no external dependency)
10
- // ============================================================
11
-
12
- function pLimit(concurrency) {
13
- let active = 0;
14
- const queue = [];
15
- function next() {
16
- if (queue.length > 0 && active < concurrency) queue.shift()();
17
- }
18
- return function limit(fn) {
19
- return new Promise((resolve, reject) => {
20
- queue.push(() => {
21
- active++;
22
- fn().then(resolve, reject).finally(() => {
23
- active--;
24
- next();
25
- });
26
- });
27
- next();
28
- });
29
- };
30
- }
31
-
32
- // ============================================================
33
- // Cost estimation
34
- // ============================================================
35
-
36
- const COST_TABLE = {
37
- // Per-model costs ($/1M tokens)
38
- 'claude-opus-4-6': { input: 5.0, output: 25.0 },
39
- 'claude-sonnet-4-6': { input: 3.0, output: 15.0 },
40
- 'claude-haiku-4-5-20251001': { input: 0.25, output: 1.25 },
41
- 'claude-haiku-4-5': { input: 0.25, output: 1.25 },
42
- // OpenAI GPT-5.x uses the provider-level fallback until the pricing sync
43
- // records exact per-model rates.
44
- 'gemini-2.5-pro': { input: 2.0, output: 12.0 },
45
- 'gemini-2.5-flash': { input: 0.08, output: 0.30 },
46
- // DeepSeek V4 - OpenAI-compatible API, $/1M tokens.
47
- 'deepseek-v4-flash': { input: 0.14, output: 0.28 },
48
- 'deepseek-v4-pro': { input: 1.74, output: 3.48 },
49
- // Moonshot / Kimi - OpenAI-compatible API, $/1M tokens.
50
- 'kimi-k2.6': { input: 0.95, output: 4.00 },
51
- 'kimi-k2.5': { input: 0.60, output: 3.00 },
52
- 'kimi-k2-0905-preview': { input: 0.60, output: 2.50 },
53
- // Local models = free
54
- 'gemma4:e4b': { input: 0, output: 0 },
55
- 'gemma4:26b': { input: 0, output: 0 },
56
- 'qwen2.5:7b-instruct-q4_K_M': { input: 0, output: 0 },
57
- // Provider-level fallbacks (for backward compat)
58
- anthropic: { input: 3.0, output: 15.0 },
59
- openai: { input: 2.0, output: 10.0 },
60
- google: { input: 1.5, output: 8.0 },
61
- deepseek: { input: 0.30, output: 0.60 },
62
- moonshot: { input: 0.95, output: 4.00 },
63
- ollama: { input: 0, output: 0 },
64
- mlx: { input: 0, output: 0 },
65
- };
66
-
67
- /**
68
- * Estimate cost in dollars for a provider call.
69
- * @param {{ inputTokens: number, outputTokens: number }} usage
70
- * @param {string} providerType
71
- * @param {string} [model] - Optional model ID for per-model cost lookup
72
- * @returns {number}
73
- */
74
- function estimateProviderCost(usage, providerType, model) {
75
- if (!usage) return 0;
76
- // Check model-specific rates first, then fall back to provider type
77
- const rates = (model && COST_TABLE[model]) || COST_TABLE[providerType] || COST_TABLE.anthropic;
78
- const inputCost = ((usage.inputTokens || 0) / 1_000_000) * rates.input;
79
- const outputCost = ((usage.outputTokens || 0) / 1_000_000) * rates.output;
80
- return Math.round((inputCost + outputCost) * 1_000_000) / 1_000_000;
81
- }
82
-
83
- // ============================================================
84
- // Provider discovery
85
- // ============================================================
86
-
87
- /**
88
- * Discover all configured and credentialed providers from brain's model registry.
89
- * @param {Object} brain
90
- * @returns {Array<{ provider: string, model: string, apiKey: string|null, baseUrl: string|null, registryId: string }>}
91
- */
92
- function getAvailableProviders(brain) {
93
- const models = brain.listAllModels();
94
- const results = [];
95
- const seen = new Set();
96
-
97
- for (const m of models) {
98
- if (!m.enabled) continue;
99
- const provider = brain.getModelProvider(m.provider_id);
100
- if (!provider || !provider.enabled) continue;
101
-
102
- // Ollama/mlx don't need API keys
103
- const needsKey = provider.type !== 'ollama' && provider.type !== 'mlx';
104
- if (needsKey && !provider.api_key_encrypted) continue;
105
-
106
- const key = `${provider.type}:${m.model_id}`;
107
- if (seen.has(key)) continue;
108
- seen.add(key);
109
-
110
- results.push({
111
- provider: provider.type,
112
- model: m.model_id,
113
- apiKey: provider.api_key_encrypted || null,
114
- baseUrl: provider.base_url || null,
115
- registryId: m.id,
116
- });
117
- }
118
-
119
- return results;
120
- }
121
-
122
- // ============================================================
123
- // Trait matching
124
- // ============================================================
125
-
126
- function traitScore(response, expectedTraits) {
127
- if (!expectedTraits || expectedTraits.length === 0) return null;
128
- const text = String(response || '').toLowerCase();
129
- let matched = 0;
130
- let scored = 0;
131
-
132
- const knownTraits = [];
133
- for (const trait of expectedTraits) {
134
- if (UNSCORABLE_TRAITS.has(trait)) continue;
135
- if (TRAIT_MATCHERS[trait]) {
136
- knownTraits.push(trait);
137
- continue;
138
- }
139
- scored++;
140
- if (text.includes(String(trait).toLowerCase())) matched++;
141
- }
142
-
143
- if (knownTraits.length > 0) {
144
- const detail = scoreTraitsDetailed(response, knownTraits);
145
- matched += detail.matched.length;
146
- scored += detail.scoredCount;
147
- }
148
-
149
- return scored > 0 ? matched / scored : 0;
150
- }
151
-
152
- // ============================================================
153
- // Head-to-head runner
154
- // ============================================================
155
-
156
- /**
157
- * Run head-to-head evaluation across multiple providers.
158
- *
159
- * @param {Object} brain - Brain module for DB access
160
- * @param {Object} options
161
- * @param {Array|string} options.prompts - Array of { prompt, taskType, expectedTraits? } or single prompt string
162
- * @param {Array} [options.providers] - Provider filter; if omitted, use all from getAvailableProviders
163
- * @param {Function} [options.judgeFn] - LLM judge: (prompt, response, taskType) => { score, reasoning }
164
- * @param {number} [options.concurrency=3] - Max parallel provider calls
165
- * @param {Function} [options.clientFactory] - Override createClient (for testing)
166
- * @returns {Promise<{ runId: string, results: Array, leaderboard: Array }>}
167
- */
168
- async function runHeadToHead(brain, options = {}) {
169
- const runId = randomUUID();
170
- const concurrency = options.concurrency || 3;
171
- const limit = pLimit(concurrency);
172
- const factory = options.clientFactory || createClient;
173
-
174
- // Normalize prompts
175
- let prompts = options.prompts;
176
- if (typeof prompts === 'string') {
177
- prompts = [{ prompt: prompts, taskType: 'chat' }];
178
- }
179
- if (!Array.isArray(prompts) || prompts.length === 0) {
180
- return { runId, results: [], leaderboard: [] };
181
- }
182
-
183
- // Resolve providers
184
- const providers = options.providers || getAvailableProviders(brain);
185
- if (providers.length === 0) {
186
- return { runId, results: [], leaderboard: [] };
187
- }
188
-
189
- const allResults = [];
190
-
191
- for (const item of prompts) {
192
- const prompt = typeof item === 'string' ? item : item.prompt;
193
- const taskType = (typeof item === 'object' && item.taskType) || 'chat';
194
- const expectedTraits = (typeof item === 'object' && item.expectedTraits) || null;
195
-
196
- const providerPromises = providers.map((prov) =>
197
- limit(async () => {
198
- const start = Date.now();
199
- try {
200
- const client = factory(prov.provider, {
201
- apiKey: prov.apiKey,
202
- baseUrl: prov.baseUrl,
203
- });
204
-
205
- const response = await client.chat({
206
- model: prov.model,
207
- messages: [{ role: 'user', content: prompt }],
208
- maxTokens: 4096,
209
- });
210
-
211
- const latencyMs = Date.now() - start;
212
- const content = typeof response.content === 'string'
213
- ? response.content
214
- : (response.text || JSON.stringify(response.content));
215
-
216
- const usage = response.usage || {};
217
-
218
- // Score independently (no reference/primary response)
219
- const hScore = heuristicScore(null, content, taskType);
220
- const tScore = traitScore(content, expectedTraits);
221
- let judgeResult = null;
222
- if (options.judgeFn) {
223
- try {
224
- judgeResult = await options.judgeFn(prompt, content, taskType);
225
- } catch (_) {
226
- judgeResult = null;
227
- }
228
- }
229
-
230
- const judgeScore = judgeResult ? Math.min(1, (judgeResult.score || 0) / 10) : null;
231
-
232
- // Composite: weighted average of available scores
233
- const components = [];
234
- components.push({ score: hScore, weight: 1.0 });
235
- if (tScore !== null) components.push({ score: tScore, weight: 1.5 });
236
- if (judgeScore !== null) components.push({ score: judgeScore, weight: 2.0 });
237
-
238
- const totalWeight = components.reduce((s, c) => s + c.weight, 0);
239
- const composite = Math.round(
240
- (components.reduce((s, c) => s + c.score * c.weight, 0) / totalWeight) * 1000
241
- ) / 1000;
242
-
243
- const inputTokens = usage.input_tokens || usage.inputTokens || usage.input || 0;
244
- const outputTokens = usage.output_tokens || usage.outputTokens || usage.output || 0;
245
-
246
- const cost = estimateProviderCost(
247
- { inputTokens, outputTokens },
248
- prov.provider,
249
- prov.model
250
- );
251
-
252
- // Store in brain if insertModelEvaluation is available
253
- if (brain.insertModelEvaluation && prov.registryId) {
254
- try {
255
- brain.insertModelEvaluation({
256
- modelRegistryId: prov.registryId,
257
- taskType,
258
- qualityScore: composite,
259
- latencyMs,
260
- inputTokens,
261
- outputTokens,
262
- costEstimate: cost,
263
- wasSelected: false,
264
- quorumId: runId,
265
- });
266
- } catch (_) {
267
- // Non-critical — don't fail the run
268
- }
269
- }
270
-
271
- return {
272
- runId,
273
- provider: prov.provider,
274
- model: prov.model,
275
- registryId: prov.registryId,
276
- prompt,
277
- taskType,
278
- content,
279
- scores: {
280
- heuristic: hScore,
281
- trait: tScore,
282
- judge: judgeScore,
283
- composite,
284
- },
285
- latencyMs,
286
- cost,
287
- genTokPerSec: usage.genTokPerSec || null,
288
- error: null,
289
- };
290
- } catch (err) {
291
- return {
292
- runId,
293
- provider: prov.provider,
294
- model: prov.model,
295
- registryId: prov.registryId,
296
- prompt,
297
- taskType,
298
- content: null,
299
- scores: { heuristic: 0, trait: null, judge: null, composite: 0 },
300
- latencyMs: Date.now() - start,
301
- cost: 0,
302
- error: err.message,
303
- };
304
- }
305
- })
306
- );
307
-
308
- const promptResults = await Promise.all(providerPromises);
309
- allResults.push(...promptResults);
310
- }
311
-
312
- const leaderboard = buildLeaderboard(allResults);
313
- return { runId, results: allResults, leaderboard };
314
- }
315
-
316
- // ============================================================
317
- // Leaderboard
318
- // ============================================================
319
-
320
- /**
321
- * Build a leaderboard from head-to-head results.
322
- * @param {Array} results - Per-prompt per-provider results from runHeadToHead
323
- * @returns {Array<{ provider: string, model: string, avgComposite: number, avgLatencyMs: number, totalCost: number, runs: number, errors: number, rank: number }>}
324
- */
325
- function buildLeaderboard(results) {
326
- const byKey = {};
327
-
328
- for (const r of results) {
329
- const key = `${r.provider}:${r.model}`;
330
- if (!byKey[key]) {
331
- byKey[key] = {
332
- provider: r.provider,
333
- model: r.model,
334
- totalComposite: 0,
335
- totalLatency: 0,
336
- totalCost: 0,
337
- runs: 0,
338
- errors: 0,
339
- };
340
- }
341
- const entry = byKey[key];
342
- entry.runs++;
343
- if (r.error) {
344
- entry.errors++;
345
- } else {
346
- entry.totalComposite += r.scores.composite;
347
- entry.totalLatency += r.latencyMs;
348
- }
349
- entry.totalCost += r.cost;
350
- }
351
-
352
- const entries = Object.values(byKey).map((e) => {
353
- const successRuns = e.runs - e.errors;
354
- return {
355
- provider: e.provider,
356
- model: e.model,
357
- avgComposite: successRuns > 0
358
- ? Math.round((e.totalComposite / successRuns) * 1000) / 1000
359
- : 0,
360
- avgLatencyMs: successRuns > 0
361
- ? Math.round(e.totalLatency / successRuns)
362
- : 0,
363
- totalCost: Math.round(e.totalCost * 1_000_000) / 1_000_000,
364
- runs: e.runs,
365
- errors: e.errors,
366
- };
367
- });
368
-
369
- // Sort by composite score descending, then by latency ascending as tiebreaker
370
- entries.sort((a, b) => {
371
- if (b.avgComposite !== a.avgComposite) return b.avgComposite - a.avgComposite;
372
- return a.avgLatencyMs - b.avgLatencyMs;
373
- });
374
-
375
- // Assign ranks
376
- entries.forEach((e, i) => { e.rank = i + 1; });
377
-
378
- return entries;
379
- }
380
-
381
- module.exports = {
382
- pLimit,
383
- estimateProviderCost,
384
- getAvailableProviders,
385
- traitScore,
386
- runHeadToHead,
387
- buildLeaderboard,
388
- };
@@ -1,321 +0,0 @@
1
- 'use strict';
2
- const fs = require('fs');
3
- const path = require('path');
4
- const os = require('os');
5
- const crypto = require('crypto');
6
- const { execFileSync } = require('child_process');
7
- const { createClient } = require('../llm/client');
8
- const { resolveModelName } = require('./agent-runner');
9
- const { decorateBenchmarkResult, DEFAULT_SCORER_VERSION } = require('./manifest');
10
-
11
- const SUITE_NAME = 'humaneval-plus';
12
- const CACHE_DIR = path.join(os.homedir(), '.walle', 'eval-cache');
13
- const CACHE_FILE = path.join(CACHE_DIR, 'humaneval-plus.json');
14
- const CACHE_MAX_AGE_MS = 7 * 24 * 60 * 60 * 1000; // 7 days
15
-
16
- // EvalPlus HumanEval+ dataset URL (JSON format)
17
- const DATASET_URL = 'https://raw.githubusercontent.com/evalplus/evalplus/master/evalplus/data/humaneval.json';
18
-
19
- /**
20
- * Download and cache the HumanEval+ dataset.
21
- * @returns {Promise<Array>} Array of HumanEval tasks
22
- */
23
- async function loadHumanEvalDataset() {
24
- fs.mkdirSync(CACHE_DIR, { recursive: true });
25
-
26
- // Check cache freshness
27
- if (fs.existsSync(CACHE_FILE)) {
28
- const stat = fs.statSync(CACHE_FILE);
29
- if (Date.now() - stat.mtimeMs < CACHE_MAX_AGE_MS) {
30
- try {
31
- const data = JSON.parse(fs.readFileSync(CACHE_FILE, 'utf8'));
32
- if (Array.isArray(data) && data.length > 0) return data;
33
- } catch { /* re-download on parse error */ }
34
- }
35
- }
36
-
37
- // Download dataset
38
- console.log('[humaneval] Downloading HumanEval+ dataset...');
39
- const resp = await fetch(DATASET_URL, { signal: AbortSignal.timeout(30000) });
40
- if (!resp.ok) {
41
- throw new Error(`Failed to download HumanEval+ dataset: ${resp.status} ${resp.statusText}`);
42
- }
43
-
44
- const raw = await resp.json();
45
-
46
- // Dataset can be object keyed by task_id or array
47
- let tasks;
48
- if (Array.isArray(raw)) {
49
- tasks = raw;
50
- } else if (typeof raw === 'object') {
51
- tasks = Object.entries(raw).map(([id, task]) => ({
52
- task_id: id,
53
- ...task,
54
- }));
55
- } else {
56
- throw new Error('Unexpected HumanEval+ dataset format');
57
- }
58
-
59
- // Cache
60
- fs.writeFileSync(CACHE_FILE, JSON.stringify(tasks, null, 2));
61
- console.log(`[humaneval] Cached ${tasks.length} tasks`);
62
-
63
- return tasks;
64
- }
65
-
66
- /**
67
- * Extract code from an LLM response, stripping markdown fences.
68
- */
69
- function extractCode(response) {
70
- if (!response) return '';
71
-
72
- // Try to extract from markdown code block
73
- const fenceMatch = response.match(/```(?:python)?\s*\n([\s\S]*?)```/);
74
- if (fenceMatch) return fenceMatch[1].trim();
75
-
76
- // If no fence, try to find a function definition
77
- const funcMatch = response.match(/((?:def|class)\s+[\s\S]*)/);
78
- if (funcMatch) return funcMatch[1].trim();
79
-
80
- // Return as-is
81
- return response.trim();
82
- }
83
-
84
- /**
85
- * Map difficulty based on task index (rough heuristic).
86
- */
87
- function taskDifficulty(taskId) {
88
- const num = parseInt((taskId || '').replace(/\D/g, ''), 10);
89
- if (num < 50) return 'easy';
90
- if (num < 120) return 'medium';
91
- return 'hard';
92
- }
93
-
94
- /**
95
- * Run a single HumanEval task.
96
- * @param {object} task - HumanEval task object
97
- * @param {object} options - { provider (client instance), model, providerType, config }
98
- * @returns {Promise<object>} Result with score and metadata
99
- */
100
- async function runHumanEvalTask(task, options = {}) {
101
- const { provider, model, providerType, config } = options;
102
- const startTime = Date.now();
103
-
104
- let client = provider;
105
- if (!client && providerType) {
106
- client = createClient(providerType, config || {});
107
- }
108
- if (!client) throw new Error('provider or providerType is required');
109
-
110
- const taskPrompt = `Complete the following Python function. Return ONLY the complete function implementation, nothing else.\n\n${task.prompt}`;
111
-
112
- let response = '';
113
- let error = null;
114
- let usage = null;
115
-
116
- try {
117
- const result = await client.chat({
118
- model: model || 'claude-haiku-4-5-20251001',
119
- messages: [{ role: 'user', content: taskPrompt }],
120
- maxTokens: 1024,
121
- });
122
- response = result.content || '';
123
- usage = result.usage || null;
124
- } catch (err) {
125
- error = err.message;
126
- return {
127
- taskId: task.task_id,
128
- passed: false,
129
- score: { composite: 0, dimensions: {} },
130
- latencyMs: Date.now() - startTime,
131
- error,
132
- response: '',
133
- };
134
- }
135
-
136
- const latencyMs = Date.now() - startTime;
137
- const code = extractCode(response);
138
-
139
- // Write code + tests to temp file and run
140
- let passed = false;
141
- let testError = null;
142
-
143
- if (code) {
144
- const tmpDir = path.join(os.tmpdir(), `humaneval-${crypto.randomUUID().slice(0, 8)}`);
145
- fs.mkdirSync(tmpDir, { recursive: true });
146
- const tmpFile = path.join(tmpDir, 'solution.py');
147
-
148
- try {
149
- // Build test file: generated code + test harness
150
- const testCode = task.test || '';
151
- const entryPoint = task.entry_point || '';
152
-
153
- // Combine: generated function + test code + check call
154
- let fullCode = code + '\n\n' + testCode;
155
- if (entryPoint && testCode.includes('check(')) {
156
- fullCode += `\ncheck(${entryPoint})\n`;
157
- }
158
-
159
- fs.writeFileSync(tmpFile, fullCode);
160
-
161
- execFileSync('python3', [tmpFile], {
162
- timeout: 30000,
163
- stdio: 'pipe',
164
- cwd: tmpDir,
165
- });
166
- passed = true;
167
- } catch (err) {
168
- testError = (err.stderr ? err.stderr.toString() : err.message).slice(0, 500);
169
- } finally {
170
- try { fs.rmSync(tmpDir, { recursive: true, force: true }); } catch {}
171
- }
172
- }
173
-
174
- // Score dimensions
175
- const dimensions = {
176
- correctness: passed ? 1.0 : 0.0,
177
- codeQuality: scoreHumanEvalQuality(code),
178
- };
179
- const composite = dimensions.correctness * 0.8 + dimensions.codeQuality * 0.2;
180
-
181
- // Estimate cost
182
- let costDollars = 0;
183
- try {
184
- const { estimateProviderCost } = require('./head-to-head');
185
- costDollars = estimateProviderCost(usage || {}, providerType || 'anthropic');
186
- } catch {}
187
-
188
- return {
189
- taskId: task.task_id,
190
- passed,
191
- score: { composite, dimensions },
192
- latencyMs,
193
- costDollars,
194
- response: response.slice(0, 2000),
195
- code: code.slice(0, 2000),
196
- error: error || null,
197
- testError: testError || null,
198
- usage,
199
- };
200
- }
201
-
202
- /**
203
- * Score code quality heuristically.
204
- */
205
- function scoreHumanEvalQuality(code) {
206
- if (!code) return 0;
207
- let score = 0.5;
208
- if (/"""[\s\S]*?"""|'''[\s\S]*?'''/.test(code)) score += 0.15; // has docstring
209
- if (/if\s+.*(?:None|not\s|len\(|==\s*0)/.test(code)) score += 0.15; // edge case handling
210
- if (code.split('\n').length > 2) score += 0.1; // non-trivial
211
- if (!/\bprint\b/.test(code)) score += 0.1; // no debug prints
212
- return Math.min(1, score);
213
- }
214
-
215
- /**
216
- * Run the full HumanEval+ suite.
217
- * @param {object} options
218
- * @param {object} [options.brain] - Brain instance for storing results
219
- * @param {string} options.providerType - Provider type (anthropic, openai, etc.)
220
- * @param {object} [options.config] - Provider config (apiKey, baseUrl)
221
- * @param {string} options.model - Model ID
222
- * @param {number} [options.maxTasks] - Limit number of tasks (default: all)
223
- * @param {AbortSignal} [options.signal] - Abort signal
224
- * @returns {Promise<object>} Suite results
225
- */
226
- async function runHumanEvalSuite(options = {}) {
227
- const { brain, providerType, config, model, maxTasks, signal, runId: providedRunId } = options;
228
-
229
- const allTasks = await loadHumanEvalDataset();
230
- const tasks = maxTasks ? allTasks.slice(0, maxTasks) : allTasks;
231
-
232
- const client = createClient(providerType || 'anthropic', config || {});
233
- const runId = providedRunId || crypto.randomUUID();
234
- const results = [];
235
- let totalPassed = 0;
236
-
237
- console.log(`[humaneval] Running ${tasks.length} tasks with ${model || 'default'}...`);
238
-
239
- for (const task of tasks) {
240
- if (signal?.aborted) break;
241
-
242
- const result = await runHumanEvalTask(task, { provider: client, model, providerType });
243
- results.push(result);
244
-
245
- if (result.passed) totalPassed++;
246
-
247
- console.log(` ${result.passed ? 'PASS' : 'FAIL'} ${task.task_id} (${result.latencyMs}ms)`);
248
-
249
- // Store in brain
250
- if (brain && typeof brain.insertBenchmarkResult === 'function') {
251
- try {
252
- const scoringMethod = 'executable-tests';
253
- brain.insertBenchmarkResult(decorateBenchmarkResult({
254
- runId,
255
- suite: SUITE_NAME,
256
- promptId: task.task_id,
257
- taskType: 'coding',
258
- difficulty: taskDifficulty(task.task_id),
259
- provider: providerType || 'unknown',
260
- model: resolveModelName(model),
261
- prompt: task.prompt,
262
- response: result.response || '',
263
- traitScore: null,
264
- compositeScore: result.score.composite,
265
- latencyMs: result.latencyMs,
266
- error: result.error || null,
267
- costDollars: result.costDollars || null,
268
- testsBefore: null,
269
- testsAfter: result.passed ? 1 : 0,
270
- totalTests: 1,
271
- dimensionsJson: JSON.stringify(result.score.dimensions),
272
- modelMetadataJson: JSON.stringify({ testError: result.testError || null }),
273
- datasetVersion: 'humaneval-plus:evalplus-master',
274
- scorerVersion: DEFAULT_SCORER_VERSION,
275
- scoringMethod,
276
- trusted: !result.error,
277
- runConfig: { maxTasks, scoringMethod },
278
- }, {
279
- suite: SUITE_NAME,
280
- benchmark: {
281
- id: task.task_id,
282
- prompt: task.prompt,
283
- taskType: 'coding',
284
- difficulty: taskDifficulty(task.task_id),
285
- datasetVersion: 'humaneval-plus:evalplus-master',
286
- },
287
- runId,
288
- provider: providerType || 'unknown',
289
- model: resolveModelName(model),
290
- scorerVersion: DEFAULT_SCORER_VERSION,
291
- scoringMethod,
292
- trusted: !result.error,
293
- runConfig: { maxTasks, scoringMethod },
294
- }));
295
- } catch {}
296
- }
297
- }
298
-
299
- const passAt1 = tasks.length > 0 ? totalPassed / tasks.length : 0;
300
-
301
- return {
302
- runId,
303
- suite: SUITE_NAME,
304
- model: resolveModelName(model),
305
- totalTasks: tasks.length,
306
- passed: totalPassed,
307
- passAt1,
308
- avgScore: results.reduce((s, r) => s + r.score.composite, 0) / Math.max(results.length, 1),
309
- totalCost: results.reduce((s, r) => s + (r.costDollars || 0), 0),
310
- results,
311
- };
312
- }
313
-
314
- module.exports = {
315
- SUITE_NAME,
316
- loadHumanEvalDataset,
317
- extractCode,
318
- runHumanEvalTask,
319
- runHumanEvalSuite,
320
- scoreHumanEvalQuality,
321
- };