create-walle 0.9.21 → 0.9.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (500) hide show
  1. package/README.md +27 -5
  2. package/package.json +2 -2
  3. package/template/CLAUDE.md +2 -2
  4. package/template/LICENSE +1 -1
  5. package/template/bin/ctm-dev-cleanup.js +24 -3
  6. package/template/bin/ctm-launch.sh +13 -0
  7. package/template/bin/dev.sh +156 -18
  8. package/template/bin/node-bin.sh +84 -0
  9. package/template/bin/pin-node.sh +51 -0
  10. package/template/claude-task-manager/api-prompts.js +1203 -182
  11. package/template/claude-task-manager/api-reviews.js +109 -15
  12. package/template/claude-task-manager/approval-agent.js +1360 -280
  13. package/template/claude-task-manager/bin/restart-ctm.sh +64 -23
  14. package/template/claude-task-manager/bin/storage-migration-supervisor.js +338 -0
  15. package/template/claude-task-manager/db.js +4417 -295
  16. package/template/claude-task-manager/docs/app-update-refresh-protocol.md +69 -0
  17. package/template/claude-task-manager/docs/approval-ai-refinement.md +138 -0
  18. package/template/claude-task-manager/docs/approval-rescue-loop.md +74 -0
  19. package/template/claude-task-manager/docs/codex-operational-warning-health.md +107 -0
  20. package/template/claude-task-manager/docs/codex-resume-state-guard-design.md +17 -12
  21. package/template/claude-task-manager/docs/codex-terminal-render-controller-handoff.md +311 -0
  22. package/template/claude-task-manager/docs/coding-agent-hooks-architecture.md +418 -0
  23. package/template/claude-task-manager/docs/conversation-import-freshness.md +20 -0
  24. package/template/claude-task-manager/docs/google-workspace-auth-health.md +77 -0
  25. package/template/claude-task-manager/docs/image-paste-ux.md +13 -0
  26. package/template/claude-task-manager/docs/ipad-web-preview.md +88 -0
  27. package/template/claude-task-manager/docs/main-loop-offload-architecture.md +66 -0
  28. package/template/claude-task-manager/docs/microsoft-dev-tunnel-phone-access-design.md +274 -519
  29. package/template/claude-task-manager/docs/mobile-live-streaming.md +27 -5
  30. package/template/claude-task-manager/docs/mobile-remote-submission-lifecycle.md +69 -0
  31. package/template/claude-task-manager/docs/phone-access-design.md +53 -15
  32. package/template/claude-task-manager/docs/phone-passkey-identity.md +122 -0
  33. package/template/claude-task-manager/docs/phone-setup.md +3 -0
  34. package/template/claude-task-manager/docs/prompt-editing-tree-design.md +25 -1
  35. package/template/claude-task-manager/docs/remote-desktop-access-design.md +268 -0
  36. package/template/claude-task-manager/docs/restart-lifecycle-architecture.md +95 -0
  37. package/template/claude-task-manager/docs/runtime-work-control-plane.md +53 -0
  38. package/template/claude-task-manager/docs/session-interactive-wait-surfaces.md +38 -0
  39. package/template/claude-task-manager/docs/session-needs-you-dismissal.md +84 -0
  40. package/template/claude-task-manager/docs/session-render-state-management-design.md +91 -3
  41. package/template/claude-task-manager/docs/session-standup-command-center-design.md +25 -1
  42. package/template/claude-task-manager/docs/session-title-authority.md +32 -0
  43. package/template/claude-task-manager/docs/session-workspace-binding.md +33 -0
  44. package/template/claude-task-manager/docs/skill-intent-resolution-design.md +72 -0
  45. package/template/claude-task-manager/docs/walle-mcp-supervisor-health.md +86 -0
  46. package/template/claude-task-manager/docs/walle-relay-phone-access-design.md +24 -15
  47. package/template/claude-task-manager/docs/walle-session-history-hydration.md +114 -0
  48. package/template/claude-task-manager/docs/walle-session-input-queue.md +104 -0
  49. package/template/claude-task-manager/docs/walle-session-model-catalog.md +90 -0
  50. package/template/claude-task-manager/docs/walle-session-model-preferences.md +15 -6
  51. package/template/claude-task-manager/git-utils.js +897 -27
  52. package/template/claude-task-manager/lib/agent-capabilities.js +33 -0
  53. package/template/claude-task-manager/lib/agent-cli-cache.js +37 -7
  54. package/template/claude-task-manager/lib/agent-hooks-installer.js +26 -2
  55. package/template/claude-task-manager/lib/agent-presets.js +17 -1
  56. package/template/claude-task-manager/lib/all-sessions-query.js +108 -0
  57. package/template/claude-task-manager/lib/approval-ai-refinement.js +488 -0
  58. package/template/claude-task-manager/lib/approval-self-adapt.js +168 -0
  59. package/template/claude-task-manager/lib/async-semaphore.js +44 -0
  60. package/template/claude-task-manager/lib/auth-context.js +5 -0
  61. package/template/claude-task-manager/lib/auth-rate-limit.js +47 -4
  62. package/template/claude-task-manager/lib/auth-rules.js +29 -2
  63. package/template/claude-task-manager/lib/auto-approval-verifier.js +129 -16
  64. package/template/claude-task-manager/lib/background-llm.js +144 -17
  65. package/template/claude-task-manager/lib/branch-inventory.js +212 -0
  66. package/template/claude-task-manager/lib/claude-desktop-sessions.js +15 -3
  67. package/template/claude-task-manager/lib/coalesce-sync-frames.js +151 -0
  68. package/template/claude-task-manager/lib/codex-launch-health.js +762 -0
  69. package/template/claude-task-manager/lib/codex-transcript-pager.js +51 -0
  70. package/template/claude-task-manager/lib/codex-zst.js +124 -0
  71. package/template/claude-task-manager/lib/coding-agent-models.js +233 -30
  72. package/template/claude-task-manager/lib/connection-health.js +232 -0
  73. package/template/claude-task-manager/lib/conversation-blob-parser.js +42 -0
  74. package/template/claude-task-manager/lib/conversation-tail-merge.js +89 -26
  75. package/template/claude-task-manager/lib/ctm-session-context-api.js +39 -10
  76. package/template/claude-task-manager/lib/cursor-conversation-store.js +354 -0
  77. package/template/claude-task-manager/lib/db-owner-worker-client.js +315 -0
  78. package/template/claude-task-manager/lib/document-review.js +141 -6
  79. package/template/claude-task-manager/lib/escalation-review.js +152 -0
  80. package/template/claude-task-manager/lib/graceful-shutdown.js +159 -0
  81. package/template/claude-task-manager/lib/headless-term-service.js +678 -0
  82. package/template/claude-task-manager/lib/heavy-worker-fallback.js +38 -0
  83. package/template/claude-task-manager/lib/jsonl-conversation-parser.js +542 -0
  84. package/template/claude-task-manager/lib/jsonl-range-reader.js +112 -0
  85. package/template/claude-task-manager/lib/main-db-census.js +216 -0
  86. package/template/claude-task-manager/lib/message-pagination.js +106 -4
  87. package/template/claude-task-manager/lib/microsoft-dev-tunnel-setup.js +750 -26
  88. package/template/claude-task-manager/lib/mobile-auth-api.js +274 -7
  89. package/template/claude-task-manager/lib/mobile-auth-store.js +592 -10
  90. package/template/claude-task-manager/lib/mobile-notification-dispatcher.js +15 -0
  91. package/template/claude-task-manager/lib/model-overview-brain-fallback.js +311 -0
  92. package/template/claude-task-manager/lib/model-overview-cache.js +141 -0
  93. package/template/claude-task-manager/lib/models-health-routing-notice.js +126 -0
  94. package/template/claude-task-manager/lib/node-pin-guard.js +93 -0
  95. package/template/claude-task-manager/lib/perf-tracker.js +242 -6
  96. package/template/claude-task-manager/lib/permission-match.js +76 -0
  97. package/template/claude-task-manager/lib/permission-sync.js +133 -20
  98. package/template/claude-task-manager/lib/process-title.js +35 -0
  99. package/template/claude-task-manager/lib/prompt-executions-query.js +25 -0
  100. package/template/claude-task-manager/lib/prompt-index-disk-cache.js +44 -0
  101. package/template/claude-task-manager/lib/prompt-intent.js +132 -0
  102. package/template/claude-task-manager/lib/provider-user-context.js +34 -0
  103. package/template/claude-task-manager/lib/read-pool-client.js +313 -0
  104. package/template/claude-task-manager/lib/readpool-breaker.js +31 -0
  105. package/template/claude-task-manager/lib/recent-sessions-breaker.js +12 -0
  106. package/template/claude-task-manager/lib/remote-feedback-client.js +72 -0
  107. package/template/claude-task-manager/lib/remote-relay-protocol.js +37 -4
  108. package/template/claude-task-manager/lib/remote-relay-store.js +159 -0
  109. package/template/claude-task-manager/lib/remote-submission-observer.js +278 -0
  110. package/template/claude-task-manager/lib/restart-guard.js +109 -0
  111. package/template/claude-task-manager/lib/restore-interruption-detector.js +439 -0
  112. package/template/claude-task-manager/lib/restore-policy.js +13 -0
  113. package/template/claude-task-manager/lib/restore-resume-batch.js +74 -0
  114. package/template/claude-task-manager/lib/restore-runtime.js +68 -0
  115. package/template/claude-task-manager/lib/restore-storm.js +34 -0
  116. package/template/claude-task-manager/lib/resume-cwd.js +36 -0
  117. package/template/claude-task-manager/lib/resume-preflight.js +313 -0
  118. package/template/claude-task-manager/lib/runtime-work-registry.js +444 -0
  119. package/template/claude-task-manager/lib/sanitize-openai-auth.js +31 -0
  120. package/template/claude-task-manager/lib/scheduler.js +21 -1
  121. package/template/claude-task-manager/lib/scrollback-snapshot-store.js +159 -0
  122. package/template/claude-task-manager/lib/serial-task-queue.js +64 -0
  123. package/template/claude-task-manager/lib/server-listeners.js +239 -0
  124. package/template/claude-task-manager/lib/session-capture.js +42 -7
  125. package/template/claude-task-manager/lib/session-content-backfill.js +131 -0
  126. package/template/claude-task-manager/lib/session-history.js +388 -43
  127. package/template/claude-task-manager/lib/session-host-manager.js +287 -0
  128. package/template/claude-task-manager/lib/session-image-refs.js +209 -0
  129. package/template/claude-task-manager/lib/session-jobs.js +399 -59
  130. package/template/claude-task-manager/lib/session-prompt-index.js +137 -0
  131. package/template/claude-task-manager/lib/session-restore.js +53 -0
  132. package/template/claude-task-manager/lib/session-standup.js +123 -23
  133. package/template/claude-task-manager/lib/session-state-bus.js +14 -0
  134. package/template/claude-task-manager/lib/session-stream.js +64 -16
  135. package/template/claude-task-manager/lib/session-timeline-summary.js +260 -0
  136. package/template/claude-task-manager/lib/session-token-usage.js +494 -0
  137. package/template/claude-task-manager/lib/session-workspace-binding.js +356 -0
  138. package/template/claude-task-manager/lib/setup-network-config.js +9 -0
  139. package/template/claude-task-manager/lib/size-cap.js +45 -0
  140. package/template/claude-task-manager/lib/size-cap.test.js +62 -0
  141. package/template/claude-task-manager/lib/skill-autocomplete.js +180 -1
  142. package/template/claude-task-manager/lib/skill-intent-resolver.js +304 -0
  143. package/template/claude-task-manager/lib/sqlite-driver.js +19 -3
  144. package/template/claude-task-manager/lib/standup-attention.js +7 -3
  145. package/template/claude-task-manager/lib/status-authority.js +39 -0
  146. package/template/claude-task-manager/lib/status-hooks.js +4 -0
  147. package/template/claude-task-manager/lib/storage-migration.js +235 -0
  148. package/template/claude-task-manager/lib/structured-capture.js +298 -0
  149. package/template/claude-task-manager/lib/sync-io-census.js +163 -0
  150. package/template/claude-task-manager/lib/tailscale-setup.js +6 -0
  151. package/template/claude-task-manager/lib/terminal-activity-evidence.js +33 -0
  152. package/template/claude-task-manager/lib/terminal-choice.js +364 -0
  153. package/template/claude-task-manager/lib/terminal-control-sanitize.js +17 -0
  154. package/template/claude-task-manager/lib/terminal-fingerprint.js +48 -0
  155. package/template/claude-task-manager/lib/terminal-output-flush.js +84 -0
  156. package/template/claude-task-manager/lib/timeline-order.js +122 -0
  157. package/template/claude-task-manager/lib/transcript-store.js +348 -43
  158. package/template/claude-task-manager/lib/transport-security.js +84 -1
  159. package/template/claude-task-manager/lib/wait-state.js +184 -0
  160. package/template/claude-task-manager/lib/walle-client.js +47 -5
  161. package/template/claude-task-manager/lib/walle-ctm-history.js +564 -4
  162. package/template/claude-task-manager/lib/walle-external-actions.js +135 -16
  163. package/template/claude-task-manager/lib/walle-history-hydration.js +46 -0
  164. package/template/claude-task-manager/lib/walle-native-health.js +403 -0
  165. package/template/claude-task-manager/lib/walle-repair.js +701 -0
  166. package/template/claude-task-manager/lib/walle-session-cache.js +109 -0
  167. package/template/claude-task-manager/lib/walle-session-context.js +57 -21
  168. package/template/claude-task-manager/lib/walle-session-model-catalog.js +34 -0
  169. package/template/claude-task-manager/lib/walle-supervisor.js +539 -63
  170. package/template/claude-task-manager/lib/walle-transcript.js +52 -0
  171. package/template/claude-task-manager/lib/worktree-active-sync.js +11 -7
  172. package/template/claude-task-manager/lib/worktree-cwd.js +32 -1
  173. package/template/claude-task-manager/package.json +1 -1
  174. package/template/claude-task-manager/prompt-harvest.js +89 -66
  175. package/template/claude-task-manager/providers/claude-code.js +51 -3
  176. package/template/claude-task-manager/providers/cursor.js +140 -45
  177. package/template/claude-task-manager/public/css/reviews.css +551 -61
  178. package/template/claude-task-manager/public/css/setup.css +191 -0
  179. package/template/claude-task-manager/public/css/walle-session.css +865 -10
  180. package/template/claude-task-manager/public/css/walle.css +154 -0
  181. package/template/claude-task-manager/public/designs/ai-providers-consolidation-v2.html +830 -0
  182. package/template/claude-task-manager/public/index.html +18516 -2058
  183. package/template/claude-task-manager/public/ipad.html +363 -0
  184. package/template/claude-task-manager/public/js/document-review-links.js +301 -0
  185. package/template/claude-task-manager/public/js/image-normalize.js +69 -36
  186. package/template/claude-task-manager/public/js/message-renderer.js +1265 -77
  187. package/template/claude-task-manager/public/js/prompts.js +66 -29
  188. package/template/claude-task-manager/public/js/reviews.js +901 -133
  189. package/template/claude-task-manager/public/js/session-activity-utils.js +11 -1
  190. package/template/claude-task-manager/public/js/session-search-utils.js +94 -10
  191. package/template/claude-task-manager/public/js/session-status-precedence.js +23 -5
  192. package/template/claude-task-manager/public/js/setup.js +1273 -176
  193. package/template/claude-task-manager/public/js/stream-view.js +691 -73
  194. package/template/claude-task-manager/public/js/terminal-reconciler.js +210 -0
  195. package/template/claude-task-manager/public/js/walle-session.js +2455 -158
  196. package/template/claude-task-manager/public/js/walle.js +455 -28
  197. package/template/claude-task-manager/public/m/app.css +2909 -262
  198. package/template/claude-task-manager/public/m/app.js +6601 -398
  199. package/template/claude-task-manager/public/m/claim.html +224 -17
  200. package/template/claude-task-manager/public/m/index.html +117 -21
  201. package/template/claude-task-manager/public/m/sw.js +3 -1
  202. package/template/claude-task-manager/public/manifest.json +2 -2
  203. package/template/claude-task-manager/public/prompts.html +30 -14
  204. package/template/claude-task-manager/queue-engine.js +507 -28
  205. package/template/claude-task-manager/scripts/repair-claude-session-images.js +27 -8
  206. package/template/claude-task-manager/server.js +14341 -2197
  207. package/template/claude-task-manager/session-integrity.js +160 -18
  208. package/template/claude-task-manager/session-search-ranking.js +1 -0
  209. package/template/claude-task-manager/session-utils.js +25 -5
  210. package/template/claude-task-manager/workers/approval-blocklist.js +96 -6
  211. package/template/claude-task-manager/workers/approval-widget-validator.js +14 -8
  212. package/template/claude-task-manager/workers/conversation-import-worker.js +11 -50
  213. package/template/claude-task-manager/workers/db-owner-worker.js +386 -0
  214. package/template/claude-task-manager/workers/harvest-worker.js +9 -55
  215. package/template/claude-task-manager/workers/headless-term-worker.js +9 -530
  216. package/template/claude-task-manager/workers/read-pool-worker.js +387 -0
  217. package/template/claude-task-manager/workers/scrollback-worker.js +11 -72
  218. package/template/claude-task-manager/workers/session-host-process.js +146 -0
  219. package/template/claude-task-manager/workers/session-integrity-worker.js +10 -54
  220. package/template/claude-task-manager/workers/state-detectors/base.js +18 -1
  221. package/template/claude-task-manager/workers/state-detectors/claude-code.js +182 -9
  222. package/template/claude-task-manager/workers/state-detectors/codex.js +150 -2
  223. package/template/claude-task-manager/workers/state-detectors/cursor.js +127 -0
  224. package/template/claude-task-manager/workers/state-detectors/gemini.js +21 -0
  225. package/template/claude-task-manager/workers/state-detectors/index.js +29 -0
  226. package/template/claude-task-manager/workers/state-detectors/opencode.js +103 -0
  227. package/template/docs/design/markdown-review-pane.md +206 -0
  228. package/template/docs/designs/2026-05-17-portkey-gateway-provider-ux.md +129 -38
  229. package/template/docs/designs/2026-05-20-mobile-worktree-finish-command.md +27 -0
  230. package/template/docs/designs/2026-05-22-ai-configuration-consolidation.md +248 -0
  231. package/template/docs/designs/ai-configuration-consolidation-mock.html +812 -0
  232. package/template/docs/private-memory-and-pii-policy.md +69 -0
  233. package/template/package.json +2 -1
  234. package/template/scripts/check-private-data.js +201 -0
  235. package/template/shared/sqlite-owner-guard.js +30 -0
  236. package/template/shared/sqlite-owner-write-queue.js +225 -0
  237. package/template/shared/sqlite-storage-policy.js +111 -0
  238. package/template/shared/sqlite-write-lock.js +428 -0
  239. package/template/wall-e/agent-runners/claude-code.js +5 -0
  240. package/template/wall-e/agent.js +166 -22
  241. package/template/wall-e/api-walle.js +524 -70
  242. package/template/wall-e/auth/provider-flows.js +11 -1
  243. package/template/wall-e/bin/walle-mcp-stdio.js +341 -17
  244. package/template/wall-e/brain.js +1614 -141
  245. package/template/wall-e/chat/attachment-blocks.js +96 -0
  246. package/template/wall-e/chat/attachments.js +2 -1
  247. package/template/wall-e/chat/capability-resolver.js +7 -7
  248. package/template/wall-e/chat/context-messages.js +28 -0
  249. package/template/wall-e/chat/conversation-frame.js +630 -0
  250. package/template/wall-e/chat/provider-messages.js +125 -0
  251. package/template/wall-e/chat.js +1002 -233
  252. package/template/wall-e/coding/acceptance-contract.js +170 -0
  253. package/template/wall-e/coding/acp-adapter.js +1 -1
  254. package/template/wall-e/coding/agent-catalog.js +3 -0
  255. package/template/wall-e/coding/artifact-store.js +93 -0
  256. package/template/wall-e/coding/capability-router.js +120 -0
  257. package/template/wall-e/coding/coding-run-controller.js +423 -0
  258. package/template/wall-e/coding/compaction-service.js +157 -12
  259. package/template/wall-e/coding/frontend-verification.js +258 -0
  260. package/template/wall-e/coding/lifecycle-hooks.js +75 -0
  261. package/template/wall-e/coding/local-preview-contract.js +157 -0
  262. package/template/wall-e/coding/permission-service.js +57 -13
  263. package/template/wall-e/coding/prompt-bundle.js +19 -1
  264. package/template/wall-e/coding/prompt-section-registry.js +227 -0
  265. package/template/wall-e/coding/provider-compat.js +15 -0
  266. package/template/wall-e/coding/runtime-events.js +224 -0
  267. package/template/wall-e/coding/runtime-mode.js +3 -0
  268. package/template/wall-e/coding/side-git-snapshot.js +160 -4
  269. package/template/wall-e/coding/snapshot-service.js +143 -1
  270. package/template/wall-e/coding/stream-processor.js +388 -34
  271. package/template/wall-e/coding/task-tool.js +141 -4
  272. package/template/wall-e/coding/tool-execution-controller.js +365 -0
  273. package/template/wall-e/coding/tool-registry.js +43 -5
  274. package/template/wall-e/coding/user-hooks.js +217 -0
  275. package/template/wall-e/coding-orchestrator.js +1330 -221
  276. package/template/wall-e/coding-prompts.js +20 -4
  277. package/template/wall-e/context/context-builder.js +15 -2
  278. package/template/wall-e/decision/confidence.js +1 -1
  279. package/template/wall-e/docs/coding-acceptance-contract.md +41 -0
  280. package/template/wall-e/docs/external-action-controller.md +26 -6
  281. package/template/wall-e/docs/telemetry-lifecycle.md +8 -2
  282. package/template/wall-e/embeddings.js +591 -53
  283. package/template/wall-e/external-action-controller.js +12 -0
  284. package/template/wall-e/http/auth.js +1 -0
  285. package/template/wall-e/http/chat-api.js +46 -11
  286. package/template/wall-e/http/model-admin.js +836 -34
  287. package/template/wall-e/lib/boot-profile.js +88 -0
  288. package/template/wall-e/lib/event-loop-monitor.js +93 -0
  289. package/template/wall-e/lib/service-health.js +194 -0
  290. package/template/wall-e/llm/anthropic.js +130 -5
  291. package/template/wall-e/llm/client.js +266 -63
  292. package/template/wall-e/llm/default-fallback.js +382 -0
  293. package/template/wall-e/llm/health.js +19 -0
  294. package/template/wall-e/llm/message-guard.js +78 -0
  295. package/template/wall-e/llm/model-catalog.js +252 -1
  296. package/template/wall-e/llm/openai.js +26 -4
  297. package/template/wall-e/llm/portkey-sync.js +654 -0
  298. package/template/wall-e/llm/provider-error.js +30 -2
  299. package/template/wall-e/llm/registry.js +5 -1
  300. package/template/wall-e/llm/request-compat.js +67 -0
  301. package/template/wall-e/loops/backfill.js +79 -23
  302. package/template/wall-e/loops/brain-optimize.js +67 -0
  303. package/template/wall-e/loops/ingest.js +25 -10
  304. package/template/wall-e/loops/question-digest.js +160 -0
  305. package/template/wall-e/loops/reflect.js +6 -4
  306. package/template/wall-e/loops/think.js +39 -12
  307. package/template/wall-e/mcp-server.js +318 -36
  308. package/template/wall-e/memory/ctm-context-client.js +52 -14
  309. package/template/wall-e/memory/ctm-operational-context.js +237 -0
  310. package/template/wall-e/memory/ctm-prompt-executions-client.js +128 -0
  311. package/template/wall-e/memory/ctm-session-context.js +111 -63
  312. package/template/wall-e/prompts/coding/deepseek.txt +3 -0
  313. package/template/wall-e/prompts/coding/gemini.txt +6 -0
  314. package/template/wall-e/prompts/coding/gpt.txt +6 -0
  315. package/template/wall-e/prompts/coding/local.txt +7 -0
  316. package/template/wall-e/runtime/decision-hooks.js +115 -0
  317. package/template/wall-e/runtime/devbox-gateway.js +82 -8
  318. package/template/wall-e/runtime/prompt-manifest.js +86 -0
  319. package/template/wall-e/runtime/tool-executor.js +269 -0
  320. package/template/wall-e/runtime/tool-result-envelope.js +138 -0
  321. package/template/wall-e/runtime/transcript-projection.js +60 -0
  322. package/template/wall-e/runtime/walle-runtime.js +224 -0
  323. package/template/wall-e/scripts/db-optimize/migrate.js +162 -0
  324. package/template/wall-e/scripts/db-optimize/recall-eval.js +117 -0
  325. package/template/wall-e/server.js +15 -0
  326. package/template/wall-e/session-files.js +9 -0
  327. package/template/wall-e/skills/_bundled/google-calendar/run.js +1 -1
  328. package/template/wall-e/skills/_bundled/gws-workspace/run.js +1 -1
  329. package/template/wall-e/skills/_bundled/slack-mentions/run.js +76 -6
  330. package/template/wall-e/skills/claude-code-reader.js +7 -3
  331. package/template/wall-e/skills/script-skill-runner.js +10 -0
  332. package/template/wall-e/skills/skill-planner.js +38 -0
  333. package/template/wall-e/tools/builtin-middleware.js +19 -9
  334. package/template/wall-e/tools/local-tools.js +1428 -16
  335. package/template/wall-e/tools/permission-checker.js +73 -5
  336. package/template/wall-e/tools/question-manager.js +117 -7
  337. package/template/wall-e/training/harvester.js +12 -28
  338. package/template/wall-e/training/replay.js +25 -80
  339. package/template/website/index.html +10 -10
  340. package/template/wall-e/eval/ab-test.js +0 -203
  341. package/template/wall-e/eval/agent-runner.js +0 -772
  342. package/template/wall-e/eval/agent-scorer.js +0 -461
  343. package/template/wall-e/eval/aggregator.js +0 -414
  344. package/template/wall-e/eval/allowed-test-commands.js +0 -34
  345. package/template/wall-e/eval/benchmark-generator.js +0 -113
  346. package/template/wall-e/eval/benchmarks/chat-eval.json +0 -1662
  347. package/template/wall-e/eval/benchmarks/chat.json +0 -82
  348. package/template/wall-e/eval/benchmarks/coding-agent-real.json +0 -1
  349. package/template/wall-e/eval/benchmarks/coding-agent.json +0 -1581
  350. package/template/wall-e/eval/benchmarks/coding.json +0 -122
  351. package/template/wall-e/eval/benchmarks/memory-retrieval.json +0 -234
  352. package/template/wall-e/eval/benchmarks/reasoning.json +0 -82
  353. package/template/wall-e/eval/benchmarks/swebench-lite-30.json +0 -212
  354. package/template/wall-e/eval/benchmarks.js +0 -669
  355. package/template/wall-e/eval/cc-replay.js +0 -719
  356. package/template/wall-e/eval/chat-eval.js +0 -525
  357. package/template/wall-e/eval/check-keys.js +0 -15
  358. package/template/wall-e/eval/check-providers.js +0 -42
  359. package/template/wall-e/eval/codex-cli-baseline.js +0 -669
  360. package/template/wall-e/eval/coding-agent-real.js +0 -570
  361. package/template/wall-e/eval/context-compactor.js +0 -251
  362. package/template/wall-e/eval/debug-agent003.js +0 -68
  363. package/template/wall-e/eval/diagnostics.js +0 -216
  364. package/template/wall-e/eval/eval-orchestrator.js +0 -642
  365. package/template/wall-e/eval/evaluate.js +0 -202
  366. package/template/wall-e/eval/evaluator.js +0 -373
  367. package/template/wall-e/eval/exporter.js +0 -212
  368. package/template/wall-e/eval/fixtures/express-basic/package.json +0 -9
  369. package/template/wall-e/eval/fixtures/express-basic/server.js +0 -115
  370. package/template/wall-e/eval/fixtures/express-basic/test.js +0 -83
  371. package/template/wall-e/eval/fixtures/express-buggy/package.json +0 -9
  372. package/template/wall-e/eval/fixtures/express-buggy/server.js +0 -113
  373. package/template/wall-e/eval/fixtures/express-buggy/test.js +0 -83
  374. package/template/wall-e/eval/fixtures/express-buggy-items/package.json +0 -9
  375. package/template/wall-e/eval/fixtures/express-buggy-items/server.js +0 -112
  376. package/template/wall-e/eval/fixtures/express-buggy-items/test.js +0 -83
  377. package/template/wall-e/eval/fixtures/express-buggy-search/package.json +0 -9
  378. package/template/wall-e/eval/fixtures/express-buggy-search/server.js +0 -121
  379. package/template/wall-e/eval/fixtures/express-buggy-search/test.js +0 -83
  380. package/template/wall-e/eval/fixtures/express-rename-data/data.js +0 -34
  381. package/template/wall-e/eval/fixtures/express-rename-data/package.json +0 -9
  382. package/template/wall-e/eval/fixtures/express-rename-data/server.js +0 -97
  383. package/template/wall-e/eval/fixtures/express-rename-data/test.js +0 -88
  384. package/template/wall-e/eval/fixtures/express-xss/package.json +0 -12
  385. package/template/wall-e/eval/fixtures/express-xss/server.js +0 -90
  386. package/template/wall-e/eval/fixtures/express-xss/test.js +0 -67
  387. package/template/wall-e/eval/fixtures/express-xss/views/profile.ejs +0 -9
  388. package/template/wall-e/eval/fixtures/fullstack-app/config/default.js +0 -9
  389. package/template/wall-e/eval/fixtures/fullstack-app/config/test.js +0 -13
  390. package/template/wall-e/eval/fixtures/fullstack-app/package.json +0 -11
  391. package/template/wall-e/eval/fixtures/fullstack-app/public/css/style.css +0 -137
  392. package/template/wall-e/eval/fixtures/fullstack-app/public/index.html +0 -46
  393. package/template/wall-e/eval/fixtures/fullstack-app/public/js/app.js +0 -121
  394. package/template/wall-e/eval/fixtures/fullstack-app/public/js/auth.js +0 -71
  395. package/template/wall-e/eval/fixtures/fullstack-app/public/js/items.js +0 -80
  396. package/template/wall-e/eval/fixtures/fullstack-app/public/js/users.js +0 -46
  397. package/template/wall-e/eval/fixtures/fullstack-app/public/login.html +0 -45
  398. package/template/wall-e/eval/fixtures/fullstack-app/public/register.html +0 -38
  399. package/template/wall-e/eval/fixtures/fullstack-app/scripts/migrate.js +0 -23
  400. package/template/wall-e/eval/fixtures/fullstack-app/scripts/seed.js +0 -46
  401. package/template/wall-e/eval/fixtures/fullstack-app/server/db.js +0 -99
  402. package/template/wall-e/eval/fixtures/fullstack-app/server/index.js +0 -94
  403. package/template/wall-e/eval/fixtures/fullstack-app/server/middleware/auth.js +0 -19
  404. package/template/wall-e/eval/fixtures/fullstack-app/server/middleware/logger.js +0 -19
  405. package/template/wall-e/eval/fixtures/fullstack-app/server/router.js +0 -50
  406. package/template/wall-e/eval/fixtures/fullstack-app/server/routes/auth.js +0 -69
  407. package/template/wall-e/eval/fixtures/fullstack-app/server/routes/health.js +0 -23
  408. package/template/wall-e/eval/fixtures/fullstack-app/server/routes/items.js +0 -88
  409. package/template/wall-e/eval/fixtures/fullstack-app/server/routes/users.js +0 -75
  410. package/template/wall-e/eval/fixtures/fullstack-app/server/test.js +0 -198
  411. package/template/wall-e/eval/fixtures/fullstack-app/server/utils/response.js +0 -34
  412. package/template/wall-e/eval/fixtures/fullstack-app/server/utils/validate.js +0 -26
  413. package/template/wall-e/eval/fixtures/fullstack-app/server.js +0 -8
  414. package/template/wall-e/eval/fixtures/fullstack-app/test.js +0 -12
  415. package/template/wall-e/eval/fixtures/monorepo-basic/package.json +0 -8
  416. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/data.js +0 -58
  417. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/middleware.js +0 -46
  418. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/package.json +0 -8
  419. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/routes.js +0 -64
  420. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/server.js +0 -56
  421. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/test.js +0 -116
  422. package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/commands.js +0 -61
  423. package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/index.js +0 -62
  424. package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/output.js +0 -43
  425. package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/package.json +0 -11
  426. package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/test.js +0 -44
  427. package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/formatters.js +0 -43
  428. package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/index.js +0 -12
  429. package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/package.json +0 -5
  430. package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/test.js +0 -55
  431. package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/validators.js +0 -29
  432. package/template/wall-e/eval/fixtures/monorepo-basic/test.js +0 -46
  433. package/template/wall-e/eval/fixtures/node-cli/index.js +0 -78
  434. package/template/wall-e/eval/fixtures/node-cli/package.json +0 -10
  435. package/template/wall-e/eval/fixtures/node-cli/test.js +0 -57
  436. package/template/wall-e/eval/fixtures/node-typed/package.json +0 -8
  437. package/template/wall-e/eval/fixtures/node-typed/src/handlers.js +0 -31
  438. package/template/wall-e/eval/fixtures/node-typed/src/utils.js +0 -33
  439. package/template/wall-e/eval/fixtures/node-typed/test.js +0 -36
  440. package/template/wall-e/eval/fixtures/python-flask/app.py +0 -14
  441. package/template/wall-e/eval/fixtures/python-flask/requirements.txt +0 -2
  442. package/template/wall-e/eval/fixtures/python-flask/test_app.py +0 -25
  443. package/template/wall-e/eval/fixtures/wall-e-subset/brain.js +0 -105
  444. package/template/wall-e/eval/fixtures/wall-e-subset/eval/aggregator.js +0 -101
  445. package/template/wall-e/eval/fixtures/wall-e-subset/eval/benchmarks/chat.json +0 -20
  446. package/template/wall-e/eval/fixtures/wall-e-subset/eval/benchmarks/coding.json +0 -32
  447. package/template/wall-e/eval/fixtures/wall-e-subset/eval/benchmarks.js +0 -64
  448. package/template/wall-e/eval/fixtures/wall-e-subset/eval/fixtures/simple-project/package.json +0 -6
  449. package/template/wall-e/eval/fixtures/wall-e-subset/eval/fixtures/simple-project/server.js +0 -31
  450. package/template/wall-e/eval/fixtures/wall-e-subset/eval/fixtures/simple-project/test.js +0 -18
  451. package/template/wall-e/eval/fixtures/wall-e-subset/eval/fixtures/simple-project/utils.js +0 -34
  452. package/template/wall-e/eval/fixtures/wall-e-subset/eval/runner.js +0 -104
  453. package/template/wall-e/eval/fixtures/wall-e-subset/eval/scorer.js +0 -73
  454. package/template/wall-e/eval/fixtures/wall-e-subset/eval/test.js +0 -134
  455. package/template/wall-e/eval/fixtures/wall-e-subset/llm/client.js +0 -99
  456. package/template/wall-e/eval/fixtures/wall-e-subset/llm/providers.js +0 -63
  457. package/template/wall-e/eval/fixtures/wall-e-subset/llm/test.js +0 -70
  458. package/template/wall-e/eval/fixtures/wall-e-subset/package.json +0 -10
  459. package/template/wall-e/eval/fixtures/wall-e-subset/test.js +0 -86
  460. package/template/wall-e/eval/harvester.js +0 -685
  461. package/template/wall-e/eval/head-to-head.js +0 -388
  462. package/template/wall-e/eval/humaneval-adapter.js +0 -321
  463. package/template/wall-e/eval/list-models.js +0 -31
  464. package/template/wall-e/eval/livecodebench-adapter.js +0 -291
  465. package/template/wall-e/eval/mail-integration.js +0 -443
  466. package/template/wall-e/eval/manifest.js +0 -186
  467. package/template/wall-e/eval/meta-harness/adapters/coding-agent.js +0 -57
  468. package/template/wall-e/eval/meta-harness/bootstrap-snapshot.js +0 -149
  469. package/template/wall-e/eval/meta-harness/candidate-store.js +0 -117
  470. package/template/wall-e/eval/meta-harness/cli.js +0 -86
  471. package/template/wall-e/eval/meta-harness/domain-spec.js +0 -154
  472. package/template/wall-e/eval/meta-harness/domains/coding-agent.domain.json +0 -84
  473. package/template/wall-e/eval/meta-harness/examples/env-bootstrap-candidate.js +0 -29
  474. package/template/wall-e/eval/meta-harness/experience-store.js +0 -174
  475. package/template/wall-e/eval/meta-harness/frontier.js +0 -96
  476. package/template/wall-e/eval/meta-harness/harness-interface.js +0 -90
  477. package/template/wall-e/eval/meta-harness/leakage-guard.js +0 -80
  478. package/template/wall-e/eval/meta-harness/optimizer.js +0 -207
  479. package/template/wall-e/eval/meta-harness/proposer-runner.js +0 -110
  480. package/template/wall-e/eval/meta-harness/reporting.js +0 -58
  481. package/template/wall-e/eval/meta-harness/telemetry.js +0 -27
  482. package/template/wall-e/eval/meta-harness/validation.js +0 -81
  483. package/template/wall-e/eval/promoter.js +0 -228
  484. package/template/wall-e/eval/provider-normalizer.js +0 -33
  485. package/template/wall-e/eval/replay.js +0 -395
  486. package/template/wall-e/eval/run-agent-benchmarks.js +0 -386
  487. package/template/wall-e/eval/run-codex-cli-baseline.js +0 -177
  488. package/template/wall-e/eval/run-coding-agent-real.js +0 -187
  489. package/template/wall-e/eval/run-eval.js +0 -435
  490. package/template/wall-e/eval/run-model-comparison.js +0 -142
  491. package/template/wall-e/eval/session-evaluator.js +0 -187
  492. package/template/wall-e/eval/session-miner.js +0 -207
  493. package/template/wall-e/eval/session-retrieval-benchmark.js +0 -150
  494. package/template/wall-e/eval/session-transcripts.js +0 -509
  495. package/template/wall-e/eval/shadow.js +0 -161
  496. package/template/wall-e/eval/swebench-adapter.js +0 -345
  497. package/template/wall-e/eval/swebench-docker.js +0 -192
  498. package/template/wall-e/eval/train.py +0 -320
  499. package/template/wall-e/eval/trainer.js +0 -232
  500. package/template/wall-e/eval/weekly-eval-loop.js +0 -241
@@ -1,414 +0,0 @@
1
- 'use strict';
2
-
3
- /**
4
- * Score aggregator — rolls up shadow eval scores into model_task_scores.
5
- * Computes rolling window stats, trends, and win rates.
6
- * Phase 6: also blends benchmark data (40%) with shadow data (60%).
7
- */
8
-
9
- const SHADOW_WEIGHT = 0.6;
10
- const BENCHMARK_WEIGHT = 0.4;
11
-
12
- /**
13
- * Map benchmark difficulty + keywords to coding task phases.
14
- */
15
- const DIFFICULTY_TASK_MAP = {
16
- easy: ['coding:exploration', 'coding:generation'],
17
- medium_debug: ['coding:debugging'],
18
- hard_multi: ['coding:refactoring', 'coding:planning'],
19
- };
20
-
21
- function mapBenchmarkToTaskTypes(difficulty, benchmarkId) {
22
- const id = (benchmarkId || '').toLowerCase();
23
- if (difficulty === 'easy') return DIFFICULTY_TASK_MAP.easy;
24
- if (difficulty === 'medium' && /debug|fix|patch|bug/.test(id)) return DIFFICULTY_TASK_MAP.medium_debug;
25
- if (difficulty === 'hard' && /multi.?file|refactor|architect/.test(id)) return DIFFICULTY_TASK_MAP.hard_multi;
26
- // Fallback: medium → generation, hard → planning
27
- if (difficulty === 'medium') return ['coding:generation'];
28
- if (difficulty === 'hard') return ['coding:planning'];
29
- return ['coding:generation'];
30
- }
31
-
32
- /**
33
- * Detect trend from 7-day and 30-day averages.
34
- * @param {number|null} score7d
35
- * @param {number|null} score30d
36
- * @returns {'improving'|'stable'|'declining'}
37
- */
38
- function detectTrend(score7d, score30d) {
39
- if (score7d == null || score30d == null) return 'stable';
40
- const diff = score7d - score30d;
41
- if (diff > 0.05) return 'improving';
42
- if (diff < -0.05) return 'declining';
43
- return 'stable';
44
- }
45
-
46
- /**
47
- * Aggregate benchmark scores for a model within a task-type-compatible window.
48
- * Queries eval_benchmark_runs and maps difficulty to coding task phases.
49
- *
50
- * @param {Object} brain - Brain module
51
- * @param {string} model - Model name
52
- * @param {string} taskType - Task type (e.g. 'coding:generation')
53
- * @param {number} [windowDays=30] - Rolling window in days
54
- * @returns {{ totalEvals: number, avgScore: number, score7d: number|null, score30d: number|null }}
55
- */
56
- function aggregateBenchmarkScores(brain, model, taskType, windowDays = 30) {
57
- const db = brain.getDb();
58
-
59
- // Check if eval_benchmark_runs table exists
60
- const tableExists = db.prepare(
61
- "SELECT name FROM sqlite_master WHERE type='table' AND name='eval_benchmark_runs'"
62
- ).get();
63
- if (!tableExists) return { totalEvals: 0, avgScore: 0, score7d: null, score30d: null };
64
-
65
- const cutoff30d = new Date(Date.now() - windowDays * 86400000).toISOString();
66
- const cutoff7d = new Date(Date.now() - 7 * 86400000).toISOString();
67
-
68
- // Get all benchmark runs for this model in window
69
- const rows = db.prepare(`
70
- SELECT benchmark_id, composite_score, created_at
71
- FROM eval_benchmark_runs
72
- WHERE model = ? AND error IS NULL AND composite_score IS NOT NULL AND created_at >= ?
73
- `).all(model, cutoff30d);
74
-
75
- if (rows.length === 0) return { totalEvals: 0, avgScore: 0, score7d: null, score30d: null };
76
-
77
- // Load benchmark definitions to get difficulty info
78
- let benchmarkDefs;
79
- try {
80
- benchmarkDefs = require('./benchmarks').loadAllBenchmarks();
81
- } catch (_) {
82
- benchmarkDefs = [];
83
- }
84
- const defMap = new Map();
85
- for (const b of benchmarkDefs) defMap.set(b.id, b);
86
-
87
- // Filter rows that map to the requested taskType
88
- const matching = rows.filter(r => {
89
- const def = defMap.get(r.benchmark_id);
90
- const difficulty = def?.difficulty || 'medium';
91
- const taskTypes = mapBenchmarkToTaskTypes(difficulty, r.benchmark_id);
92
- return taskTypes.includes(taskType);
93
- });
94
-
95
- if (matching.length === 0) return { totalEvals: 0, avgScore: 0, score7d: null, score30d: null };
96
-
97
- const totalEvals = matching.length;
98
- const avgScore = matching.reduce((s, r) => s + r.composite_score, 0) / totalEvals;
99
-
100
- const matching7d = matching.filter(r => r.created_at >= cutoff7d);
101
- const score7d = matching7d.length > 0
102
- ? matching7d.reduce((s, r) => s + r.composite_score, 0) / matching7d.length
103
- : null;
104
-
105
- return {
106
- totalEvals,
107
- avgScore: Math.round(avgScore * 1000) / 1000,
108
- score7d: score7d != null ? Math.round(score7d * 1000) / 1000 : null,
109
- score30d: Math.round(avgScore * 1000) / 1000,
110
- };
111
- }
112
-
113
- /**
114
- * Resolve provider for a model by checking eval_benchmark_runs, then falling back to name heuristic.
115
- */
116
- function resolveProvider(brain, model) {
117
- try {
118
- const row = brain.getDb().prepare(
119
- "SELECT provider FROM eval_benchmark_runs WHERE model = ? AND provider != 'unknown' LIMIT 1"
120
- ).get(model);
121
- if (row?.provider) return row.provider;
122
- } catch (_) { /* table may not exist */ }
123
- if (/^claude-/.test(model)) return 'anthropic';
124
- if (/^gpt-/.test(model)) return 'openai';
125
- if (/^gemini-/.test(model)) return 'google';
126
- return 'ollama';
127
- }
128
-
129
- /**
130
- * Aggregate scores for a specific model and task type.
131
- * Queries shadow_results directly for rolling window stats.
132
- * Phase 6: blends with benchmark data (60% shadow + 40% benchmark).
133
- *
134
- * @param {Object} brain - Brain module
135
- * @param {string} model - Shadow model name
136
- * @param {string} taskType - Task type
137
- * @param {number} [windowDays=30] - Rolling window in days
138
- * @returns {Object} Aggregated stats
139
- */
140
- function aggregateScores(brain, model, taskType, windowDays = 30) {
141
- const db = brain.getDb();
142
- const cutoff30d = new Date(Date.now() - windowDays * 86400000).toISOString();
143
- const cutoff7d = new Date(Date.now() - 7 * 86400000).toISOString();
144
-
145
- // 30-day window stats from shadow_results
146
- const stats30d = db.prepare(`
147
- SELECT
148
- COUNT(*) AS total_evals,
149
- AVG(shadow_score) AS avg_score,
150
- SUM(CASE WHEN shadow_score >= 0.8 * COALESCE(primary_score, 1.0) THEN 1 ELSE 0 END) AS win_count,
151
- SUM(CASE WHEN shadow_score >= COALESCE(primary_score, 1.0) THEN 1 ELSE 0 END) AS strong_win_count,
152
- AVG(shadow_latency_ms) AS avg_latency_ms,
153
- MAX(evaluated_at) AS last_eval_at
154
- FROM shadow_results
155
- WHERE shadow_model = ? AND task_type = ?
156
- AND evaluated_at IS NOT NULL
157
- AND created_at >= ?
158
- `).get(model, taskType, cutoff30d);
159
-
160
- // 7-day window stats from shadow_results
161
- const stats7d = db.prepare(`
162
- SELECT AVG(shadow_score) AS avg_score
163
- FROM shadow_results
164
- WHERE shadow_model = ? AND task_type = ?
165
- AND evaluated_at IS NOT NULL
166
- AND created_at >= ?
167
- `).get(model, taskType, cutoff7d);
168
-
169
- const shadowScore7d = stats7d?.avg_score ?? null;
170
- const shadowScore30d = stats30d?.avg_score ?? null;
171
-
172
- // Phase 6: blend with benchmark scores
173
- const bench = aggregateBenchmarkScores(brain, model, taskType, windowDays);
174
- const hasShadow = (stats30d?.total_evals || 0) > 0;
175
- const hasBench = bench.totalEvals > 0;
176
-
177
- let blendedAvg, blended7d, blended30d;
178
-
179
- if (hasShadow && hasBench) {
180
- // Weighted blend: 60% shadow + 40% benchmark
181
- blendedAvg = SHADOW_WEIGHT * (shadowScore30d || 0) + BENCHMARK_WEIGHT * bench.avgScore;
182
- blended7d = (shadowScore7d != null && bench.score7d != null)
183
- ? SHADOW_WEIGHT * shadowScore7d + BENCHMARK_WEIGHT * bench.score7d
184
- : shadowScore7d ?? bench.score7d;
185
- blended30d = SHADOW_WEIGHT * (shadowScore30d || 0) + BENCHMARK_WEIGHT * bench.score30d;
186
- } else if (hasBench) {
187
- // Only benchmark data — use it at full weight
188
- blendedAvg = bench.avgScore;
189
- blended7d = bench.score7d;
190
- blended30d = bench.score30d;
191
- } else {
192
- // Only shadow data (or nothing)
193
- blendedAvg = shadowScore30d || 0;
194
- blended7d = shadowScore7d;
195
- blended30d = shadowScore30d;
196
- }
197
-
198
- const score7d = blended7d;
199
- const score30d = blended30d;
200
- const trend = detectTrend(score7d, score30d);
201
-
202
- return {
203
- model,
204
- taskType,
205
- provider: resolveProvider(brain, model),
206
- totalEvals: (stats30d?.total_evals || 0) + bench.totalEvals,
207
- avgScore: Math.round((blendedAvg || 0) * 1000) / 1000,
208
- winCount: stats30d?.win_count || 0,
209
- strongWinCount: stats30d?.strong_win_count || 0,
210
- avgLatencyMs: Math.round(stats30d?.avg_latency_ms || 0),
211
- score7d: score7d != null ? Math.round(score7d * 1000) / 1000 : null,
212
- score30d: score30d != null ? Math.round(score30d * 1000) / 1000 : null,
213
- trend,
214
- lastEvalAt: stats30d?.last_eval_at || null,
215
- };
216
- }
217
-
218
- /**
219
- * Get all distinct (model, task_type) pairs from shadow_results that have been evaluated.
220
- */
221
- function getDistinctModelTaskPairs(brain) {
222
- return brain.getDb().prepare(`
223
- SELECT DISTINCT shadow_model AS model, task_type
224
- FROM shadow_results
225
- WHERE evaluated_at IS NOT NULL AND shadow_model IS NOT NULL
226
- `).all();
227
- }
228
-
229
- /**
230
- * Get distinct (model, task_type) pairs from benchmark runs mapped through difficulty.
231
- * Returns pairs that may not exist in shadow_results.
232
- */
233
- function getDistinctBenchmarkModelTaskPairs(brain) {
234
- const db = brain.getDb();
235
- const tableExists = db.prepare(
236
- "SELECT name FROM sqlite_master WHERE type='table' AND name='eval_benchmark_runs'"
237
- ).get();
238
- if (!tableExists) return [];
239
-
240
- const rows = db.prepare(`
241
- SELECT DISTINCT model, benchmark_id FROM eval_benchmark_runs
242
- WHERE error IS NULL AND composite_score IS NOT NULL AND model != 'unknown'
243
- `).all();
244
-
245
- let benchmarkDefs;
246
- try {
247
- benchmarkDefs = require('./benchmarks').loadAllBenchmarks();
248
- } catch (_) {
249
- benchmarkDefs = [];
250
- }
251
- const defMap = new Map();
252
- for (const b of benchmarkDefs) defMap.set(b.id, b);
253
-
254
- const pairSet = new Set();
255
- const pairs = [];
256
- for (const { model, benchmark_id } of rows) {
257
- const def = defMap.get(benchmark_id);
258
- const difficulty = def?.difficulty || 'medium';
259
- const taskTypes = mapBenchmarkToTaskTypes(difficulty, benchmark_id);
260
- for (const tt of taskTypes) {
261
- const key = `${model}::${tt}`;
262
- if (!pairSet.has(key)) {
263
- pairSet.add(key);
264
- pairs.push({ model, task_type: tt });
265
- }
266
- }
267
- }
268
- return pairs;
269
- }
270
-
271
- /**
272
- * Refresh all model_task_scores by re-aggregating from shadow_results and benchmark runs.
273
- * Phase 6: also iterates benchmark-derived model/task pairs so benchmark-only models get scores.
274
- * @param {Object} brain - Brain module
275
- * @returns {{ updated: number, pairs: Array }}
276
- */
277
- function updateAllModelTaskScores(brain) {
278
- const shadowPairs = getDistinctModelTaskPairs(brain);
279
- const benchPairs = getDistinctBenchmarkModelTaskPairs(brain);
280
-
281
- // Merge pairs, deduplicating by model::taskType
282
- const seen = new Set();
283
- const allPairs = [];
284
- for (const p of [...shadowPairs, ...benchPairs]) {
285
- const key = `${p.model}::${p.task_type}`;
286
- if (!seen.has(key)) {
287
- seen.add(key);
288
- allPairs.push(p);
289
- }
290
- }
291
-
292
- const updated = [];
293
- for (const { model, task_type: taskType } of allPairs) {
294
- const stats = aggregateScores(brain, model, taskType);
295
- brain.upsertModelTaskScore(model, taskType, stats);
296
- updated.push({ model, taskType, avgScore: stats.avgScore, totalEvals: stats.totalEvals, trend: stats.trend });
297
- }
298
-
299
- return { updated: updated.length, pairs: updated };
300
- }
301
-
302
- /**
303
- * Aggregate coding agent session scores alongside shadow eval data.
304
- * @param {Object} brain - Brain module
305
- * @returns {{ updated: number, agentSessions: number }}
306
- */
307
- function aggregateCodingAgentScores(brain) {
308
- const db = brain.getDb();
309
-
310
- // Check if coding_agent_sessions table exists
311
- const tableExists = db.prepare(
312
- "SELECT name FROM sqlite_master WHERE type='table' AND name='coding_agent_sessions'"
313
- ).get();
314
- if (!tableExists) return { updated: 0, agentSessions: 0 };
315
-
316
- // Get coding agent session stats grouped by classified_type
317
- const stats = db.prepare(`
318
- SELECT
319
- classified_type,
320
- COUNT(*) as total,
321
- AVG(significance_score) as avg_significance
322
- FROM coding_agent_sessions
323
- WHERE significance_score > 0
324
- GROUP BY classified_type
325
- `).all();
326
-
327
- return { updated: stats.length, agentSessions: stats.reduce((s, r) => s + r.total, 0), byType: stats };
328
- }
329
-
330
- /**
331
- * Data-driven difficulty calibration.
332
- * After 50+ benchmark runs per benchmark_id, checks if actual scores suggest
333
- * the difficulty label should be changed.
334
- *
335
- * @param {Object} brain - Brain module
336
- * @returns {Array<{ benchmarkId: string, currentDifficulty: string, suggestedDifficulty: string, avgScore: number, runCount: number }>}
337
- */
338
- function calibrateDifficulty(brain) {
339
- const db = brain.getDb();
340
-
341
- const tableExists = db.prepare(
342
- "SELECT name FROM sqlite_master WHERE type='table' AND name='eval_benchmark_runs'"
343
- ).get();
344
- if (!tableExists) return [];
345
-
346
- const rows = db.prepare(`
347
- SELECT benchmark_id, AVG(composite_score) AS actual_difficulty, COUNT(*) AS run_count
348
- FROM eval_benchmark_runs
349
- WHERE error IS NULL AND composite_score IS NOT NULL
350
- GROUP BY benchmark_id
351
- HAVING COUNT(*) >= 50
352
- `).all();
353
-
354
- if (rows.length === 0) return [];
355
-
356
- // Load benchmark definitions to get current difficulty labels
357
- let benchmarkDefs;
358
- try {
359
- benchmarkDefs = require('./benchmarks').loadAllBenchmarks();
360
- } catch (_) {
361
- benchmarkDefs = [];
362
- }
363
- const defMap = new Map();
364
- for (const b of benchmarkDefs) defMap.set(b.id, b);
365
-
366
- const suggestions = [];
367
- for (const { benchmark_id, actual_difficulty: avgScore, run_count: runCount } of rows) {
368
- const def = defMap.get(benchmark_id);
369
- const currentDifficulty = def?.difficulty || 'unknown';
370
-
371
- let suggestedDifficulty = currentDifficulty;
372
- // High avg score on a "hard" benchmark → suggest downgrade to medium
373
- if (currentDifficulty === 'hard' && avgScore >= 0.85) {
374
- suggestedDifficulty = 'medium';
375
- }
376
- // High avg score on a "medium" benchmark → suggest downgrade to easy
377
- else if (currentDifficulty === 'medium' && avgScore >= 0.85) {
378
- suggestedDifficulty = 'easy';
379
- }
380
- // Low avg score on "easy" → suggest upgrade to medium or hard
381
- else if (currentDifficulty === 'easy' && avgScore < 0.5) {
382
- suggestedDifficulty = 'hard';
383
- }
384
- // Low avg score on "medium" → suggest upgrade to hard
385
- else if (currentDifficulty === 'medium' && avgScore < 0.5) {
386
- suggestedDifficulty = 'hard';
387
- }
388
-
389
- if (suggestedDifficulty !== currentDifficulty) {
390
- suggestions.push({
391
- benchmarkId: benchmark_id,
392
- currentDifficulty,
393
- suggestedDifficulty,
394
- avgScore: Math.round(avgScore * 1000) / 1000,
395
- runCount,
396
- });
397
- }
398
- }
399
-
400
- return suggestions;
401
- }
402
-
403
- module.exports = {
404
- detectTrend,
405
- aggregateScores,
406
- aggregateBenchmarkScores,
407
- getDistinctModelTaskPairs,
408
- updateAllModelTaskScores,
409
- aggregateCodingAgentScores,
410
- calibrateDifficulty,
411
- mapBenchmarkToTaskTypes,
412
- SHADOW_WEIGHT,
413
- BENCHMARK_WEIGHT,
414
- };
@@ -1,34 +0,0 @@
1
- 'use strict';
2
-
3
- const ALLOWED_TEST_COMMAND_PATTERNS = [
4
- /^npm test(?:\s+--\s*[\w./:= -]+)?$/,
5
- /^npm run (?:test|test:[\w:-]+|typecheck|lint)(?:\s+--\s*[\w./:= -]+)?$/,
6
- /^pnpm test(?:\s+--\s*[\w./:= -]+)?$/,
7
- /^pnpm run (?:test|test:[\w:-]+|typecheck|lint)(?:\s+--\s*[\w./:= -]+)?$/,
8
- /^yarn test(?:\s+[\w./:= -]+)?$/,
9
- /^bun test(?:\s+[\w./:= -]+)?$/,
10
- /^node test\.js$/,
11
- /^node --test(?:\s+[\w./-]+)*$/,
12
- /^pytest(?:\s+[\w./:-]+)*$/,
13
- /^python -m pytest(?:\s+[\w./:-]+)*$/,
14
- /^make test$/,
15
- /^tsc --noEmit(?:\s+--[\w:-]+(?:[= ]\S+)?)?$/,
16
- /^npx tsc --noEmit(?:\s+--[\w:-]+(?:[= ]\S+)?)?$/,
17
- /^go test(?:\s+(?:\.|\.\/\.\.\.|[\w./-]+))*$/,
18
- /^cargo test(?:\s+[\w./:-]+)*$/,
19
- ];
20
-
21
- function normalizeTestCommand(command) {
22
- return String(command || '').trim().replace(/\s+/g, ' ');
23
- }
24
-
25
- function testCommandAllowed(command) {
26
- const normalized = normalizeTestCommand(command);
27
- return Boolean(normalized) && ALLOWED_TEST_COMMAND_PATTERNS.some((pattern) => pattern.test(normalized));
28
- }
29
-
30
- module.exports = {
31
- ALLOWED_TEST_COMMAND_PATTERNS,
32
- normalizeTestCommand,
33
- testCommandAllowed,
34
- };
@@ -1,113 +0,0 @@
1
- 'use strict';
2
- const crypto = require('crypto');
3
- const fs = require('fs');
4
- const path = require('path');
5
-
6
- /**
7
- * Classify a coding agent session into a task type based on prompt content
8
- * and tool call patterns.
9
- */
10
- function classifyCodingType(session) {
11
- const prompt = (session.prompt || '').toLowerCase();
12
-
13
- if (/plan|design|architect/i.test(prompt)) return 'coding:planning';
14
- if (/debug|fix|bug|error|failing/i.test(prompt)) return 'coding:debugging';
15
- if (/refactor|extract|rename|reorganize/i.test(prompt)) return 'coding:refactoring';
16
- if (/review|assess|check/i.test(prompt)) return 'coding:review';
17
- if (/test|spec|coverage/i.test(prompt)) return 'coding:testing';
18
- return 'coding:generation';
19
- }
20
-
21
- function isReplayableBenchmarkPrompt(prompt) {
22
- const text = String(prompt || '').trim();
23
- if (text.length < 25) return false;
24
-
25
- // Session-mined prompts must be the user's task, not the assistant's first
26
- // progress narration. Assistant prose turns the benchmark into "continue the
27
- // previous assistant's work", which is not replayable from a fresh sandbox.
28
- if (/^(i('|’)ll|i will|i can|i('|’)m going to|let me|sure[, ]|happy to help|i('|’)ll help|i('|’)ll start)\b/i.test(text)) {
29
- return false;
30
- }
31
-
32
- if (/^\s*(go ahead|continue|proceed|do it|yes|yep|ok|okay|thanks|thank you)\b/i.test(text)) {
33
- return false;
34
- }
35
-
36
- return /\b(fix|implement|add|change|update|refactor|test|debug|make|write|delete|remove|harden|wire|bug|failing|error|regression|feature|endpoint|api|ui|server|component|code review|review.*code)\b/i.test(text);
37
- }
38
-
39
- /**
40
- * Convert a coding agent session object to a benchmark entry.
41
- * Returns null if the session would be a duplicate (id already in existingIds).
42
- */
43
- function sessionToBenchmark(session, existingIds = new Set()) {
44
- if (!isReplayableBenchmarkPrompt(session.prompt)) return null;
45
-
46
- const id = `agent-session-${crypto.createHash('sha256').update(session.prompt || '').digest('hex').slice(0, 8)}`;
47
- if (existingIds.has(id)) return null; // dedup
48
-
49
- const type = classifyCodingType(session);
50
- const toolNames = (session.tool_calls || []).map(t => typeof t === 'string' ? t : t.name || '');
51
-
52
- // Infer expected traits from what the session actually did
53
- const traits = [];
54
- if (toolNames.some(t => /read_file|glob|grep/.test(t))) traits.push('reads before writing');
55
- if (toolNames.some(t => /edit_file/.test(t)) && !toolNames.some(t => t === 'write_file')) traits.push('uses edit over write');
56
- if (toolNames.some(t => /test|npm test/.test(t))) traits.push('runs tests after changes');
57
- if (toolNames.some(t => /update_todos/.test(t))) traits.push('plans before executing');
58
- if (traits.length === 0) traits.push('has code block');
59
-
60
- const difficulty = session.turns > 10 ? 'hard' : session.turns > 5 ? 'medium' : 'easy';
61
-
62
- return {
63
- id,
64
- prompt: session.prompt,
65
- taskType: 'coding-agent',
66
- difficulty,
67
- expectedTraits: traits,
68
- agentExpectations: {
69
- expectedToolCalls: [...new Set(toolNames)].slice(0, 10),
70
- maxTurns: Math.min((session.turns || 1) * 2, 50),
71
- expectedFileChanges: session.files_modified || [],
72
- },
73
- sourceSessionId: session.session_id,
74
- classifiedType: type,
75
- expectedDiff: session.git_diff || null,
76
- complexityIndicator: Array.isArray(session.files_modified) ? session.files_modified.length : 0,
77
- };
78
- }
79
-
80
- /**
81
- * Generate benchmarks from significant coding agent sessions stored in the brain DB.
82
- *
83
- * @param {object} brain - Brain module (must have getCodingSessions, markBenchmarkGenerated)
84
- * @param {object} opts - Options: minSignificance (default 0.5), limit (default 50)
85
- * @returns {Array} Array of newly generated benchmark objects
86
- */
87
- async function generateBenchmarks(brain, { minSignificance = 0.5, limit = 50 } = {}) {
88
- // Get significant sessions not yet converted
89
- const sessions = brain.getCodingSessions({ minSignificance, limit });
90
- const unconverted = sessions.filter(s => !s.benchmark_generated);
91
-
92
- // Load existing benchmark IDs to dedup
93
- const existingIds = new Set();
94
- try {
95
- const benchmarkPath = path.join(__dirname, 'benchmarks', 'coding-agent.json');
96
- const existing = JSON.parse(fs.readFileSync(benchmarkPath, 'utf8'));
97
- existing.forEach(b => existingIds.add(b.id));
98
- } catch { /* no existing file or parse error */ }
99
-
100
- const benchmarks = [];
101
- for (const session of unconverted) {
102
- const benchmark = sessionToBenchmark(session, existingIds);
103
- if (benchmark) {
104
- benchmarks.push(benchmark);
105
- existingIds.add(benchmark.id);
106
- brain.markBenchmarkGenerated(session.id);
107
- }
108
- }
109
-
110
- return benchmarks;
111
- }
112
-
113
- module.exports = { classifyCodingType, isReplayableBenchmarkPrompt, sessionToBenchmark, generateBenchmarks };