create-walle 0.9.21 → 0.9.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (500) hide show
  1. package/README.md +27 -5
  2. package/package.json +2 -2
  3. package/template/CLAUDE.md +2 -2
  4. package/template/LICENSE +1 -1
  5. package/template/bin/ctm-dev-cleanup.js +24 -3
  6. package/template/bin/ctm-launch.sh +13 -0
  7. package/template/bin/dev.sh +156 -18
  8. package/template/bin/node-bin.sh +84 -0
  9. package/template/bin/pin-node.sh +51 -0
  10. package/template/claude-task-manager/api-prompts.js +1203 -182
  11. package/template/claude-task-manager/api-reviews.js +109 -15
  12. package/template/claude-task-manager/approval-agent.js +1360 -280
  13. package/template/claude-task-manager/bin/restart-ctm.sh +64 -23
  14. package/template/claude-task-manager/bin/storage-migration-supervisor.js +338 -0
  15. package/template/claude-task-manager/db.js +4417 -295
  16. package/template/claude-task-manager/docs/app-update-refresh-protocol.md +69 -0
  17. package/template/claude-task-manager/docs/approval-ai-refinement.md +138 -0
  18. package/template/claude-task-manager/docs/approval-rescue-loop.md +74 -0
  19. package/template/claude-task-manager/docs/codex-operational-warning-health.md +107 -0
  20. package/template/claude-task-manager/docs/codex-resume-state-guard-design.md +17 -12
  21. package/template/claude-task-manager/docs/codex-terminal-render-controller-handoff.md +311 -0
  22. package/template/claude-task-manager/docs/coding-agent-hooks-architecture.md +418 -0
  23. package/template/claude-task-manager/docs/conversation-import-freshness.md +20 -0
  24. package/template/claude-task-manager/docs/google-workspace-auth-health.md +77 -0
  25. package/template/claude-task-manager/docs/image-paste-ux.md +13 -0
  26. package/template/claude-task-manager/docs/ipad-web-preview.md +88 -0
  27. package/template/claude-task-manager/docs/main-loop-offload-architecture.md +66 -0
  28. package/template/claude-task-manager/docs/microsoft-dev-tunnel-phone-access-design.md +274 -519
  29. package/template/claude-task-manager/docs/mobile-live-streaming.md +27 -5
  30. package/template/claude-task-manager/docs/mobile-remote-submission-lifecycle.md +69 -0
  31. package/template/claude-task-manager/docs/phone-access-design.md +53 -15
  32. package/template/claude-task-manager/docs/phone-passkey-identity.md +122 -0
  33. package/template/claude-task-manager/docs/phone-setup.md +3 -0
  34. package/template/claude-task-manager/docs/prompt-editing-tree-design.md +25 -1
  35. package/template/claude-task-manager/docs/remote-desktop-access-design.md +268 -0
  36. package/template/claude-task-manager/docs/restart-lifecycle-architecture.md +95 -0
  37. package/template/claude-task-manager/docs/runtime-work-control-plane.md +53 -0
  38. package/template/claude-task-manager/docs/session-interactive-wait-surfaces.md +38 -0
  39. package/template/claude-task-manager/docs/session-needs-you-dismissal.md +84 -0
  40. package/template/claude-task-manager/docs/session-render-state-management-design.md +91 -3
  41. package/template/claude-task-manager/docs/session-standup-command-center-design.md +25 -1
  42. package/template/claude-task-manager/docs/session-title-authority.md +32 -0
  43. package/template/claude-task-manager/docs/session-workspace-binding.md +33 -0
  44. package/template/claude-task-manager/docs/skill-intent-resolution-design.md +72 -0
  45. package/template/claude-task-manager/docs/walle-mcp-supervisor-health.md +86 -0
  46. package/template/claude-task-manager/docs/walle-relay-phone-access-design.md +24 -15
  47. package/template/claude-task-manager/docs/walle-session-history-hydration.md +114 -0
  48. package/template/claude-task-manager/docs/walle-session-input-queue.md +104 -0
  49. package/template/claude-task-manager/docs/walle-session-model-catalog.md +90 -0
  50. package/template/claude-task-manager/docs/walle-session-model-preferences.md +15 -6
  51. package/template/claude-task-manager/git-utils.js +897 -27
  52. package/template/claude-task-manager/lib/agent-capabilities.js +33 -0
  53. package/template/claude-task-manager/lib/agent-cli-cache.js +37 -7
  54. package/template/claude-task-manager/lib/agent-hooks-installer.js +26 -2
  55. package/template/claude-task-manager/lib/agent-presets.js +17 -1
  56. package/template/claude-task-manager/lib/all-sessions-query.js +108 -0
  57. package/template/claude-task-manager/lib/approval-ai-refinement.js +488 -0
  58. package/template/claude-task-manager/lib/approval-self-adapt.js +168 -0
  59. package/template/claude-task-manager/lib/async-semaphore.js +44 -0
  60. package/template/claude-task-manager/lib/auth-context.js +5 -0
  61. package/template/claude-task-manager/lib/auth-rate-limit.js +47 -4
  62. package/template/claude-task-manager/lib/auth-rules.js +29 -2
  63. package/template/claude-task-manager/lib/auto-approval-verifier.js +129 -16
  64. package/template/claude-task-manager/lib/background-llm.js +144 -17
  65. package/template/claude-task-manager/lib/branch-inventory.js +212 -0
  66. package/template/claude-task-manager/lib/claude-desktop-sessions.js +15 -3
  67. package/template/claude-task-manager/lib/coalesce-sync-frames.js +151 -0
  68. package/template/claude-task-manager/lib/codex-launch-health.js +762 -0
  69. package/template/claude-task-manager/lib/codex-transcript-pager.js +51 -0
  70. package/template/claude-task-manager/lib/codex-zst.js +124 -0
  71. package/template/claude-task-manager/lib/coding-agent-models.js +233 -30
  72. package/template/claude-task-manager/lib/connection-health.js +232 -0
  73. package/template/claude-task-manager/lib/conversation-blob-parser.js +42 -0
  74. package/template/claude-task-manager/lib/conversation-tail-merge.js +89 -26
  75. package/template/claude-task-manager/lib/ctm-session-context-api.js +39 -10
  76. package/template/claude-task-manager/lib/cursor-conversation-store.js +354 -0
  77. package/template/claude-task-manager/lib/db-owner-worker-client.js +315 -0
  78. package/template/claude-task-manager/lib/document-review.js +141 -6
  79. package/template/claude-task-manager/lib/escalation-review.js +152 -0
  80. package/template/claude-task-manager/lib/graceful-shutdown.js +159 -0
  81. package/template/claude-task-manager/lib/headless-term-service.js +678 -0
  82. package/template/claude-task-manager/lib/heavy-worker-fallback.js +38 -0
  83. package/template/claude-task-manager/lib/jsonl-conversation-parser.js +542 -0
  84. package/template/claude-task-manager/lib/jsonl-range-reader.js +112 -0
  85. package/template/claude-task-manager/lib/main-db-census.js +216 -0
  86. package/template/claude-task-manager/lib/message-pagination.js +106 -4
  87. package/template/claude-task-manager/lib/microsoft-dev-tunnel-setup.js +750 -26
  88. package/template/claude-task-manager/lib/mobile-auth-api.js +274 -7
  89. package/template/claude-task-manager/lib/mobile-auth-store.js +592 -10
  90. package/template/claude-task-manager/lib/mobile-notification-dispatcher.js +15 -0
  91. package/template/claude-task-manager/lib/model-overview-brain-fallback.js +311 -0
  92. package/template/claude-task-manager/lib/model-overview-cache.js +141 -0
  93. package/template/claude-task-manager/lib/models-health-routing-notice.js +126 -0
  94. package/template/claude-task-manager/lib/node-pin-guard.js +93 -0
  95. package/template/claude-task-manager/lib/perf-tracker.js +242 -6
  96. package/template/claude-task-manager/lib/permission-match.js +76 -0
  97. package/template/claude-task-manager/lib/permission-sync.js +133 -20
  98. package/template/claude-task-manager/lib/process-title.js +35 -0
  99. package/template/claude-task-manager/lib/prompt-executions-query.js +25 -0
  100. package/template/claude-task-manager/lib/prompt-index-disk-cache.js +44 -0
  101. package/template/claude-task-manager/lib/prompt-intent.js +132 -0
  102. package/template/claude-task-manager/lib/provider-user-context.js +34 -0
  103. package/template/claude-task-manager/lib/read-pool-client.js +313 -0
  104. package/template/claude-task-manager/lib/readpool-breaker.js +31 -0
  105. package/template/claude-task-manager/lib/recent-sessions-breaker.js +12 -0
  106. package/template/claude-task-manager/lib/remote-feedback-client.js +72 -0
  107. package/template/claude-task-manager/lib/remote-relay-protocol.js +37 -4
  108. package/template/claude-task-manager/lib/remote-relay-store.js +159 -0
  109. package/template/claude-task-manager/lib/remote-submission-observer.js +278 -0
  110. package/template/claude-task-manager/lib/restart-guard.js +109 -0
  111. package/template/claude-task-manager/lib/restore-interruption-detector.js +439 -0
  112. package/template/claude-task-manager/lib/restore-policy.js +13 -0
  113. package/template/claude-task-manager/lib/restore-resume-batch.js +74 -0
  114. package/template/claude-task-manager/lib/restore-runtime.js +68 -0
  115. package/template/claude-task-manager/lib/restore-storm.js +34 -0
  116. package/template/claude-task-manager/lib/resume-cwd.js +36 -0
  117. package/template/claude-task-manager/lib/resume-preflight.js +313 -0
  118. package/template/claude-task-manager/lib/runtime-work-registry.js +444 -0
  119. package/template/claude-task-manager/lib/sanitize-openai-auth.js +31 -0
  120. package/template/claude-task-manager/lib/scheduler.js +21 -1
  121. package/template/claude-task-manager/lib/scrollback-snapshot-store.js +159 -0
  122. package/template/claude-task-manager/lib/serial-task-queue.js +64 -0
  123. package/template/claude-task-manager/lib/server-listeners.js +239 -0
  124. package/template/claude-task-manager/lib/session-capture.js +42 -7
  125. package/template/claude-task-manager/lib/session-content-backfill.js +131 -0
  126. package/template/claude-task-manager/lib/session-history.js +388 -43
  127. package/template/claude-task-manager/lib/session-host-manager.js +287 -0
  128. package/template/claude-task-manager/lib/session-image-refs.js +209 -0
  129. package/template/claude-task-manager/lib/session-jobs.js +399 -59
  130. package/template/claude-task-manager/lib/session-prompt-index.js +137 -0
  131. package/template/claude-task-manager/lib/session-restore.js +53 -0
  132. package/template/claude-task-manager/lib/session-standup.js +123 -23
  133. package/template/claude-task-manager/lib/session-state-bus.js +14 -0
  134. package/template/claude-task-manager/lib/session-stream.js +64 -16
  135. package/template/claude-task-manager/lib/session-timeline-summary.js +260 -0
  136. package/template/claude-task-manager/lib/session-token-usage.js +494 -0
  137. package/template/claude-task-manager/lib/session-workspace-binding.js +356 -0
  138. package/template/claude-task-manager/lib/setup-network-config.js +9 -0
  139. package/template/claude-task-manager/lib/size-cap.js +45 -0
  140. package/template/claude-task-manager/lib/size-cap.test.js +62 -0
  141. package/template/claude-task-manager/lib/skill-autocomplete.js +180 -1
  142. package/template/claude-task-manager/lib/skill-intent-resolver.js +304 -0
  143. package/template/claude-task-manager/lib/sqlite-driver.js +19 -3
  144. package/template/claude-task-manager/lib/standup-attention.js +7 -3
  145. package/template/claude-task-manager/lib/status-authority.js +39 -0
  146. package/template/claude-task-manager/lib/status-hooks.js +4 -0
  147. package/template/claude-task-manager/lib/storage-migration.js +235 -0
  148. package/template/claude-task-manager/lib/structured-capture.js +298 -0
  149. package/template/claude-task-manager/lib/sync-io-census.js +163 -0
  150. package/template/claude-task-manager/lib/tailscale-setup.js +6 -0
  151. package/template/claude-task-manager/lib/terminal-activity-evidence.js +33 -0
  152. package/template/claude-task-manager/lib/terminal-choice.js +364 -0
  153. package/template/claude-task-manager/lib/terminal-control-sanitize.js +17 -0
  154. package/template/claude-task-manager/lib/terminal-fingerprint.js +48 -0
  155. package/template/claude-task-manager/lib/terminal-output-flush.js +84 -0
  156. package/template/claude-task-manager/lib/timeline-order.js +122 -0
  157. package/template/claude-task-manager/lib/transcript-store.js +348 -43
  158. package/template/claude-task-manager/lib/transport-security.js +84 -1
  159. package/template/claude-task-manager/lib/wait-state.js +184 -0
  160. package/template/claude-task-manager/lib/walle-client.js +47 -5
  161. package/template/claude-task-manager/lib/walle-ctm-history.js +564 -4
  162. package/template/claude-task-manager/lib/walle-external-actions.js +135 -16
  163. package/template/claude-task-manager/lib/walle-history-hydration.js +46 -0
  164. package/template/claude-task-manager/lib/walle-native-health.js +403 -0
  165. package/template/claude-task-manager/lib/walle-repair.js +701 -0
  166. package/template/claude-task-manager/lib/walle-session-cache.js +109 -0
  167. package/template/claude-task-manager/lib/walle-session-context.js +57 -21
  168. package/template/claude-task-manager/lib/walle-session-model-catalog.js +34 -0
  169. package/template/claude-task-manager/lib/walle-supervisor.js +539 -63
  170. package/template/claude-task-manager/lib/walle-transcript.js +52 -0
  171. package/template/claude-task-manager/lib/worktree-active-sync.js +11 -7
  172. package/template/claude-task-manager/lib/worktree-cwd.js +32 -1
  173. package/template/claude-task-manager/package.json +1 -1
  174. package/template/claude-task-manager/prompt-harvest.js +89 -66
  175. package/template/claude-task-manager/providers/claude-code.js +51 -3
  176. package/template/claude-task-manager/providers/cursor.js +140 -45
  177. package/template/claude-task-manager/public/css/reviews.css +551 -61
  178. package/template/claude-task-manager/public/css/setup.css +191 -0
  179. package/template/claude-task-manager/public/css/walle-session.css +865 -10
  180. package/template/claude-task-manager/public/css/walle.css +154 -0
  181. package/template/claude-task-manager/public/designs/ai-providers-consolidation-v2.html +830 -0
  182. package/template/claude-task-manager/public/index.html +18516 -2058
  183. package/template/claude-task-manager/public/ipad.html +363 -0
  184. package/template/claude-task-manager/public/js/document-review-links.js +301 -0
  185. package/template/claude-task-manager/public/js/image-normalize.js +69 -36
  186. package/template/claude-task-manager/public/js/message-renderer.js +1265 -77
  187. package/template/claude-task-manager/public/js/prompts.js +66 -29
  188. package/template/claude-task-manager/public/js/reviews.js +901 -133
  189. package/template/claude-task-manager/public/js/session-activity-utils.js +11 -1
  190. package/template/claude-task-manager/public/js/session-search-utils.js +94 -10
  191. package/template/claude-task-manager/public/js/session-status-precedence.js +23 -5
  192. package/template/claude-task-manager/public/js/setup.js +1273 -176
  193. package/template/claude-task-manager/public/js/stream-view.js +691 -73
  194. package/template/claude-task-manager/public/js/terminal-reconciler.js +210 -0
  195. package/template/claude-task-manager/public/js/walle-session.js +2455 -158
  196. package/template/claude-task-manager/public/js/walle.js +455 -28
  197. package/template/claude-task-manager/public/m/app.css +2909 -262
  198. package/template/claude-task-manager/public/m/app.js +6601 -398
  199. package/template/claude-task-manager/public/m/claim.html +224 -17
  200. package/template/claude-task-manager/public/m/index.html +117 -21
  201. package/template/claude-task-manager/public/m/sw.js +3 -1
  202. package/template/claude-task-manager/public/manifest.json +2 -2
  203. package/template/claude-task-manager/public/prompts.html +30 -14
  204. package/template/claude-task-manager/queue-engine.js +507 -28
  205. package/template/claude-task-manager/scripts/repair-claude-session-images.js +27 -8
  206. package/template/claude-task-manager/server.js +14341 -2197
  207. package/template/claude-task-manager/session-integrity.js +160 -18
  208. package/template/claude-task-manager/session-search-ranking.js +1 -0
  209. package/template/claude-task-manager/session-utils.js +25 -5
  210. package/template/claude-task-manager/workers/approval-blocklist.js +96 -6
  211. package/template/claude-task-manager/workers/approval-widget-validator.js +14 -8
  212. package/template/claude-task-manager/workers/conversation-import-worker.js +11 -50
  213. package/template/claude-task-manager/workers/db-owner-worker.js +386 -0
  214. package/template/claude-task-manager/workers/harvest-worker.js +9 -55
  215. package/template/claude-task-manager/workers/headless-term-worker.js +9 -530
  216. package/template/claude-task-manager/workers/read-pool-worker.js +387 -0
  217. package/template/claude-task-manager/workers/scrollback-worker.js +11 -72
  218. package/template/claude-task-manager/workers/session-host-process.js +146 -0
  219. package/template/claude-task-manager/workers/session-integrity-worker.js +10 -54
  220. package/template/claude-task-manager/workers/state-detectors/base.js +18 -1
  221. package/template/claude-task-manager/workers/state-detectors/claude-code.js +182 -9
  222. package/template/claude-task-manager/workers/state-detectors/codex.js +150 -2
  223. package/template/claude-task-manager/workers/state-detectors/cursor.js +127 -0
  224. package/template/claude-task-manager/workers/state-detectors/gemini.js +21 -0
  225. package/template/claude-task-manager/workers/state-detectors/index.js +29 -0
  226. package/template/claude-task-manager/workers/state-detectors/opencode.js +103 -0
  227. package/template/docs/design/markdown-review-pane.md +206 -0
  228. package/template/docs/designs/2026-05-17-portkey-gateway-provider-ux.md +129 -38
  229. package/template/docs/designs/2026-05-20-mobile-worktree-finish-command.md +27 -0
  230. package/template/docs/designs/2026-05-22-ai-configuration-consolidation.md +248 -0
  231. package/template/docs/designs/ai-configuration-consolidation-mock.html +812 -0
  232. package/template/docs/private-memory-and-pii-policy.md +69 -0
  233. package/template/package.json +2 -1
  234. package/template/scripts/check-private-data.js +201 -0
  235. package/template/shared/sqlite-owner-guard.js +30 -0
  236. package/template/shared/sqlite-owner-write-queue.js +225 -0
  237. package/template/shared/sqlite-storage-policy.js +111 -0
  238. package/template/shared/sqlite-write-lock.js +428 -0
  239. package/template/wall-e/agent-runners/claude-code.js +5 -0
  240. package/template/wall-e/agent.js +166 -22
  241. package/template/wall-e/api-walle.js +524 -70
  242. package/template/wall-e/auth/provider-flows.js +11 -1
  243. package/template/wall-e/bin/walle-mcp-stdio.js +341 -17
  244. package/template/wall-e/brain.js +1614 -141
  245. package/template/wall-e/chat/attachment-blocks.js +96 -0
  246. package/template/wall-e/chat/attachments.js +2 -1
  247. package/template/wall-e/chat/capability-resolver.js +7 -7
  248. package/template/wall-e/chat/context-messages.js +28 -0
  249. package/template/wall-e/chat/conversation-frame.js +630 -0
  250. package/template/wall-e/chat/provider-messages.js +125 -0
  251. package/template/wall-e/chat.js +1002 -233
  252. package/template/wall-e/coding/acceptance-contract.js +170 -0
  253. package/template/wall-e/coding/acp-adapter.js +1 -1
  254. package/template/wall-e/coding/agent-catalog.js +3 -0
  255. package/template/wall-e/coding/artifact-store.js +93 -0
  256. package/template/wall-e/coding/capability-router.js +120 -0
  257. package/template/wall-e/coding/coding-run-controller.js +423 -0
  258. package/template/wall-e/coding/compaction-service.js +157 -12
  259. package/template/wall-e/coding/frontend-verification.js +258 -0
  260. package/template/wall-e/coding/lifecycle-hooks.js +75 -0
  261. package/template/wall-e/coding/local-preview-contract.js +157 -0
  262. package/template/wall-e/coding/permission-service.js +57 -13
  263. package/template/wall-e/coding/prompt-bundle.js +19 -1
  264. package/template/wall-e/coding/prompt-section-registry.js +227 -0
  265. package/template/wall-e/coding/provider-compat.js +15 -0
  266. package/template/wall-e/coding/runtime-events.js +224 -0
  267. package/template/wall-e/coding/runtime-mode.js +3 -0
  268. package/template/wall-e/coding/side-git-snapshot.js +160 -4
  269. package/template/wall-e/coding/snapshot-service.js +143 -1
  270. package/template/wall-e/coding/stream-processor.js +388 -34
  271. package/template/wall-e/coding/task-tool.js +141 -4
  272. package/template/wall-e/coding/tool-execution-controller.js +365 -0
  273. package/template/wall-e/coding/tool-registry.js +43 -5
  274. package/template/wall-e/coding/user-hooks.js +217 -0
  275. package/template/wall-e/coding-orchestrator.js +1330 -221
  276. package/template/wall-e/coding-prompts.js +20 -4
  277. package/template/wall-e/context/context-builder.js +15 -2
  278. package/template/wall-e/decision/confidence.js +1 -1
  279. package/template/wall-e/docs/coding-acceptance-contract.md +41 -0
  280. package/template/wall-e/docs/external-action-controller.md +26 -6
  281. package/template/wall-e/docs/telemetry-lifecycle.md +8 -2
  282. package/template/wall-e/embeddings.js +591 -53
  283. package/template/wall-e/external-action-controller.js +12 -0
  284. package/template/wall-e/http/auth.js +1 -0
  285. package/template/wall-e/http/chat-api.js +46 -11
  286. package/template/wall-e/http/model-admin.js +836 -34
  287. package/template/wall-e/lib/boot-profile.js +88 -0
  288. package/template/wall-e/lib/event-loop-monitor.js +93 -0
  289. package/template/wall-e/lib/service-health.js +194 -0
  290. package/template/wall-e/llm/anthropic.js +130 -5
  291. package/template/wall-e/llm/client.js +266 -63
  292. package/template/wall-e/llm/default-fallback.js +382 -0
  293. package/template/wall-e/llm/health.js +19 -0
  294. package/template/wall-e/llm/message-guard.js +78 -0
  295. package/template/wall-e/llm/model-catalog.js +252 -1
  296. package/template/wall-e/llm/openai.js +26 -4
  297. package/template/wall-e/llm/portkey-sync.js +654 -0
  298. package/template/wall-e/llm/provider-error.js +30 -2
  299. package/template/wall-e/llm/registry.js +5 -1
  300. package/template/wall-e/llm/request-compat.js +67 -0
  301. package/template/wall-e/loops/backfill.js +79 -23
  302. package/template/wall-e/loops/brain-optimize.js +67 -0
  303. package/template/wall-e/loops/ingest.js +25 -10
  304. package/template/wall-e/loops/question-digest.js +160 -0
  305. package/template/wall-e/loops/reflect.js +6 -4
  306. package/template/wall-e/loops/think.js +39 -12
  307. package/template/wall-e/mcp-server.js +318 -36
  308. package/template/wall-e/memory/ctm-context-client.js +52 -14
  309. package/template/wall-e/memory/ctm-operational-context.js +237 -0
  310. package/template/wall-e/memory/ctm-prompt-executions-client.js +128 -0
  311. package/template/wall-e/memory/ctm-session-context.js +111 -63
  312. package/template/wall-e/prompts/coding/deepseek.txt +3 -0
  313. package/template/wall-e/prompts/coding/gemini.txt +6 -0
  314. package/template/wall-e/prompts/coding/gpt.txt +6 -0
  315. package/template/wall-e/prompts/coding/local.txt +7 -0
  316. package/template/wall-e/runtime/decision-hooks.js +115 -0
  317. package/template/wall-e/runtime/devbox-gateway.js +82 -8
  318. package/template/wall-e/runtime/prompt-manifest.js +86 -0
  319. package/template/wall-e/runtime/tool-executor.js +269 -0
  320. package/template/wall-e/runtime/tool-result-envelope.js +138 -0
  321. package/template/wall-e/runtime/transcript-projection.js +60 -0
  322. package/template/wall-e/runtime/walle-runtime.js +224 -0
  323. package/template/wall-e/scripts/db-optimize/migrate.js +162 -0
  324. package/template/wall-e/scripts/db-optimize/recall-eval.js +117 -0
  325. package/template/wall-e/server.js +15 -0
  326. package/template/wall-e/session-files.js +9 -0
  327. package/template/wall-e/skills/_bundled/google-calendar/run.js +1 -1
  328. package/template/wall-e/skills/_bundled/gws-workspace/run.js +1 -1
  329. package/template/wall-e/skills/_bundled/slack-mentions/run.js +76 -6
  330. package/template/wall-e/skills/claude-code-reader.js +7 -3
  331. package/template/wall-e/skills/script-skill-runner.js +10 -0
  332. package/template/wall-e/skills/skill-planner.js +38 -0
  333. package/template/wall-e/tools/builtin-middleware.js +19 -9
  334. package/template/wall-e/tools/local-tools.js +1428 -16
  335. package/template/wall-e/tools/permission-checker.js +73 -5
  336. package/template/wall-e/tools/question-manager.js +117 -7
  337. package/template/wall-e/training/harvester.js +12 -28
  338. package/template/wall-e/training/replay.js +25 -80
  339. package/template/website/index.html +10 -10
  340. package/template/wall-e/eval/ab-test.js +0 -203
  341. package/template/wall-e/eval/agent-runner.js +0 -772
  342. package/template/wall-e/eval/agent-scorer.js +0 -461
  343. package/template/wall-e/eval/aggregator.js +0 -414
  344. package/template/wall-e/eval/allowed-test-commands.js +0 -34
  345. package/template/wall-e/eval/benchmark-generator.js +0 -113
  346. package/template/wall-e/eval/benchmarks/chat-eval.json +0 -1662
  347. package/template/wall-e/eval/benchmarks/chat.json +0 -82
  348. package/template/wall-e/eval/benchmarks/coding-agent-real.json +0 -1
  349. package/template/wall-e/eval/benchmarks/coding-agent.json +0 -1581
  350. package/template/wall-e/eval/benchmarks/coding.json +0 -122
  351. package/template/wall-e/eval/benchmarks/memory-retrieval.json +0 -234
  352. package/template/wall-e/eval/benchmarks/reasoning.json +0 -82
  353. package/template/wall-e/eval/benchmarks/swebench-lite-30.json +0 -212
  354. package/template/wall-e/eval/benchmarks.js +0 -669
  355. package/template/wall-e/eval/cc-replay.js +0 -719
  356. package/template/wall-e/eval/chat-eval.js +0 -525
  357. package/template/wall-e/eval/check-keys.js +0 -15
  358. package/template/wall-e/eval/check-providers.js +0 -42
  359. package/template/wall-e/eval/codex-cli-baseline.js +0 -669
  360. package/template/wall-e/eval/coding-agent-real.js +0 -570
  361. package/template/wall-e/eval/context-compactor.js +0 -251
  362. package/template/wall-e/eval/debug-agent003.js +0 -68
  363. package/template/wall-e/eval/diagnostics.js +0 -216
  364. package/template/wall-e/eval/eval-orchestrator.js +0 -642
  365. package/template/wall-e/eval/evaluate.js +0 -202
  366. package/template/wall-e/eval/evaluator.js +0 -373
  367. package/template/wall-e/eval/exporter.js +0 -212
  368. package/template/wall-e/eval/fixtures/express-basic/package.json +0 -9
  369. package/template/wall-e/eval/fixtures/express-basic/server.js +0 -115
  370. package/template/wall-e/eval/fixtures/express-basic/test.js +0 -83
  371. package/template/wall-e/eval/fixtures/express-buggy/package.json +0 -9
  372. package/template/wall-e/eval/fixtures/express-buggy/server.js +0 -113
  373. package/template/wall-e/eval/fixtures/express-buggy/test.js +0 -83
  374. package/template/wall-e/eval/fixtures/express-buggy-items/package.json +0 -9
  375. package/template/wall-e/eval/fixtures/express-buggy-items/server.js +0 -112
  376. package/template/wall-e/eval/fixtures/express-buggy-items/test.js +0 -83
  377. package/template/wall-e/eval/fixtures/express-buggy-search/package.json +0 -9
  378. package/template/wall-e/eval/fixtures/express-buggy-search/server.js +0 -121
  379. package/template/wall-e/eval/fixtures/express-buggy-search/test.js +0 -83
  380. package/template/wall-e/eval/fixtures/express-rename-data/data.js +0 -34
  381. package/template/wall-e/eval/fixtures/express-rename-data/package.json +0 -9
  382. package/template/wall-e/eval/fixtures/express-rename-data/server.js +0 -97
  383. package/template/wall-e/eval/fixtures/express-rename-data/test.js +0 -88
  384. package/template/wall-e/eval/fixtures/express-xss/package.json +0 -12
  385. package/template/wall-e/eval/fixtures/express-xss/server.js +0 -90
  386. package/template/wall-e/eval/fixtures/express-xss/test.js +0 -67
  387. package/template/wall-e/eval/fixtures/express-xss/views/profile.ejs +0 -9
  388. package/template/wall-e/eval/fixtures/fullstack-app/config/default.js +0 -9
  389. package/template/wall-e/eval/fixtures/fullstack-app/config/test.js +0 -13
  390. package/template/wall-e/eval/fixtures/fullstack-app/package.json +0 -11
  391. package/template/wall-e/eval/fixtures/fullstack-app/public/css/style.css +0 -137
  392. package/template/wall-e/eval/fixtures/fullstack-app/public/index.html +0 -46
  393. package/template/wall-e/eval/fixtures/fullstack-app/public/js/app.js +0 -121
  394. package/template/wall-e/eval/fixtures/fullstack-app/public/js/auth.js +0 -71
  395. package/template/wall-e/eval/fixtures/fullstack-app/public/js/items.js +0 -80
  396. package/template/wall-e/eval/fixtures/fullstack-app/public/js/users.js +0 -46
  397. package/template/wall-e/eval/fixtures/fullstack-app/public/login.html +0 -45
  398. package/template/wall-e/eval/fixtures/fullstack-app/public/register.html +0 -38
  399. package/template/wall-e/eval/fixtures/fullstack-app/scripts/migrate.js +0 -23
  400. package/template/wall-e/eval/fixtures/fullstack-app/scripts/seed.js +0 -46
  401. package/template/wall-e/eval/fixtures/fullstack-app/server/db.js +0 -99
  402. package/template/wall-e/eval/fixtures/fullstack-app/server/index.js +0 -94
  403. package/template/wall-e/eval/fixtures/fullstack-app/server/middleware/auth.js +0 -19
  404. package/template/wall-e/eval/fixtures/fullstack-app/server/middleware/logger.js +0 -19
  405. package/template/wall-e/eval/fixtures/fullstack-app/server/router.js +0 -50
  406. package/template/wall-e/eval/fixtures/fullstack-app/server/routes/auth.js +0 -69
  407. package/template/wall-e/eval/fixtures/fullstack-app/server/routes/health.js +0 -23
  408. package/template/wall-e/eval/fixtures/fullstack-app/server/routes/items.js +0 -88
  409. package/template/wall-e/eval/fixtures/fullstack-app/server/routes/users.js +0 -75
  410. package/template/wall-e/eval/fixtures/fullstack-app/server/test.js +0 -198
  411. package/template/wall-e/eval/fixtures/fullstack-app/server/utils/response.js +0 -34
  412. package/template/wall-e/eval/fixtures/fullstack-app/server/utils/validate.js +0 -26
  413. package/template/wall-e/eval/fixtures/fullstack-app/server.js +0 -8
  414. package/template/wall-e/eval/fixtures/fullstack-app/test.js +0 -12
  415. package/template/wall-e/eval/fixtures/monorepo-basic/package.json +0 -8
  416. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/data.js +0 -58
  417. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/middleware.js +0 -46
  418. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/package.json +0 -8
  419. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/routes.js +0 -64
  420. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/server.js +0 -56
  421. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/test.js +0 -116
  422. package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/commands.js +0 -61
  423. package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/index.js +0 -62
  424. package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/output.js +0 -43
  425. package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/package.json +0 -11
  426. package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/test.js +0 -44
  427. package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/formatters.js +0 -43
  428. package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/index.js +0 -12
  429. package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/package.json +0 -5
  430. package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/test.js +0 -55
  431. package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/validators.js +0 -29
  432. package/template/wall-e/eval/fixtures/monorepo-basic/test.js +0 -46
  433. package/template/wall-e/eval/fixtures/node-cli/index.js +0 -78
  434. package/template/wall-e/eval/fixtures/node-cli/package.json +0 -10
  435. package/template/wall-e/eval/fixtures/node-cli/test.js +0 -57
  436. package/template/wall-e/eval/fixtures/node-typed/package.json +0 -8
  437. package/template/wall-e/eval/fixtures/node-typed/src/handlers.js +0 -31
  438. package/template/wall-e/eval/fixtures/node-typed/src/utils.js +0 -33
  439. package/template/wall-e/eval/fixtures/node-typed/test.js +0 -36
  440. package/template/wall-e/eval/fixtures/python-flask/app.py +0 -14
  441. package/template/wall-e/eval/fixtures/python-flask/requirements.txt +0 -2
  442. package/template/wall-e/eval/fixtures/python-flask/test_app.py +0 -25
  443. package/template/wall-e/eval/fixtures/wall-e-subset/brain.js +0 -105
  444. package/template/wall-e/eval/fixtures/wall-e-subset/eval/aggregator.js +0 -101
  445. package/template/wall-e/eval/fixtures/wall-e-subset/eval/benchmarks/chat.json +0 -20
  446. package/template/wall-e/eval/fixtures/wall-e-subset/eval/benchmarks/coding.json +0 -32
  447. package/template/wall-e/eval/fixtures/wall-e-subset/eval/benchmarks.js +0 -64
  448. package/template/wall-e/eval/fixtures/wall-e-subset/eval/fixtures/simple-project/package.json +0 -6
  449. package/template/wall-e/eval/fixtures/wall-e-subset/eval/fixtures/simple-project/server.js +0 -31
  450. package/template/wall-e/eval/fixtures/wall-e-subset/eval/fixtures/simple-project/test.js +0 -18
  451. package/template/wall-e/eval/fixtures/wall-e-subset/eval/fixtures/simple-project/utils.js +0 -34
  452. package/template/wall-e/eval/fixtures/wall-e-subset/eval/runner.js +0 -104
  453. package/template/wall-e/eval/fixtures/wall-e-subset/eval/scorer.js +0 -73
  454. package/template/wall-e/eval/fixtures/wall-e-subset/eval/test.js +0 -134
  455. package/template/wall-e/eval/fixtures/wall-e-subset/llm/client.js +0 -99
  456. package/template/wall-e/eval/fixtures/wall-e-subset/llm/providers.js +0 -63
  457. package/template/wall-e/eval/fixtures/wall-e-subset/llm/test.js +0 -70
  458. package/template/wall-e/eval/fixtures/wall-e-subset/package.json +0 -10
  459. package/template/wall-e/eval/fixtures/wall-e-subset/test.js +0 -86
  460. package/template/wall-e/eval/harvester.js +0 -685
  461. package/template/wall-e/eval/head-to-head.js +0 -388
  462. package/template/wall-e/eval/humaneval-adapter.js +0 -321
  463. package/template/wall-e/eval/list-models.js +0 -31
  464. package/template/wall-e/eval/livecodebench-adapter.js +0 -291
  465. package/template/wall-e/eval/mail-integration.js +0 -443
  466. package/template/wall-e/eval/manifest.js +0 -186
  467. package/template/wall-e/eval/meta-harness/adapters/coding-agent.js +0 -57
  468. package/template/wall-e/eval/meta-harness/bootstrap-snapshot.js +0 -149
  469. package/template/wall-e/eval/meta-harness/candidate-store.js +0 -117
  470. package/template/wall-e/eval/meta-harness/cli.js +0 -86
  471. package/template/wall-e/eval/meta-harness/domain-spec.js +0 -154
  472. package/template/wall-e/eval/meta-harness/domains/coding-agent.domain.json +0 -84
  473. package/template/wall-e/eval/meta-harness/examples/env-bootstrap-candidate.js +0 -29
  474. package/template/wall-e/eval/meta-harness/experience-store.js +0 -174
  475. package/template/wall-e/eval/meta-harness/frontier.js +0 -96
  476. package/template/wall-e/eval/meta-harness/harness-interface.js +0 -90
  477. package/template/wall-e/eval/meta-harness/leakage-guard.js +0 -80
  478. package/template/wall-e/eval/meta-harness/optimizer.js +0 -207
  479. package/template/wall-e/eval/meta-harness/proposer-runner.js +0 -110
  480. package/template/wall-e/eval/meta-harness/reporting.js +0 -58
  481. package/template/wall-e/eval/meta-harness/telemetry.js +0 -27
  482. package/template/wall-e/eval/meta-harness/validation.js +0 -81
  483. package/template/wall-e/eval/promoter.js +0 -228
  484. package/template/wall-e/eval/provider-normalizer.js +0 -33
  485. package/template/wall-e/eval/replay.js +0 -395
  486. package/template/wall-e/eval/run-agent-benchmarks.js +0 -386
  487. package/template/wall-e/eval/run-codex-cli-baseline.js +0 -177
  488. package/template/wall-e/eval/run-coding-agent-real.js +0 -187
  489. package/template/wall-e/eval/run-eval.js +0 -435
  490. package/template/wall-e/eval/run-model-comparison.js +0 -142
  491. package/template/wall-e/eval/session-evaluator.js +0 -187
  492. package/template/wall-e/eval/session-miner.js +0 -207
  493. package/template/wall-e/eval/session-retrieval-benchmark.js +0 -150
  494. package/template/wall-e/eval/session-transcripts.js +0 -509
  495. package/template/wall-e/eval/shadow.js +0 -161
  496. package/template/wall-e/eval/swebench-adapter.js +0 -345
  497. package/template/wall-e/eval/swebench-docker.js +0 -192
  498. package/template/wall-e/eval/train.py +0 -320
  499. package/template/wall-e/eval/trainer.js +0 -232
  500. package/template/wall-e/eval/weekly-eval-loop.js +0 -241
@@ -1,142 +0,0 @@
1
- #!/usr/bin/env node
2
- 'use strict';
3
-
4
- /**
5
- * Compare all available Ollama models on agent-001 (simplest benchmark).
6
- * Records per-model performance data for model-task affinity learning.
7
- */
8
-
9
- const path = require('path');
10
- process.chdir(path.join(__dirname, '..'));
11
-
12
- const { setupSandbox, cleanupSandbox, scoreAgentResult } = require('./agent-runner');
13
- const { execFileSync } = require('child_process');
14
- const benchmarks = require('./benchmarks/coding-agent.json');
15
-
16
- const MODELS = ['gemma4:e4b', 'qwen2.5:7b-instruct-q4_K_M', 'phi4:latest', 'llama3.1:8b-instruct-q4_K_M'];
17
- const BENCH_ID = process.argv[2] || 'agent-001';
18
-
19
- async function runWithModel(bench, modelName) {
20
- const fixtureName = bench.agentExpectations?.projectFixture || 'express-basic';
21
- const sandboxDir = setupSandbox(fixtureName);
22
-
23
- try {
24
- const { runAgentLoop } = require('../coding-orchestrator');
25
- const start = Date.now();
26
-
27
- const result = await runAgentLoop(bench.prompt, {
28
- cwd: sandboxDir,
29
- timeoutMs: 180000,
30
- model: modelName,
31
- mode: 'build',
32
- persistTranscript: false,
33
- });
34
-
35
- const latencyMs = Date.now() - start;
36
-
37
- // Extract tool calls from log
38
- const toolCalls = [];
39
- for (const entry of (result.log || [])) {
40
- for (const tc of (entry.toolCalls || [])) {
41
- toolCalls.push(tc.name);
42
- }
43
- }
44
-
45
- // Check modified files
46
- let modifiedFiles = [];
47
- try {
48
- const diff = execFileSync('git', ['diff', '--name-only', 'HEAD'], { cwd: sandboxDir, encoding: 'utf8' });
49
- const untracked = execFileSync('git', ['ls-files', '--others', '--exclude-standard'], { cwd: sandboxDir, encoding: 'utf8' });
50
- modifiedFiles = [...diff.trim().split('\n'), ...untracked.trim().split('\n')].filter(Boolean);
51
- } catch {}
52
-
53
- // Run tests
54
- let testsPassed = null;
55
- if (bench.agentExpectations?.testCommand === 'npm test') {
56
- try {
57
- execFileSync('node', ['test.js'], { cwd: sandboxDir, timeout: 10000, stdio: 'pipe' });
58
- testsPassed = true;
59
- } catch { testsPassed = false; }
60
- }
61
-
62
- const score = scoreAgentResult(bench, {
63
- actualToolCalls: toolCalls,
64
- actualFileChanges: modifiedFiles,
65
- actualTurns: (result.log || []).length,
66
- testsPassed,
67
- output: result.output || '',
68
- success: result.success,
69
- });
70
-
71
- return {
72
- model: modelName,
73
- success: result.success,
74
- score,
75
- latencyMs,
76
- turns: (result.log || []).length,
77
- toolCalls,
78
- uniqueTools: [...new Set(toolCalls)],
79
- modifiedFiles,
80
- testsPassed,
81
- error: result.stderr || null,
82
- };
83
- } catch (err) {
84
- return { model: modelName, success: false, score: { composite: 0 }, error: err.message, latencyMs: Date.now() - start };
85
- } finally {
86
- cleanupSandbox(sandboxDir);
87
- }
88
- }
89
-
90
- async function main() {
91
- const bench = benchmarks.find(b => b.id === BENCH_ID);
92
- if (!bench) { console.error(`No benchmark: ${BENCH_ID}`); process.exit(1); }
93
-
94
- console.log(`=== Model Comparison: ${bench.id} (${bench.difficulty}) ===`);
95
- console.log(`Prompt: ${bench.prompt.slice(0, 100)}...\n`);
96
-
97
- const results = [];
98
- for (const model of MODELS) {
99
- console.log(`--- ${model} ---`);
100
- const r = await runWithModel(bench, model);
101
- results.push(r);
102
-
103
- console.log(` Score: ${(r.score?.composite || 0).toFixed(3)} | Success: ${r.success} | Turns: ${r.turns || 0} | Time: ${((r.latencyMs || 0) / 1000).toFixed(1)}s`);
104
- if (r.score?.dimensions) {
105
- const d = r.score.dimensions;
106
- console.log(` Tool eff: ${(d.toolEfficiency || 0).toFixed(2)} | Correct: ${(d.correctness || 0).toFixed(2)} | Plan: ${(d.planQuality || 0).toFixed(2)} | Turns: ${(d.turnEconomy || 0).toFixed(2)} | Error: ${(d.errorHandling || 0).toFixed(2)}`);
107
- }
108
- console.log(` Tools: ${(r.uniqueTools || []).join(', ') || 'none'}`);
109
- console.log(` Files: ${(r.modifiedFiles || []).join(', ') || 'none'}`);
110
- if (r.error) console.log(` Error: ${r.error.slice(0, 200)}`);
111
- console.log('');
112
- }
113
-
114
- // Summary table
115
- console.log('=== LEADERBOARD ===');
116
- results.sort((a, b) => (b.score?.composite || 0) - (a.score?.composite || 0));
117
- console.log('Rank | Model | Score | Time | Tools | Files');
118
- console.log('-----|-------------------------------|-------|--------|-------|------');
119
- results.forEach((r, i) => {
120
- console.log(` ${i + 1} | ${r.model.padEnd(29)} | ${(r.score?.composite || 0).toFixed(3)} | ${((r.latencyMs || 0) / 1000).toFixed(1).padStart(5)}s | ${(r.toolCalls || []).length.toString().padStart(5)} | ${(r.modifiedFiles || []).length}`);
121
- });
122
-
123
- // Record to brain if available
124
- try {
125
- const brain = require('../brain');
126
- brain.initDb();
127
- for (const r of results) {
128
- if (typeof brain.insertModelEvaluation === 'function') {
129
- brain.insertModelEvaluation({
130
- modelRegistryId: r.model,
131
- taskType: 'coding-agent',
132
- qualityScore: r.score?.composite || 0,
133
- latencyMs: r.latencyMs,
134
- wasSelected: false,
135
- });
136
- }
137
- }
138
- console.log('\nResults saved to brain.');
139
- } catch {}
140
- }
141
-
142
- main().catch(err => { console.error('Fatal:', err); process.exit(1); });
@@ -1,187 +0,0 @@
1
- 'use strict';
2
-
3
- const { computeAgentScore } = require('./agent-scorer');
4
-
5
- /**
6
- * Evaluate a completed coding session.
7
- * Called after the agent loop finishes to score the session.
8
- *
9
- * @param {object} sessionData - From the agent loop
10
- * @param {string} sessionData.sessionId
11
- * @param {string} sessionData.prompt - Original user request
12
- * @param {Array} sessionData.toolCalls - Tool calls made
13
- * @param {number} sessionData.turns - Total turns used
14
- * @param {Array} sessionData.filesModified - Files changed
15
- * @param {boolean} sessionData.success - Whether task completed
16
- * @param {string} sessionData.output - Final output text
17
- * @param {boolean} [sessionData.testsPassed] - Test results
18
- * @param {number} [sessionData.consecutiveErrors] - Error count
19
- * @param {object} options
20
- * @param {object} options.brain - Brain instance for storage
21
- * @param {string} [options.model] - Model used
22
- * @returns {object} Evaluation result with scores
23
- */
24
- async function evaluateSession(sessionData, options = {}) {
25
- const { brain, model } = options;
26
-
27
- const toolCallNames = (sessionData.toolCalls || []).map(t =>
28
- typeof t === 'string' ? t : t.name || ''
29
- ).filter(Boolean);
30
-
31
- // Compute multi-dimensional score
32
- const score = computeAgentScore({
33
- actualToolCalls: toolCallNames,
34
- testsPassed: sessionData.testsPassed ?? null,
35
- success: sessionData.success || false,
36
- output: sessionData.output || '',
37
- actualFiles: sessionData.filesModified || [],
38
- actualTurns: sessionData.turns || 0,
39
- maxTurns: 50,
40
- consecutiveErrors: sessionData.consecutiveErrors || 0,
41
- });
42
-
43
- // Classify the session type
44
- const classifiedType = classifySessionType(sessionData.prompt, toolCallNames);
45
-
46
- // Build evaluation result
47
- const evaluation = {
48
- sessionId: sessionData.sessionId,
49
- prompt: sessionData.prompt,
50
- model: model || 'unknown',
51
- classifiedType,
52
- score,
53
- toolCallCount: toolCallNames.length,
54
- uniqueToolCount: new Set(toolCallNames).size,
55
- turns: sessionData.turns || 0,
56
- filesModified: sessionData.filesModified || [],
57
- success: sessionData.success || false,
58
- testsPassed: sessionData.testsPassed ?? null,
59
- timestamp: new Date().toISOString(),
60
- };
61
-
62
- // Store in brain if available
63
- if (brain) {
64
- try {
65
- // Store as coding agent session
66
- if (typeof brain.insertCodingSession === 'function') {
67
- brain.insertCodingSession({
68
- id: sessionData.sessionId,
69
- session_id: sessionData.sessionId,
70
- prompt: sessionData.prompt || '',
71
- tool_calls: JSON.stringify(toolCallNames),
72
- turns: sessionData.turns || 0,
73
- files_modified: JSON.stringify(sessionData.filesModified || []),
74
- git_committed: sessionData.gitCommitted ? 1 : 0,
75
- git_diff: sessionData.gitDiff || null,
76
- significance_score: score.composite,
77
- classified_type: classifiedType,
78
- });
79
- }
80
-
81
- // Also record model evaluation for the learner
82
- if (typeof brain.insertModelEvaluation === 'function') {
83
- brain.insertModelEvaluation({
84
- modelRegistryId: model || 'coding-orchestrator',
85
- taskType: classifiedType,
86
- qualityScore: score.composite,
87
- latencyMs: sessionData.latencyMs || null,
88
- inputTokens: sessionData.inputTokens || null,
89
- outputTokens: sessionData.outputTokens || null,
90
- wasSelected: true,
91
- });
92
- }
93
- } catch (err) {
94
- console.warn('[session-evaluator] Storage failed:', err.message);
95
- }
96
- }
97
-
98
- // Generate learning insights
99
- const insights = generateInsights(evaluation);
100
-
101
- return { evaluation, insights };
102
- }
103
-
104
- /**
105
- * Classify session type from prompt and tool usage.
106
- */
107
- function classifySessionType(prompt, toolCalls) {
108
- const p = (prompt || '').toLowerCase();
109
-
110
- if (/plan|design|architect/i.test(p)) return 'coding:planning';
111
- if (/debug|fix|bug|error|failing/i.test(p)) return 'coding:debugging';
112
- if (/refactor|extract|rename|reorganize|clean/i.test(p)) return 'coding:refactoring';
113
- if (/test|spec|coverage/i.test(p)) return 'coding:testing';
114
- if (/review|assess|check|audit/i.test(p)) return 'coding:review';
115
-
116
- // Classify by dominant tool usage
117
- const genTools = toolCalls.filter(t => /write_file|edit_file|apply_patch/.test(t));
118
- const readTools = toolCalls.filter(t => /read_file|glob|grep/.test(t));
119
-
120
- if (genTools.length === 0 && readTools.length > 3) return 'coding:exploration';
121
- return 'coding:generation';
122
- }
123
-
124
- /**
125
- * Generate learning insights from evaluation results.
126
- * These can be stored as brain memories for future reference.
127
- */
128
- function generateInsights(evaluation) {
129
- const insights = [];
130
- const { score, classifiedType, model, turns } = evaluation;
131
-
132
- if (score.composite >= 0.8) {
133
- insights.push({
134
- type: 'positive',
135
- content: `Model ${model} performed well on ${classifiedType} (score ${score.composite.toFixed(2)}, ${turns} turns)`,
136
- });
137
- }
138
-
139
- if (score.dimensions.toolEfficiency < 0.5) {
140
- insights.push({
141
- type: 'improvement',
142
- content: `Tool efficiency was low (${score.dimensions.toolEfficiency.toFixed(2)}) for ${classifiedType} - consider tool selection optimization`,
143
- });
144
- }
145
-
146
- if (score.dimensions.turnEconomy < 0.3) {
147
- insights.push({
148
- type: 'improvement',
149
- content: `Turn economy was poor (${turns} turns) for ${classifiedType} - task may need better planning phase`,
150
- });
151
- }
152
-
153
- if (score.dimensions.errorHandling < 0.5) {
154
- insights.push({
155
- type: 'warning',
156
- content: `Error handling issues detected in ${classifiedType} session - possible doom loop or repeated failures`,
157
- });
158
- }
159
-
160
- return insights;
161
- }
162
-
163
- /**
164
- * Store learning insights as brain memories.
165
- */
166
- async function storeInsights(brain, insights) {
167
- if (!brain || !insights || insights.length === 0) return;
168
- if (typeof brain.addMemory !== 'function') return;
169
-
170
- for (const insight of insights) {
171
- try {
172
- brain.addMemory({
173
- source: 'coding-agent-eval',
174
- type: 'learning',
175
- content: insight.content,
176
- importance: insight.type === 'warning' ? 0.8 : 0.5,
177
- });
178
- } catch { /* non-fatal */ }
179
- }
180
- }
181
-
182
- module.exports = {
183
- evaluateSession,
184
- classifySessionType,
185
- generateInsights,
186
- storeInsights,
187
- };
@@ -1,207 +0,0 @@
1
- 'use strict';
2
- const fs = require('fs');
3
- const path = require('path');
4
- const { sessionToBenchmark } = require('./benchmark-generator');
5
-
6
- const BENCHMARKS_PATH = path.join(__dirname, 'benchmarks', 'coding-agent.json');
7
- const MAX_PER_WEEK_DEFAULT = 5;
8
- const MIN_SIGNIFICANCE = 0.7;
9
-
10
- /**
11
- * Load existing benchmark IDs from the coding-agent.json file.
12
- * @returns {Set<string>}
13
- */
14
- function loadExistingIds() {
15
- const ids = new Set();
16
- try {
17
- const existing = JSON.parse(fs.readFileSync(BENCHMARKS_PATH, 'utf8'));
18
- for (const b of existing) ids.add(b.id);
19
- } catch { /* file missing or parse error */ }
20
- return ids;
21
- }
22
-
23
- /**
24
- * Count how many benchmarks were generated in the current week (Mon-Sun).
25
- * Checks created_at timestamps of recently converted sessions.
26
- * @param {object} brain
27
- * @returns {number}
28
- */
29
- function countGeneratedThisWeek(brain) {
30
- const now = new Date();
31
- // Get Monday 00:00 of the current week
32
- const day = now.getDay();
33
- const diffToMonday = day === 0 ? 6 : day - 1;
34
- const monday = new Date(now);
35
- monday.setDate(now.getDate() - diffToMonday);
36
- monday.setHours(0, 0, 0, 0);
37
- const weekStart = monday.toISOString();
38
-
39
- // Query sessions that were already converted (benchmark_generated = 1)
40
- // and created this week. This is approximate — we use created_at as proxy
41
- // since we don't track "converted_at" separately.
42
- const sessions = brain.getCodingSessions({ minSignificance: MIN_SIGNIFICANCE, limit: 200 });
43
- return sessions.filter(s => s.benchmark_generated && s.created_at >= weekStart).length;
44
- }
45
-
46
- /**
47
- * Pick a balanced subset of candidates across classified_type categories.
48
- * Distributes evenly, cycling through types until budget is exhausted.
49
- * @param {Array} candidates - session objects
50
- * @param {number} budget - max to pick
51
- * @returns {Array}
52
- */
53
- function pickBalanced(candidates, budget) {
54
- if (candidates.length <= budget) return candidates;
55
-
56
- // Group by classified_type
57
- const groups = {};
58
- for (const c of candidates) {
59
- const type = c.classified_type || 'unknown';
60
- if (!groups[type]) groups[type] = [];
61
- groups[type].push(c);
62
- }
63
-
64
- const picked = [];
65
- const types = Object.keys(groups);
66
- let round = 0;
67
-
68
- while (picked.length < budget) {
69
- let addedAny = false;
70
- for (const type of types) {
71
- if (picked.length >= budget) break;
72
- if (round < groups[type].length) {
73
- picked.push(groups[type][round]);
74
- addedAny = true;
75
- }
76
- }
77
- if (!addedAny) break;
78
- round++;
79
- }
80
-
81
- return picked;
82
- }
83
-
84
- /**
85
- * Mine high-significance coding sessions and generate benchmarks.
86
- *
87
- * @param {object} brain - Brain module instance
88
- * @param {object} opts
89
- * @param {boolean} opts.dryRun - If true, don't write files or mark sessions
90
- * @param {number} opts.maxPerWeek - Weekly cap (default 5)
91
- * @returns {{ candidates: Array, generated: Array, skipped: Array }}
92
- */
93
- async function mineAndGenerate(brain, { dryRun = false, maxPerWeek = MAX_PER_WEEK_DEFAULT } = {}) {
94
- // 1. Check weekly budget
95
- const usedThisWeek = countGeneratedThisWeek(brain);
96
- const remaining = Math.max(0, maxPerWeek - usedThisWeek);
97
-
98
- if (remaining === 0) {
99
- return { candidates: [], generated: [], skipped: [], reason: 'weekly cap reached' };
100
- }
101
-
102
- // 2. Get high-significance unconverted sessions
103
- const sessions = brain.getCodingSessions({ minSignificance: MIN_SIGNIFICANCE, limit: 100 });
104
- const unconverted = sessions.filter(s => !s.benchmark_generated);
105
-
106
- if (unconverted.length === 0) {
107
- return { candidates: [], generated: [], skipped: [], reason: 'no unconverted sessions' };
108
- }
109
-
110
- // 3. Group by classified_type, pick balanced subset
111
- const candidates = pickBalanced(unconverted, remaining);
112
-
113
- // 4. Generate benchmarks
114
- const existingIds = loadExistingIds();
115
- const generated = [];
116
- const skipped = [];
117
-
118
- for (const session of candidates) {
119
- const benchmark = sessionToBenchmark(session, existingIds);
120
- if (benchmark) {
121
- generated.push(benchmark);
122
- existingIds.add(benchmark.id);
123
- } else {
124
- skipped.push({ id: session.id, reason: 'duplicate' });
125
- }
126
- }
127
-
128
- // 5. Persist if not dry-run
129
- if (!dryRun && generated.length > 0) {
130
- // Append to existing benchmarks file
131
- let existing = [];
132
- try {
133
- existing = JSON.parse(fs.readFileSync(BENCHMARKS_PATH, 'utf8'));
134
- } catch { /* start fresh */ }
135
-
136
- existing.push(...generated);
137
- fs.mkdirSync(path.dirname(BENCHMARKS_PATH), { recursive: true });
138
- fs.writeFileSync(BENCHMARKS_PATH, JSON.stringify(existing, null, 2) + '\n');
139
-
140
- // Mark sessions as converted
141
- for (const session of candidates) {
142
- // Only mark if benchmark was actually generated (not skipped)
143
- if (generated.some(b => b.sourceSessionId === session.session_id)) {
144
- brain.markBenchmarkGenerated(session.id);
145
- }
146
- }
147
- }
148
-
149
- return { candidates, generated, skipped };
150
- }
151
-
152
- // CLI mode
153
- if (require.main === module) {
154
- const args = process.argv.slice(2);
155
- const dryRun = args.includes('--dry-run');
156
- const maxPerWeek = (() => {
157
- const idx = args.indexOf('--max');
158
- return idx >= 0 && args[idx + 1] ? parseInt(args[idx + 1], 10) : MAX_PER_WEEK_DEFAULT;
159
- })();
160
-
161
- (async () => {
162
- // Initialize brain
163
- let brain;
164
- try {
165
- brain = require('../brain');
166
- brain.initDb();
167
- } catch (err) {
168
- console.error('Failed to initialize brain:', err.message);
169
- process.exit(1);
170
- }
171
-
172
- console.log(`Session Miner ${dryRun ? '(DRY RUN)' : ''}`);
173
- console.log(` Max per week: ${maxPerWeek}`);
174
- console.log('');
175
-
176
- const result = await mineAndGenerate(brain, { dryRun, maxPerWeek });
177
-
178
- if (result.reason) {
179
- console.log(`Skipped: ${result.reason}`);
180
- return;
181
- }
182
-
183
- console.log(`Candidates: ${result.candidates.length}`);
184
- console.log(`Generated: ${result.generated.length}`);
185
- console.log(`Skipped: ${result.skipped.length}`);
186
-
187
- if (result.generated.length > 0) {
188
- console.log('\nGenerated benchmarks:');
189
- for (const b of result.generated) {
190
- console.log(` - ${b.id} [${b.classifiedType}] ${b.difficulty} (${b.complexityIndicator} files)`);
191
- if (b.expectedDiff) console.log(` has golden diff: ${b.expectedDiff.length} chars`);
192
- }
193
- }
194
-
195
- if (result.skipped.length > 0) {
196
- console.log('\nSkipped:');
197
- for (const s of result.skipped) {
198
- console.log(` - ${s.id}: ${s.reason}`);
199
- }
200
- }
201
- })().catch(err => {
202
- console.error(err);
203
- process.exit(1);
204
- });
205
- }
206
-
207
- module.exports = { mineAndGenerate };
@@ -1,150 +0,0 @@
1
- 'use strict';
2
-
3
- const fs = require('node:fs');
4
- const path = require('node:path');
5
- const brainDefault = require('../brain');
6
- const { indexMemory } = require('../memory/source-indexer');
7
-
8
- const DEFAULT_CASES_PATH = path.join(__dirname, 'benchmarks', 'memory-retrieval.json');
9
-
10
- function loadMemoryRetrievalCases(filePath = DEFAULT_CASES_PATH) {
11
- const parsed = JSON.parse(fs.readFileSync(filePath, 'utf8'));
12
- return Array.isArray(parsed) ? parsed : [];
13
- }
14
-
15
- function seedBenchmarkMemories({ brain = brainDefault, cases = [] } = {}) {
16
- let inserted = 0;
17
- let indexed = 0;
18
- for (const bench of cases) {
19
- const memories = bench.retrieval?.seedMemories || [];
20
- for (const seed of memories) {
21
- const result = brain.insertMemory({
22
- source: seed.source || 'codex-jsonl',
23
- source_id: seed.source_id,
24
- source_channel: seed.cwd || '',
25
- memory_type: seed.memory_type || 'coding_session_exchange',
26
- direction: seed.direction || 'exchange',
27
- subject: seed.subject || seed.source_id,
28
- content: seed.content,
29
- content_raw: seed.content,
30
- metadata: JSON.stringify(seed.metadata || {}),
31
- importance: seed.importance ?? 0.7,
32
- timestamp: seed.timestamp || new Date().toISOString(),
33
- });
34
- if (!result) continue;
35
- inserted++;
36
- indexMemory({
37
- ...seed,
38
- id: result.id,
39
- source: seed.source || 'codex-jsonl',
40
- source_id: seed.source_id,
41
- source_channel: seed.cwd || '',
42
- memory_type: seed.memory_type || 'coding_session_exchange',
43
- content: seed.content,
44
- metadata: JSON.stringify(seed.metadata || {}),
45
- timestamp: seed.timestamp || new Date().toISOString(),
46
- }, { brain });
47
- indexed++;
48
- }
49
- }
50
- return { inserted, indexed };
51
- }
52
-
53
- function searchRetrievalCase({ brain = brainDefault, query, limit = 10 } = {}) {
54
- const max = Math.min(Math.max(Number(limit) || 10, 1), 50);
55
- const direct = brain.searchMemories({ query, limit: max * 3 });
56
- let indexed = [];
57
- try {
58
- const rows = brain.searchMemoryIndex({ query, limit: max * 3 });
59
- indexed = hydrateIndexRows(brain, rows);
60
- } catch {}
61
- return mergeById(direct, indexed).slice(0, max);
62
- }
63
-
64
- function scoreRetrievalCase(bench, results, { ks = [5, 10] } = {}) {
65
- const expected = new Set(bench.retrieval?.expectedSourceIds || []);
66
- const sourceIds = results.map(resultSourceId);
67
- const out = {
68
- id: bench.id,
69
- query: bench.retrieval?.query || '',
70
- expected: [...expected],
71
- returned: sourceIds,
72
- };
73
- for (const k of ks) {
74
- out[`hit_at_${k}`] = sourceIds.slice(0, k).some((id) => expected.has(id));
75
- }
76
- return out;
77
- }
78
-
79
- function runMemoryRetrievalBenchmark({ brain = brainDefault, cases = loadMemoryRetrievalCases(), seed = false, limit = 10 } = {}) {
80
- if (seed) seedBenchmarkMemories({ brain, cases });
81
- const results = [];
82
- for (const bench of cases) {
83
- const query = bench.retrieval?.query || bench.prompt || '';
84
- const hits = searchRetrievalCase({ brain, query, limit });
85
- results.push(scoreRetrievalCase(bench, hits));
86
- }
87
- return summarizeRetrievalResults(results);
88
- }
89
-
90
- function summarizeRetrievalResults(results) {
91
- const total = results.length || 1;
92
- const hitAt5 = results.filter((result) => result.hit_at_5).length;
93
- const hitAt10 = results.filter((result) => result.hit_at_10).length;
94
- return {
95
- total: results.length,
96
- recall_at_5: hitAt5 / total,
97
- recall_at_10: hitAt10 / total,
98
- results,
99
- };
100
- }
101
-
102
- function hydrateIndexRows(brain, rows) {
103
- if (!rows?.length) return [];
104
- const ids = [...new Set(rows.map((row) => row.memory_id).filter(Boolean))];
105
- if (!ids.length) return [];
106
- const placeholders = ids.map(() => '?').join(',');
107
- return brain.getDb().prepare(`
108
- SELECT * FROM memories
109
- WHERE archived_at IS NULL AND id IN (${placeholders})
110
- `).all(...ids);
111
- }
112
-
113
- function mergeById(...groups) {
114
- const seen = new Set();
115
- const merged = [];
116
- for (const group of groups) {
117
- for (const item of group || []) {
118
- if (!item?.id || seen.has(item.id)) continue;
119
- seen.add(item.id);
120
- merged.push(item);
121
- }
122
- }
123
- return merged;
124
- }
125
-
126
- function resultSourceId(result = {}) {
127
- try {
128
- const metadata = JSON.parse(result.metadata || '{}');
129
- if (metadata?.sourceId) return metadata.sourceId;
130
- } catch {}
131
- const sourceId = String(result.source_id || '');
132
- const parts = sourceId.split(':');
133
- return parts.length > 2 ? parts.slice(0, 2).join(':') : sourceId;
134
- }
135
-
136
- if (require.main === module) {
137
- brainDefault.initDb();
138
- const summary = runMemoryRetrievalBenchmark({ seed: process.argv.includes('--seed') });
139
- console.log(JSON.stringify(summary, null, 2));
140
- }
141
-
142
- module.exports = {
143
- loadMemoryRetrievalCases,
144
- resultSourceId,
145
- runMemoryRetrievalBenchmark,
146
- scoreRetrievalCase,
147
- searchRetrievalCase,
148
- seedBenchmarkMemories,
149
- summarizeRetrievalResults,
150
- };