create-walle 0.9.21 → 0.9.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (500) hide show
  1. package/README.md +27 -5
  2. package/package.json +2 -2
  3. package/template/CLAUDE.md +2 -2
  4. package/template/LICENSE +1 -1
  5. package/template/bin/ctm-dev-cleanup.js +24 -3
  6. package/template/bin/ctm-launch.sh +13 -0
  7. package/template/bin/dev.sh +156 -18
  8. package/template/bin/node-bin.sh +84 -0
  9. package/template/bin/pin-node.sh +51 -0
  10. package/template/claude-task-manager/api-prompts.js +1203 -182
  11. package/template/claude-task-manager/api-reviews.js +109 -15
  12. package/template/claude-task-manager/approval-agent.js +1360 -280
  13. package/template/claude-task-manager/bin/restart-ctm.sh +64 -23
  14. package/template/claude-task-manager/bin/storage-migration-supervisor.js +338 -0
  15. package/template/claude-task-manager/db.js +4417 -295
  16. package/template/claude-task-manager/docs/app-update-refresh-protocol.md +69 -0
  17. package/template/claude-task-manager/docs/approval-ai-refinement.md +138 -0
  18. package/template/claude-task-manager/docs/approval-rescue-loop.md +74 -0
  19. package/template/claude-task-manager/docs/codex-operational-warning-health.md +107 -0
  20. package/template/claude-task-manager/docs/codex-resume-state-guard-design.md +17 -12
  21. package/template/claude-task-manager/docs/codex-terminal-render-controller-handoff.md +311 -0
  22. package/template/claude-task-manager/docs/coding-agent-hooks-architecture.md +418 -0
  23. package/template/claude-task-manager/docs/conversation-import-freshness.md +20 -0
  24. package/template/claude-task-manager/docs/google-workspace-auth-health.md +77 -0
  25. package/template/claude-task-manager/docs/image-paste-ux.md +13 -0
  26. package/template/claude-task-manager/docs/ipad-web-preview.md +88 -0
  27. package/template/claude-task-manager/docs/main-loop-offload-architecture.md +66 -0
  28. package/template/claude-task-manager/docs/microsoft-dev-tunnel-phone-access-design.md +274 -519
  29. package/template/claude-task-manager/docs/mobile-live-streaming.md +27 -5
  30. package/template/claude-task-manager/docs/mobile-remote-submission-lifecycle.md +69 -0
  31. package/template/claude-task-manager/docs/phone-access-design.md +53 -15
  32. package/template/claude-task-manager/docs/phone-passkey-identity.md +122 -0
  33. package/template/claude-task-manager/docs/phone-setup.md +3 -0
  34. package/template/claude-task-manager/docs/prompt-editing-tree-design.md +25 -1
  35. package/template/claude-task-manager/docs/remote-desktop-access-design.md +268 -0
  36. package/template/claude-task-manager/docs/restart-lifecycle-architecture.md +95 -0
  37. package/template/claude-task-manager/docs/runtime-work-control-plane.md +53 -0
  38. package/template/claude-task-manager/docs/session-interactive-wait-surfaces.md +38 -0
  39. package/template/claude-task-manager/docs/session-needs-you-dismissal.md +84 -0
  40. package/template/claude-task-manager/docs/session-render-state-management-design.md +91 -3
  41. package/template/claude-task-manager/docs/session-standup-command-center-design.md +25 -1
  42. package/template/claude-task-manager/docs/session-title-authority.md +32 -0
  43. package/template/claude-task-manager/docs/session-workspace-binding.md +33 -0
  44. package/template/claude-task-manager/docs/skill-intent-resolution-design.md +72 -0
  45. package/template/claude-task-manager/docs/walle-mcp-supervisor-health.md +86 -0
  46. package/template/claude-task-manager/docs/walle-relay-phone-access-design.md +24 -15
  47. package/template/claude-task-manager/docs/walle-session-history-hydration.md +114 -0
  48. package/template/claude-task-manager/docs/walle-session-input-queue.md +104 -0
  49. package/template/claude-task-manager/docs/walle-session-model-catalog.md +90 -0
  50. package/template/claude-task-manager/docs/walle-session-model-preferences.md +15 -6
  51. package/template/claude-task-manager/git-utils.js +897 -27
  52. package/template/claude-task-manager/lib/agent-capabilities.js +33 -0
  53. package/template/claude-task-manager/lib/agent-cli-cache.js +37 -7
  54. package/template/claude-task-manager/lib/agent-hooks-installer.js +26 -2
  55. package/template/claude-task-manager/lib/agent-presets.js +17 -1
  56. package/template/claude-task-manager/lib/all-sessions-query.js +108 -0
  57. package/template/claude-task-manager/lib/approval-ai-refinement.js +488 -0
  58. package/template/claude-task-manager/lib/approval-self-adapt.js +168 -0
  59. package/template/claude-task-manager/lib/async-semaphore.js +44 -0
  60. package/template/claude-task-manager/lib/auth-context.js +5 -0
  61. package/template/claude-task-manager/lib/auth-rate-limit.js +47 -4
  62. package/template/claude-task-manager/lib/auth-rules.js +29 -2
  63. package/template/claude-task-manager/lib/auto-approval-verifier.js +129 -16
  64. package/template/claude-task-manager/lib/background-llm.js +144 -17
  65. package/template/claude-task-manager/lib/branch-inventory.js +212 -0
  66. package/template/claude-task-manager/lib/claude-desktop-sessions.js +15 -3
  67. package/template/claude-task-manager/lib/coalesce-sync-frames.js +151 -0
  68. package/template/claude-task-manager/lib/codex-launch-health.js +762 -0
  69. package/template/claude-task-manager/lib/codex-transcript-pager.js +51 -0
  70. package/template/claude-task-manager/lib/codex-zst.js +124 -0
  71. package/template/claude-task-manager/lib/coding-agent-models.js +233 -30
  72. package/template/claude-task-manager/lib/connection-health.js +232 -0
  73. package/template/claude-task-manager/lib/conversation-blob-parser.js +42 -0
  74. package/template/claude-task-manager/lib/conversation-tail-merge.js +89 -26
  75. package/template/claude-task-manager/lib/ctm-session-context-api.js +39 -10
  76. package/template/claude-task-manager/lib/cursor-conversation-store.js +354 -0
  77. package/template/claude-task-manager/lib/db-owner-worker-client.js +315 -0
  78. package/template/claude-task-manager/lib/document-review.js +141 -6
  79. package/template/claude-task-manager/lib/escalation-review.js +152 -0
  80. package/template/claude-task-manager/lib/graceful-shutdown.js +159 -0
  81. package/template/claude-task-manager/lib/headless-term-service.js +678 -0
  82. package/template/claude-task-manager/lib/heavy-worker-fallback.js +38 -0
  83. package/template/claude-task-manager/lib/jsonl-conversation-parser.js +542 -0
  84. package/template/claude-task-manager/lib/jsonl-range-reader.js +112 -0
  85. package/template/claude-task-manager/lib/main-db-census.js +216 -0
  86. package/template/claude-task-manager/lib/message-pagination.js +106 -4
  87. package/template/claude-task-manager/lib/microsoft-dev-tunnel-setup.js +750 -26
  88. package/template/claude-task-manager/lib/mobile-auth-api.js +274 -7
  89. package/template/claude-task-manager/lib/mobile-auth-store.js +592 -10
  90. package/template/claude-task-manager/lib/mobile-notification-dispatcher.js +15 -0
  91. package/template/claude-task-manager/lib/model-overview-brain-fallback.js +311 -0
  92. package/template/claude-task-manager/lib/model-overview-cache.js +141 -0
  93. package/template/claude-task-manager/lib/models-health-routing-notice.js +126 -0
  94. package/template/claude-task-manager/lib/node-pin-guard.js +93 -0
  95. package/template/claude-task-manager/lib/perf-tracker.js +242 -6
  96. package/template/claude-task-manager/lib/permission-match.js +76 -0
  97. package/template/claude-task-manager/lib/permission-sync.js +133 -20
  98. package/template/claude-task-manager/lib/process-title.js +35 -0
  99. package/template/claude-task-manager/lib/prompt-executions-query.js +25 -0
  100. package/template/claude-task-manager/lib/prompt-index-disk-cache.js +44 -0
  101. package/template/claude-task-manager/lib/prompt-intent.js +132 -0
  102. package/template/claude-task-manager/lib/provider-user-context.js +34 -0
  103. package/template/claude-task-manager/lib/read-pool-client.js +313 -0
  104. package/template/claude-task-manager/lib/readpool-breaker.js +31 -0
  105. package/template/claude-task-manager/lib/recent-sessions-breaker.js +12 -0
  106. package/template/claude-task-manager/lib/remote-feedback-client.js +72 -0
  107. package/template/claude-task-manager/lib/remote-relay-protocol.js +37 -4
  108. package/template/claude-task-manager/lib/remote-relay-store.js +159 -0
  109. package/template/claude-task-manager/lib/remote-submission-observer.js +278 -0
  110. package/template/claude-task-manager/lib/restart-guard.js +109 -0
  111. package/template/claude-task-manager/lib/restore-interruption-detector.js +439 -0
  112. package/template/claude-task-manager/lib/restore-policy.js +13 -0
  113. package/template/claude-task-manager/lib/restore-resume-batch.js +74 -0
  114. package/template/claude-task-manager/lib/restore-runtime.js +68 -0
  115. package/template/claude-task-manager/lib/restore-storm.js +34 -0
  116. package/template/claude-task-manager/lib/resume-cwd.js +36 -0
  117. package/template/claude-task-manager/lib/resume-preflight.js +313 -0
  118. package/template/claude-task-manager/lib/runtime-work-registry.js +444 -0
  119. package/template/claude-task-manager/lib/sanitize-openai-auth.js +31 -0
  120. package/template/claude-task-manager/lib/scheduler.js +21 -1
  121. package/template/claude-task-manager/lib/scrollback-snapshot-store.js +159 -0
  122. package/template/claude-task-manager/lib/serial-task-queue.js +64 -0
  123. package/template/claude-task-manager/lib/server-listeners.js +239 -0
  124. package/template/claude-task-manager/lib/session-capture.js +42 -7
  125. package/template/claude-task-manager/lib/session-content-backfill.js +131 -0
  126. package/template/claude-task-manager/lib/session-history.js +388 -43
  127. package/template/claude-task-manager/lib/session-host-manager.js +287 -0
  128. package/template/claude-task-manager/lib/session-image-refs.js +209 -0
  129. package/template/claude-task-manager/lib/session-jobs.js +399 -59
  130. package/template/claude-task-manager/lib/session-prompt-index.js +137 -0
  131. package/template/claude-task-manager/lib/session-restore.js +53 -0
  132. package/template/claude-task-manager/lib/session-standup.js +123 -23
  133. package/template/claude-task-manager/lib/session-state-bus.js +14 -0
  134. package/template/claude-task-manager/lib/session-stream.js +64 -16
  135. package/template/claude-task-manager/lib/session-timeline-summary.js +260 -0
  136. package/template/claude-task-manager/lib/session-token-usage.js +494 -0
  137. package/template/claude-task-manager/lib/session-workspace-binding.js +356 -0
  138. package/template/claude-task-manager/lib/setup-network-config.js +9 -0
  139. package/template/claude-task-manager/lib/size-cap.js +45 -0
  140. package/template/claude-task-manager/lib/size-cap.test.js +62 -0
  141. package/template/claude-task-manager/lib/skill-autocomplete.js +180 -1
  142. package/template/claude-task-manager/lib/skill-intent-resolver.js +304 -0
  143. package/template/claude-task-manager/lib/sqlite-driver.js +19 -3
  144. package/template/claude-task-manager/lib/standup-attention.js +7 -3
  145. package/template/claude-task-manager/lib/status-authority.js +39 -0
  146. package/template/claude-task-manager/lib/status-hooks.js +4 -0
  147. package/template/claude-task-manager/lib/storage-migration.js +235 -0
  148. package/template/claude-task-manager/lib/structured-capture.js +298 -0
  149. package/template/claude-task-manager/lib/sync-io-census.js +163 -0
  150. package/template/claude-task-manager/lib/tailscale-setup.js +6 -0
  151. package/template/claude-task-manager/lib/terminal-activity-evidence.js +33 -0
  152. package/template/claude-task-manager/lib/terminal-choice.js +364 -0
  153. package/template/claude-task-manager/lib/terminal-control-sanitize.js +17 -0
  154. package/template/claude-task-manager/lib/terminal-fingerprint.js +48 -0
  155. package/template/claude-task-manager/lib/terminal-output-flush.js +84 -0
  156. package/template/claude-task-manager/lib/timeline-order.js +122 -0
  157. package/template/claude-task-manager/lib/transcript-store.js +348 -43
  158. package/template/claude-task-manager/lib/transport-security.js +84 -1
  159. package/template/claude-task-manager/lib/wait-state.js +184 -0
  160. package/template/claude-task-manager/lib/walle-client.js +47 -5
  161. package/template/claude-task-manager/lib/walle-ctm-history.js +564 -4
  162. package/template/claude-task-manager/lib/walle-external-actions.js +135 -16
  163. package/template/claude-task-manager/lib/walle-history-hydration.js +46 -0
  164. package/template/claude-task-manager/lib/walle-native-health.js +403 -0
  165. package/template/claude-task-manager/lib/walle-repair.js +701 -0
  166. package/template/claude-task-manager/lib/walle-session-cache.js +109 -0
  167. package/template/claude-task-manager/lib/walle-session-context.js +57 -21
  168. package/template/claude-task-manager/lib/walle-session-model-catalog.js +34 -0
  169. package/template/claude-task-manager/lib/walle-supervisor.js +539 -63
  170. package/template/claude-task-manager/lib/walle-transcript.js +52 -0
  171. package/template/claude-task-manager/lib/worktree-active-sync.js +11 -7
  172. package/template/claude-task-manager/lib/worktree-cwd.js +32 -1
  173. package/template/claude-task-manager/package.json +1 -1
  174. package/template/claude-task-manager/prompt-harvest.js +89 -66
  175. package/template/claude-task-manager/providers/claude-code.js +51 -3
  176. package/template/claude-task-manager/providers/cursor.js +140 -45
  177. package/template/claude-task-manager/public/css/reviews.css +551 -61
  178. package/template/claude-task-manager/public/css/setup.css +191 -0
  179. package/template/claude-task-manager/public/css/walle-session.css +865 -10
  180. package/template/claude-task-manager/public/css/walle.css +154 -0
  181. package/template/claude-task-manager/public/designs/ai-providers-consolidation-v2.html +830 -0
  182. package/template/claude-task-manager/public/index.html +18516 -2058
  183. package/template/claude-task-manager/public/ipad.html +363 -0
  184. package/template/claude-task-manager/public/js/document-review-links.js +301 -0
  185. package/template/claude-task-manager/public/js/image-normalize.js +69 -36
  186. package/template/claude-task-manager/public/js/message-renderer.js +1265 -77
  187. package/template/claude-task-manager/public/js/prompts.js +66 -29
  188. package/template/claude-task-manager/public/js/reviews.js +901 -133
  189. package/template/claude-task-manager/public/js/session-activity-utils.js +11 -1
  190. package/template/claude-task-manager/public/js/session-search-utils.js +94 -10
  191. package/template/claude-task-manager/public/js/session-status-precedence.js +23 -5
  192. package/template/claude-task-manager/public/js/setup.js +1273 -176
  193. package/template/claude-task-manager/public/js/stream-view.js +691 -73
  194. package/template/claude-task-manager/public/js/terminal-reconciler.js +210 -0
  195. package/template/claude-task-manager/public/js/walle-session.js +2455 -158
  196. package/template/claude-task-manager/public/js/walle.js +455 -28
  197. package/template/claude-task-manager/public/m/app.css +2909 -262
  198. package/template/claude-task-manager/public/m/app.js +6601 -398
  199. package/template/claude-task-manager/public/m/claim.html +224 -17
  200. package/template/claude-task-manager/public/m/index.html +117 -21
  201. package/template/claude-task-manager/public/m/sw.js +3 -1
  202. package/template/claude-task-manager/public/manifest.json +2 -2
  203. package/template/claude-task-manager/public/prompts.html +30 -14
  204. package/template/claude-task-manager/queue-engine.js +507 -28
  205. package/template/claude-task-manager/scripts/repair-claude-session-images.js +27 -8
  206. package/template/claude-task-manager/server.js +14341 -2197
  207. package/template/claude-task-manager/session-integrity.js +160 -18
  208. package/template/claude-task-manager/session-search-ranking.js +1 -0
  209. package/template/claude-task-manager/session-utils.js +25 -5
  210. package/template/claude-task-manager/workers/approval-blocklist.js +96 -6
  211. package/template/claude-task-manager/workers/approval-widget-validator.js +14 -8
  212. package/template/claude-task-manager/workers/conversation-import-worker.js +11 -50
  213. package/template/claude-task-manager/workers/db-owner-worker.js +386 -0
  214. package/template/claude-task-manager/workers/harvest-worker.js +9 -55
  215. package/template/claude-task-manager/workers/headless-term-worker.js +9 -530
  216. package/template/claude-task-manager/workers/read-pool-worker.js +387 -0
  217. package/template/claude-task-manager/workers/scrollback-worker.js +11 -72
  218. package/template/claude-task-manager/workers/session-host-process.js +146 -0
  219. package/template/claude-task-manager/workers/session-integrity-worker.js +10 -54
  220. package/template/claude-task-manager/workers/state-detectors/base.js +18 -1
  221. package/template/claude-task-manager/workers/state-detectors/claude-code.js +182 -9
  222. package/template/claude-task-manager/workers/state-detectors/codex.js +150 -2
  223. package/template/claude-task-manager/workers/state-detectors/cursor.js +127 -0
  224. package/template/claude-task-manager/workers/state-detectors/gemini.js +21 -0
  225. package/template/claude-task-manager/workers/state-detectors/index.js +29 -0
  226. package/template/claude-task-manager/workers/state-detectors/opencode.js +103 -0
  227. package/template/docs/design/markdown-review-pane.md +206 -0
  228. package/template/docs/designs/2026-05-17-portkey-gateway-provider-ux.md +129 -38
  229. package/template/docs/designs/2026-05-20-mobile-worktree-finish-command.md +27 -0
  230. package/template/docs/designs/2026-05-22-ai-configuration-consolidation.md +248 -0
  231. package/template/docs/designs/ai-configuration-consolidation-mock.html +812 -0
  232. package/template/docs/private-memory-and-pii-policy.md +69 -0
  233. package/template/package.json +2 -1
  234. package/template/scripts/check-private-data.js +201 -0
  235. package/template/shared/sqlite-owner-guard.js +30 -0
  236. package/template/shared/sqlite-owner-write-queue.js +225 -0
  237. package/template/shared/sqlite-storage-policy.js +111 -0
  238. package/template/shared/sqlite-write-lock.js +428 -0
  239. package/template/wall-e/agent-runners/claude-code.js +5 -0
  240. package/template/wall-e/agent.js +166 -22
  241. package/template/wall-e/api-walle.js +524 -70
  242. package/template/wall-e/auth/provider-flows.js +11 -1
  243. package/template/wall-e/bin/walle-mcp-stdio.js +341 -17
  244. package/template/wall-e/brain.js +1614 -141
  245. package/template/wall-e/chat/attachment-blocks.js +96 -0
  246. package/template/wall-e/chat/attachments.js +2 -1
  247. package/template/wall-e/chat/capability-resolver.js +7 -7
  248. package/template/wall-e/chat/context-messages.js +28 -0
  249. package/template/wall-e/chat/conversation-frame.js +630 -0
  250. package/template/wall-e/chat/provider-messages.js +125 -0
  251. package/template/wall-e/chat.js +1002 -233
  252. package/template/wall-e/coding/acceptance-contract.js +170 -0
  253. package/template/wall-e/coding/acp-adapter.js +1 -1
  254. package/template/wall-e/coding/agent-catalog.js +3 -0
  255. package/template/wall-e/coding/artifact-store.js +93 -0
  256. package/template/wall-e/coding/capability-router.js +120 -0
  257. package/template/wall-e/coding/coding-run-controller.js +423 -0
  258. package/template/wall-e/coding/compaction-service.js +157 -12
  259. package/template/wall-e/coding/frontend-verification.js +258 -0
  260. package/template/wall-e/coding/lifecycle-hooks.js +75 -0
  261. package/template/wall-e/coding/local-preview-contract.js +157 -0
  262. package/template/wall-e/coding/permission-service.js +57 -13
  263. package/template/wall-e/coding/prompt-bundle.js +19 -1
  264. package/template/wall-e/coding/prompt-section-registry.js +227 -0
  265. package/template/wall-e/coding/provider-compat.js +15 -0
  266. package/template/wall-e/coding/runtime-events.js +224 -0
  267. package/template/wall-e/coding/runtime-mode.js +3 -0
  268. package/template/wall-e/coding/side-git-snapshot.js +160 -4
  269. package/template/wall-e/coding/snapshot-service.js +143 -1
  270. package/template/wall-e/coding/stream-processor.js +388 -34
  271. package/template/wall-e/coding/task-tool.js +141 -4
  272. package/template/wall-e/coding/tool-execution-controller.js +365 -0
  273. package/template/wall-e/coding/tool-registry.js +43 -5
  274. package/template/wall-e/coding/user-hooks.js +217 -0
  275. package/template/wall-e/coding-orchestrator.js +1330 -221
  276. package/template/wall-e/coding-prompts.js +20 -4
  277. package/template/wall-e/context/context-builder.js +15 -2
  278. package/template/wall-e/decision/confidence.js +1 -1
  279. package/template/wall-e/docs/coding-acceptance-contract.md +41 -0
  280. package/template/wall-e/docs/external-action-controller.md +26 -6
  281. package/template/wall-e/docs/telemetry-lifecycle.md +8 -2
  282. package/template/wall-e/embeddings.js +591 -53
  283. package/template/wall-e/external-action-controller.js +12 -0
  284. package/template/wall-e/http/auth.js +1 -0
  285. package/template/wall-e/http/chat-api.js +46 -11
  286. package/template/wall-e/http/model-admin.js +836 -34
  287. package/template/wall-e/lib/boot-profile.js +88 -0
  288. package/template/wall-e/lib/event-loop-monitor.js +93 -0
  289. package/template/wall-e/lib/service-health.js +194 -0
  290. package/template/wall-e/llm/anthropic.js +130 -5
  291. package/template/wall-e/llm/client.js +266 -63
  292. package/template/wall-e/llm/default-fallback.js +382 -0
  293. package/template/wall-e/llm/health.js +19 -0
  294. package/template/wall-e/llm/message-guard.js +78 -0
  295. package/template/wall-e/llm/model-catalog.js +252 -1
  296. package/template/wall-e/llm/openai.js +26 -4
  297. package/template/wall-e/llm/portkey-sync.js +654 -0
  298. package/template/wall-e/llm/provider-error.js +30 -2
  299. package/template/wall-e/llm/registry.js +5 -1
  300. package/template/wall-e/llm/request-compat.js +67 -0
  301. package/template/wall-e/loops/backfill.js +79 -23
  302. package/template/wall-e/loops/brain-optimize.js +67 -0
  303. package/template/wall-e/loops/ingest.js +25 -10
  304. package/template/wall-e/loops/question-digest.js +160 -0
  305. package/template/wall-e/loops/reflect.js +6 -4
  306. package/template/wall-e/loops/think.js +39 -12
  307. package/template/wall-e/mcp-server.js +318 -36
  308. package/template/wall-e/memory/ctm-context-client.js +52 -14
  309. package/template/wall-e/memory/ctm-operational-context.js +237 -0
  310. package/template/wall-e/memory/ctm-prompt-executions-client.js +128 -0
  311. package/template/wall-e/memory/ctm-session-context.js +111 -63
  312. package/template/wall-e/prompts/coding/deepseek.txt +3 -0
  313. package/template/wall-e/prompts/coding/gemini.txt +6 -0
  314. package/template/wall-e/prompts/coding/gpt.txt +6 -0
  315. package/template/wall-e/prompts/coding/local.txt +7 -0
  316. package/template/wall-e/runtime/decision-hooks.js +115 -0
  317. package/template/wall-e/runtime/devbox-gateway.js +82 -8
  318. package/template/wall-e/runtime/prompt-manifest.js +86 -0
  319. package/template/wall-e/runtime/tool-executor.js +269 -0
  320. package/template/wall-e/runtime/tool-result-envelope.js +138 -0
  321. package/template/wall-e/runtime/transcript-projection.js +60 -0
  322. package/template/wall-e/runtime/walle-runtime.js +224 -0
  323. package/template/wall-e/scripts/db-optimize/migrate.js +162 -0
  324. package/template/wall-e/scripts/db-optimize/recall-eval.js +117 -0
  325. package/template/wall-e/server.js +15 -0
  326. package/template/wall-e/session-files.js +9 -0
  327. package/template/wall-e/skills/_bundled/google-calendar/run.js +1 -1
  328. package/template/wall-e/skills/_bundled/gws-workspace/run.js +1 -1
  329. package/template/wall-e/skills/_bundled/slack-mentions/run.js +76 -6
  330. package/template/wall-e/skills/claude-code-reader.js +7 -3
  331. package/template/wall-e/skills/script-skill-runner.js +10 -0
  332. package/template/wall-e/skills/skill-planner.js +38 -0
  333. package/template/wall-e/tools/builtin-middleware.js +19 -9
  334. package/template/wall-e/tools/local-tools.js +1428 -16
  335. package/template/wall-e/tools/permission-checker.js +73 -5
  336. package/template/wall-e/tools/question-manager.js +117 -7
  337. package/template/wall-e/training/harvester.js +12 -28
  338. package/template/wall-e/training/replay.js +25 -80
  339. package/template/website/index.html +10 -10
  340. package/template/wall-e/eval/ab-test.js +0 -203
  341. package/template/wall-e/eval/agent-runner.js +0 -772
  342. package/template/wall-e/eval/agent-scorer.js +0 -461
  343. package/template/wall-e/eval/aggregator.js +0 -414
  344. package/template/wall-e/eval/allowed-test-commands.js +0 -34
  345. package/template/wall-e/eval/benchmark-generator.js +0 -113
  346. package/template/wall-e/eval/benchmarks/chat-eval.json +0 -1662
  347. package/template/wall-e/eval/benchmarks/chat.json +0 -82
  348. package/template/wall-e/eval/benchmarks/coding-agent-real.json +0 -1
  349. package/template/wall-e/eval/benchmarks/coding-agent.json +0 -1581
  350. package/template/wall-e/eval/benchmarks/coding.json +0 -122
  351. package/template/wall-e/eval/benchmarks/memory-retrieval.json +0 -234
  352. package/template/wall-e/eval/benchmarks/reasoning.json +0 -82
  353. package/template/wall-e/eval/benchmarks/swebench-lite-30.json +0 -212
  354. package/template/wall-e/eval/benchmarks.js +0 -669
  355. package/template/wall-e/eval/cc-replay.js +0 -719
  356. package/template/wall-e/eval/chat-eval.js +0 -525
  357. package/template/wall-e/eval/check-keys.js +0 -15
  358. package/template/wall-e/eval/check-providers.js +0 -42
  359. package/template/wall-e/eval/codex-cli-baseline.js +0 -669
  360. package/template/wall-e/eval/coding-agent-real.js +0 -570
  361. package/template/wall-e/eval/context-compactor.js +0 -251
  362. package/template/wall-e/eval/debug-agent003.js +0 -68
  363. package/template/wall-e/eval/diagnostics.js +0 -216
  364. package/template/wall-e/eval/eval-orchestrator.js +0 -642
  365. package/template/wall-e/eval/evaluate.js +0 -202
  366. package/template/wall-e/eval/evaluator.js +0 -373
  367. package/template/wall-e/eval/exporter.js +0 -212
  368. package/template/wall-e/eval/fixtures/express-basic/package.json +0 -9
  369. package/template/wall-e/eval/fixtures/express-basic/server.js +0 -115
  370. package/template/wall-e/eval/fixtures/express-basic/test.js +0 -83
  371. package/template/wall-e/eval/fixtures/express-buggy/package.json +0 -9
  372. package/template/wall-e/eval/fixtures/express-buggy/server.js +0 -113
  373. package/template/wall-e/eval/fixtures/express-buggy/test.js +0 -83
  374. package/template/wall-e/eval/fixtures/express-buggy-items/package.json +0 -9
  375. package/template/wall-e/eval/fixtures/express-buggy-items/server.js +0 -112
  376. package/template/wall-e/eval/fixtures/express-buggy-items/test.js +0 -83
  377. package/template/wall-e/eval/fixtures/express-buggy-search/package.json +0 -9
  378. package/template/wall-e/eval/fixtures/express-buggy-search/server.js +0 -121
  379. package/template/wall-e/eval/fixtures/express-buggy-search/test.js +0 -83
  380. package/template/wall-e/eval/fixtures/express-rename-data/data.js +0 -34
  381. package/template/wall-e/eval/fixtures/express-rename-data/package.json +0 -9
  382. package/template/wall-e/eval/fixtures/express-rename-data/server.js +0 -97
  383. package/template/wall-e/eval/fixtures/express-rename-data/test.js +0 -88
  384. package/template/wall-e/eval/fixtures/express-xss/package.json +0 -12
  385. package/template/wall-e/eval/fixtures/express-xss/server.js +0 -90
  386. package/template/wall-e/eval/fixtures/express-xss/test.js +0 -67
  387. package/template/wall-e/eval/fixtures/express-xss/views/profile.ejs +0 -9
  388. package/template/wall-e/eval/fixtures/fullstack-app/config/default.js +0 -9
  389. package/template/wall-e/eval/fixtures/fullstack-app/config/test.js +0 -13
  390. package/template/wall-e/eval/fixtures/fullstack-app/package.json +0 -11
  391. package/template/wall-e/eval/fixtures/fullstack-app/public/css/style.css +0 -137
  392. package/template/wall-e/eval/fixtures/fullstack-app/public/index.html +0 -46
  393. package/template/wall-e/eval/fixtures/fullstack-app/public/js/app.js +0 -121
  394. package/template/wall-e/eval/fixtures/fullstack-app/public/js/auth.js +0 -71
  395. package/template/wall-e/eval/fixtures/fullstack-app/public/js/items.js +0 -80
  396. package/template/wall-e/eval/fixtures/fullstack-app/public/js/users.js +0 -46
  397. package/template/wall-e/eval/fixtures/fullstack-app/public/login.html +0 -45
  398. package/template/wall-e/eval/fixtures/fullstack-app/public/register.html +0 -38
  399. package/template/wall-e/eval/fixtures/fullstack-app/scripts/migrate.js +0 -23
  400. package/template/wall-e/eval/fixtures/fullstack-app/scripts/seed.js +0 -46
  401. package/template/wall-e/eval/fixtures/fullstack-app/server/db.js +0 -99
  402. package/template/wall-e/eval/fixtures/fullstack-app/server/index.js +0 -94
  403. package/template/wall-e/eval/fixtures/fullstack-app/server/middleware/auth.js +0 -19
  404. package/template/wall-e/eval/fixtures/fullstack-app/server/middleware/logger.js +0 -19
  405. package/template/wall-e/eval/fixtures/fullstack-app/server/router.js +0 -50
  406. package/template/wall-e/eval/fixtures/fullstack-app/server/routes/auth.js +0 -69
  407. package/template/wall-e/eval/fixtures/fullstack-app/server/routes/health.js +0 -23
  408. package/template/wall-e/eval/fixtures/fullstack-app/server/routes/items.js +0 -88
  409. package/template/wall-e/eval/fixtures/fullstack-app/server/routes/users.js +0 -75
  410. package/template/wall-e/eval/fixtures/fullstack-app/server/test.js +0 -198
  411. package/template/wall-e/eval/fixtures/fullstack-app/server/utils/response.js +0 -34
  412. package/template/wall-e/eval/fixtures/fullstack-app/server/utils/validate.js +0 -26
  413. package/template/wall-e/eval/fixtures/fullstack-app/server.js +0 -8
  414. package/template/wall-e/eval/fixtures/fullstack-app/test.js +0 -12
  415. package/template/wall-e/eval/fixtures/monorepo-basic/package.json +0 -8
  416. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/data.js +0 -58
  417. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/middleware.js +0 -46
  418. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/package.json +0 -8
  419. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/routes.js +0 -64
  420. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/server.js +0 -56
  421. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/test.js +0 -116
  422. package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/commands.js +0 -61
  423. package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/index.js +0 -62
  424. package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/output.js +0 -43
  425. package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/package.json +0 -11
  426. package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/test.js +0 -44
  427. package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/formatters.js +0 -43
  428. package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/index.js +0 -12
  429. package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/package.json +0 -5
  430. package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/test.js +0 -55
  431. package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/validators.js +0 -29
  432. package/template/wall-e/eval/fixtures/monorepo-basic/test.js +0 -46
  433. package/template/wall-e/eval/fixtures/node-cli/index.js +0 -78
  434. package/template/wall-e/eval/fixtures/node-cli/package.json +0 -10
  435. package/template/wall-e/eval/fixtures/node-cli/test.js +0 -57
  436. package/template/wall-e/eval/fixtures/node-typed/package.json +0 -8
  437. package/template/wall-e/eval/fixtures/node-typed/src/handlers.js +0 -31
  438. package/template/wall-e/eval/fixtures/node-typed/src/utils.js +0 -33
  439. package/template/wall-e/eval/fixtures/node-typed/test.js +0 -36
  440. package/template/wall-e/eval/fixtures/python-flask/app.py +0 -14
  441. package/template/wall-e/eval/fixtures/python-flask/requirements.txt +0 -2
  442. package/template/wall-e/eval/fixtures/python-flask/test_app.py +0 -25
  443. package/template/wall-e/eval/fixtures/wall-e-subset/brain.js +0 -105
  444. package/template/wall-e/eval/fixtures/wall-e-subset/eval/aggregator.js +0 -101
  445. package/template/wall-e/eval/fixtures/wall-e-subset/eval/benchmarks/chat.json +0 -20
  446. package/template/wall-e/eval/fixtures/wall-e-subset/eval/benchmarks/coding.json +0 -32
  447. package/template/wall-e/eval/fixtures/wall-e-subset/eval/benchmarks.js +0 -64
  448. package/template/wall-e/eval/fixtures/wall-e-subset/eval/fixtures/simple-project/package.json +0 -6
  449. package/template/wall-e/eval/fixtures/wall-e-subset/eval/fixtures/simple-project/server.js +0 -31
  450. package/template/wall-e/eval/fixtures/wall-e-subset/eval/fixtures/simple-project/test.js +0 -18
  451. package/template/wall-e/eval/fixtures/wall-e-subset/eval/fixtures/simple-project/utils.js +0 -34
  452. package/template/wall-e/eval/fixtures/wall-e-subset/eval/runner.js +0 -104
  453. package/template/wall-e/eval/fixtures/wall-e-subset/eval/scorer.js +0 -73
  454. package/template/wall-e/eval/fixtures/wall-e-subset/eval/test.js +0 -134
  455. package/template/wall-e/eval/fixtures/wall-e-subset/llm/client.js +0 -99
  456. package/template/wall-e/eval/fixtures/wall-e-subset/llm/providers.js +0 -63
  457. package/template/wall-e/eval/fixtures/wall-e-subset/llm/test.js +0 -70
  458. package/template/wall-e/eval/fixtures/wall-e-subset/package.json +0 -10
  459. package/template/wall-e/eval/fixtures/wall-e-subset/test.js +0 -86
  460. package/template/wall-e/eval/harvester.js +0 -685
  461. package/template/wall-e/eval/head-to-head.js +0 -388
  462. package/template/wall-e/eval/humaneval-adapter.js +0 -321
  463. package/template/wall-e/eval/list-models.js +0 -31
  464. package/template/wall-e/eval/livecodebench-adapter.js +0 -291
  465. package/template/wall-e/eval/mail-integration.js +0 -443
  466. package/template/wall-e/eval/manifest.js +0 -186
  467. package/template/wall-e/eval/meta-harness/adapters/coding-agent.js +0 -57
  468. package/template/wall-e/eval/meta-harness/bootstrap-snapshot.js +0 -149
  469. package/template/wall-e/eval/meta-harness/candidate-store.js +0 -117
  470. package/template/wall-e/eval/meta-harness/cli.js +0 -86
  471. package/template/wall-e/eval/meta-harness/domain-spec.js +0 -154
  472. package/template/wall-e/eval/meta-harness/domains/coding-agent.domain.json +0 -84
  473. package/template/wall-e/eval/meta-harness/examples/env-bootstrap-candidate.js +0 -29
  474. package/template/wall-e/eval/meta-harness/experience-store.js +0 -174
  475. package/template/wall-e/eval/meta-harness/frontier.js +0 -96
  476. package/template/wall-e/eval/meta-harness/harness-interface.js +0 -90
  477. package/template/wall-e/eval/meta-harness/leakage-guard.js +0 -80
  478. package/template/wall-e/eval/meta-harness/optimizer.js +0 -207
  479. package/template/wall-e/eval/meta-harness/proposer-runner.js +0 -110
  480. package/template/wall-e/eval/meta-harness/reporting.js +0 -58
  481. package/template/wall-e/eval/meta-harness/telemetry.js +0 -27
  482. package/template/wall-e/eval/meta-harness/validation.js +0 -81
  483. package/template/wall-e/eval/promoter.js +0 -228
  484. package/template/wall-e/eval/provider-normalizer.js +0 -33
  485. package/template/wall-e/eval/replay.js +0 -395
  486. package/template/wall-e/eval/run-agent-benchmarks.js +0 -386
  487. package/template/wall-e/eval/run-codex-cli-baseline.js +0 -177
  488. package/template/wall-e/eval/run-coding-agent-real.js +0 -187
  489. package/template/wall-e/eval/run-eval.js +0 -435
  490. package/template/wall-e/eval/run-model-comparison.js +0 -142
  491. package/template/wall-e/eval/session-evaluator.js +0 -187
  492. package/template/wall-e/eval/session-miner.js +0 -207
  493. package/template/wall-e/eval/session-retrieval-benchmark.js +0 -150
  494. package/template/wall-e/eval/session-transcripts.js +0 -509
  495. package/template/wall-e/eval/shadow.js +0 -161
  496. package/template/wall-e/eval/swebench-adapter.js +0 -345
  497. package/template/wall-e/eval/swebench-docker.js +0 -192
  498. package/template/wall-e/eval/train.py +0 -320
  499. package/template/wall-e/eval/trainer.js +0 -232
  500. package/template/wall-e/eval/weekly-eval-loop.js +0 -241
@@ -1,202 +0,0 @@
1
- 'use strict';
2
-
3
- const { execFile } = require('node:child_process');
4
- const { promisify } = require('node:util');
5
-
6
- const execFileAsync = promisify(execFile);
7
-
8
- // ---------------------------------------------------------------------------
9
- // Built-in benchmark prompts
10
- // ---------------------------------------------------------------------------
11
-
12
- const BENCHMARK_PROMPTS = [
13
- { prompt: 'Explain the difference between a promise and a callback in JavaScript.', type: 'coding' },
14
- { prompt: 'Summarize the key points of effective code review.', type: 'coding' },
15
- { prompt: 'What are the pros and cons of microservices vs monolith?', type: 'reasoning' },
16
- { prompt: 'Write a function to find the longest palindrome in a string.', type: 'coding' },
17
- { prompt: 'What should I prioritize when debugging a production outage?', type: 'reasoning' },
18
- ];
19
-
20
- // ---------------------------------------------------------------------------
21
- // Scoring helpers
22
- // ---------------------------------------------------------------------------
23
-
24
- /**
25
- * Score a single response on a 0-1 scale.
26
- * Factors: non-empty (coherence), length (detail), latency (speed).
27
- */
28
- function scoreResponse(response, latencyMs) {
29
- if (!response || !response.trim()) return 0;
30
-
31
- // Coherence: 0.4 points for non-empty
32
- let score = 0.4;
33
-
34
- // Length score: up to 0.3 points (longer = more detailed, capped at 500 chars)
35
- const lengthScore = Math.min(response.length / 500, 1) * 0.3;
36
- score += lengthScore;
37
-
38
- // Latency score: up to 0.3 points (faster = better, under 2s is perfect)
39
- const latencyScore = Math.max(0, 1 - latencyMs / 10000) * 0.3;
40
- score += latencyScore;
41
-
42
- return Math.round(score * 1000) / 1000;
43
- }
44
-
45
- // ---------------------------------------------------------------------------
46
- // Model runner
47
- // ---------------------------------------------------------------------------
48
-
49
- /**
50
- * Run a prompt against an Ollama model and return { response, latencyMs }.
51
- * @param {string} model - Ollama model name
52
- * @param {string} prompt - The prompt text
53
- * @returns {Promise<{ response: string, latencyMs: number }>}
54
- */
55
- async function runOllamaPrompt(model, prompt) {
56
- const start = Date.now();
57
- try {
58
- const { stdout } = await execFileAsync('ollama', ['run', model, prompt], {
59
- timeout: 30000,
60
- maxBuffer: 1024 * 1024,
61
- });
62
- return { response: stdout.trim(), latencyMs: Date.now() - start };
63
- } catch (err) {
64
- return { response: '', latencyMs: Date.now() - start, error: err.message };
65
- }
66
- }
67
-
68
- // ---------------------------------------------------------------------------
69
- // Main entry point
70
- // ---------------------------------------------------------------------------
71
-
72
- /**
73
- * Evaluate fine-tuned models against base models using the quorum system.
74
- * @param {Object} options
75
- * @param {string[]} options.fineTunedModels - Ollama model names to evaluate
76
- * @param {string[]} [options.benchmarkPrompts] - Test prompts (defaults to built-in set)
77
- * @param {string} [options.baseline] - Baseline model name (default: 'llama3.2:1b')
78
- * @param {number} [options.winThreshold] - Win rate threshold for deployment (default: 0.6)
79
- * @param {Object} [deps] - { brain, runPrompt }
80
- * @returns {Promise<{ results: Array, winner: string|null, deployed: boolean }>}
81
- */
82
- async function evaluateAndDeploy(options, deps = {}) {
83
- const {
84
- fineTunedModels = [],
85
- benchmarkPrompts,
86
- baseline = 'llama3.2:1b',
87
- winThreshold = 0.6,
88
- } = options || {};
89
-
90
- const prompts = benchmarkPrompts || BENCHMARK_PROMPTS;
91
- const runPrompt = deps.runPrompt || runOllamaPrompt;
92
- const brain = deps.brain || null;
93
-
94
- if (!fineTunedModels.length) {
95
- return { results: [], winner: null, deployed: false };
96
- }
97
-
98
- // All models to evaluate: fine-tuned + baseline
99
- const allModels = [...fineTunedModels, baseline];
100
- const results = [];
101
-
102
- // Run each prompt against all models
103
- for (const { prompt, type } of prompts) {
104
- const promptResult = { prompt, type, scores: {} };
105
-
106
- for (const model of allModels) {
107
- try {
108
- const { response, latencyMs } = await runPrompt(model, prompt);
109
- const score = scoreResponse(response, latencyMs);
110
- promptResult.scores[model] = { score, latencyMs, responseLength: (response || '').length };
111
- } catch (err) {
112
- promptResult.scores[model] = { score: 0, latencyMs: 0, responseLength: 0, error: err.message };
113
- }
114
- }
115
-
116
- results.push(promptResult);
117
- }
118
-
119
- // Compare each fine-tuned model against baseline
120
- let winner = null;
121
- let bestWinRate = 0;
122
- let bestAvgScore = 0;
123
-
124
- for (const model of fineTunedModels) {
125
- let wins = 0;
126
- let comparisons = 0;
127
- let totalScore = 0;
128
-
129
- for (const r of results) {
130
- const modelScore = r.scores[model]?.score || 0;
131
- const baselineScore = r.scores[baseline]?.score || 0;
132
- comparisons++;
133
- totalScore += modelScore;
134
- if (modelScore > baselineScore) wins++;
135
- }
136
-
137
- const winRate = comparisons > 0 ? wins / comparisons : 0;
138
- const avgScore = comparisons > 0 ? totalScore / comparisons : 0;
139
-
140
- // Pick model with highest win rate; break ties by average score
141
- if (winRate > bestWinRate || (winRate === bestWinRate && avgScore > bestAvgScore)) {
142
- bestWinRate = winRate;
143
- bestAvgScore = avgScore;
144
- if (winRate > winThreshold) {
145
- winner = model;
146
- }
147
- }
148
- }
149
-
150
- // Deploy winner if found
151
- let deployed = false;
152
- if (winner) {
153
- try {
154
- await deployModel(winner, deps);
155
- deployed = true;
156
- } catch (err) {
157
- // Deployment failed but evaluation still succeeded
158
- deployed = false;
159
- }
160
- }
161
-
162
- return { results, winner, deployed };
163
- }
164
-
165
- // ---------------------------------------------------------------------------
166
- // Deployment
167
- // ---------------------------------------------------------------------------
168
-
169
- /**
170
- * Deploy a model to Ollama and register in brain's model_registry.
171
- * @param {string} modelName - The model to deploy
172
- * @param {Object} deps - { brain, execDeploy }
173
- */
174
- async function deployModel(modelName, deps = {}) {
175
- const execDeploy = deps.execDeploy || execFileAsync;
176
- const brain = deps.brain || null;
177
-
178
- // Create an Ollama model alias
179
- const walleAlias = `walle-${modelName.replace(/[/:]/g, '-')}`;
180
-
181
- await execDeploy('ollama', ['cp', modelName, walleAlias], { timeout: 60000 });
182
-
183
- // Register in brain's model_registry if available
184
- if (brain && brain.getDb) {
185
- try {
186
- const db = brain.getDb();
187
- db.prepare(`
188
- INSERT OR REPLACE INTO model_registry (name, base_model, status, deployed_at)
189
- VALUES (?, ?, 'active', datetime('now'))
190
- `).run(walleAlias, modelName);
191
- } catch {
192
- // model_registry table may not exist yet -- non-fatal
193
- }
194
- }
195
- }
196
-
197
- module.exports = {
198
- evaluateAndDeploy,
199
- deployModel,
200
- scoreResponse,
201
- BENCHMARK_PROMPTS,
202
- };
@@ -1,373 +0,0 @@
1
- 'use strict';
2
-
3
- /**
4
- * Shadow evaluation framework — scores shadow model outputs against primary.
5
- * Three methods: heuristic (free), embedding similarity (cheap), LLM-as-judge (expensive).
6
- */
7
-
8
- // --- Heuristic Scoring (free, instant) ---
9
-
10
- function codingHeuristic(primary, shadow) {
11
- let score = 0;
12
- // Check for code blocks
13
- const primaryHasCode = /```/.test(primary);
14
- const shadowHasCode = /```/.test(shadow);
15
- if (primaryHasCode && shadowHasCode) score += 0.3;
16
- else if (primaryHasCode && !shadowHasCode) score += 0.0;
17
- else score += 0.15;
18
-
19
- // Check for similar keywords (function, const, class, import, etc.)
20
- const codeKeywords = /\b(function|const|let|var|class|import|export|return|if|for|while|async|await)\b/g;
21
- const primaryKeywords = new Set((primary.match(codeKeywords) || []).map(k => k.toLowerCase()));
22
- const shadowKeywords = new Set((shadow.match(codeKeywords) || []).map(k => k.toLowerCase()));
23
- if (primaryKeywords.size > 0 && shadowKeywords.size > 0) {
24
- const overlap = [...shadowKeywords].filter(k => primaryKeywords.has(k)).length;
25
- score += 0.3 * (overlap / Math.max(primaryKeywords.size, 1));
26
- }
27
- return Math.min(score, 0.6);
28
- }
29
-
30
- function qaHeuristic(primary, shadow) {
31
- let score = 0;
32
- // Key terms overlap (split into words, compare)
33
- const primaryWords = new Set(primary.toLowerCase().split(/\W+/).filter(w => w.length > 3));
34
- const shadowWords = new Set(shadow.toLowerCase().split(/\W+/).filter(w => w.length > 3));
35
- if (primaryWords.size > 0 && shadowWords.size > 0) {
36
- const overlap = [...shadowWords].filter(w => primaryWords.has(w)).length;
37
- const overlapRate = overlap / Math.max(primaryWords.size, 1);
38
- score += 0.6 * Math.min(overlapRate * 2, 1); // scale so 50% overlap = full score
39
- }
40
- return Math.min(score, 0.6);
41
- }
42
-
43
- function planningHeuristic(primary, shadow) {
44
- let score = 0;
45
-
46
- // Structure signals — good plans have headers, lists, or numbered steps
47
- const structurePatterns = /^(#{1,3}\s|[\-*]\s|\d+[\.\)]\s)/m;
48
- const primaryHasStructure = structurePatterns.test(primary);
49
- const shadowHasStructure = structurePatterns.test(shadow);
50
- if (primaryHasStructure && shadowHasStructure) score += 0.15;
51
- else if (shadowHasStructure) score += 0.1;
52
-
53
- // Key concept overlap — planning discussions revolve around domain terms
54
- const primaryWords = new Set(primary.toLowerCase().split(/\W+/).filter(w => w.length > 3));
55
- const shadowWords = new Set(shadow.toLowerCase().split(/\W+/).filter(w => w.length > 3));
56
- if (primaryWords.size > 0 && shadowWords.size > 0) {
57
- const overlap = [...shadowWords].filter(w => primaryWords.has(w)).length;
58
- const overlapRate = overlap / Math.max(primaryWords.size, 1);
59
- score += 0.25 * Math.min(overlapRate * 2, 1);
60
- }
61
-
62
- // Thoroughness — plans should be substantive, not terse
63
- if (shadow.length >= primary.length * 0.4) score += 0.1;
64
- if (shadow.length >= primary.length * 0.7) score += 0.1;
65
-
66
- return Math.min(score, 0.6);
67
- }
68
-
69
- function generalHeuristic(primary, shadow) {
70
- // Simple word overlap as semantic similarity proxy
71
- const primaryWords = new Set(primary.toLowerCase().split(/\W+/).filter(w => w.length > 3));
72
- const shadowWords = new Set(shadow.toLowerCase().split(/\W+/).filter(w => w.length > 3));
73
- if (primaryWords.size === 0 || shadowWords.size === 0) return 0.3;
74
- const overlap = [...shadowWords].filter(w => primaryWords.has(w)).length;
75
- const overlapRate = overlap / Math.max(primaryWords.size, 1);
76
- return Math.min(0.6 * overlapRate * 2, 0.6);
77
- }
78
-
79
- /**
80
- * Heuristic scoring — free, instant.
81
- * @returns {number} 0.0 - 1.0
82
- */
83
- function heuristicScore(primary, shadow, taskType) {
84
- if (!shadow || !shadow.trim()) return 0;
85
- let score = 0;
86
-
87
- // Non-empty response: 0.2
88
- score += 0.2;
89
-
90
- // Length ratio: 0.2 (within 0.3x-3x of primary is ideal)
91
- if (primary) {
92
- const ratio = shadow.length / Math.max(primary.length, 1);
93
- if (ratio >= 0.3 && ratio <= 3.0) score += 0.2;
94
- else if (ratio >= 0.1 && ratio <= 5.0) score += 0.1;
95
- } else {
96
- score += 0.1;
97
- }
98
-
99
- // Task-specific scoring: up to 0.6
100
- if (taskType === 'coding') {
101
- score += codingHeuristic(primary || '', shadow);
102
- } else if (taskType === 'planning') {
103
- score += planningHeuristic(primary || '', shadow);
104
- } else if (taskType === 'qa') {
105
- score += qaHeuristic(primary || '', shadow);
106
- } else {
107
- score += generalHeuristic(primary || '', shadow);
108
- }
109
-
110
- return Math.min(Math.round(score * 1000) / 1000, 1.0);
111
- }
112
-
113
- // --- Embedding Similarity (cheap, ~100ms) ---
114
-
115
- /**
116
- * Compute cosine similarity between two vectors.
117
- */
118
- function cosineSimilarity(a, b) {
119
- if (!a || !b || a.length !== b.length || a.length === 0) return 0;
120
- let dot = 0, normA = 0, normB = 0;
121
- for (let i = 0; i < a.length; i++) {
122
- dot += a[i] * b[i];
123
- normA += a[i] * a[i];
124
- normB += b[i] * b[i];
125
- }
126
- const denom = Math.sqrt(normA) * Math.sqrt(normB);
127
- return denom === 0 ? 0 : dot / denom;
128
- }
129
-
130
- /**
131
- * Get embedding from Ollama.
132
- * @returns {number[]|null}
133
- */
134
- async function getEmbedding(text, model = 'nomic-embed-text') {
135
- try {
136
- const response = await fetch('http://localhost:11434/api/embed', {
137
- method: 'POST',
138
- headers: { 'Content-Type': 'application/json' },
139
- body: JSON.stringify({ model, input: text.slice(0, 2048) }),
140
- signal: AbortSignal.timeout(10000),
141
- });
142
- if (!response.ok) return null;
143
- const data = await response.json();
144
- return data.embeddings?.[0] || null;
145
- } catch {
146
- return null;
147
- }
148
- }
149
-
150
- /**
151
- * Embedding similarity between primary and shadow responses.
152
- * @returns {number|null} 0.0-1.0 or null if unavailable
153
- */
154
- async function embeddingSimilarity(primary, shadow, embedFn) {
155
- const embed = embedFn || getEmbedding;
156
- try {
157
- const [pEmb, sEmb] = await Promise.all([embed(primary), embed(shadow)]);
158
- if (!pEmb || !sEmb) return null;
159
- return Math.round(cosineSimilarity(pEmb, sEmb) * 1000) / 1000;
160
- } catch {
161
- return null;
162
- }
163
- }
164
-
165
- // --- LLM-as-Judge (expensive, ~5s) ---
166
-
167
- // Daily counter for rate limiting
168
- let _judgeDailyCount = 0;
169
- let _judgeCountDate = '';
170
-
171
- function resetJudgeCountIfNewDay() {
172
- const today = new Date().toISOString().slice(0, 10);
173
- if (_judgeCountDate !== today) {
174
- _judgeDailyCount = 0;
175
- _judgeCountDate = today;
176
- }
177
- }
178
-
179
- /**
180
- * Whether to use LLM judge for this task type.
181
- * Rate-limited to maxJudgePerDay (default 50).
182
- */
183
- function shouldJudge(taskType, maxPerDay = 50) {
184
- resetJudgeCountIfNewDay();
185
- if (_judgeDailyCount >= maxPerDay) return false;
186
-
187
- // Always judge planning and complex tasks
188
- if (taskType === 'planning' || taskType === 'coding') return true;
189
- // Sample 10-20% of others
190
- const sampleRates = { chat: 0.1, qa: 0.1, 'slack-reply': 0.2 };
191
- const rate = sampleRates[taskType] || 0.1;
192
- return Math.random() < rate;
193
- }
194
-
195
- /**
196
- * LLM-as-Judge — blind A/B comparison.
197
- * @returns {{ primaryScore: number, shadowScore: number, reasoning: string }|null}
198
- */
199
- async function llmJudge(prompt, primary, shadow, taskType, judgeFn) {
200
- resetJudgeCountIfNewDay();
201
- _judgeDailyCount++;
202
-
203
- const judgePrompt = `You are evaluating two AI responses to the same prompt.
204
- Rate each response on a 0-10 scale for: accuracy, helpfulness, completeness.
205
- Then compute an overall score (average of the three).
206
-
207
- Task type: ${taskType}
208
-
209
- Prompt: ${prompt.slice(0, 1000)}
210
-
211
- Response A:
212
- ${primary.slice(0, 2000)}
213
-
214
- Response B:
215
- ${shadow.slice(0, 2000)}
216
-
217
- Output ONLY valid JSON (no markdown): { "a_score": N, "b_score": N, "reasoning": "..." }`;
218
-
219
- try {
220
- const judge = judgeFn || defaultJudgeFn;
221
- const result = await judge(judgePrompt);
222
- if (!result) return null;
223
-
224
- // Normalize to 0-1
225
- return {
226
- primaryScore: Math.min((result.a_score || 0) / 10, 1),
227
- shadowScore: Math.min((result.b_score || 0) / 10, 1),
228
- reasoning: result.reasoning || '',
229
- };
230
- } catch {
231
- return null;
232
- }
233
- }
234
-
235
- async function defaultJudgeFn(prompt) {
236
- // Uses the default LLM client for judging
237
- // Falls back gracefully if not available
238
- try {
239
- const { getDefaultClient } = require('../llm/client');
240
- const client = getDefaultClient();
241
- const response = await client.chat({
242
- model: 'claude-haiku-4-5-20251001',
243
- messages: [{ role: 'user', content: prompt }],
244
- maxTokens: 500,
245
- });
246
- const text = typeof response.content === 'string' ? response.content : response.text || '';
247
- // Extract JSON from response
248
- const jsonMatch = text.match(/\{[\s\S]*\}/);
249
- if (!jsonMatch) return null;
250
- return JSON.parse(jsonMatch[0]);
251
- } catch {
252
- return null;
253
- }
254
- }
255
-
256
- // --- Composite Scoring ---
257
-
258
- /**
259
- * Compute composite eval score from available methods.
260
- */
261
- function computeCompositeEvalScore(heuristic, embedding, judge, taskType) {
262
- const components = [];
263
-
264
- if (heuristic != null) components.push({ score: heuristic, weight: 1.0 });
265
- if (embedding != null) components.push({ score: embedding, weight: 1.5 });
266
- if (judge != null) components.push({ score: judge.shadowScore, weight: 2.0 });
267
-
268
- if (components.length === 0) return 0;
269
-
270
- const totalWeight = components.reduce((sum, c) => sum + c.weight, 0);
271
- const weightedSum = components.reduce((sum, c) => sum + c.score * c.weight, 0);
272
- return Math.round((weightedSum / totalWeight) * 1000) / 1000;
273
- }
274
-
275
- // --- Main Evaluation Pipeline ---
276
-
277
- /**
278
- * Evaluate a single shadow result.
279
- * @param {Object} result - Row from shadow_results table
280
- * @param {Object} brain - Brain module
281
- * @param {Object} [deps] - Dependency injection for testing
282
- * @returns {Object} { primaryScore, shadowScore, evalMethod }
283
- */
284
- async function evaluateShadowResult(result, brain, deps = {}) {
285
- const { task_type: taskType, prompt, primary_response: primary, shadow_response: shadow } = result;
286
-
287
- if (!shadow) {
288
- brain.updateShadowResultEval(result.id, {
289
- primaryScore: 1.0, shadowScore: 0, evalMethod: 'no-response',
290
- });
291
- return { primaryScore: 1.0, shadowScore: 0, evalMethod: 'no-response' };
292
- }
293
-
294
- // Always run heuristic
295
- const hScore = heuristicScore(primary, shadow, taskType);
296
-
297
- // Try embedding similarity
298
- let eScore = null;
299
- if (deps.embedFn !== false) {
300
- eScore = await embeddingSimilarity(primary, shadow, deps.embedFn || undefined);
301
- }
302
-
303
- // LLM judge for qualifying results
304
- let jResult = null;
305
- if (deps.judgeFn !== false && shouldJudge(taskType, deps.maxJudgePerDay)) {
306
- jResult = await llmJudge(prompt, primary, shadow, taskType, deps.judgeFn || undefined);
307
- }
308
-
309
- const shadowScore = computeCompositeEvalScore(hScore, eScore, jResult, taskType);
310
- const evalMethod = [
311
- 'heuristic',
312
- eScore != null ? 'embedding' : null,
313
- jResult != null ? 'judge' : null,
314
- ].filter(Boolean).join('+');
315
-
316
- brain.updateShadowResultEval(result.id, {
317
- primaryScore: 1.0,
318
- shadowScore,
319
- evalMethod,
320
- judgeModel: jResult ? 'claude-haiku-4-5-20251001' : null,
321
- judgeReasoning: jResult?.reasoning || null,
322
- });
323
-
324
- return { primaryScore: 1.0, shadowScore, evalMethod };
325
- }
326
-
327
- /**
328
- * Run batch evaluation on unevaluated shadow results.
329
- * @param {Object} brain - Brain module
330
- * @param {Object} [options]
331
- * @param {number} [options.limit=100]
332
- * @param {Object} [options.deps] - Dependency injection
333
- * @returns {{ evaluated: number, avgScore: number }}
334
- */
335
- async function runBatchEval(brain, options = {}) {
336
- const limit = options.limit || 100;
337
- const deps = options.deps || {};
338
- const results = brain.getShadowResults({ evaluated: false, limit });
339
-
340
- let totalScore = 0;
341
- let count = 0;
342
-
343
- for (const result of results) {
344
- try {
345
- const eval_ = await evaluateShadowResult(result, brain, deps);
346
- totalScore += eval_.shadowScore;
347
- count++;
348
- } catch (err) {
349
- console.error(`[evaluator] Error evaluating ${result.id}:`, err.message);
350
- }
351
- }
352
-
353
- return {
354
- evaluated: count,
355
- avgScore: count > 0 ? Math.round((totalScore / count) * 1000) / 1000 : 0,
356
- };
357
- }
358
-
359
- module.exports = {
360
- heuristicScore,
361
- codingHeuristic,
362
- qaHeuristic,
363
- generalHeuristic,
364
- cosineSimilarity,
365
- embeddingSimilarity,
366
- shouldJudge,
367
- llmJudge,
368
- computeCompositeEvalScore,
369
- evaluateShadowResult,
370
- runBatchEval,
371
- // Exported for testing
372
- getEmbedding,
373
- };