create-walle 0.9.21 → 0.9.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (500) hide show
  1. package/README.md +27 -5
  2. package/package.json +2 -2
  3. package/template/CLAUDE.md +2 -2
  4. package/template/LICENSE +1 -1
  5. package/template/bin/ctm-dev-cleanup.js +24 -3
  6. package/template/bin/ctm-launch.sh +13 -0
  7. package/template/bin/dev.sh +156 -18
  8. package/template/bin/node-bin.sh +84 -0
  9. package/template/bin/pin-node.sh +51 -0
  10. package/template/claude-task-manager/api-prompts.js +1203 -182
  11. package/template/claude-task-manager/api-reviews.js +109 -15
  12. package/template/claude-task-manager/approval-agent.js +1360 -280
  13. package/template/claude-task-manager/bin/restart-ctm.sh +64 -23
  14. package/template/claude-task-manager/bin/storage-migration-supervisor.js +338 -0
  15. package/template/claude-task-manager/db.js +4417 -295
  16. package/template/claude-task-manager/docs/app-update-refresh-protocol.md +69 -0
  17. package/template/claude-task-manager/docs/approval-ai-refinement.md +138 -0
  18. package/template/claude-task-manager/docs/approval-rescue-loop.md +74 -0
  19. package/template/claude-task-manager/docs/codex-operational-warning-health.md +107 -0
  20. package/template/claude-task-manager/docs/codex-resume-state-guard-design.md +17 -12
  21. package/template/claude-task-manager/docs/codex-terminal-render-controller-handoff.md +311 -0
  22. package/template/claude-task-manager/docs/coding-agent-hooks-architecture.md +418 -0
  23. package/template/claude-task-manager/docs/conversation-import-freshness.md +20 -0
  24. package/template/claude-task-manager/docs/google-workspace-auth-health.md +77 -0
  25. package/template/claude-task-manager/docs/image-paste-ux.md +13 -0
  26. package/template/claude-task-manager/docs/ipad-web-preview.md +88 -0
  27. package/template/claude-task-manager/docs/main-loop-offload-architecture.md +66 -0
  28. package/template/claude-task-manager/docs/microsoft-dev-tunnel-phone-access-design.md +274 -519
  29. package/template/claude-task-manager/docs/mobile-live-streaming.md +27 -5
  30. package/template/claude-task-manager/docs/mobile-remote-submission-lifecycle.md +69 -0
  31. package/template/claude-task-manager/docs/phone-access-design.md +53 -15
  32. package/template/claude-task-manager/docs/phone-passkey-identity.md +122 -0
  33. package/template/claude-task-manager/docs/phone-setup.md +3 -0
  34. package/template/claude-task-manager/docs/prompt-editing-tree-design.md +25 -1
  35. package/template/claude-task-manager/docs/remote-desktop-access-design.md +268 -0
  36. package/template/claude-task-manager/docs/restart-lifecycle-architecture.md +95 -0
  37. package/template/claude-task-manager/docs/runtime-work-control-plane.md +53 -0
  38. package/template/claude-task-manager/docs/session-interactive-wait-surfaces.md +38 -0
  39. package/template/claude-task-manager/docs/session-needs-you-dismissal.md +84 -0
  40. package/template/claude-task-manager/docs/session-render-state-management-design.md +91 -3
  41. package/template/claude-task-manager/docs/session-standup-command-center-design.md +25 -1
  42. package/template/claude-task-manager/docs/session-title-authority.md +32 -0
  43. package/template/claude-task-manager/docs/session-workspace-binding.md +33 -0
  44. package/template/claude-task-manager/docs/skill-intent-resolution-design.md +72 -0
  45. package/template/claude-task-manager/docs/walle-mcp-supervisor-health.md +86 -0
  46. package/template/claude-task-manager/docs/walle-relay-phone-access-design.md +24 -15
  47. package/template/claude-task-manager/docs/walle-session-history-hydration.md +114 -0
  48. package/template/claude-task-manager/docs/walle-session-input-queue.md +104 -0
  49. package/template/claude-task-manager/docs/walle-session-model-catalog.md +90 -0
  50. package/template/claude-task-manager/docs/walle-session-model-preferences.md +15 -6
  51. package/template/claude-task-manager/git-utils.js +897 -27
  52. package/template/claude-task-manager/lib/agent-capabilities.js +33 -0
  53. package/template/claude-task-manager/lib/agent-cli-cache.js +37 -7
  54. package/template/claude-task-manager/lib/agent-hooks-installer.js +26 -2
  55. package/template/claude-task-manager/lib/agent-presets.js +17 -1
  56. package/template/claude-task-manager/lib/all-sessions-query.js +108 -0
  57. package/template/claude-task-manager/lib/approval-ai-refinement.js +488 -0
  58. package/template/claude-task-manager/lib/approval-self-adapt.js +168 -0
  59. package/template/claude-task-manager/lib/async-semaphore.js +44 -0
  60. package/template/claude-task-manager/lib/auth-context.js +5 -0
  61. package/template/claude-task-manager/lib/auth-rate-limit.js +47 -4
  62. package/template/claude-task-manager/lib/auth-rules.js +29 -2
  63. package/template/claude-task-manager/lib/auto-approval-verifier.js +129 -16
  64. package/template/claude-task-manager/lib/background-llm.js +144 -17
  65. package/template/claude-task-manager/lib/branch-inventory.js +212 -0
  66. package/template/claude-task-manager/lib/claude-desktop-sessions.js +15 -3
  67. package/template/claude-task-manager/lib/coalesce-sync-frames.js +151 -0
  68. package/template/claude-task-manager/lib/codex-launch-health.js +762 -0
  69. package/template/claude-task-manager/lib/codex-transcript-pager.js +51 -0
  70. package/template/claude-task-manager/lib/codex-zst.js +124 -0
  71. package/template/claude-task-manager/lib/coding-agent-models.js +233 -30
  72. package/template/claude-task-manager/lib/connection-health.js +232 -0
  73. package/template/claude-task-manager/lib/conversation-blob-parser.js +42 -0
  74. package/template/claude-task-manager/lib/conversation-tail-merge.js +89 -26
  75. package/template/claude-task-manager/lib/ctm-session-context-api.js +39 -10
  76. package/template/claude-task-manager/lib/cursor-conversation-store.js +354 -0
  77. package/template/claude-task-manager/lib/db-owner-worker-client.js +315 -0
  78. package/template/claude-task-manager/lib/document-review.js +141 -6
  79. package/template/claude-task-manager/lib/escalation-review.js +152 -0
  80. package/template/claude-task-manager/lib/graceful-shutdown.js +159 -0
  81. package/template/claude-task-manager/lib/headless-term-service.js +678 -0
  82. package/template/claude-task-manager/lib/heavy-worker-fallback.js +38 -0
  83. package/template/claude-task-manager/lib/jsonl-conversation-parser.js +542 -0
  84. package/template/claude-task-manager/lib/jsonl-range-reader.js +112 -0
  85. package/template/claude-task-manager/lib/main-db-census.js +216 -0
  86. package/template/claude-task-manager/lib/message-pagination.js +106 -4
  87. package/template/claude-task-manager/lib/microsoft-dev-tunnel-setup.js +750 -26
  88. package/template/claude-task-manager/lib/mobile-auth-api.js +274 -7
  89. package/template/claude-task-manager/lib/mobile-auth-store.js +592 -10
  90. package/template/claude-task-manager/lib/mobile-notification-dispatcher.js +15 -0
  91. package/template/claude-task-manager/lib/model-overview-brain-fallback.js +311 -0
  92. package/template/claude-task-manager/lib/model-overview-cache.js +141 -0
  93. package/template/claude-task-manager/lib/models-health-routing-notice.js +126 -0
  94. package/template/claude-task-manager/lib/node-pin-guard.js +93 -0
  95. package/template/claude-task-manager/lib/perf-tracker.js +242 -6
  96. package/template/claude-task-manager/lib/permission-match.js +76 -0
  97. package/template/claude-task-manager/lib/permission-sync.js +133 -20
  98. package/template/claude-task-manager/lib/process-title.js +35 -0
  99. package/template/claude-task-manager/lib/prompt-executions-query.js +25 -0
  100. package/template/claude-task-manager/lib/prompt-index-disk-cache.js +44 -0
  101. package/template/claude-task-manager/lib/prompt-intent.js +132 -0
  102. package/template/claude-task-manager/lib/provider-user-context.js +34 -0
  103. package/template/claude-task-manager/lib/read-pool-client.js +313 -0
  104. package/template/claude-task-manager/lib/readpool-breaker.js +31 -0
  105. package/template/claude-task-manager/lib/recent-sessions-breaker.js +12 -0
  106. package/template/claude-task-manager/lib/remote-feedback-client.js +72 -0
  107. package/template/claude-task-manager/lib/remote-relay-protocol.js +37 -4
  108. package/template/claude-task-manager/lib/remote-relay-store.js +159 -0
  109. package/template/claude-task-manager/lib/remote-submission-observer.js +278 -0
  110. package/template/claude-task-manager/lib/restart-guard.js +109 -0
  111. package/template/claude-task-manager/lib/restore-interruption-detector.js +439 -0
  112. package/template/claude-task-manager/lib/restore-policy.js +13 -0
  113. package/template/claude-task-manager/lib/restore-resume-batch.js +74 -0
  114. package/template/claude-task-manager/lib/restore-runtime.js +68 -0
  115. package/template/claude-task-manager/lib/restore-storm.js +34 -0
  116. package/template/claude-task-manager/lib/resume-cwd.js +36 -0
  117. package/template/claude-task-manager/lib/resume-preflight.js +313 -0
  118. package/template/claude-task-manager/lib/runtime-work-registry.js +444 -0
  119. package/template/claude-task-manager/lib/sanitize-openai-auth.js +31 -0
  120. package/template/claude-task-manager/lib/scheduler.js +21 -1
  121. package/template/claude-task-manager/lib/scrollback-snapshot-store.js +159 -0
  122. package/template/claude-task-manager/lib/serial-task-queue.js +64 -0
  123. package/template/claude-task-manager/lib/server-listeners.js +239 -0
  124. package/template/claude-task-manager/lib/session-capture.js +42 -7
  125. package/template/claude-task-manager/lib/session-content-backfill.js +131 -0
  126. package/template/claude-task-manager/lib/session-history.js +388 -43
  127. package/template/claude-task-manager/lib/session-host-manager.js +287 -0
  128. package/template/claude-task-manager/lib/session-image-refs.js +209 -0
  129. package/template/claude-task-manager/lib/session-jobs.js +399 -59
  130. package/template/claude-task-manager/lib/session-prompt-index.js +137 -0
  131. package/template/claude-task-manager/lib/session-restore.js +53 -0
  132. package/template/claude-task-manager/lib/session-standup.js +123 -23
  133. package/template/claude-task-manager/lib/session-state-bus.js +14 -0
  134. package/template/claude-task-manager/lib/session-stream.js +64 -16
  135. package/template/claude-task-manager/lib/session-timeline-summary.js +260 -0
  136. package/template/claude-task-manager/lib/session-token-usage.js +494 -0
  137. package/template/claude-task-manager/lib/session-workspace-binding.js +356 -0
  138. package/template/claude-task-manager/lib/setup-network-config.js +9 -0
  139. package/template/claude-task-manager/lib/size-cap.js +45 -0
  140. package/template/claude-task-manager/lib/size-cap.test.js +62 -0
  141. package/template/claude-task-manager/lib/skill-autocomplete.js +180 -1
  142. package/template/claude-task-manager/lib/skill-intent-resolver.js +304 -0
  143. package/template/claude-task-manager/lib/sqlite-driver.js +19 -3
  144. package/template/claude-task-manager/lib/standup-attention.js +7 -3
  145. package/template/claude-task-manager/lib/status-authority.js +39 -0
  146. package/template/claude-task-manager/lib/status-hooks.js +4 -0
  147. package/template/claude-task-manager/lib/storage-migration.js +235 -0
  148. package/template/claude-task-manager/lib/structured-capture.js +298 -0
  149. package/template/claude-task-manager/lib/sync-io-census.js +163 -0
  150. package/template/claude-task-manager/lib/tailscale-setup.js +6 -0
  151. package/template/claude-task-manager/lib/terminal-activity-evidence.js +33 -0
  152. package/template/claude-task-manager/lib/terminal-choice.js +364 -0
  153. package/template/claude-task-manager/lib/terminal-control-sanitize.js +17 -0
  154. package/template/claude-task-manager/lib/terminal-fingerprint.js +48 -0
  155. package/template/claude-task-manager/lib/terminal-output-flush.js +84 -0
  156. package/template/claude-task-manager/lib/timeline-order.js +122 -0
  157. package/template/claude-task-manager/lib/transcript-store.js +348 -43
  158. package/template/claude-task-manager/lib/transport-security.js +84 -1
  159. package/template/claude-task-manager/lib/wait-state.js +184 -0
  160. package/template/claude-task-manager/lib/walle-client.js +47 -5
  161. package/template/claude-task-manager/lib/walle-ctm-history.js +564 -4
  162. package/template/claude-task-manager/lib/walle-external-actions.js +135 -16
  163. package/template/claude-task-manager/lib/walle-history-hydration.js +46 -0
  164. package/template/claude-task-manager/lib/walle-native-health.js +403 -0
  165. package/template/claude-task-manager/lib/walle-repair.js +701 -0
  166. package/template/claude-task-manager/lib/walle-session-cache.js +109 -0
  167. package/template/claude-task-manager/lib/walle-session-context.js +57 -21
  168. package/template/claude-task-manager/lib/walle-session-model-catalog.js +34 -0
  169. package/template/claude-task-manager/lib/walle-supervisor.js +539 -63
  170. package/template/claude-task-manager/lib/walle-transcript.js +52 -0
  171. package/template/claude-task-manager/lib/worktree-active-sync.js +11 -7
  172. package/template/claude-task-manager/lib/worktree-cwd.js +32 -1
  173. package/template/claude-task-manager/package.json +1 -1
  174. package/template/claude-task-manager/prompt-harvest.js +89 -66
  175. package/template/claude-task-manager/providers/claude-code.js +51 -3
  176. package/template/claude-task-manager/providers/cursor.js +140 -45
  177. package/template/claude-task-manager/public/css/reviews.css +551 -61
  178. package/template/claude-task-manager/public/css/setup.css +191 -0
  179. package/template/claude-task-manager/public/css/walle-session.css +865 -10
  180. package/template/claude-task-manager/public/css/walle.css +154 -0
  181. package/template/claude-task-manager/public/designs/ai-providers-consolidation-v2.html +830 -0
  182. package/template/claude-task-manager/public/index.html +18516 -2058
  183. package/template/claude-task-manager/public/ipad.html +363 -0
  184. package/template/claude-task-manager/public/js/document-review-links.js +301 -0
  185. package/template/claude-task-manager/public/js/image-normalize.js +69 -36
  186. package/template/claude-task-manager/public/js/message-renderer.js +1265 -77
  187. package/template/claude-task-manager/public/js/prompts.js +66 -29
  188. package/template/claude-task-manager/public/js/reviews.js +901 -133
  189. package/template/claude-task-manager/public/js/session-activity-utils.js +11 -1
  190. package/template/claude-task-manager/public/js/session-search-utils.js +94 -10
  191. package/template/claude-task-manager/public/js/session-status-precedence.js +23 -5
  192. package/template/claude-task-manager/public/js/setup.js +1273 -176
  193. package/template/claude-task-manager/public/js/stream-view.js +691 -73
  194. package/template/claude-task-manager/public/js/terminal-reconciler.js +210 -0
  195. package/template/claude-task-manager/public/js/walle-session.js +2455 -158
  196. package/template/claude-task-manager/public/js/walle.js +455 -28
  197. package/template/claude-task-manager/public/m/app.css +2909 -262
  198. package/template/claude-task-manager/public/m/app.js +6601 -398
  199. package/template/claude-task-manager/public/m/claim.html +224 -17
  200. package/template/claude-task-manager/public/m/index.html +117 -21
  201. package/template/claude-task-manager/public/m/sw.js +3 -1
  202. package/template/claude-task-manager/public/manifest.json +2 -2
  203. package/template/claude-task-manager/public/prompts.html +30 -14
  204. package/template/claude-task-manager/queue-engine.js +507 -28
  205. package/template/claude-task-manager/scripts/repair-claude-session-images.js +27 -8
  206. package/template/claude-task-manager/server.js +14341 -2197
  207. package/template/claude-task-manager/session-integrity.js +160 -18
  208. package/template/claude-task-manager/session-search-ranking.js +1 -0
  209. package/template/claude-task-manager/session-utils.js +25 -5
  210. package/template/claude-task-manager/workers/approval-blocklist.js +96 -6
  211. package/template/claude-task-manager/workers/approval-widget-validator.js +14 -8
  212. package/template/claude-task-manager/workers/conversation-import-worker.js +11 -50
  213. package/template/claude-task-manager/workers/db-owner-worker.js +386 -0
  214. package/template/claude-task-manager/workers/harvest-worker.js +9 -55
  215. package/template/claude-task-manager/workers/headless-term-worker.js +9 -530
  216. package/template/claude-task-manager/workers/read-pool-worker.js +387 -0
  217. package/template/claude-task-manager/workers/scrollback-worker.js +11 -72
  218. package/template/claude-task-manager/workers/session-host-process.js +146 -0
  219. package/template/claude-task-manager/workers/session-integrity-worker.js +10 -54
  220. package/template/claude-task-manager/workers/state-detectors/base.js +18 -1
  221. package/template/claude-task-manager/workers/state-detectors/claude-code.js +182 -9
  222. package/template/claude-task-manager/workers/state-detectors/codex.js +150 -2
  223. package/template/claude-task-manager/workers/state-detectors/cursor.js +127 -0
  224. package/template/claude-task-manager/workers/state-detectors/gemini.js +21 -0
  225. package/template/claude-task-manager/workers/state-detectors/index.js +29 -0
  226. package/template/claude-task-manager/workers/state-detectors/opencode.js +103 -0
  227. package/template/docs/design/markdown-review-pane.md +206 -0
  228. package/template/docs/designs/2026-05-17-portkey-gateway-provider-ux.md +129 -38
  229. package/template/docs/designs/2026-05-20-mobile-worktree-finish-command.md +27 -0
  230. package/template/docs/designs/2026-05-22-ai-configuration-consolidation.md +248 -0
  231. package/template/docs/designs/ai-configuration-consolidation-mock.html +812 -0
  232. package/template/docs/private-memory-and-pii-policy.md +69 -0
  233. package/template/package.json +2 -1
  234. package/template/scripts/check-private-data.js +201 -0
  235. package/template/shared/sqlite-owner-guard.js +30 -0
  236. package/template/shared/sqlite-owner-write-queue.js +225 -0
  237. package/template/shared/sqlite-storage-policy.js +111 -0
  238. package/template/shared/sqlite-write-lock.js +428 -0
  239. package/template/wall-e/agent-runners/claude-code.js +5 -0
  240. package/template/wall-e/agent.js +166 -22
  241. package/template/wall-e/api-walle.js +524 -70
  242. package/template/wall-e/auth/provider-flows.js +11 -1
  243. package/template/wall-e/bin/walle-mcp-stdio.js +341 -17
  244. package/template/wall-e/brain.js +1614 -141
  245. package/template/wall-e/chat/attachment-blocks.js +96 -0
  246. package/template/wall-e/chat/attachments.js +2 -1
  247. package/template/wall-e/chat/capability-resolver.js +7 -7
  248. package/template/wall-e/chat/context-messages.js +28 -0
  249. package/template/wall-e/chat/conversation-frame.js +630 -0
  250. package/template/wall-e/chat/provider-messages.js +125 -0
  251. package/template/wall-e/chat.js +1002 -233
  252. package/template/wall-e/coding/acceptance-contract.js +170 -0
  253. package/template/wall-e/coding/acp-adapter.js +1 -1
  254. package/template/wall-e/coding/agent-catalog.js +3 -0
  255. package/template/wall-e/coding/artifact-store.js +93 -0
  256. package/template/wall-e/coding/capability-router.js +120 -0
  257. package/template/wall-e/coding/coding-run-controller.js +423 -0
  258. package/template/wall-e/coding/compaction-service.js +157 -12
  259. package/template/wall-e/coding/frontend-verification.js +258 -0
  260. package/template/wall-e/coding/lifecycle-hooks.js +75 -0
  261. package/template/wall-e/coding/local-preview-contract.js +157 -0
  262. package/template/wall-e/coding/permission-service.js +57 -13
  263. package/template/wall-e/coding/prompt-bundle.js +19 -1
  264. package/template/wall-e/coding/prompt-section-registry.js +227 -0
  265. package/template/wall-e/coding/provider-compat.js +15 -0
  266. package/template/wall-e/coding/runtime-events.js +224 -0
  267. package/template/wall-e/coding/runtime-mode.js +3 -0
  268. package/template/wall-e/coding/side-git-snapshot.js +160 -4
  269. package/template/wall-e/coding/snapshot-service.js +143 -1
  270. package/template/wall-e/coding/stream-processor.js +388 -34
  271. package/template/wall-e/coding/task-tool.js +141 -4
  272. package/template/wall-e/coding/tool-execution-controller.js +365 -0
  273. package/template/wall-e/coding/tool-registry.js +43 -5
  274. package/template/wall-e/coding/user-hooks.js +217 -0
  275. package/template/wall-e/coding-orchestrator.js +1330 -221
  276. package/template/wall-e/coding-prompts.js +20 -4
  277. package/template/wall-e/context/context-builder.js +15 -2
  278. package/template/wall-e/decision/confidence.js +1 -1
  279. package/template/wall-e/docs/coding-acceptance-contract.md +41 -0
  280. package/template/wall-e/docs/external-action-controller.md +26 -6
  281. package/template/wall-e/docs/telemetry-lifecycle.md +8 -2
  282. package/template/wall-e/embeddings.js +591 -53
  283. package/template/wall-e/external-action-controller.js +12 -0
  284. package/template/wall-e/http/auth.js +1 -0
  285. package/template/wall-e/http/chat-api.js +46 -11
  286. package/template/wall-e/http/model-admin.js +836 -34
  287. package/template/wall-e/lib/boot-profile.js +88 -0
  288. package/template/wall-e/lib/event-loop-monitor.js +93 -0
  289. package/template/wall-e/lib/service-health.js +194 -0
  290. package/template/wall-e/llm/anthropic.js +130 -5
  291. package/template/wall-e/llm/client.js +266 -63
  292. package/template/wall-e/llm/default-fallback.js +382 -0
  293. package/template/wall-e/llm/health.js +19 -0
  294. package/template/wall-e/llm/message-guard.js +78 -0
  295. package/template/wall-e/llm/model-catalog.js +252 -1
  296. package/template/wall-e/llm/openai.js +26 -4
  297. package/template/wall-e/llm/portkey-sync.js +654 -0
  298. package/template/wall-e/llm/provider-error.js +30 -2
  299. package/template/wall-e/llm/registry.js +5 -1
  300. package/template/wall-e/llm/request-compat.js +67 -0
  301. package/template/wall-e/loops/backfill.js +79 -23
  302. package/template/wall-e/loops/brain-optimize.js +67 -0
  303. package/template/wall-e/loops/ingest.js +25 -10
  304. package/template/wall-e/loops/question-digest.js +160 -0
  305. package/template/wall-e/loops/reflect.js +6 -4
  306. package/template/wall-e/loops/think.js +39 -12
  307. package/template/wall-e/mcp-server.js +318 -36
  308. package/template/wall-e/memory/ctm-context-client.js +52 -14
  309. package/template/wall-e/memory/ctm-operational-context.js +237 -0
  310. package/template/wall-e/memory/ctm-prompt-executions-client.js +128 -0
  311. package/template/wall-e/memory/ctm-session-context.js +111 -63
  312. package/template/wall-e/prompts/coding/deepseek.txt +3 -0
  313. package/template/wall-e/prompts/coding/gemini.txt +6 -0
  314. package/template/wall-e/prompts/coding/gpt.txt +6 -0
  315. package/template/wall-e/prompts/coding/local.txt +7 -0
  316. package/template/wall-e/runtime/decision-hooks.js +115 -0
  317. package/template/wall-e/runtime/devbox-gateway.js +82 -8
  318. package/template/wall-e/runtime/prompt-manifest.js +86 -0
  319. package/template/wall-e/runtime/tool-executor.js +269 -0
  320. package/template/wall-e/runtime/tool-result-envelope.js +138 -0
  321. package/template/wall-e/runtime/transcript-projection.js +60 -0
  322. package/template/wall-e/runtime/walle-runtime.js +224 -0
  323. package/template/wall-e/scripts/db-optimize/migrate.js +162 -0
  324. package/template/wall-e/scripts/db-optimize/recall-eval.js +117 -0
  325. package/template/wall-e/server.js +15 -0
  326. package/template/wall-e/session-files.js +9 -0
  327. package/template/wall-e/skills/_bundled/google-calendar/run.js +1 -1
  328. package/template/wall-e/skills/_bundled/gws-workspace/run.js +1 -1
  329. package/template/wall-e/skills/_bundled/slack-mentions/run.js +76 -6
  330. package/template/wall-e/skills/claude-code-reader.js +7 -3
  331. package/template/wall-e/skills/script-skill-runner.js +10 -0
  332. package/template/wall-e/skills/skill-planner.js +38 -0
  333. package/template/wall-e/tools/builtin-middleware.js +19 -9
  334. package/template/wall-e/tools/local-tools.js +1428 -16
  335. package/template/wall-e/tools/permission-checker.js +73 -5
  336. package/template/wall-e/tools/question-manager.js +117 -7
  337. package/template/wall-e/training/harvester.js +12 -28
  338. package/template/wall-e/training/replay.js +25 -80
  339. package/template/website/index.html +10 -10
  340. package/template/wall-e/eval/ab-test.js +0 -203
  341. package/template/wall-e/eval/agent-runner.js +0 -772
  342. package/template/wall-e/eval/agent-scorer.js +0 -461
  343. package/template/wall-e/eval/aggregator.js +0 -414
  344. package/template/wall-e/eval/allowed-test-commands.js +0 -34
  345. package/template/wall-e/eval/benchmark-generator.js +0 -113
  346. package/template/wall-e/eval/benchmarks/chat-eval.json +0 -1662
  347. package/template/wall-e/eval/benchmarks/chat.json +0 -82
  348. package/template/wall-e/eval/benchmarks/coding-agent-real.json +0 -1
  349. package/template/wall-e/eval/benchmarks/coding-agent.json +0 -1581
  350. package/template/wall-e/eval/benchmarks/coding.json +0 -122
  351. package/template/wall-e/eval/benchmarks/memory-retrieval.json +0 -234
  352. package/template/wall-e/eval/benchmarks/reasoning.json +0 -82
  353. package/template/wall-e/eval/benchmarks/swebench-lite-30.json +0 -212
  354. package/template/wall-e/eval/benchmarks.js +0 -669
  355. package/template/wall-e/eval/cc-replay.js +0 -719
  356. package/template/wall-e/eval/chat-eval.js +0 -525
  357. package/template/wall-e/eval/check-keys.js +0 -15
  358. package/template/wall-e/eval/check-providers.js +0 -42
  359. package/template/wall-e/eval/codex-cli-baseline.js +0 -669
  360. package/template/wall-e/eval/coding-agent-real.js +0 -570
  361. package/template/wall-e/eval/context-compactor.js +0 -251
  362. package/template/wall-e/eval/debug-agent003.js +0 -68
  363. package/template/wall-e/eval/diagnostics.js +0 -216
  364. package/template/wall-e/eval/eval-orchestrator.js +0 -642
  365. package/template/wall-e/eval/evaluate.js +0 -202
  366. package/template/wall-e/eval/evaluator.js +0 -373
  367. package/template/wall-e/eval/exporter.js +0 -212
  368. package/template/wall-e/eval/fixtures/express-basic/package.json +0 -9
  369. package/template/wall-e/eval/fixtures/express-basic/server.js +0 -115
  370. package/template/wall-e/eval/fixtures/express-basic/test.js +0 -83
  371. package/template/wall-e/eval/fixtures/express-buggy/package.json +0 -9
  372. package/template/wall-e/eval/fixtures/express-buggy/server.js +0 -113
  373. package/template/wall-e/eval/fixtures/express-buggy/test.js +0 -83
  374. package/template/wall-e/eval/fixtures/express-buggy-items/package.json +0 -9
  375. package/template/wall-e/eval/fixtures/express-buggy-items/server.js +0 -112
  376. package/template/wall-e/eval/fixtures/express-buggy-items/test.js +0 -83
  377. package/template/wall-e/eval/fixtures/express-buggy-search/package.json +0 -9
  378. package/template/wall-e/eval/fixtures/express-buggy-search/server.js +0 -121
  379. package/template/wall-e/eval/fixtures/express-buggy-search/test.js +0 -83
  380. package/template/wall-e/eval/fixtures/express-rename-data/data.js +0 -34
  381. package/template/wall-e/eval/fixtures/express-rename-data/package.json +0 -9
  382. package/template/wall-e/eval/fixtures/express-rename-data/server.js +0 -97
  383. package/template/wall-e/eval/fixtures/express-rename-data/test.js +0 -88
  384. package/template/wall-e/eval/fixtures/express-xss/package.json +0 -12
  385. package/template/wall-e/eval/fixtures/express-xss/server.js +0 -90
  386. package/template/wall-e/eval/fixtures/express-xss/test.js +0 -67
  387. package/template/wall-e/eval/fixtures/express-xss/views/profile.ejs +0 -9
  388. package/template/wall-e/eval/fixtures/fullstack-app/config/default.js +0 -9
  389. package/template/wall-e/eval/fixtures/fullstack-app/config/test.js +0 -13
  390. package/template/wall-e/eval/fixtures/fullstack-app/package.json +0 -11
  391. package/template/wall-e/eval/fixtures/fullstack-app/public/css/style.css +0 -137
  392. package/template/wall-e/eval/fixtures/fullstack-app/public/index.html +0 -46
  393. package/template/wall-e/eval/fixtures/fullstack-app/public/js/app.js +0 -121
  394. package/template/wall-e/eval/fixtures/fullstack-app/public/js/auth.js +0 -71
  395. package/template/wall-e/eval/fixtures/fullstack-app/public/js/items.js +0 -80
  396. package/template/wall-e/eval/fixtures/fullstack-app/public/js/users.js +0 -46
  397. package/template/wall-e/eval/fixtures/fullstack-app/public/login.html +0 -45
  398. package/template/wall-e/eval/fixtures/fullstack-app/public/register.html +0 -38
  399. package/template/wall-e/eval/fixtures/fullstack-app/scripts/migrate.js +0 -23
  400. package/template/wall-e/eval/fixtures/fullstack-app/scripts/seed.js +0 -46
  401. package/template/wall-e/eval/fixtures/fullstack-app/server/db.js +0 -99
  402. package/template/wall-e/eval/fixtures/fullstack-app/server/index.js +0 -94
  403. package/template/wall-e/eval/fixtures/fullstack-app/server/middleware/auth.js +0 -19
  404. package/template/wall-e/eval/fixtures/fullstack-app/server/middleware/logger.js +0 -19
  405. package/template/wall-e/eval/fixtures/fullstack-app/server/router.js +0 -50
  406. package/template/wall-e/eval/fixtures/fullstack-app/server/routes/auth.js +0 -69
  407. package/template/wall-e/eval/fixtures/fullstack-app/server/routes/health.js +0 -23
  408. package/template/wall-e/eval/fixtures/fullstack-app/server/routes/items.js +0 -88
  409. package/template/wall-e/eval/fixtures/fullstack-app/server/routes/users.js +0 -75
  410. package/template/wall-e/eval/fixtures/fullstack-app/server/test.js +0 -198
  411. package/template/wall-e/eval/fixtures/fullstack-app/server/utils/response.js +0 -34
  412. package/template/wall-e/eval/fixtures/fullstack-app/server/utils/validate.js +0 -26
  413. package/template/wall-e/eval/fixtures/fullstack-app/server.js +0 -8
  414. package/template/wall-e/eval/fixtures/fullstack-app/test.js +0 -12
  415. package/template/wall-e/eval/fixtures/monorepo-basic/package.json +0 -8
  416. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/data.js +0 -58
  417. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/middleware.js +0 -46
  418. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/package.json +0 -8
  419. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/routes.js +0 -64
  420. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/server.js +0 -56
  421. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/test.js +0 -116
  422. package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/commands.js +0 -61
  423. package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/index.js +0 -62
  424. package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/output.js +0 -43
  425. package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/package.json +0 -11
  426. package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/test.js +0 -44
  427. package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/formatters.js +0 -43
  428. package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/index.js +0 -12
  429. package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/package.json +0 -5
  430. package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/test.js +0 -55
  431. package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/validators.js +0 -29
  432. package/template/wall-e/eval/fixtures/monorepo-basic/test.js +0 -46
  433. package/template/wall-e/eval/fixtures/node-cli/index.js +0 -78
  434. package/template/wall-e/eval/fixtures/node-cli/package.json +0 -10
  435. package/template/wall-e/eval/fixtures/node-cli/test.js +0 -57
  436. package/template/wall-e/eval/fixtures/node-typed/package.json +0 -8
  437. package/template/wall-e/eval/fixtures/node-typed/src/handlers.js +0 -31
  438. package/template/wall-e/eval/fixtures/node-typed/src/utils.js +0 -33
  439. package/template/wall-e/eval/fixtures/node-typed/test.js +0 -36
  440. package/template/wall-e/eval/fixtures/python-flask/app.py +0 -14
  441. package/template/wall-e/eval/fixtures/python-flask/requirements.txt +0 -2
  442. package/template/wall-e/eval/fixtures/python-flask/test_app.py +0 -25
  443. package/template/wall-e/eval/fixtures/wall-e-subset/brain.js +0 -105
  444. package/template/wall-e/eval/fixtures/wall-e-subset/eval/aggregator.js +0 -101
  445. package/template/wall-e/eval/fixtures/wall-e-subset/eval/benchmarks/chat.json +0 -20
  446. package/template/wall-e/eval/fixtures/wall-e-subset/eval/benchmarks/coding.json +0 -32
  447. package/template/wall-e/eval/fixtures/wall-e-subset/eval/benchmarks.js +0 -64
  448. package/template/wall-e/eval/fixtures/wall-e-subset/eval/fixtures/simple-project/package.json +0 -6
  449. package/template/wall-e/eval/fixtures/wall-e-subset/eval/fixtures/simple-project/server.js +0 -31
  450. package/template/wall-e/eval/fixtures/wall-e-subset/eval/fixtures/simple-project/test.js +0 -18
  451. package/template/wall-e/eval/fixtures/wall-e-subset/eval/fixtures/simple-project/utils.js +0 -34
  452. package/template/wall-e/eval/fixtures/wall-e-subset/eval/runner.js +0 -104
  453. package/template/wall-e/eval/fixtures/wall-e-subset/eval/scorer.js +0 -73
  454. package/template/wall-e/eval/fixtures/wall-e-subset/eval/test.js +0 -134
  455. package/template/wall-e/eval/fixtures/wall-e-subset/llm/client.js +0 -99
  456. package/template/wall-e/eval/fixtures/wall-e-subset/llm/providers.js +0 -63
  457. package/template/wall-e/eval/fixtures/wall-e-subset/llm/test.js +0 -70
  458. package/template/wall-e/eval/fixtures/wall-e-subset/package.json +0 -10
  459. package/template/wall-e/eval/fixtures/wall-e-subset/test.js +0 -86
  460. package/template/wall-e/eval/harvester.js +0 -685
  461. package/template/wall-e/eval/head-to-head.js +0 -388
  462. package/template/wall-e/eval/humaneval-adapter.js +0 -321
  463. package/template/wall-e/eval/list-models.js +0 -31
  464. package/template/wall-e/eval/livecodebench-adapter.js +0 -291
  465. package/template/wall-e/eval/mail-integration.js +0 -443
  466. package/template/wall-e/eval/manifest.js +0 -186
  467. package/template/wall-e/eval/meta-harness/adapters/coding-agent.js +0 -57
  468. package/template/wall-e/eval/meta-harness/bootstrap-snapshot.js +0 -149
  469. package/template/wall-e/eval/meta-harness/candidate-store.js +0 -117
  470. package/template/wall-e/eval/meta-harness/cli.js +0 -86
  471. package/template/wall-e/eval/meta-harness/domain-spec.js +0 -154
  472. package/template/wall-e/eval/meta-harness/domains/coding-agent.domain.json +0 -84
  473. package/template/wall-e/eval/meta-harness/examples/env-bootstrap-candidate.js +0 -29
  474. package/template/wall-e/eval/meta-harness/experience-store.js +0 -174
  475. package/template/wall-e/eval/meta-harness/frontier.js +0 -96
  476. package/template/wall-e/eval/meta-harness/harness-interface.js +0 -90
  477. package/template/wall-e/eval/meta-harness/leakage-guard.js +0 -80
  478. package/template/wall-e/eval/meta-harness/optimizer.js +0 -207
  479. package/template/wall-e/eval/meta-harness/proposer-runner.js +0 -110
  480. package/template/wall-e/eval/meta-harness/reporting.js +0 -58
  481. package/template/wall-e/eval/meta-harness/telemetry.js +0 -27
  482. package/template/wall-e/eval/meta-harness/validation.js +0 -81
  483. package/template/wall-e/eval/promoter.js +0 -228
  484. package/template/wall-e/eval/provider-normalizer.js +0 -33
  485. package/template/wall-e/eval/replay.js +0 -395
  486. package/template/wall-e/eval/run-agent-benchmarks.js +0 -386
  487. package/template/wall-e/eval/run-codex-cli-baseline.js +0 -177
  488. package/template/wall-e/eval/run-coding-agent-real.js +0 -187
  489. package/template/wall-e/eval/run-eval.js +0 -435
  490. package/template/wall-e/eval/run-model-comparison.js +0 -142
  491. package/template/wall-e/eval/session-evaluator.js +0 -187
  492. package/template/wall-e/eval/session-miner.js +0 -207
  493. package/template/wall-e/eval/session-retrieval-benchmark.js +0 -150
  494. package/template/wall-e/eval/session-transcripts.js +0 -509
  495. package/template/wall-e/eval/shadow.js +0 -161
  496. package/template/wall-e/eval/swebench-adapter.js +0 -345
  497. package/template/wall-e/eval/swebench-docker.js +0 -192
  498. package/template/wall-e/eval/train.py +0 -320
  499. package/template/wall-e/eval/trainer.js +0 -232
  500. package/template/wall-e/eval/weekly-eval-loop.js +0 -241
@@ -1,669 +0,0 @@
1
- 'use strict';
2
-
3
- const fs = require('fs');
4
- const path = require('path');
5
- const crypto = require('crypto');
6
- const { createClient } = require('../llm/client');
7
- const { decorateBenchmarkResult, DEFAULT_SCORER_VERSION } = require('./manifest');
8
-
9
- // ============================================================
10
- // Constants
11
- // ============================================================
12
-
13
- const BENCHMARKS_DIR = path.join(__dirname, 'benchmarks');
14
- const VALID_TASK_TYPES = ['coding', 'chat', 'reasoning', 'memory-retrieval', 'coding-agent'];
15
- const VALID_DIFFICULTIES = ['easy', 'medium', 'hard'];
16
- const DEFAULT_TIMEOUT_MS = 60_000;
17
-
18
- // ============================================================
19
- // Trait scoring — regex/heuristic checks
20
- // ============================================================
21
-
22
- /**
23
- * Mapping from trait name to regex or function that tests for that trait.
24
- * Returns true if the response exhibits the trait.
25
- */
26
- const TRAIT_MATCHERS = {
27
- // --- Coding traits ---
28
- 'has code block': (r) => /```[\s\S]*?```/.test(r),
29
- 'defines function': (r) => /\b(function\s+\w+|const\s+\w+\s*=\s*(async\s+)?\(|def\s+\w+\s*\()/i.test(r),
30
- 'uses setTimeout': (r) => /setTimeout/i.test(r),
31
- 'uses clearTimeout': (r) => /clearTimeout/i.test(r),
32
- 'uses csv module': (r) => /\bcsv\b/i.test(r),
33
- 'returns list': (r) => /\breturn\b.*\[|list|List/i.test(r),
34
- 'handles headers': (r) => /header|column|field/i.test(r),
35
- 'uses async/await': (r) => /async\s+|await\s+/i.test(r),
36
- 'has retry logic': (r) => /retry|retries|attempt/i.test(r),
37
- 'has backoff': (r) => /backoff|exponential|delay\s*\*|Math\.pow/i.test(r),
38
- 'uses dynamic programming': (r) => /\bdp\b|dynamic\s+program|table|matrix|memo/i.test(r),
39
- 'uses binary search': (r) => /binary\s+search|low.*high|mid|left.*right/i.test(r),
40
- 'handles not found': (r) => /-1|not\s+found|None/i.test(r),
41
- 'returns number': (r) => /return\s+\d|return\s+\w+\s*(;|\n)/i.test(r),
42
- 'identifies null check missing': (r) => /null|undefined|optional\s+chaining|\?\.|guard|check/i.test(r),
43
- 'adds optional chaining or guard': (r) => /\?\.|&&|if\s*\(|guard|check/i.test(r),
44
- 'checks response status': (r) => /response\.(ok|status)|\.ok\b|status\s*(===|!==|==|!=)/i.test(r),
45
- 'identifies missing extend/concatenation': (r) => /extend|concat|\+\s*=|result\s*\+|\.extend|\.concat/i.test(r),
46
- 'fixes recursive call': (r) => /result\s*(=|\+=|\.extend|\.concat)|return.*flatten/i.test(r),
47
- 'explains the bug': (r) => /bug|issue|problem|because|result.*lost|not.*captured|discard/i.test(r),
48
- 'uses Map or linked list': (r) => /\bMap\b|linked\s*list|doubly/i.test(r),
49
- 'implements eviction': (r) => /evict|delete|remove.*oldest|remove.*least/i.test(r),
50
- 'has get and put methods': (r) => /\bget\s*\(|\bput\s*\(|\bset\s*\(/i.test(r),
51
- 'uses decorator pattern': (r) => /def\s+\w+\s*\(\s*func|@\w+|wrapper|wraps/i.test(r),
52
- 'tracks timestamps': (r) => /time|timestamp|datetime|clock/i.test(r),
53
- 'raises exception': (r) => /raise|throw|Error|Exception/i.test(r),
54
- 'handles arrays': (r) => /Array\.isArray|instanceof\s+Array|\barray\b.*concat|\.concat/i.test(r),
55
- 'handles nested objects': (r) => /typeof.*object|recursive|nested|deep/i.test(r),
56
- 'handles primitives': (r) => /string|number|boolean|primitive|typeof/i.test(r),
57
- 'separates validation': (r) => /validat|middleware|schema|joi|zod|check.*input/i.test(r),
58
- 'extracts service layer': (r) => /service|layer|separate.*concern|module|extract/i.test(r),
59
- 'adds try-catch': (r) => /try\s*\{|catch\s*\(|error\s*handl/i.test(r),
60
- 'improves structure': (r) => /refactor|separate|extract|clean|modular/i.test(r),
61
- 'implements all four methods': (r) => /\bon\s*\(.*\)|\boff\s*\(.*\)|\bonce\s*\(.*\)|\bemit\s*\(.*\)/i.test(r),
62
- 'handles once correctly': (r) => /once|removeListener|off.*after|single\s*invocation/i.test(r),
63
- 'mentions max listeners or cleanup': (r) => /max.*listener|cleanup|leak|removeAll|memory/i.test(r),
64
- 'has concurrency limit': (r) => /concurren|parallel.*limit|max.*running|semaphore/i.test(r),
65
- 'has priority support': (r) => /priorit|queue.*sort|high.*low|urgency/i.test(r),
66
- 'has cancellation': (r) => /cancel|abort|remove.*pending|clear.*queue/i.test(r),
67
- 'has jsdoc': (r) => /\/\*\*[\s\S]*?\*\/|@param|@returns/i.test(r),
68
- 'uses context manager': (r) => /__enter__|__exit__|with\s+|contextmanager/i.test(r),
69
- 'handles rollback': (r) => /rollback|ROLLBACK/i.test(r),
70
- 'supports nesting': (r) => /savepoint|nested|SAVEPOINT/i.test(r),
71
- 'mentions heap snapshots': (r) => /heap\s*snapshot|--inspect|heapdump|v8/i.test(r),
72
- 'identifies unbounded cache': (r) => /unbounded|grow.*without|no.*eviction|plain\s*object.*cache/i.test(r),
73
- 'suggests WeakMap or LRU': (r) => /WeakMap|LRU|lru-cache|bounded/i.test(r),
74
- 'mentions event listener leaks': (r) => /event\s*listener.*leak|removeListener|off\(|maxListeners/i.test(r),
75
- 'handles empty strings': (r) => /empty\s*string|''|""|\.length\s*===?\s*0|!str/i.test(r),
76
-
77
- // --- Chat traits ---
78
- 'greeting': (r) => /\b(hi|hello|hey|welcome|greetings)\b/i.test(r),
79
- 'lists capabilities': (r) => /can\s+help|able\s+to|assist.*with|I\s+can/i.test(r),
80
- 'friendly tone': (r) => /glad|happy|great|wonderful|!\s|welcome/i.test(r),
81
- 'invites follow-up': (r) => /feel\s+free|let\s+me\s+know|any.*question|don't\s+hesitate|anything\s+else/i.test(r),
82
- 'empathetic tone': (r) => /understand|sounds.*tough|sorry\s+to\s+hear|that.*challenging|must\s+be/i.test(r),
83
- 'actionable advice': (r) => /try\s|consider|suggest|recommend|step|could\s|should\s|here'?s?\s*(what|how)/i.test(r),
84
- 'suggests prioritization': (r) => /prioriti|important.*first|urgent|rank|triage|eisenhower/i.test(r),
85
- 'concise': (r) => r.length < 2000,
86
- 'uses analogy or simple language': (r) => /like\s+a|think\s+of\s+it|imagine|analogy|simply\s+put|in\s+other\s+words/i.test(r),
87
- 'mentions HTTP methods': (r) => /GET|POST|PUT|DELETE|PATCH|HTTP\s+method/i.test(r),
88
- 'accurate': (_r) => true, // can't verify accuracy heuristically — always true as baseline
89
- 'clear distinction': (r) => /difference|while|whereas|on\s+the\s+other\s+hand|in\s+contrast|unlike/i.test(r),
90
- 'uses examples': (r) => /for\s+example|such\s+as|e\.g\.|instance|like\s+\w+/i.test(r),
91
- 'suggests ownership': (r) => /own\s+up|take\s+responsib|acknowledge|admit|transparent/i.test(r),
92
- 'constructive framing': (r) => /learn|grow|opportunity|moving\s+forward|improve|next\s+time/i.test(r),
93
- 'recommends specific language': (r) => /JavaScript|TypeScript|Python|HTML|CSS/i.test(r),
94
- 'explains reasoning': (r) => /because|reason|since|this\s+is\s+why|due\s+to/i.test(r),
95
- 'mentions learning resources': (r) => /tutorial|course|documentation|freeCodeCamp|MDN|book|resource|Udemy|YouTube/i.test(r),
96
- 'acknowledges thanks': (r) => /you're\s+welcome|glad|happy\s+to|my\s+pleasure|no\s+problem|anytime/i.test(r),
97
- 'acknowledges disagreement': (r) => /valid\s+point|understand.*perspective|fair\s+point|you're\s+right|good\s+point|that\s+makes\s+sense/i.test(r),
98
- 'validates their point': (r) => /valid|good\s+point|makes\s+sense|right|agree|fair/i.test(r),
99
- 'non-defensive': (r) => !/wrong|incorrect|actually\s+no|you\s+should\s+have/i.test(r),
100
- 'constructive': (r) => /consider|suggest|might|could|option|alternative/i.test(r),
101
- 'lists pros and cons': (r) => /pro|con|advantage|disadvantage|benefit|drawback|\+\s|✓|✗|-\s/i.test(r),
102
- 'covers both sides': (r) => /remote.*office|office.*remote|both|on\s+the\s+other|however/i.test(r),
103
- 'uses structure': (r) => /^(\s*[-*]\s|\s*\d+[\.\)]\s|#{1,3}\s)/m.test(r),
104
- 'celebrates achievement': (r) => /congrat|awesome|amazing|fantastic|great\s+job|well\s+done|exciting/i.test(r),
105
- 'enthusiastic tone': (r) => /!|exciting|love|fantastic|wonderful|awesome|amazing/i.test(r),
106
- 'encouraging': (r) => /keep\s+going|great\s+start|proud|milestone|first\s+of\s+many/i.test(r),
107
-
108
- // --- Reasoning traits ---
109
- 'multiple options considered': (r) => /option|alternative|approach|choice|comparison|versus|vs\./i.test(r),
110
- 'pros and cons': (r) => /pro|con|advantage|disadvantage|trade-?off|benefit|drawback/i.test(r),
111
- 'numbered steps': (r) => /^\s*\d+[\.\)]\s/m.test(r),
112
- 'step-by-step': (r) => /step\s+\d|first.*then.*finally|step-by-step|^\s*\d+[\.\)]/mi.test(r),
113
- 'conclusion': (r) => /recommend|conclusion|therefore|in\s+summary|overall|my\s+suggestion|I('d|\s+would)\s+(go|choose|recommend|suggest)/i.test(r),
114
- 'considers use case fit': (r) => /use\s+case|depends\s+on|your\s+scenario|for\s+your|requirements/i.test(r),
115
- 'considers team size': (r) => /team\s+size|small\s+team|5\s+developer|developer|staffing/i.test(r),
116
- 'correct solution': (_r) => true, // heuristic can't verify — baseline true
117
- 'correct conclusion': (r) => /conclude|conclusion|therefore|so\s+the\s+answer|result\s+is/i.test(r),
118
- 'explains why': (r) => /because|reason|since|this\s+is\s+why|due\s+to|explains/i.test(r),
119
- 'explains constraints': (r) => /constraint|rule|cannot|must\s+not|only\s+holds/i.test(r),
120
- 'checks logs first': (r) => /log|logging|check.*log|grep.*log|tail/i.test(r),
121
- 'considers deployment changes': (r) => /deploy|rollback|diff|last\s+week|recent\s+change|what\s+changed/i.test(r),
122
- 'systematic approach': (r) => /systematic|methodical|step.*step|first.*then|diagnos/i.test(r),
123
- 'mentions monitoring': (r) => /monitor|alert|metric|dashboard|grafana|datadog|observ/i.test(r),
124
- 'identifies logical fallacy': (r) => /fallacy|cannot\s+conclude|does\s+not\s+follow|invalid|not\s+necessarily/i.test(r),
125
- 'phased approach': (r) => /phase\s+\d|stage\s+\d|first\s+phase|phase\s+1|incremental/i.test(r),
126
- 'considers risk': (r) => /risk|careful|fallback|rollback|gradual|safety/i.test(r),
127
- 'mentions strangler pattern': (r) => /strangler|strangler\s+fig|facade|proxy.*route|incremental.*migrat/i.test(r),
128
- 'realistic timeline': (r) => /week|month|sprint|timeline|3\s+months|quarter/i.test(r),
129
- 'considers age and risk': (r) => /age|30|risk\s+tolerance|time\s+horizon|young|long.term/i.test(r),
130
- 'specific allocation': (r) => /\d+\s*%|percent|allocation|split|ratio/i.test(r),
131
- 'uses heat trick': (r) => /heat|warm|hot|temperature|touch|feel/i.test(r),
132
- 'explains logic': (r) => /because|therefore|since|so\s+we\s+know|this\s+means/i.test(r),
133
- 'considers scale': (r) => /scale|50K|growth|users|traffic/i.test(r),
134
- 'shows calculation': (r) => /\d+\s*[\+\-\*\/\%]\s*\d+|=\s*\d+|P\s*\(|probability/i.test(r),
135
- 'uses inclusion-exclusion': (r) => /inclusion.exclusion|union|P\(A\s*(∪|or|OR|\|).*B\)|60\s*\+\s*50\s*-\s*30|80/i.test(r),
136
- 'correct answer': (r) => /20\s*%|0\.2|20\s+percent/i.test(r),
137
-
138
- // --- Memory-retrieval traits ---
139
- 'references context': (r) => true, // baseline — the real check is the specific facts
140
- 'mentions Rust': (r) => /\bRust\b/i.test(r),
141
- 'mentions Phoenix': (r) => /\bPhoenix\b/i.test(r),
142
- 'accurate extraction': (_r) => true, // can't auto-verify — baseline true
143
- 'mentions Thursday meeting': (r) => /Thursday|Thurs/i.test(r),
144
- 'mentions Friday report': (r) => /Friday|Fri/i.test(r),
145
- 'mentions FastAPI': (r) => /FastAPI/i.test(r),
146
- 'mentions Python': (r) => /\bPython\b/i.test(r),
147
- 'not hallucinated': (_r) => true, // can't auto-detect hallucination — baseline true
148
- 'recommends Knex or raw SQL': (r) => /Knex|raw\s+SQL|query\s+builder/i.test(r),
149
- 'mentions ORM preference': (r) => /ORM|ActiveRecord|prefer|dislike/i.test(r),
150
- 'respects user preference': (r) => /prefer|based\s+on|you\s+(mentioned|said|noted)|previous/i.test(r),
151
- 'mentions on-premises requirement': (r) => /on.prem|no.cloud|self.host|local|sensitive\s+data/i.test(r),
152
- 'suggests self-hosted options': (r) => /ELK|Elasticsearch|Loki|Grafana|Graylog|Fluentd|self.host/i.test(r),
153
- 'mentions OrderService': (r) => /OrderService/i.test(r),
154
- 'mentions port 3002': (r) => /3002/.test(r),
155
- 'mentions AuthService for SLA': (r) => /AuthService.*(?:critical|SLA|uptime|highest)|(?:critical|SLA|uptime|highest).*AuthService/i.test(r),
156
- 'mentions write-through': (r) => /write.through/i.test(r),
157
- 'explains rejected approaches': (r) => /cache\s+miss|lost\s+on\s+restart|TTL|in.memory/i.test(r),
158
- 'acknowledges missing info': (r) => /not\s+mention|no\s+information|haven't\s+shared|don't\s+have|not\s+specified|unclear/i.test(r),
159
- 'does not hallucinate': (_r) => true, // baseline
160
- 'asks for clarification': (r) => /could\s+you|what.*use|which.*tool|tell\s+me|please\s+share|can\s+you\s+share|\?/i.test(r),
161
- 'correct branch name format': (r) => /feat\/PROJ-456/i.test(r),
162
- 'mentions conventional commits': (r) => /conventional\s+commit|feat:|fix:|chore:/i.test(r),
163
- 'mentions no force push to main': (r) => /force\s+push|--force|never.*push.*main|no.*force/i.test(r),
164
- 'lists all five preferences': (r) => /dark\s+mode|timezone|America\/Los_Angeles|English|daily\s+digest|verbose/i.test(r),
165
- 'searches session memory': (r) => /session|memory|remember|transcript|source|found/i.test(r),
166
- 'mentions parser.js': (r) => /parser\.js/i.test(r),
167
- 'mentions node --test': (r) => /node\s+--test/i.test(r),
168
- 'cites session id': (r) => /(?:session|source)[\s_-]?id|codex:sanitized|claude:sanitized|walle:sanitized|sanitized-[\w-]+/i.test(r),
169
- 'mentions lock contention': (r) => /lock\s+contention/i.test(r),
170
- 'mentions queue-worker.js': (r) => /queue-worker\.js/i.test(r),
171
- 'mentions codex-blank-space.spec.js': (r) => /codex-blank-space\.spec\.js/i.test(r),
172
- 'mentions blank gap': (r) => /blank[-\s]?gap/i.test(r),
173
- 'searches diary': (r) => /diary|agent diary|remember|memory|source/i.test(r),
174
- 'mentions router inputs': (r) => /router\s+inputs|routing.*inputs/i.test(r),
175
- 'mentions evaluation': (r) => /evaluation|eval|trusted\s+evaluation/i.test(r),
176
- 'cites diary/session id': (r) => /diary|session[\s_-]?id|sanitized-quorum|source[\s_-]?id/i.test(r),
177
- 'mentions gemini-jsonl': (r) => /gemini-jsonl/i.test(r),
178
- 'mentions pii_potential': (r) => /pii_potential/i.test(r),
179
- 'says do not replace SQLite': (r) => /do\s+not\s+(?:adopt|replace|use).*SQLite|keep\s+SQLite|SQLite.*not\s+replace/i.test(r),
180
- 'mentions sqlite-vec': (r) => /sqlite-vec/i.test(r),
181
- 'does not hallucinate approval': (r) => /do\s+not\s+(?:adopt|replace)|no\s+approval|not\s+approved|rejected|keep\s+SQLite/i.test(r),
182
- 'uses Wall-E memory': (r) => /Wall-?E|memory|remember|source|retriev|context/i.test(r),
183
- 'mentions direct': (r) => /\bdirect\b|concise|straightforward/i.test(r),
184
- 'mentions evidence': (r) => /evidence|cite|source|verified|proof/i.test(r),
185
- 'mentions thorough verification': (r) => /thorough\s+verification|verify|validated|test|evidence/i.test(r),
186
- 'does not search public web first': (r) => /before\s+public\s+web|not\s+(?:search|use).*public|Wall-?E.*first|memory.*first/i.test(r),
187
- 'mentions colleague context': (r) => /colleague|work\s+context|planning|team\s+strategy|prioriti[sz]ation/i.test(r),
188
- 'cites memory evidence': (r) => /source|source[_\s-]?id|memory|evidence|person:sanitized|sanitized-casey/i.test(r),
189
-
190
- // --- Coding-agent traits ---
191
- 'uses edit over write': (r) => /edit_file|apply_patch|multi_edit/i.test(r) && !/write_file/i.test(r),
192
- 'reads before writing': (r) => {
193
- const readIdx = r.search(/read_file|glob|grep_files/i);
194
- const writeIdx = r.search(/write_file|edit_file|apply_patch/i);
195
- return readIdx >= 0 && writeIdx >= 0 && readIdx < writeIdx;
196
- },
197
- 'runs tests after changes': (r) => {
198
- const editIdx = r.search(/edit_file|write_file|apply_patch/i);
199
- const testIdx = r.search(/npm test|pytest|run_shell.*test/i);
200
- return editIdx >= 0 && testIdx >= 0 && editIdx < testIdx;
201
- },
202
- 'uses LSP diagnostics': (r) => /lsp_diagnostics|lsp_symbols/i.test(r),
203
- 'plans before executing': (r) => {
204
- const planIdx = r.search(/update_todos|plan|step\s*1/i);
205
- const execIdx = r.search(/edit_file|write_file/i);
206
- return planIdx >= 0 && execIdx >= 0 && planIdx < execIdx;
207
- },
208
- 'efficient tool use': (r) => {
209
- const toolCalls = (r.match(/\b(read_file|write_file|edit_file|run_shell|glob|grep_files)\b/gi) || []);
210
- return toolCalls.length > 0 && toolCalls.length <= 15;
211
- },
212
- 'creates todos': (r) => /update_todos/i.test(r),
213
- 'uses glob for discovery': (r) => /glob/i.test(r),
214
- 'uses grep for search': (r) => /grep_files/i.test(r),
215
- 'handles errors gracefully': (r) => /error|catch|try|failed|retry/i.test(r),
216
- 'multi-file coordination': (r) => {
217
- const files = new Set((r.match(/(?:read_file|edit_file|write_file).*?['"]([^'"]+)['"]/gi) || []).map(m => m.match(/['"]([^'"]+)['"]/)?.[1]).filter(Boolean));
218
- return files.size >= 2;
219
- },
220
-
221
- // --- Tool diversity traits (coding-agent) ---
222
- 'uses lsp tools': (r) => /lsp_references|lsp_definition|lsp_diagnostics|lsp_symbols|lsp_hover|lsp_implementation/i.test(r),
223
- 'uses grep before edit': (r) => {
224
- const grepIdx = r.search(/grep_files|Grep/i);
225
- const editIdx = r.search(/edit_file|Edit/i);
226
- return grepIdx >= 0 && editIdx >= 0 && grepIdx < editIdx;
227
- },
228
- 'minimal file writes': (r) => {
229
- const writes = (r.match(/\b(write_file|Write)\b/gi) || []).length;
230
- const edits = (r.match(/\b(edit_file|Edit)\b/gi) || []).length;
231
- // Prefer edits over writes; penalize if writes > edits
232
- return edits > 0 && writes <= edits;
233
- },
234
- 'uses search before write': (r) => {
235
- const searchIdx = r.search(/grep_files|glob|Grep|Glob/i);
236
- const writeIdx = r.search(/write_file|edit_file|Write|Edit/i);
237
- return searchIdx >= 0 && writeIdx >= 0 && searchIdx < writeIdx;
238
- },
239
- 'asks clarifying questions': (r) => /ask_user|AskUserQuestion/i.test(r),
240
- };
241
-
242
- const UNSCORABLE_TRAITS = new Set([
243
- 'accurate',
244
- 'correct solution',
245
- 'references context',
246
- 'accurate extraction',
247
- 'not hallucinated',
248
- 'does not hallucinate',
249
- ]);
250
-
251
- // ============================================================
252
- // Suite loading
253
- // ============================================================
254
-
255
- /**
256
- * List available benchmark suite names.
257
- * @returns {string[]}
258
- */
259
- // Suites with a non-standard schema that needs a dedicated loader rather
260
- // than the generic validator below (e.g. SWE-bench files use the upstream
261
- // Princeton/SWE-bench schema with `instance_id` instead of `id`).
262
- const ALT_SCHEMA_SUITE_PREFIXES = ['swebench-', 'swebench_'];
263
-
264
- function listBenchmarkSuites() {
265
- if (!fs.existsSync(BENCHMARKS_DIR)) return [];
266
- return fs.readdirSync(BENCHMARKS_DIR)
267
- .filter((f) => f.endsWith('.json'))
268
- .map((f) => f.replace(/\.json$/, ''))
269
- .filter((name) => !ALT_SCHEMA_SUITE_PREFIXES.some((prefix) => name.startsWith(prefix)));
270
- }
271
-
272
- /**
273
- * Load and validate a benchmark suite by name.
274
- * @param {string} suiteName - e.g. 'coding', 'chat'
275
- * @returns {{ name: string, prompts: object[] }}
276
- */
277
- function loadBenchmarkSuite(suiteName) {
278
- const filePath = path.join(BENCHMARKS_DIR, `${suiteName}.json`);
279
- if (!fs.existsSync(filePath)) {
280
- throw new Error(`Benchmark suite not found: ${suiteName} (looked in ${filePath})`);
281
- }
282
-
283
- const raw = fs.readFileSync(filePath, 'utf-8');
284
- let prompts;
285
- try {
286
- prompts = JSON.parse(raw);
287
- } catch (e) {
288
- throw new Error(`Invalid JSON in benchmark suite ${suiteName}: ${e.message}`);
289
- }
290
-
291
- if (!Array.isArray(prompts) || prompts.length === 0) {
292
- throw new Error(`Benchmark suite ${suiteName} must be a non-empty array`);
293
- }
294
-
295
- // Skip in-file metadata entries (e.g. {"_comment": "Section A: ..."} markers
296
- // used to annotate sections of the JSON). They are not benchmarks.
297
- prompts = prompts.filter((entry) => entry && typeof entry === 'object' && entry._comment === undefined);
298
-
299
- for (const entry of prompts) {
300
- if (!entry.id || typeof entry.id !== 'string') {
301
- throw new Error(`Benchmark entry missing valid 'id' in suite ${suiteName}`);
302
- }
303
- // prompt must be a string; empty string is allowed for explicit
304
- // edge-case tests (e.g. ce-I1 tagged ["edge-case","empty"] — the
305
- // empty input IS the test).
306
- if (typeof entry.prompt !== 'string') {
307
- throw new Error(`Benchmark entry ${entry.id} missing valid 'prompt'`);
308
- }
309
- if (!VALID_TASK_TYPES.includes(entry.taskType)) {
310
- throw new Error(`Benchmark entry ${entry.id} has invalid taskType: ${entry.taskType}`);
311
- }
312
- if (!VALID_DIFFICULTIES.includes(entry.difficulty)) {
313
- throw new Error(`Benchmark entry ${entry.id} has invalid difficulty: ${entry.difficulty}`);
314
- }
315
- // Scoring signal must come from at least one of:
316
- // - expectedTraits (regex/heuristic matchers)
317
- // - expectedInReply / forbiddenInReply (substring/regex on the reply)
318
- // - expectedTools / forbiddenTools (tool-routing scoring)
319
- // - mockToolResults (tool-call shape scoring)
320
- // - agentExpectations (coding-agent: expected tools/files/tests)
321
- // - edge-case / adversarial marker (the absence/refusal is the test)
322
- const hasTraits = Array.isArray(entry.expectedTraits) && entry.expectedTraits.length > 0;
323
- const hasReplyChecks = (Array.isArray(entry.expectedInReply) && entry.expectedInReply.length > 0) ||
324
- (Array.isArray(entry.forbiddenInReply) && entry.forbiddenInReply.length > 0);
325
- const hasToolChecks = (Array.isArray(entry.expectedTools) && entry.expectedTools.length > 0) ||
326
- (Array.isArray(entry.forbiddenTools) && entry.forbiddenTools.length > 0);
327
- const hasMockTools = entry.mockToolResults && typeof entry.mockToolResults === 'object' &&
328
- Object.keys(entry.mockToolResults).length > 0;
329
- const hasAgentExpectations = entry.agentExpectations && typeof entry.agentExpectations === 'object';
330
- const isEdgeCase = entry.category === 'edge-case' || entry.category === 'adversarial' ||
331
- (Array.isArray(entry.tags) && (entry.tags.includes('edge-case') || entry.tags.includes('adversarial')));
332
- if (!hasTraits && !hasReplyChecks && !hasToolChecks && !hasMockTools && !hasAgentExpectations && !isEdgeCase) {
333
- throw new Error(`Benchmark entry ${entry.id} has no scoring signal (expectedTraits / expectedInReply / expectedTools / mockToolResults / agentExpectations)`);
334
- }
335
- if (hasTraits) {
336
- const unknownTraits = entry.expectedTraits.filter(t => !TRAIT_MATCHERS[t] && !UNSCORABLE_TRAITS.has(t));
337
- if (unknownTraits.length) {
338
- throw new Error(`Benchmark entry ${entry.id} has unknown expectedTraits: ${unknownTraits.join(', ')}`);
339
- }
340
- }
341
- }
342
-
343
- return { name: suiteName, prompts };
344
- }
345
-
346
- // ============================================================
347
- // Trait scoring
348
- // ============================================================
349
-
350
- /**
351
- * Check if a response exhibits a single trait.
352
- * @param {string} response - LLM response text
353
- * @param {string} trait - Trait name to check
354
- * @returns {boolean}
355
- */
356
- function scoreTrait(response, trait) {
357
- if (!response || typeof response !== 'string') return false;
358
- if (UNSCORABLE_TRAITS.has(trait)) return false;
359
- const matcher = TRAIT_MATCHERS[trait];
360
- if (!matcher) return false;
361
- return matcher(response);
362
- }
363
-
364
- function scoreTraitsDetailed(response, expectedTraits) {
365
- const detail = {
366
- score: 0,
367
- matched: [],
368
- missed: [],
369
- unscored: [],
370
- unknown: [],
371
- scoredCount: 0,
372
- };
373
- if (!expectedTraits || expectedTraits.length === 0) return detail;
374
-
375
- for (const trait of expectedTraits) {
376
- if (UNSCORABLE_TRAITS.has(trait)) {
377
- detail.unscored.push(trait);
378
- continue;
379
- }
380
- if (!TRAIT_MATCHERS[trait]) {
381
- detail.unknown.push(trait);
382
- continue;
383
- }
384
- detail.scoredCount++;
385
- if (scoreTrait(response, trait)) detail.matched.push(trait);
386
- else detail.missed.push(trait);
387
- }
388
-
389
- detail.score = detail.scoredCount > 0 ? detail.matched.length / detail.scoredCount : 0;
390
- return detail;
391
- }
392
-
393
- /**
394
- * Score a response against multiple expected traits.
395
- * @param {string} response - LLM response text
396
- * @param {string[]} expectedTraits - Array of trait names
397
- * @returns {number} 0.0 to 1.0 based on percentage of traits matched
398
- */
399
- function scoreTraits(response, expectedTraits) {
400
- return scoreTraitsDetailed(response, expectedTraits).score;
401
- }
402
-
403
- // ============================================================
404
- // Benchmark runner
405
- // ============================================================
406
-
407
- /**
408
- * Run a benchmark suite against one or more providers.
409
- *
410
- * @param {object} brain - Brain instance (must have insertBenchmarkResult method)
411
- * @param {object} options
412
- * @param {string} options.suite - Suite name to run
413
- * @param {Array<{type: string, model: string, config?: object}>} options.providers - Providers to test
414
- * @param {Function} [options.judgeFn] - Optional LLM-as-judge function: (prompt, response) => { score, feedback }
415
- * @param {number} [options.timeoutMs] - Per-prompt timeout in ms (default: 60000)
416
- * @param {AbortSignal} [options.signal] - Abort signal to cancel the run
417
- * @returns {Promise<{ runId: string, results: object[], leaderboard: object }>}
418
- */
419
- async function runBenchmark(brain, { suite, providers, judgeFn, timeoutMs = DEFAULT_TIMEOUT_MS, signal } = {}) {
420
- if (!suite) throw new Error('suite is required');
421
- if (!providers || providers.length === 0) throw new Error('providers is required');
422
-
423
- const runId = crypto.randomUUID();
424
- const { prompts } = loadBenchmarkSuite(suite);
425
- const results = [];
426
- const providerScores = {}; // { providerKey: { total: 0, count: 0 } }
427
-
428
- for (const entry of prompts) {
429
- if (signal?.aborted) break;
430
-
431
- for (const provider of providers) {
432
- if (signal?.aborted) break;
433
-
434
- const providerKey = `${provider.type}/${provider.model}`;
435
- if (!providerScores[providerKey]) {
436
- providerScores[providerKey] = { total: 0, count: 0, errors: 0 };
437
- }
438
-
439
- let response = null;
440
- let latencyMs = 0;
441
- let error = null;
442
- let usage = null;
443
- let timer;
444
-
445
- try {
446
- const client = createClient(provider.type, provider.config || {});
447
- const abortCtl = new AbortController();
448
- timer = setTimeout(() => abortCtl.abort(), timeoutMs);
449
-
450
- // Combine external signal with timeout
451
- if (signal) {
452
- signal.addEventListener('abort', () => abortCtl.abort(), { once: true });
453
- }
454
-
455
- const startTime = Date.now();
456
- const result = await client.chat({
457
- model: provider.model,
458
- messages: [{ role: 'user', content: entry.prompt }],
459
- maxTokens: 2048,
460
- signal: abortCtl.signal,
461
- });
462
- latencyMs = Date.now() - startTime;
463
- clearTimeout(timer);
464
-
465
- response = result.content || '';
466
- usage = result.usage || null;
467
- } catch (err) {
468
- clearTimeout(timer);
469
- error = err.message || String(err);
470
- providerScores[providerKey].errors++;
471
- }
472
-
473
- // Score traits. Some dataset traits are intentionally marked unscorable:
474
- // they document desired behavior but must not inflate automatic scores.
475
- const traitDetail = response
476
- ? scoreTraitsDetailed(response, entry.expectedTraits)
477
- : scoreTraitsDetailed('', entry.expectedTraits);
478
- const traitScore = traitDetail.score;
479
- const matchedTraits = traitDetail.matched;
480
-
481
- // Optional LLM judge
482
- let judgeScore = null;
483
- let judgeFeedback = null;
484
- if (judgeFn && response) {
485
- try {
486
- const judgeResult = await judgeFn(entry.prompt, response);
487
- judgeScore = judgeResult.score;
488
- judgeFeedback = judgeResult.feedback;
489
- } catch (_err) {
490
- // judge failure is non-fatal
491
- }
492
- }
493
-
494
- // Composite score: trait score (weight 0.6) + judge score (weight 0.4) when judge available
495
- const compositeScore = judgeScore != null
496
- ? traitScore * 0.6 + judgeScore * 0.4
497
- : traitScore;
498
- const scoringMethod = judgeScore != null
499
- ? 'trait+judge'
500
- : traitDetail.scoredCount > 0 ? 'traits' : 'unscored-traits';
501
-
502
- providerScores[providerKey].total += compositeScore;
503
- providerScores[providerKey].count++;
504
-
505
- // Map chat scoring onto the dimension rubric so the leaderboard's
506
- // dimensional view has data to show. Chat is single-turn — agent-loop
507
- // dims (toolEfficiency, turnEconomy, iterativeRefinement, etc.) don't
508
- // apply, so leave those undefined. Aggregator skips undefined dims.
509
- const dimensions = {
510
- correctness: traitScore,
511
- ...(judgeScore != null ? { codeQuality: judgeScore } : {}),
512
- };
513
-
514
- const resultEntry = decorateBenchmarkResult({
515
- runId,
516
- suite,
517
- promptId: entry.id,
518
- taskType: entry.taskType,
519
- difficulty: entry.difficulty,
520
- provider: provider.type,
521
- model: provider.model,
522
- prompt: entry.prompt,
523
- response,
524
- traitScore,
525
- matchedTraits,
526
- judgeScore,
527
- judgeFeedback,
528
- compositeScore,
529
- latencyMs,
530
- usage,
531
- inputTokens: usage?.input ?? usage?.prompt_tokens ?? null,
532
- outputTokens: usage?.output ?? usage?.completion_tokens ?? null,
533
- genTokPerSec: usage?.genTokPerSec ?? null,
534
- dimensionsJson: JSON.stringify(dimensions),
535
- modelMetadataJson: JSON.stringify({
536
- matchedTraits,
537
- missedTraits: traitDetail.missed,
538
- unscoredTraits: traitDetail.unscored,
539
- unknownTraits: traitDetail.unknown,
540
- scoredTraitCount: traitDetail.scoredCount,
541
- }),
542
- error,
543
- scorerVersion: DEFAULT_SCORER_VERSION,
544
- scoringMethod,
545
- trusted: !error && judgeScore != null,
546
- runConfig: { timeoutMs },
547
- timestamp: new Date().toISOString(),
548
- }, {
549
- suite,
550
- benchmark: entry,
551
- runId,
552
- provider: provider.type,
553
- model: provider.model,
554
- scoringMethod,
555
- scorerVersion: DEFAULT_SCORER_VERSION,
556
- trusted: !error && judgeScore != null,
557
- runConfig: { timeoutMs },
558
- });
559
-
560
- results.push(resultEntry);
561
-
562
- // Persist to brain
563
- if (brain && typeof brain.insertBenchmarkResult === 'function') {
564
- try {
565
- brain.insertBenchmarkResult(resultEntry);
566
- } catch (_err) {
567
- // storage failure is non-fatal
568
- }
569
- }
570
- }
571
- }
572
-
573
- // Build leaderboard
574
- const leaderboard = {};
575
- for (const [key, data] of Object.entries(providerScores)) {
576
- leaderboard[key] = {
577
- avgScore: data.count > 0 ? data.total / data.count : 0,
578
- totalPrompts: data.count,
579
- errors: data.errors,
580
- };
581
- }
582
-
583
- return { runId, results, leaderboard };
584
- }
585
-
586
- // ============================================================
587
- // Leaderboard aggregation
588
- // ============================================================
589
-
590
- /**
591
- * Get aggregated benchmark leaderboard from stored results.
592
- *
593
- * Expected brain method signature:
594
- * brain.getBenchmarkResults({ suite, days }) => Array<{ provider, model, compositeScore, error }>
595
- *
596
- * @param {object} brain - Brain instance
597
- * @param {object} options
598
- * @param {string} [options.suite] - Filter by suite name
599
- * @param {number} [options.days] - Filter to last N days
600
- * @returns {object} Leaderboard: { 'provider/model': { avgScore, totalRuns, errors } }
601
- */
602
- function getBenchmarkLeaderboard(brain, { suite, days } = {}) {
603
- if (!brain || typeof brain.getBenchmarkResults !== 'function') {
604
- throw new Error('brain.getBenchmarkResults is required');
605
- }
606
-
607
- const results = brain.getBenchmarkResults({ suite, days });
608
- const scores = {};
609
-
610
- for (const r of results) {
611
- const key = `${r.provider}/${r.model}`;
612
- if (!scores[key]) {
613
- scores[key] = { total: 0, count: 0, errors: 0 };
614
- }
615
- if (r.error) {
616
- scores[key].errors++;
617
- }
618
- scores[key].total += r.compositeScore || 0;
619
- scores[key].count++;
620
- }
621
-
622
- const leaderboard = {};
623
- for (const [key, data] of Object.entries(scores)) {
624
- leaderboard[key] = {
625
- avgScore: data.count > 0 ? data.total / data.count : 0,
626
- totalRuns: data.count,
627
- errors: data.errors,
628
- };
629
- }
630
-
631
- return leaderboard;
632
- }
633
-
634
- /**
635
- * Load all benchmarks across all suites, attaching _suite and difficulty metadata.
636
- * Used by aggregator for difficulty-based task type mapping.
637
- * @returns {Array<Object>}
638
- */
639
- function loadAllBenchmarks() {
640
- const all = [];
641
- for (const suite of listBenchmarkSuites()) {
642
- try {
643
- const { prompts } = loadBenchmarkSuite(suite);
644
- for (const b of prompts) {
645
- b._suite = suite;
646
- all.push(b);
647
- }
648
- } catch { /* skip malformed suites */ }
649
- }
650
- return all;
651
- }
652
-
653
- // ============================================================
654
- // Exports
655
- // ============================================================
656
-
657
- module.exports = {
658
- listBenchmarkSuites,
659
- loadBenchmarkSuite,
660
- loadAllBenchmarks,
661
- scoreTrait,
662
- scoreTraits,
663
- scoreTraitsDetailed,
664
- runBenchmark,
665
- getBenchmarkLeaderboard,
666
- TRAIT_MATCHERS,
667
- UNSCORABLE_TRAITS,
668
- BENCHMARKS_DIR,
669
- };