create-walle 0.9.21 → 0.9.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (500) hide show
  1. package/README.md +27 -5
  2. package/package.json +2 -2
  3. package/template/CLAUDE.md +2 -2
  4. package/template/LICENSE +1 -1
  5. package/template/bin/ctm-dev-cleanup.js +24 -3
  6. package/template/bin/ctm-launch.sh +13 -0
  7. package/template/bin/dev.sh +156 -18
  8. package/template/bin/node-bin.sh +84 -0
  9. package/template/bin/pin-node.sh +51 -0
  10. package/template/claude-task-manager/api-prompts.js +1203 -182
  11. package/template/claude-task-manager/api-reviews.js +109 -15
  12. package/template/claude-task-manager/approval-agent.js +1360 -280
  13. package/template/claude-task-manager/bin/restart-ctm.sh +64 -23
  14. package/template/claude-task-manager/bin/storage-migration-supervisor.js +338 -0
  15. package/template/claude-task-manager/db.js +4417 -295
  16. package/template/claude-task-manager/docs/app-update-refresh-protocol.md +69 -0
  17. package/template/claude-task-manager/docs/approval-ai-refinement.md +138 -0
  18. package/template/claude-task-manager/docs/approval-rescue-loop.md +74 -0
  19. package/template/claude-task-manager/docs/codex-operational-warning-health.md +107 -0
  20. package/template/claude-task-manager/docs/codex-resume-state-guard-design.md +17 -12
  21. package/template/claude-task-manager/docs/codex-terminal-render-controller-handoff.md +311 -0
  22. package/template/claude-task-manager/docs/coding-agent-hooks-architecture.md +418 -0
  23. package/template/claude-task-manager/docs/conversation-import-freshness.md +20 -0
  24. package/template/claude-task-manager/docs/google-workspace-auth-health.md +77 -0
  25. package/template/claude-task-manager/docs/image-paste-ux.md +13 -0
  26. package/template/claude-task-manager/docs/ipad-web-preview.md +88 -0
  27. package/template/claude-task-manager/docs/main-loop-offload-architecture.md +66 -0
  28. package/template/claude-task-manager/docs/microsoft-dev-tunnel-phone-access-design.md +274 -519
  29. package/template/claude-task-manager/docs/mobile-live-streaming.md +27 -5
  30. package/template/claude-task-manager/docs/mobile-remote-submission-lifecycle.md +69 -0
  31. package/template/claude-task-manager/docs/phone-access-design.md +53 -15
  32. package/template/claude-task-manager/docs/phone-passkey-identity.md +122 -0
  33. package/template/claude-task-manager/docs/phone-setup.md +3 -0
  34. package/template/claude-task-manager/docs/prompt-editing-tree-design.md +25 -1
  35. package/template/claude-task-manager/docs/remote-desktop-access-design.md +268 -0
  36. package/template/claude-task-manager/docs/restart-lifecycle-architecture.md +95 -0
  37. package/template/claude-task-manager/docs/runtime-work-control-plane.md +53 -0
  38. package/template/claude-task-manager/docs/session-interactive-wait-surfaces.md +38 -0
  39. package/template/claude-task-manager/docs/session-needs-you-dismissal.md +84 -0
  40. package/template/claude-task-manager/docs/session-render-state-management-design.md +91 -3
  41. package/template/claude-task-manager/docs/session-standup-command-center-design.md +25 -1
  42. package/template/claude-task-manager/docs/session-title-authority.md +32 -0
  43. package/template/claude-task-manager/docs/session-workspace-binding.md +33 -0
  44. package/template/claude-task-manager/docs/skill-intent-resolution-design.md +72 -0
  45. package/template/claude-task-manager/docs/walle-mcp-supervisor-health.md +86 -0
  46. package/template/claude-task-manager/docs/walle-relay-phone-access-design.md +24 -15
  47. package/template/claude-task-manager/docs/walle-session-history-hydration.md +114 -0
  48. package/template/claude-task-manager/docs/walle-session-input-queue.md +104 -0
  49. package/template/claude-task-manager/docs/walle-session-model-catalog.md +90 -0
  50. package/template/claude-task-manager/docs/walle-session-model-preferences.md +15 -6
  51. package/template/claude-task-manager/git-utils.js +897 -27
  52. package/template/claude-task-manager/lib/agent-capabilities.js +33 -0
  53. package/template/claude-task-manager/lib/agent-cli-cache.js +37 -7
  54. package/template/claude-task-manager/lib/agent-hooks-installer.js +26 -2
  55. package/template/claude-task-manager/lib/agent-presets.js +17 -1
  56. package/template/claude-task-manager/lib/all-sessions-query.js +108 -0
  57. package/template/claude-task-manager/lib/approval-ai-refinement.js +488 -0
  58. package/template/claude-task-manager/lib/approval-self-adapt.js +168 -0
  59. package/template/claude-task-manager/lib/async-semaphore.js +44 -0
  60. package/template/claude-task-manager/lib/auth-context.js +5 -0
  61. package/template/claude-task-manager/lib/auth-rate-limit.js +47 -4
  62. package/template/claude-task-manager/lib/auth-rules.js +29 -2
  63. package/template/claude-task-manager/lib/auto-approval-verifier.js +129 -16
  64. package/template/claude-task-manager/lib/background-llm.js +144 -17
  65. package/template/claude-task-manager/lib/branch-inventory.js +212 -0
  66. package/template/claude-task-manager/lib/claude-desktop-sessions.js +15 -3
  67. package/template/claude-task-manager/lib/coalesce-sync-frames.js +151 -0
  68. package/template/claude-task-manager/lib/codex-launch-health.js +762 -0
  69. package/template/claude-task-manager/lib/codex-transcript-pager.js +51 -0
  70. package/template/claude-task-manager/lib/codex-zst.js +124 -0
  71. package/template/claude-task-manager/lib/coding-agent-models.js +233 -30
  72. package/template/claude-task-manager/lib/connection-health.js +232 -0
  73. package/template/claude-task-manager/lib/conversation-blob-parser.js +42 -0
  74. package/template/claude-task-manager/lib/conversation-tail-merge.js +89 -26
  75. package/template/claude-task-manager/lib/ctm-session-context-api.js +39 -10
  76. package/template/claude-task-manager/lib/cursor-conversation-store.js +354 -0
  77. package/template/claude-task-manager/lib/db-owner-worker-client.js +315 -0
  78. package/template/claude-task-manager/lib/document-review.js +141 -6
  79. package/template/claude-task-manager/lib/escalation-review.js +152 -0
  80. package/template/claude-task-manager/lib/graceful-shutdown.js +159 -0
  81. package/template/claude-task-manager/lib/headless-term-service.js +678 -0
  82. package/template/claude-task-manager/lib/heavy-worker-fallback.js +38 -0
  83. package/template/claude-task-manager/lib/jsonl-conversation-parser.js +542 -0
  84. package/template/claude-task-manager/lib/jsonl-range-reader.js +112 -0
  85. package/template/claude-task-manager/lib/main-db-census.js +216 -0
  86. package/template/claude-task-manager/lib/message-pagination.js +106 -4
  87. package/template/claude-task-manager/lib/microsoft-dev-tunnel-setup.js +750 -26
  88. package/template/claude-task-manager/lib/mobile-auth-api.js +274 -7
  89. package/template/claude-task-manager/lib/mobile-auth-store.js +592 -10
  90. package/template/claude-task-manager/lib/mobile-notification-dispatcher.js +15 -0
  91. package/template/claude-task-manager/lib/model-overview-brain-fallback.js +311 -0
  92. package/template/claude-task-manager/lib/model-overview-cache.js +141 -0
  93. package/template/claude-task-manager/lib/models-health-routing-notice.js +126 -0
  94. package/template/claude-task-manager/lib/node-pin-guard.js +93 -0
  95. package/template/claude-task-manager/lib/perf-tracker.js +242 -6
  96. package/template/claude-task-manager/lib/permission-match.js +76 -0
  97. package/template/claude-task-manager/lib/permission-sync.js +133 -20
  98. package/template/claude-task-manager/lib/process-title.js +35 -0
  99. package/template/claude-task-manager/lib/prompt-executions-query.js +25 -0
  100. package/template/claude-task-manager/lib/prompt-index-disk-cache.js +44 -0
  101. package/template/claude-task-manager/lib/prompt-intent.js +132 -0
  102. package/template/claude-task-manager/lib/provider-user-context.js +34 -0
  103. package/template/claude-task-manager/lib/read-pool-client.js +313 -0
  104. package/template/claude-task-manager/lib/readpool-breaker.js +31 -0
  105. package/template/claude-task-manager/lib/recent-sessions-breaker.js +12 -0
  106. package/template/claude-task-manager/lib/remote-feedback-client.js +72 -0
  107. package/template/claude-task-manager/lib/remote-relay-protocol.js +37 -4
  108. package/template/claude-task-manager/lib/remote-relay-store.js +159 -0
  109. package/template/claude-task-manager/lib/remote-submission-observer.js +278 -0
  110. package/template/claude-task-manager/lib/restart-guard.js +109 -0
  111. package/template/claude-task-manager/lib/restore-interruption-detector.js +439 -0
  112. package/template/claude-task-manager/lib/restore-policy.js +13 -0
  113. package/template/claude-task-manager/lib/restore-resume-batch.js +74 -0
  114. package/template/claude-task-manager/lib/restore-runtime.js +68 -0
  115. package/template/claude-task-manager/lib/restore-storm.js +34 -0
  116. package/template/claude-task-manager/lib/resume-cwd.js +36 -0
  117. package/template/claude-task-manager/lib/resume-preflight.js +313 -0
  118. package/template/claude-task-manager/lib/runtime-work-registry.js +444 -0
  119. package/template/claude-task-manager/lib/sanitize-openai-auth.js +31 -0
  120. package/template/claude-task-manager/lib/scheduler.js +21 -1
  121. package/template/claude-task-manager/lib/scrollback-snapshot-store.js +159 -0
  122. package/template/claude-task-manager/lib/serial-task-queue.js +64 -0
  123. package/template/claude-task-manager/lib/server-listeners.js +239 -0
  124. package/template/claude-task-manager/lib/session-capture.js +42 -7
  125. package/template/claude-task-manager/lib/session-content-backfill.js +131 -0
  126. package/template/claude-task-manager/lib/session-history.js +388 -43
  127. package/template/claude-task-manager/lib/session-host-manager.js +287 -0
  128. package/template/claude-task-manager/lib/session-image-refs.js +209 -0
  129. package/template/claude-task-manager/lib/session-jobs.js +399 -59
  130. package/template/claude-task-manager/lib/session-prompt-index.js +137 -0
  131. package/template/claude-task-manager/lib/session-restore.js +53 -0
  132. package/template/claude-task-manager/lib/session-standup.js +123 -23
  133. package/template/claude-task-manager/lib/session-state-bus.js +14 -0
  134. package/template/claude-task-manager/lib/session-stream.js +64 -16
  135. package/template/claude-task-manager/lib/session-timeline-summary.js +260 -0
  136. package/template/claude-task-manager/lib/session-token-usage.js +494 -0
  137. package/template/claude-task-manager/lib/session-workspace-binding.js +356 -0
  138. package/template/claude-task-manager/lib/setup-network-config.js +9 -0
  139. package/template/claude-task-manager/lib/size-cap.js +45 -0
  140. package/template/claude-task-manager/lib/size-cap.test.js +62 -0
  141. package/template/claude-task-manager/lib/skill-autocomplete.js +180 -1
  142. package/template/claude-task-manager/lib/skill-intent-resolver.js +304 -0
  143. package/template/claude-task-manager/lib/sqlite-driver.js +19 -3
  144. package/template/claude-task-manager/lib/standup-attention.js +7 -3
  145. package/template/claude-task-manager/lib/status-authority.js +39 -0
  146. package/template/claude-task-manager/lib/status-hooks.js +4 -0
  147. package/template/claude-task-manager/lib/storage-migration.js +235 -0
  148. package/template/claude-task-manager/lib/structured-capture.js +298 -0
  149. package/template/claude-task-manager/lib/sync-io-census.js +163 -0
  150. package/template/claude-task-manager/lib/tailscale-setup.js +6 -0
  151. package/template/claude-task-manager/lib/terminal-activity-evidence.js +33 -0
  152. package/template/claude-task-manager/lib/terminal-choice.js +364 -0
  153. package/template/claude-task-manager/lib/terminal-control-sanitize.js +17 -0
  154. package/template/claude-task-manager/lib/terminal-fingerprint.js +48 -0
  155. package/template/claude-task-manager/lib/terminal-output-flush.js +84 -0
  156. package/template/claude-task-manager/lib/timeline-order.js +122 -0
  157. package/template/claude-task-manager/lib/transcript-store.js +348 -43
  158. package/template/claude-task-manager/lib/transport-security.js +84 -1
  159. package/template/claude-task-manager/lib/wait-state.js +184 -0
  160. package/template/claude-task-manager/lib/walle-client.js +47 -5
  161. package/template/claude-task-manager/lib/walle-ctm-history.js +564 -4
  162. package/template/claude-task-manager/lib/walle-external-actions.js +135 -16
  163. package/template/claude-task-manager/lib/walle-history-hydration.js +46 -0
  164. package/template/claude-task-manager/lib/walle-native-health.js +403 -0
  165. package/template/claude-task-manager/lib/walle-repair.js +701 -0
  166. package/template/claude-task-manager/lib/walle-session-cache.js +109 -0
  167. package/template/claude-task-manager/lib/walle-session-context.js +57 -21
  168. package/template/claude-task-manager/lib/walle-session-model-catalog.js +34 -0
  169. package/template/claude-task-manager/lib/walle-supervisor.js +539 -63
  170. package/template/claude-task-manager/lib/walle-transcript.js +52 -0
  171. package/template/claude-task-manager/lib/worktree-active-sync.js +11 -7
  172. package/template/claude-task-manager/lib/worktree-cwd.js +32 -1
  173. package/template/claude-task-manager/package.json +1 -1
  174. package/template/claude-task-manager/prompt-harvest.js +89 -66
  175. package/template/claude-task-manager/providers/claude-code.js +51 -3
  176. package/template/claude-task-manager/providers/cursor.js +140 -45
  177. package/template/claude-task-manager/public/css/reviews.css +551 -61
  178. package/template/claude-task-manager/public/css/setup.css +191 -0
  179. package/template/claude-task-manager/public/css/walle-session.css +865 -10
  180. package/template/claude-task-manager/public/css/walle.css +154 -0
  181. package/template/claude-task-manager/public/designs/ai-providers-consolidation-v2.html +830 -0
  182. package/template/claude-task-manager/public/index.html +18516 -2058
  183. package/template/claude-task-manager/public/ipad.html +363 -0
  184. package/template/claude-task-manager/public/js/document-review-links.js +301 -0
  185. package/template/claude-task-manager/public/js/image-normalize.js +69 -36
  186. package/template/claude-task-manager/public/js/message-renderer.js +1265 -77
  187. package/template/claude-task-manager/public/js/prompts.js +66 -29
  188. package/template/claude-task-manager/public/js/reviews.js +901 -133
  189. package/template/claude-task-manager/public/js/session-activity-utils.js +11 -1
  190. package/template/claude-task-manager/public/js/session-search-utils.js +94 -10
  191. package/template/claude-task-manager/public/js/session-status-precedence.js +23 -5
  192. package/template/claude-task-manager/public/js/setup.js +1273 -176
  193. package/template/claude-task-manager/public/js/stream-view.js +691 -73
  194. package/template/claude-task-manager/public/js/terminal-reconciler.js +210 -0
  195. package/template/claude-task-manager/public/js/walle-session.js +2455 -158
  196. package/template/claude-task-manager/public/js/walle.js +455 -28
  197. package/template/claude-task-manager/public/m/app.css +2909 -262
  198. package/template/claude-task-manager/public/m/app.js +6601 -398
  199. package/template/claude-task-manager/public/m/claim.html +224 -17
  200. package/template/claude-task-manager/public/m/index.html +117 -21
  201. package/template/claude-task-manager/public/m/sw.js +3 -1
  202. package/template/claude-task-manager/public/manifest.json +2 -2
  203. package/template/claude-task-manager/public/prompts.html +30 -14
  204. package/template/claude-task-manager/queue-engine.js +507 -28
  205. package/template/claude-task-manager/scripts/repair-claude-session-images.js +27 -8
  206. package/template/claude-task-manager/server.js +14341 -2197
  207. package/template/claude-task-manager/session-integrity.js +160 -18
  208. package/template/claude-task-manager/session-search-ranking.js +1 -0
  209. package/template/claude-task-manager/session-utils.js +25 -5
  210. package/template/claude-task-manager/workers/approval-blocklist.js +96 -6
  211. package/template/claude-task-manager/workers/approval-widget-validator.js +14 -8
  212. package/template/claude-task-manager/workers/conversation-import-worker.js +11 -50
  213. package/template/claude-task-manager/workers/db-owner-worker.js +386 -0
  214. package/template/claude-task-manager/workers/harvest-worker.js +9 -55
  215. package/template/claude-task-manager/workers/headless-term-worker.js +9 -530
  216. package/template/claude-task-manager/workers/read-pool-worker.js +387 -0
  217. package/template/claude-task-manager/workers/scrollback-worker.js +11 -72
  218. package/template/claude-task-manager/workers/session-host-process.js +146 -0
  219. package/template/claude-task-manager/workers/session-integrity-worker.js +10 -54
  220. package/template/claude-task-manager/workers/state-detectors/base.js +18 -1
  221. package/template/claude-task-manager/workers/state-detectors/claude-code.js +182 -9
  222. package/template/claude-task-manager/workers/state-detectors/codex.js +150 -2
  223. package/template/claude-task-manager/workers/state-detectors/cursor.js +127 -0
  224. package/template/claude-task-manager/workers/state-detectors/gemini.js +21 -0
  225. package/template/claude-task-manager/workers/state-detectors/index.js +29 -0
  226. package/template/claude-task-manager/workers/state-detectors/opencode.js +103 -0
  227. package/template/docs/design/markdown-review-pane.md +206 -0
  228. package/template/docs/designs/2026-05-17-portkey-gateway-provider-ux.md +129 -38
  229. package/template/docs/designs/2026-05-20-mobile-worktree-finish-command.md +27 -0
  230. package/template/docs/designs/2026-05-22-ai-configuration-consolidation.md +248 -0
  231. package/template/docs/designs/ai-configuration-consolidation-mock.html +812 -0
  232. package/template/docs/private-memory-and-pii-policy.md +69 -0
  233. package/template/package.json +2 -1
  234. package/template/scripts/check-private-data.js +201 -0
  235. package/template/shared/sqlite-owner-guard.js +30 -0
  236. package/template/shared/sqlite-owner-write-queue.js +225 -0
  237. package/template/shared/sqlite-storage-policy.js +111 -0
  238. package/template/shared/sqlite-write-lock.js +428 -0
  239. package/template/wall-e/agent-runners/claude-code.js +5 -0
  240. package/template/wall-e/agent.js +166 -22
  241. package/template/wall-e/api-walle.js +524 -70
  242. package/template/wall-e/auth/provider-flows.js +11 -1
  243. package/template/wall-e/bin/walle-mcp-stdio.js +341 -17
  244. package/template/wall-e/brain.js +1614 -141
  245. package/template/wall-e/chat/attachment-blocks.js +96 -0
  246. package/template/wall-e/chat/attachments.js +2 -1
  247. package/template/wall-e/chat/capability-resolver.js +7 -7
  248. package/template/wall-e/chat/context-messages.js +28 -0
  249. package/template/wall-e/chat/conversation-frame.js +630 -0
  250. package/template/wall-e/chat/provider-messages.js +125 -0
  251. package/template/wall-e/chat.js +1002 -233
  252. package/template/wall-e/coding/acceptance-contract.js +170 -0
  253. package/template/wall-e/coding/acp-adapter.js +1 -1
  254. package/template/wall-e/coding/agent-catalog.js +3 -0
  255. package/template/wall-e/coding/artifact-store.js +93 -0
  256. package/template/wall-e/coding/capability-router.js +120 -0
  257. package/template/wall-e/coding/coding-run-controller.js +423 -0
  258. package/template/wall-e/coding/compaction-service.js +157 -12
  259. package/template/wall-e/coding/frontend-verification.js +258 -0
  260. package/template/wall-e/coding/lifecycle-hooks.js +75 -0
  261. package/template/wall-e/coding/local-preview-contract.js +157 -0
  262. package/template/wall-e/coding/permission-service.js +57 -13
  263. package/template/wall-e/coding/prompt-bundle.js +19 -1
  264. package/template/wall-e/coding/prompt-section-registry.js +227 -0
  265. package/template/wall-e/coding/provider-compat.js +15 -0
  266. package/template/wall-e/coding/runtime-events.js +224 -0
  267. package/template/wall-e/coding/runtime-mode.js +3 -0
  268. package/template/wall-e/coding/side-git-snapshot.js +160 -4
  269. package/template/wall-e/coding/snapshot-service.js +143 -1
  270. package/template/wall-e/coding/stream-processor.js +388 -34
  271. package/template/wall-e/coding/task-tool.js +141 -4
  272. package/template/wall-e/coding/tool-execution-controller.js +365 -0
  273. package/template/wall-e/coding/tool-registry.js +43 -5
  274. package/template/wall-e/coding/user-hooks.js +217 -0
  275. package/template/wall-e/coding-orchestrator.js +1330 -221
  276. package/template/wall-e/coding-prompts.js +20 -4
  277. package/template/wall-e/context/context-builder.js +15 -2
  278. package/template/wall-e/decision/confidence.js +1 -1
  279. package/template/wall-e/docs/coding-acceptance-contract.md +41 -0
  280. package/template/wall-e/docs/external-action-controller.md +26 -6
  281. package/template/wall-e/docs/telemetry-lifecycle.md +8 -2
  282. package/template/wall-e/embeddings.js +591 -53
  283. package/template/wall-e/external-action-controller.js +12 -0
  284. package/template/wall-e/http/auth.js +1 -0
  285. package/template/wall-e/http/chat-api.js +46 -11
  286. package/template/wall-e/http/model-admin.js +836 -34
  287. package/template/wall-e/lib/boot-profile.js +88 -0
  288. package/template/wall-e/lib/event-loop-monitor.js +93 -0
  289. package/template/wall-e/lib/service-health.js +194 -0
  290. package/template/wall-e/llm/anthropic.js +130 -5
  291. package/template/wall-e/llm/client.js +266 -63
  292. package/template/wall-e/llm/default-fallback.js +382 -0
  293. package/template/wall-e/llm/health.js +19 -0
  294. package/template/wall-e/llm/message-guard.js +78 -0
  295. package/template/wall-e/llm/model-catalog.js +252 -1
  296. package/template/wall-e/llm/openai.js +26 -4
  297. package/template/wall-e/llm/portkey-sync.js +654 -0
  298. package/template/wall-e/llm/provider-error.js +30 -2
  299. package/template/wall-e/llm/registry.js +5 -1
  300. package/template/wall-e/llm/request-compat.js +67 -0
  301. package/template/wall-e/loops/backfill.js +79 -23
  302. package/template/wall-e/loops/brain-optimize.js +67 -0
  303. package/template/wall-e/loops/ingest.js +25 -10
  304. package/template/wall-e/loops/question-digest.js +160 -0
  305. package/template/wall-e/loops/reflect.js +6 -4
  306. package/template/wall-e/loops/think.js +39 -12
  307. package/template/wall-e/mcp-server.js +318 -36
  308. package/template/wall-e/memory/ctm-context-client.js +52 -14
  309. package/template/wall-e/memory/ctm-operational-context.js +237 -0
  310. package/template/wall-e/memory/ctm-prompt-executions-client.js +128 -0
  311. package/template/wall-e/memory/ctm-session-context.js +111 -63
  312. package/template/wall-e/prompts/coding/deepseek.txt +3 -0
  313. package/template/wall-e/prompts/coding/gemini.txt +6 -0
  314. package/template/wall-e/prompts/coding/gpt.txt +6 -0
  315. package/template/wall-e/prompts/coding/local.txt +7 -0
  316. package/template/wall-e/runtime/decision-hooks.js +115 -0
  317. package/template/wall-e/runtime/devbox-gateway.js +82 -8
  318. package/template/wall-e/runtime/prompt-manifest.js +86 -0
  319. package/template/wall-e/runtime/tool-executor.js +269 -0
  320. package/template/wall-e/runtime/tool-result-envelope.js +138 -0
  321. package/template/wall-e/runtime/transcript-projection.js +60 -0
  322. package/template/wall-e/runtime/walle-runtime.js +224 -0
  323. package/template/wall-e/scripts/db-optimize/migrate.js +162 -0
  324. package/template/wall-e/scripts/db-optimize/recall-eval.js +117 -0
  325. package/template/wall-e/server.js +15 -0
  326. package/template/wall-e/session-files.js +9 -0
  327. package/template/wall-e/skills/_bundled/google-calendar/run.js +1 -1
  328. package/template/wall-e/skills/_bundled/gws-workspace/run.js +1 -1
  329. package/template/wall-e/skills/_bundled/slack-mentions/run.js +76 -6
  330. package/template/wall-e/skills/claude-code-reader.js +7 -3
  331. package/template/wall-e/skills/script-skill-runner.js +10 -0
  332. package/template/wall-e/skills/skill-planner.js +38 -0
  333. package/template/wall-e/tools/builtin-middleware.js +19 -9
  334. package/template/wall-e/tools/local-tools.js +1428 -16
  335. package/template/wall-e/tools/permission-checker.js +73 -5
  336. package/template/wall-e/tools/question-manager.js +117 -7
  337. package/template/wall-e/training/harvester.js +12 -28
  338. package/template/wall-e/training/replay.js +25 -80
  339. package/template/website/index.html +10 -10
  340. package/template/wall-e/eval/ab-test.js +0 -203
  341. package/template/wall-e/eval/agent-runner.js +0 -772
  342. package/template/wall-e/eval/agent-scorer.js +0 -461
  343. package/template/wall-e/eval/aggregator.js +0 -414
  344. package/template/wall-e/eval/allowed-test-commands.js +0 -34
  345. package/template/wall-e/eval/benchmark-generator.js +0 -113
  346. package/template/wall-e/eval/benchmarks/chat-eval.json +0 -1662
  347. package/template/wall-e/eval/benchmarks/chat.json +0 -82
  348. package/template/wall-e/eval/benchmarks/coding-agent-real.json +0 -1
  349. package/template/wall-e/eval/benchmarks/coding-agent.json +0 -1581
  350. package/template/wall-e/eval/benchmarks/coding.json +0 -122
  351. package/template/wall-e/eval/benchmarks/memory-retrieval.json +0 -234
  352. package/template/wall-e/eval/benchmarks/reasoning.json +0 -82
  353. package/template/wall-e/eval/benchmarks/swebench-lite-30.json +0 -212
  354. package/template/wall-e/eval/benchmarks.js +0 -669
  355. package/template/wall-e/eval/cc-replay.js +0 -719
  356. package/template/wall-e/eval/chat-eval.js +0 -525
  357. package/template/wall-e/eval/check-keys.js +0 -15
  358. package/template/wall-e/eval/check-providers.js +0 -42
  359. package/template/wall-e/eval/codex-cli-baseline.js +0 -669
  360. package/template/wall-e/eval/coding-agent-real.js +0 -570
  361. package/template/wall-e/eval/context-compactor.js +0 -251
  362. package/template/wall-e/eval/debug-agent003.js +0 -68
  363. package/template/wall-e/eval/diagnostics.js +0 -216
  364. package/template/wall-e/eval/eval-orchestrator.js +0 -642
  365. package/template/wall-e/eval/evaluate.js +0 -202
  366. package/template/wall-e/eval/evaluator.js +0 -373
  367. package/template/wall-e/eval/exporter.js +0 -212
  368. package/template/wall-e/eval/fixtures/express-basic/package.json +0 -9
  369. package/template/wall-e/eval/fixtures/express-basic/server.js +0 -115
  370. package/template/wall-e/eval/fixtures/express-basic/test.js +0 -83
  371. package/template/wall-e/eval/fixtures/express-buggy/package.json +0 -9
  372. package/template/wall-e/eval/fixtures/express-buggy/server.js +0 -113
  373. package/template/wall-e/eval/fixtures/express-buggy/test.js +0 -83
  374. package/template/wall-e/eval/fixtures/express-buggy-items/package.json +0 -9
  375. package/template/wall-e/eval/fixtures/express-buggy-items/server.js +0 -112
  376. package/template/wall-e/eval/fixtures/express-buggy-items/test.js +0 -83
  377. package/template/wall-e/eval/fixtures/express-buggy-search/package.json +0 -9
  378. package/template/wall-e/eval/fixtures/express-buggy-search/server.js +0 -121
  379. package/template/wall-e/eval/fixtures/express-buggy-search/test.js +0 -83
  380. package/template/wall-e/eval/fixtures/express-rename-data/data.js +0 -34
  381. package/template/wall-e/eval/fixtures/express-rename-data/package.json +0 -9
  382. package/template/wall-e/eval/fixtures/express-rename-data/server.js +0 -97
  383. package/template/wall-e/eval/fixtures/express-rename-data/test.js +0 -88
  384. package/template/wall-e/eval/fixtures/express-xss/package.json +0 -12
  385. package/template/wall-e/eval/fixtures/express-xss/server.js +0 -90
  386. package/template/wall-e/eval/fixtures/express-xss/test.js +0 -67
  387. package/template/wall-e/eval/fixtures/express-xss/views/profile.ejs +0 -9
  388. package/template/wall-e/eval/fixtures/fullstack-app/config/default.js +0 -9
  389. package/template/wall-e/eval/fixtures/fullstack-app/config/test.js +0 -13
  390. package/template/wall-e/eval/fixtures/fullstack-app/package.json +0 -11
  391. package/template/wall-e/eval/fixtures/fullstack-app/public/css/style.css +0 -137
  392. package/template/wall-e/eval/fixtures/fullstack-app/public/index.html +0 -46
  393. package/template/wall-e/eval/fixtures/fullstack-app/public/js/app.js +0 -121
  394. package/template/wall-e/eval/fixtures/fullstack-app/public/js/auth.js +0 -71
  395. package/template/wall-e/eval/fixtures/fullstack-app/public/js/items.js +0 -80
  396. package/template/wall-e/eval/fixtures/fullstack-app/public/js/users.js +0 -46
  397. package/template/wall-e/eval/fixtures/fullstack-app/public/login.html +0 -45
  398. package/template/wall-e/eval/fixtures/fullstack-app/public/register.html +0 -38
  399. package/template/wall-e/eval/fixtures/fullstack-app/scripts/migrate.js +0 -23
  400. package/template/wall-e/eval/fixtures/fullstack-app/scripts/seed.js +0 -46
  401. package/template/wall-e/eval/fixtures/fullstack-app/server/db.js +0 -99
  402. package/template/wall-e/eval/fixtures/fullstack-app/server/index.js +0 -94
  403. package/template/wall-e/eval/fixtures/fullstack-app/server/middleware/auth.js +0 -19
  404. package/template/wall-e/eval/fixtures/fullstack-app/server/middleware/logger.js +0 -19
  405. package/template/wall-e/eval/fixtures/fullstack-app/server/router.js +0 -50
  406. package/template/wall-e/eval/fixtures/fullstack-app/server/routes/auth.js +0 -69
  407. package/template/wall-e/eval/fixtures/fullstack-app/server/routes/health.js +0 -23
  408. package/template/wall-e/eval/fixtures/fullstack-app/server/routes/items.js +0 -88
  409. package/template/wall-e/eval/fixtures/fullstack-app/server/routes/users.js +0 -75
  410. package/template/wall-e/eval/fixtures/fullstack-app/server/test.js +0 -198
  411. package/template/wall-e/eval/fixtures/fullstack-app/server/utils/response.js +0 -34
  412. package/template/wall-e/eval/fixtures/fullstack-app/server/utils/validate.js +0 -26
  413. package/template/wall-e/eval/fixtures/fullstack-app/server.js +0 -8
  414. package/template/wall-e/eval/fixtures/fullstack-app/test.js +0 -12
  415. package/template/wall-e/eval/fixtures/monorepo-basic/package.json +0 -8
  416. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/data.js +0 -58
  417. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/middleware.js +0 -46
  418. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/package.json +0 -8
  419. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/routes.js +0 -64
  420. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/server.js +0 -56
  421. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/test.js +0 -116
  422. package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/commands.js +0 -61
  423. package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/index.js +0 -62
  424. package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/output.js +0 -43
  425. package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/package.json +0 -11
  426. package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/test.js +0 -44
  427. package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/formatters.js +0 -43
  428. package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/index.js +0 -12
  429. package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/package.json +0 -5
  430. package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/test.js +0 -55
  431. package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/validators.js +0 -29
  432. package/template/wall-e/eval/fixtures/monorepo-basic/test.js +0 -46
  433. package/template/wall-e/eval/fixtures/node-cli/index.js +0 -78
  434. package/template/wall-e/eval/fixtures/node-cli/package.json +0 -10
  435. package/template/wall-e/eval/fixtures/node-cli/test.js +0 -57
  436. package/template/wall-e/eval/fixtures/node-typed/package.json +0 -8
  437. package/template/wall-e/eval/fixtures/node-typed/src/handlers.js +0 -31
  438. package/template/wall-e/eval/fixtures/node-typed/src/utils.js +0 -33
  439. package/template/wall-e/eval/fixtures/node-typed/test.js +0 -36
  440. package/template/wall-e/eval/fixtures/python-flask/app.py +0 -14
  441. package/template/wall-e/eval/fixtures/python-flask/requirements.txt +0 -2
  442. package/template/wall-e/eval/fixtures/python-flask/test_app.py +0 -25
  443. package/template/wall-e/eval/fixtures/wall-e-subset/brain.js +0 -105
  444. package/template/wall-e/eval/fixtures/wall-e-subset/eval/aggregator.js +0 -101
  445. package/template/wall-e/eval/fixtures/wall-e-subset/eval/benchmarks/chat.json +0 -20
  446. package/template/wall-e/eval/fixtures/wall-e-subset/eval/benchmarks/coding.json +0 -32
  447. package/template/wall-e/eval/fixtures/wall-e-subset/eval/benchmarks.js +0 -64
  448. package/template/wall-e/eval/fixtures/wall-e-subset/eval/fixtures/simple-project/package.json +0 -6
  449. package/template/wall-e/eval/fixtures/wall-e-subset/eval/fixtures/simple-project/server.js +0 -31
  450. package/template/wall-e/eval/fixtures/wall-e-subset/eval/fixtures/simple-project/test.js +0 -18
  451. package/template/wall-e/eval/fixtures/wall-e-subset/eval/fixtures/simple-project/utils.js +0 -34
  452. package/template/wall-e/eval/fixtures/wall-e-subset/eval/runner.js +0 -104
  453. package/template/wall-e/eval/fixtures/wall-e-subset/eval/scorer.js +0 -73
  454. package/template/wall-e/eval/fixtures/wall-e-subset/eval/test.js +0 -134
  455. package/template/wall-e/eval/fixtures/wall-e-subset/llm/client.js +0 -99
  456. package/template/wall-e/eval/fixtures/wall-e-subset/llm/providers.js +0 -63
  457. package/template/wall-e/eval/fixtures/wall-e-subset/llm/test.js +0 -70
  458. package/template/wall-e/eval/fixtures/wall-e-subset/package.json +0 -10
  459. package/template/wall-e/eval/fixtures/wall-e-subset/test.js +0 -86
  460. package/template/wall-e/eval/harvester.js +0 -685
  461. package/template/wall-e/eval/head-to-head.js +0 -388
  462. package/template/wall-e/eval/humaneval-adapter.js +0 -321
  463. package/template/wall-e/eval/list-models.js +0 -31
  464. package/template/wall-e/eval/livecodebench-adapter.js +0 -291
  465. package/template/wall-e/eval/mail-integration.js +0 -443
  466. package/template/wall-e/eval/manifest.js +0 -186
  467. package/template/wall-e/eval/meta-harness/adapters/coding-agent.js +0 -57
  468. package/template/wall-e/eval/meta-harness/bootstrap-snapshot.js +0 -149
  469. package/template/wall-e/eval/meta-harness/candidate-store.js +0 -117
  470. package/template/wall-e/eval/meta-harness/cli.js +0 -86
  471. package/template/wall-e/eval/meta-harness/domain-spec.js +0 -154
  472. package/template/wall-e/eval/meta-harness/domains/coding-agent.domain.json +0 -84
  473. package/template/wall-e/eval/meta-harness/examples/env-bootstrap-candidate.js +0 -29
  474. package/template/wall-e/eval/meta-harness/experience-store.js +0 -174
  475. package/template/wall-e/eval/meta-harness/frontier.js +0 -96
  476. package/template/wall-e/eval/meta-harness/harness-interface.js +0 -90
  477. package/template/wall-e/eval/meta-harness/leakage-guard.js +0 -80
  478. package/template/wall-e/eval/meta-harness/optimizer.js +0 -207
  479. package/template/wall-e/eval/meta-harness/proposer-runner.js +0 -110
  480. package/template/wall-e/eval/meta-harness/reporting.js +0 -58
  481. package/template/wall-e/eval/meta-harness/telemetry.js +0 -27
  482. package/template/wall-e/eval/meta-harness/validation.js +0 -81
  483. package/template/wall-e/eval/promoter.js +0 -228
  484. package/template/wall-e/eval/provider-normalizer.js +0 -33
  485. package/template/wall-e/eval/replay.js +0 -395
  486. package/template/wall-e/eval/run-agent-benchmarks.js +0 -386
  487. package/template/wall-e/eval/run-codex-cli-baseline.js +0 -177
  488. package/template/wall-e/eval/run-coding-agent-real.js +0 -187
  489. package/template/wall-e/eval/run-eval.js +0 -435
  490. package/template/wall-e/eval/run-model-comparison.js +0 -142
  491. package/template/wall-e/eval/session-evaluator.js +0 -187
  492. package/template/wall-e/eval/session-miner.js +0 -207
  493. package/template/wall-e/eval/session-retrieval-benchmark.js +0 -150
  494. package/template/wall-e/eval/session-transcripts.js +0 -509
  495. package/template/wall-e/eval/shadow.js +0 -161
  496. package/template/wall-e/eval/swebench-adapter.js +0 -345
  497. package/template/wall-e/eval/swebench-docker.js +0 -192
  498. package/template/wall-e/eval/train.py +0 -320
  499. package/template/wall-e/eval/trainer.js +0 -232
  500. package/template/wall-e/eval/weekly-eval-loop.js +0 -241
@@ -1,461 +0,0 @@
1
- 'use strict';
2
-
3
- const { execFileSync } = require('child_process');
4
- const path = require('path');
5
-
6
- /**
7
- * Score coding agent performance across multiple dimensions.
8
- * Used for both benchmark evaluation and post-session evaluation.
9
- *
10
- * 11 dimensions (weights sum to 1.0):
11
- * correctness(0.25), toolEfficiency(0.10), diffAccuracy(0.10),
12
- * costEfficiency(0.10), planQuality(0.08), errorHandling(0.08),
13
- * turnEconomy(0.07), codeQuality(0.07), partialProgress(0.05),
14
- * contextManagement(0.05), iterativeRefinement(0.05)
15
- */
16
-
17
- const DIMENSION_WEIGHTS = {
18
- correctness: 0.25,
19
- toolEfficiency: 0.10,
20
- diffAccuracy: 0.10,
21
- costEfficiency: 0.10,
22
- planQuality: 0.08,
23
- errorHandling: 0.08,
24
- turnEconomy: 0.07,
25
- codeQuality: 0.07,
26
- partialProgress: 0.05,
27
- contextManagement: 0.05,
28
- iterativeRefinement: 0.05,
29
- };
30
-
31
- /**
32
- * Compute tool efficiency score.
33
- * Penalizes redundant calls, rewards expected tool usage.
34
- */
35
- function scoreToolEfficiency(actual, expected = [], forbidden = []) {
36
- if (!actual || actual.length === 0) return 0.1;
37
-
38
- let score = 0.5; // base score for any tool usage
39
-
40
- // Reward using expected tools
41
- if (expected.length > 0) {
42
- const actualSet = new Set(actual);
43
- const overlap = expected.filter(t => actualSet.has(t)).length;
44
- score = overlap / expected.length;
45
- }
46
-
47
- // Penalize forbidden tool usage
48
- const violations = actual.filter(t => forbidden.includes(t));
49
- if (violations.length > 0) score *= 0.5;
50
-
51
- // Penalize excessive redundancy
52
- const uniqueRatio = new Set(actual).size / actual.length;
53
- if (uniqueRatio < 0.3) score *= 0.7; // too many repeated calls
54
-
55
- return Math.min(1, Math.max(0, score));
56
- }
57
-
58
- /**
59
- * Score code correctness based on test results, file changes, and completion.
60
- * Tests passing WITHOUT any file modifications = model didn't do the work.
61
- */
62
- function scoreCorrectness({ testsPassed, success, output, actualFiles }) {
63
- const hasFileChanges = actualFiles && actualFiles.length > 0;
64
-
65
- if (testsPassed === true && hasFileChanges) return 1.0;
66
- if (testsPassed === true && !hasFileChanges) return 0.2; // baseline tests pass but no work done
67
- if (testsPassed === false) return 0.2;
68
- if (success && hasFileChanges) return 0.7;
69
- if (success && !hasFileChanges) return 0.3; // "success" without changes is suspicious
70
- if (output && output.length > 100) return 0.4;
71
- return 0.1;
72
- }
73
-
74
- /**
75
- * Score plan quality based on todo usage and structure.
76
- */
77
- function scorePlanQuality(toolCalls, output) {
78
- let score = 0.3; // baseline
79
-
80
- const hasTodos = toolCalls.some(t => t === 'update_todos' || (typeof t === 'object' && t.name === 'update_todos'));
81
- if (hasTodos) score += 0.3;
82
-
83
- // Check if output shows structured planning
84
- if (/step\s*\d|phase\s*\d|plan:/i.test(output || '')) score += 0.2;
85
-
86
- // Check if read/explore happens before write
87
- const readIdx = toolCalls.findIndex(t => {
88
- const name = typeof t === 'string' ? t : t.name;
89
- return /read_file|glob|grep/.test(name);
90
- });
91
- const writeIdx = toolCalls.findIndex(t => {
92
- const name = typeof t === 'string' ? t : t.name;
93
- return /write_file|edit_file/.test(name);
94
- });
95
- if (readIdx >= 0 && writeIdx >= 0 && readIdx < writeIdx) score += 0.2;
96
-
97
- return Math.min(1, score);
98
- }
99
-
100
- /**
101
- * Score diff accuracy — how well the changes match expectations.
102
- */
103
- function scoreDiffAccuracy(actualFiles, expectedFiles) {
104
- if (!expectedFiles || expectedFiles.length === 0) return 0.5;
105
- if (!actualFiles || actualFiles.length === 0) return 0.1;
106
-
107
- const expected = new Set(expectedFiles);
108
- const actual = new Set(actualFiles);
109
-
110
- const truePositives = [...expected].filter(f => actual.has(f)).length;
111
- const precision = actual.size > 0 ? truePositives / actual.size : 0;
112
- const recall = expected.size > 0 ? truePositives / expected.size : 0;
113
-
114
- // F1 score
115
- if (precision + recall === 0) return 0.1;
116
- return 2 * (precision * recall) / (precision + recall);
117
- }
118
-
119
- /**
120
- * Score turn economy — fewer turns for equivalent work is better.
121
- */
122
- function scoreTurnEconomy(actualTurns, maxTurns = 20) {
123
- if (actualTurns <= 0) return 0;
124
- if (actualTurns <= maxTurns * 0.5) return 1.0;
125
- if (actualTurns <= maxTurns) return 0.7;
126
- if (actualTurns <= maxTurns * 1.5) return 0.3;
127
- return 0.1;
128
- }
129
-
130
- /**
131
- * Score error handling — doom loop avoidance and recovery.
132
- */
133
- function scoreErrorHandling(toolCalls, consecutiveErrors = 0) {
134
- let score = 0.7; // baseline
135
-
136
- // Penalize doom loops (same tool call repeated 3+ times)
137
- const callNames = toolCalls.map(t => typeof t === 'string' ? t : t.name || '');
138
- for (let i = 2; i < callNames.length; i++) {
139
- if (callNames[i] === callNames[i-1] && callNames[i] === callNames[i-2]) {
140
- score -= 0.2;
141
- break;
142
- }
143
- }
144
-
145
- // Penalize excessive consecutive errors
146
- if (consecutiveErrors >= 3) score -= 0.3;
147
-
148
- return Math.max(0, Math.min(1, score));
149
- }
150
-
151
- /**
152
- * Score code quality using eslint static analysis on modified files.
153
- * Falls back to heuristic if eslint is unavailable or sandbox is missing.
154
- *
155
- * @param {object} opts
156
- * @param {string} opts.sandboxDir - directory containing modified files
157
- * @param {string[]} opts.actualFiles - list of modified file paths (relative to sandboxDir)
158
- * @param {boolean} opts.success - whether the agent reported success
159
- * @returns {number} score in [0, 1]
160
- */
161
- function scoreCodeQuality({ sandboxDir, actualFiles, success } = {}) {
162
- if (!sandboxDir || !actualFiles || actualFiles.length === 0) {
163
- return success ? 0.7 : 0.3;
164
- }
165
-
166
- // Filter to JS/TS files only — eslint won't help for other types
167
- const jsFiles = actualFiles
168
- .filter(f => /\.(js|ts|mjs|cjs)$/.test(f))
169
- .map(f => path.resolve(sandboxDir, f));
170
-
171
- if (jsFiles.length === 0) {
172
- return success ? 0.7 : 0.3;
173
- }
174
-
175
- try {
176
- execFileSync('npx', [
177
- 'eslint', '--format', 'json', '--no-eslintrc',
178
- '-c', JSON.stringify({ rules: { 'no-undef': 'error', 'no-unused-vars': 'warn' } }),
179
- ...jsFiles,
180
- ], {
181
- cwd: sandboxDir,
182
- timeout: 15000,
183
- stdio: ['pipe', 'pipe', 'pipe'],
184
- env: { ...process.env, NODE_ENV: 'test' },
185
- });
186
-
187
- // eslint exits 0 = no issues
188
- return 1.0;
189
- } catch (err) {
190
- // eslint exits non-zero if there are issues — stdout still has JSON
191
- const stdout = err.stdout ? err.stdout.toString() : '';
192
- if (!stdout || !stdout.startsWith('[')) {
193
- // eslint not available or unexpected error — fall back
194
- return success ? 0.7 : 0.3;
195
- }
196
-
197
- try {
198
- const report = JSON.parse(stdout);
199
- let errors = 0;
200
- let warnings = 0;
201
- for (const file of report) {
202
- errors += file.errorCount || 0;
203
- warnings += file.warningCount || 0;
204
- }
205
- const score = 1.0 - (errors * 0.1 + warnings * 0.03);
206
- return Math.min(1, Math.max(0, score));
207
- } catch {
208
- return success ? 0.7 : 0.3;
209
- }
210
- }
211
- }
212
-
213
- /**
214
- * Score cost efficiency — quality per dollar spent.
215
- * Higher quality at lower cost = better score.
216
- * Normalized: qualityScore / max(costDollars, 0.001)
217
- *
218
- * @param {number} qualityScore - composite quality score (0-1) from other dimensions
219
- * @param {number} costDollars - actual cost in USD
220
- * @param {number} [cohortMaxRatio] - max quality/cost ratio in cohort for normalization
221
- * @returns {number} score in [0, 1]
222
- */
223
- function scoreCostEfficiency(qualityScore, costDollars, cohortMaxRatio = 0) {
224
- if (costDollars === undefined || costDollars === null) return 0.5; // neutral if no cost data
225
- const ratio = (qualityScore || 0) / Math.max(costDollars, 0.001);
226
- if (cohortMaxRatio > 0) {
227
- return Math.min(1, Math.max(0, ratio / cohortMaxRatio));
228
- }
229
- // Without cohort context, use heuristic: ratio of 100 (good quality for $0.01) = 1.0
230
- // ratio of 1 ($1 for score 1.0) = low
231
- const normalized = Math.min(1, ratio / 100);
232
- return Math.max(0, normalized);
233
- }
234
-
235
- /**
236
- * Score partial progress — fix rate based on test improvements.
237
- * (testsAfter - testsBefore) / totalTests
238
- *
239
- * @param {number} testsBefore - tests passing before agent run
240
- * @param {number} testsAfter - tests passing after agent run
241
- * @param {number} totalTests - total number of tests
242
- * @returns {number} score in [0, 1]
243
- */
244
- function scorePartialProgress(testsBefore, testsAfter, totalTests) {
245
- if (totalTests === undefined || totalTests === null || totalTests <= 0) return 0.5; // neutral
246
- if (testsBefore === undefined || testsBefore === null) return 0.5;
247
- if (testsAfter === undefined || testsAfter === null) return 0.5;
248
-
249
- const fixRate = (testsAfter - testsBefore) / totalTests;
250
- // fixRate can be negative (regression) — clamp to [0, 1]
251
- return Math.min(1, Math.max(0, fixRate));
252
- }
253
-
254
- /**
255
- * Score context management — penalize redundant file reads, reward targeted access.
256
- * Examines tool call details to detect re-reading the same file >2x
257
- * and rewards use of offset/limit parameters for targeted reads.
258
- *
259
- * @param {Array} toolCallDetails - array of { name, args } objects with full call info
260
- * @returns {number} score in [0, 1]
261
- */
262
- function scoreContextManagement(toolCallDetails) {
263
- if (!toolCallDetails || toolCallDetails.length === 0) return 0.5; // neutral
264
-
265
- const readCounts = {}; // file -> read count
266
- let targetedReads = 0;
267
- let totalReads = 0;
268
-
269
- for (const call of toolCallDetails) {
270
- const name = typeof call === 'string' ? call : (call.name || '');
271
- const args = (typeof call === 'object' && call.args) || {};
272
-
273
- if (/read_file|Read/.test(name)) {
274
- totalReads++;
275
- const filePath = args.file_path || args.path || args.file || 'unknown';
276
- readCounts[filePath] = (readCounts[filePath] || 0) + 1;
277
-
278
- // Reward targeted reads (using offset/limit)
279
- if (args.offset !== undefined || args.limit !== undefined || args.line_range) {
280
- targetedReads++;
281
- }
282
- }
283
- }
284
-
285
- if (totalReads === 0) return 0.5; // no reads = neutral
286
-
287
- let score = 0.7; // baseline
288
-
289
- // Penalize re-reading same file >2 times
290
- for (const count of Object.values(readCounts)) {
291
- if (count > 2) {
292
- score -= 0.1 * (count - 2); // -0.1 per extra re-read beyond 2
293
- }
294
- }
295
-
296
- // Reward targeted reads (offset/limit usage)
297
- if (totalReads > 0) {
298
- const targetedRatio = targetedReads / totalReads;
299
- score += targetedRatio * 0.3; // up to +0.3 for all targeted reads
300
- }
301
-
302
- return Math.min(1, Math.max(0, score));
303
- }
304
-
305
- /**
306
- * Score iterative refinement — reward edit->test fail->re-edit->test pass sequences.
307
- * Looks for the pattern: edit, test/run, (fail detected), edit again, test/run, (pass).
308
- *
309
- * @param {Array} toolCallDetails - array of { name, args, result } objects
310
- * @returns {number} score in [0, 1]
311
- */
312
- function scoreIterativeRefinement(toolCallDetails) {
313
- if (!toolCallDetails || toolCallDetails.length < 3) return 0.5; // neutral — not enough data
314
-
315
- const calls = toolCallDetails.map(c => {
316
- const name = typeof c === 'string' ? c : (c.name || '');
317
- const result = (typeof c === 'object' && c.result) || '';
318
- return { name, result: typeof result === 'string' ? result : JSON.stringify(result) };
319
- });
320
-
321
- let refinementCycles = 0;
322
- let totalEditTestPairs = 0;
323
-
324
- for (let i = 0; i < calls.length - 1; i++) {
325
- const isEdit = /^(edit_file|write_file|Edit|Write)$/.test(calls[i].name);
326
- if (!isEdit) continue;
327
-
328
- // Look for a test/run after this edit
329
- for (let j = i + 1; j < Math.min(i + 4, calls.length); j++) {
330
- const isTest = /^(run_shell|bash|Bash)$/.test(calls[j].name);
331
- if (!isTest) continue;
332
- totalEditTestPairs++;
333
-
334
- const testFailed = /error|fail|exception/i.test(calls[j].result);
335
- if (!testFailed) break;
336
-
337
- // Look for re-edit after failed test
338
- for (let k = j + 1; k < Math.min(j + 3, calls.length); k++) {
339
- const isReEdit = /^(edit_file|write_file|Edit|Write)$/.test(calls[k].name);
340
- if (!isReEdit) continue;
341
-
342
- // Look for passing test after re-edit
343
- for (let m = k + 1; m < Math.min(k + 3, calls.length); m++) {
344
- const isReTest = /^(run_shell|bash|Bash)$/.test(calls[m].name);
345
- if (!isReTest) continue;
346
- const testPassed = !/error|fail|exception/i.test(calls[m].result);
347
- if (testPassed) refinementCycles++;
348
- break;
349
- }
350
- break;
351
- }
352
- break;
353
- }
354
- }
355
-
356
- if (totalEditTestPairs === 0) return 0.5; // no edit->test pairs = neutral
357
-
358
- // Score: base 0.5 + bonus for successful refinement cycles
359
- const score = 0.5 + (refinementCycles / Math.max(totalEditTestPairs, 1)) * 0.5;
360
- return Math.min(1, Math.max(0, score));
361
- }
362
-
363
- /**
364
- * Score ambiguity handling — agent should ask for clarification before writing
365
- * when the prompt is vague/ambiguous.
366
- *
367
- * @param {string[]} actualToolCalls - tool call names
368
- * @param {object} expectations - benchmark agentExpectations (shouldAskUser, forbiddenToolCalls)
369
- * @returns {number} score in [0, 1]
370
- */
371
- function scoreAmbiguityHandling(actualToolCalls, expectations = {}) {
372
- if (!expectations.shouldAskUser) return 0.5; // not an ambiguity benchmark
373
-
374
- const calls = actualToolCalls.map(t => typeof t === 'string' ? t : (t.name || ''));
375
- const askedUser = calls.some(c => /^(ask_user|AskUserQuestion)$/i.test(c));
376
- const wroteFiles = calls.some(c => /^(write_file|edit_file|Write|Edit)$/i.test(c));
377
-
378
- if (askedUser && !wroteFiles) return 1.0; // perfect: clarified without writing
379
- if (askedUser && wroteFiles) return 0.4; // asked but also wrote (mixed signals)
380
- if (!askedUser && !wroteFiles) return 0.6; // at least didn't write blindly
381
- return 0.0; // worst: wrote without asking
382
- }
383
-
384
- /**
385
- * Compute composite agent score across all 11 dimensions.
386
- */
387
- function computeAgentScore({
388
- actualToolCalls = [],
389
- expectedToolCalls = [],
390
- forbiddenToolCalls = [],
391
- testsPassed = null,
392
- success = false,
393
- output = '',
394
- actualFiles = [],
395
- expectedFiles = [],
396
- actualTurns = 0,
397
- maxTurns = 20,
398
- consecutiveErrors = 0,
399
- // New params for enhanced dimensions
400
- sandboxDir = null,
401
- costDollars = null,
402
- testsBefore = null,
403
- testsAfter = null,
404
- totalTests = null,
405
- toolCallDetails = null, // array of { name, args, result } for context/refinement scoring
406
- shouldAskUser = false, // true for ambiguity benchmarks
407
- weights = DIMENSION_WEIGHTS,
408
- } = {}) {
409
- // Score the original 6 dimensions (codeQuality is now real)
410
- const dimensions = {
411
- toolEfficiency: scoreToolEfficiency(actualToolCalls, expectedToolCalls, forbiddenToolCalls),
412
- correctness: scoreCorrectness({ testsPassed, success, output, actualFiles }),
413
- planQuality: scorePlanQuality(actualToolCalls, output),
414
- diffAccuracy: scoreDiffAccuracy(actualFiles, expectedFiles),
415
- turnEconomy: scoreTurnEconomy(actualTurns, maxTurns),
416
- errorHandling: scoreErrorHandling(actualToolCalls, consecutiveErrors),
417
- codeQuality: scoreCodeQuality({ sandboxDir, actualFiles, success }),
418
- };
419
-
420
- // Compute a preliminary quality score for cost efficiency
421
- // (average of non-cost dimensions that have data)
422
- const prelimDims = ['correctness', 'toolEfficiency', 'diffAccuracy', 'planQuality', 'turnEconomy', 'errorHandling', 'codeQuality'];
423
- const qualityScore = prelimDims.reduce((sum, d) => sum + dimensions[d], 0) / prelimDims.length;
424
-
425
- // Score the 4 new dimensions
426
- dimensions.costEfficiency = scoreCostEfficiency(qualityScore, costDollars);
427
- dimensions.partialProgress = scorePartialProgress(testsBefore, testsAfter, totalTests);
428
- dimensions.contextManagement = scoreContextManagement(toolCallDetails || actualToolCalls);
429
- dimensions.iterativeRefinement = scoreIterativeRefinement(toolCallDetails);
430
-
431
- // For ambiguity benchmarks, override correctness with ambiguity handling score
432
- if (shouldAskUser) {
433
- dimensions.ambiguityHandling = scoreAmbiguityHandling(actualToolCalls, { shouldAskUser });
434
- // Replace correctness with ambiguity handling for these benchmarks
435
- dimensions.correctness = dimensions.ambiguityHandling;
436
- }
437
-
438
- let composite = 0;
439
- for (const [dim, weight] of Object.entries(weights)) {
440
- composite += (dimensions[dim] || 0) * (weight || 0);
441
- }
442
-
443
- return { composite: Math.min(1, Math.max(0, composite)), dimensions };
444
- }
445
-
446
- module.exports = {
447
- DIMENSION_WEIGHTS,
448
- scoreToolEfficiency,
449
- scoreCorrectness,
450
- scorePlanQuality,
451
- scoreDiffAccuracy,
452
- scoreTurnEconomy,
453
- scoreErrorHandling,
454
- scoreCodeQuality,
455
- scoreCostEfficiency,
456
- scorePartialProgress,
457
- scoreContextManagement,
458
- scoreIterativeRefinement,
459
- scoreAmbiguityHandling,
460
- computeAgentScore,
461
- };