create-walle 0.9.21 → 0.9.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (500) hide show
  1. package/README.md +27 -5
  2. package/package.json +2 -2
  3. package/template/CLAUDE.md +2 -2
  4. package/template/LICENSE +1 -1
  5. package/template/bin/ctm-dev-cleanup.js +24 -3
  6. package/template/bin/ctm-launch.sh +13 -0
  7. package/template/bin/dev.sh +156 -18
  8. package/template/bin/node-bin.sh +84 -0
  9. package/template/bin/pin-node.sh +51 -0
  10. package/template/claude-task-manager/api-prompts.js +1203 -182
  11. package/template/claude-task-manager/api-reviews.js +109 -15
  12. package/template/claude-task-manager/approval-agent.js +1360 -280
  13. package/template/claude-task-manager/bin/restart-ctm.sh +64 -23
  14. package/template/claude-task-manager/bin/storage-migration-supervisor.js +338 -0
  15. package/template/claude-task-manager/db.js +4417 -295
  16. package/template/claude-task-manager/docs/app-update-refresh-protocol.md +69 -0
  17. package/template/claude-task-manager/docs/approval-ai-refinement.md +138 -0
  18. package/template/claude-task-manager/docs/approval-rescue-loop.md +74 -0
  19. package/template/claude-task-manager/docs/codex-operational-warning-health.md +107 -0
  20. package/template/claude-task-manager/docs/codex-resume-state-guard-design.md +17 -12
  21. package/template/claude-task-manager/docs/codex-terminal-render-controller-handoff.md +311 -0
  22. package/template/claude-task-manager/docs/coding-agent-hooks-architecture.md +418 -0
  23. package/template/claude-task-manager/docs/conversation-import-freshness.md +20 -0
  24. package/template/claude-task-manager/docs/google-workspace-auth-health.md +77 -0
  25. package/template/claude-task-manager/docs/image-paste-ux.md +13 -0
  26. package/template/claude-task-manager/docs/ipad-web-preview.md +88 -0
  27. package/template/claude-task-manager/docs/main-loop-offload-architecture.md +66 -0
  28. package/template/claude-task-manager/docs/microsoft-dev-tunnel-phone-access-design.md +274 -519
  29. package/template/claude-task-manager/docs/mobile-live-streaming.md +27 -5
  30. package/template/claude-task-manager/docs/mobile-remote-submission-lifecycle.md +69 -0
  31. package/template/claude-task-manager/docs/phone-access-design.md +53 -15
  32. package/template/claude-task-manager/docs/phone-passkey-identity.md +122 -0
  33. package/template/claude-task-manager/docs/phone-setup.md +3 -0
  34. package/template/claude-task-manager/docs/prompt-editing-tree-design.md +25 -1
  35. package/template/claude-task-manager/docs/remote-desktop-access-design.md +268 -0
  36. package/template/claude-task-manager/docs/restart-lifecycle-architecture.md +95 -0
  37. package/template/claude-task-manager/docs/runtime-work-control-plane.md +53 -0
  38. package/template/claude-task-manager/docs/session-interactive-wait-surfaces.md +38 -0
  39. package/template/claude-task-manager/docs/session-needs-you-dismissal.md +84 -0
  40. package/template/claude-task-manager/docs/session-render-state-management-design.md +91 -3
  41. package/template/claude-task-manager/docs/session-standup-command-center-design.md +25 -1
  42. package/template/claude-task-manager/docs/session-title-authority.md +32 -0
  43. package/template/claude-task-manager/docs/session-workspace-binding.md +33 -0
  44. package/template/claude-task-manager/docs/skill-intent-resolution-design.md +72 -0
  45. package/template/claude-task-manager/docs/walle-mcp-supervisor-health.md +86 -0
  46. package/template/claude-task-manager/docs/walle-relay-phone-access-design.md +24 -15
  47. package/template/claude-task-manager/docs/walle-session-history-hydration.md +114 -0
  48. package/template/claude-task-manager/docs/walle-session-input-queue.md +104 -0
  49. package/template/claude-task-manager/docs/walle-session-model-catalog.md +90 -0
  50. package/template/claude-task-manager/docs/walle-session-model-preferences.md +15 -6
  51. package/template/claude-task-manager/git-utils.js +897 -27
  52. package/template/claude-task-manager/lib/agent-capabilities.js +33 -0
  53. package/template/claude-task-manager/lib/agent-cli-cache.js +37 -7
  54. package/template/claude-task-manager/lib/agent-hooks-installer.js +26 -2
  55. package/template/claude-task-manager/lib/agent-presets.js +17 -1
  56. package/template/claude-task-manager/lib/all-sessions-query.js +108 -0
  57. package/template/claude-task-manager/lib/approval-ai-refinement.js +488 -0
  58. package/template/claude-task-manager/lib/approval-self-adapt.js +168 -0
  59. package/template/claude-task-manager/lib/async-semaphore.js +44 -0
  60. package/template/claude-task-manager/lib/auth-context.js +5 -0
  61. package/template/claude-task-manager/lib/auth-rate-limit.js +47 -4
  62. package/template/claude-task-manager/lib/auth-rules.js +29 -2
  63. package/template/claude-task-manager/lib/auto-approval-verifier.js +129 -16
  64. package/template/claude-task-manager/lib/background-llm.js +144 -17
  65. package/template/claude-task-manager/lib/branch-inventory.js +212 -0
  66. package/template/claude-task-manager/lib/claude-desktop-sessions.js +15 -3
  67. package/template/claude-task-manager/lib/coalesce-sync-frames.js +151 -0
  68. package/template/claude-task-manager/lib/codex-launch-health.js +762 -0
  69. package/template/claude-task-manager/lib/codex-transcript-pager.js +51 -0
  70. package/template/claude-task-manager/lib/codex-zst.js +124 -0
  71. package/template/claude-task-manager/lib/coding-agent-models.js +233 -30
  72. package/template/claude-task-manager/lib/connection-health.js +232 -0
  73. package/template/claude-task-manager/lib/conversation-blob-parser.js +42 -0
  74. package/template/claude-task-manager/lib/conversation-tail-merge.js +89 -26
  75. package/template/claude-task-manager/lib/ctm-session-context-api.js +39 -10
  76. package/template/claude-task-manager/lib/cursor-conversation-store.js +354 -0
  77. package/template/claude-task-manager/lib/db-owner-worker-client.js +315 -0
  78. package/template/claude-task-manager/lib/document-review.js +141 -6
  79. package/template/claude-task-manager/lib/escalation-review.js +152 -0
  80. package/template/claude-task-manager/lib/graceful-shutdown.js +159 -0
  81. package/template/claude-task-manager/lib/headless-term-service.js +678 -0
  82. package/template/claude-task-manager/lib/heavy-worker-fallback.js +38 -0
  83. package/template/claude-task-manager/lib/jsonl-conversation-parser.js +542 -0
  84. package/template/claude-task-manager/lib/jsonl-range-reader.js +112 -0
  85. package/template/claude-task-manager/lib/main-db-census.js +216 -0
  86. package/template/claude-task-manager/lib/message-pagination.js +106 -4
  87. package/template/claude-task-manager/lib/microsoft-dev-tunnel-setup.js +750 -26
  88. package/template/claude-task-manager/lib/mobile-auth-api.js +274 -7
  89. package/template/claude-task-manager/lib/mobile-auth-store.js +592 -10
  90. package/template/claude-task-manager/lib/mobile-notification-dispatcher.js +15 -0
  91. package/template/claude-task-manager/lib/model-overview-brain-fallback.js +311 -0
  92. package/template/claude-task-manager/lib/model-overview-cache.js +141 -0
  93. package/template/claude-task-manager/lib/models-health-routing-notice.js +126 -0
  94. package/template/claude-task-manager/lib/node-pin-guard.js +93 -0
  95. package/template/claude-task-manager/lib/perf-tracker.js +242 -6
  96. package/template/claude-task-manager/lib/permission-match.js +76 -0
  97. package/template/claude-task-manager/lib/permission-sync.js +133 -20
  98. package/template/claude-task-manager/lib/process-title.js +35 -0
  99. package/template/claude-task-manager/lib/prompt-executions-query.js +25 -0
  100. package/template/claude-task-manager/lib/prompt-index-disk-cache.js +44 -0
  101. package/template/claude-task-manager/lib/prompt-intent.js +132 -0
  102. package/template/claude-task-manager/lib/provider-user-context.js +34 -0
  103. package/template/claude-task-manager/lib/read-pool-client.js +313 -0
  104. package/template/claude-task-manager/lib/readpool-breaker.js +31 -0
  105. package/template/claude-task-manager/lib/recent-sessions-breaker.js +12 -0
  106. package/template/claude-task-manager/lib/remote-feedback-client.js +72 -0
  107. package/template/claude-task-manager/lib/remote-relay-protocol.js +37 -4
  108. package/template/claude-task-manager/lib/remote-relay-store.js +159 -0
  109. package/template/claude-task-manager/lib/remote-submission-observer.js +278 -0
  110. package/template/claude-task-manager/lib/restart-guard.js +109 -0
  111. package/template/claude-task-manager/lib/restore-interruption-detector.js +439 -0
  112. package/template/claude-task-manager/lib/restore-policy.js +13 -0
  113. package/template/claude-task-manager/lib/restore-resume-batch.js +74 -0
  114. package/template/claude-task-manager/lib/restore-runtime.js +68 -0
  115. package/template/claude-task-manager/lib/restore-storm.js +34 -0
  116. package/template/claude-task-manager/lib/resume-cwd.js +36 -0
  117. package/template/claude-task-manager/lib/resume-preflight.js +313 -0
  118. package/template/claude-task-manager/lib/runtime-work-registry.js +444 -0
  119. package/template/claude-task-manager/lib/sanitize-openai-auth.js +31 -0
  120. package/template/claude-task-manager/lib/scheduler.js +21 -1
  121. package/template/claude-task-manager/lib/scrollback-snapshot-store.js +159 -0
  122. package/template/claude-task-manager/lib/serial-task-queue.js +64 -0
  123. package/template/claude-task-manager/lib/server-listeners.js +239 -0
  124. package/template/claude-task-manager/lib/session-capture.js +42 -7
  125. package/template/claude-task-manager/lib/session-content-backfill.js +131 -0
  126. package/template/claude-task-manager/lib/session-history.js +388 -43
  127. package/template/claude-task-manager/lib/session-host-manager.js +287 -0
  128. package/template/claude-task-manager/lib/session-image-refs.js +209 -0
  129. package/template/claude-task-manager/lib/session-jobs.js +399 -59
  130. package/template/claude-task-manager/lib/session-prompt-index.js +137 -0
  131. package/template/claude-task-manager/lib/session-restore.js +53 -0
  132. package/template/claude-task-manager/lib/session-standup.js +123 -23
  133. package/template/claude-task-manager/lib/session-state-bus.js +14 -0
  134. package/template/claude-task-manager/lib/session-stream.js +64 -16
  135. package/template/claude-task-manager/lib/session-timeline-summary.js +260 -0
  136. package/template/claude-task-manager/lib/session-token-usage.js +494 -0
  137. package/template/claude-task-manager/lib/session-workspace-binding.js +356 -0
  138. package/template/claude-task-manager/lib/setup-network-config.js +9 -0
  139. package/template/claude-task-manager/lib/size-cap.js +45 -0
  140. package/template/claude-task-manager/lib/size-cap.test.js +62 -0
  141. package/template/claude-task-manager/lib/skill-autocomplete.js +180 -1
  142. package/template/claude-task-manager/lib/skill-intent-resolver.js +304 -0
  143. package/template/claude-task-manager/lib/sqlite-driver.js +19 -3
  144. package/template/claude-task-manager/lib/standup-attention.js +7 -3
  145. package/template/claude-task-manager/lib/status-authority.js +39 -0
  146. package/template/claude-task-manager/lib/status-hooks.js +4 -0
  147. package/template/claude-task-manager/lib/storage-migration.js +235 -0
  148. package/template/claude-task-manager/lib/structured-capture.js +298 -0
  149. package/template/claude-task-manager/lib/sync-io-census.js +163 -0
  150. package/template/claude-task-manager/lib/tailscale-setup.js +6 -0
  151. package/template/claude-task-manager/lib/terminal-activity-evidence.js +33 -0
  152. package/template/claude-task-manager/lib/terminal-choice.js +364 -0
  153. package/template/claude-task-manager/lib/terminal-control-sanitize.js +17 -0
  154. package/template/claude-task-manager/lib/terminal-fingerprint.js +48 -0
  155. package/template/claude-task-manager/lib/terminal-output-flush.js +84 -0
  156. package/template/claude-task-manager/lib/timeline-order.js +122 -0
  157. package/template/claude-task-manager/lib/transcript-store.js +348 -43
  158. package/template/claude-task-manager/lib/transport-security.js +84 -1
  159. package/template/claude-task-manager/lib/wait-state.js +184 -0
  160. package/template/claude-task-manager/lib/walle-client.js +47 -5
  161. package/template/claude-task-manager/lib/walle-ctm-history.js +564 -4
  162. package/template/claude-task-manager/lib/walle-external-actions.js +135 -16
  163. package/template/claude-task-manager/lib/walle-history-hydration.js +46 -0
  164. package/template/claude-task-manager/lib/walle-native-health.js +403 -0
  165. package/template/claude-task-manager/lib/walle-repair.js +701 -0
  166. package/template/claude-task-manager/lib/walle-session-cache.js +109 -0
  167. package/template/claude-task-manager/lib/walle-session-context.js +57 -21
  168. package/template/claude-task-manager/lib/walle-session-model-catalog.js +34 -0
  169. package/template/claude-task-manager/lib/walle-supervisor.js +539 -63
  170. package/template/claude-task-manager/lib/walle-transcript.js +52 -0
  171. package/template/claude-task-manager/lib/worktree-active-sync.js +11 -7
  172. package/template/claude-task-manager/lib/worktree-cwd.js +32 -1
  173. package/template/claude-task-manager/package.json +1 -1
  174. package/template/claude-task-manager/prompt-harvest.js +89 -66
  175. package/template/claude-task-manager/providers/claude-code.js +51 -3
  176. package/template/claude-task-manager/providers/cursor.js +140 -45
  177. package/template/claude-task-manager/public/css/reviews.css +551 -61
  178. package/template/claude-task-manager/public/css/setup.css +191 -0
  179. package/template/claude-task-manager/public/css/walle-session.css +865 -10
  180. package/template/claude-task-manager/public/css/walle.css +154 -0
  181. package/template/claude-task-manager/public/designs/ai-providers-consolidation-v2.html +830 -0
  182. package/template/claude-task-manager/public/index.html +18516 -2058
  183. package/template/claude-task-manager/public/ipad.html +363 -0
  184. package/template/claude-task-manager/public/js/document-review-links.js +301 -0
  185. package/template/claude-task-manager/public/js/image-normalize.js +69 -36
  186. package/template/claude-task-manager/public/js/message-renderer.js +1265 -77
  187. package/template/claude-task-manager/public/js/prompts.js +66 -29
  188. package/template/claude-task-manager/public/js/reviews.js +901 -133
  189. package/template/claude-task-manager/public/js/session-activity-utils.js +11 -1
  190. package/template/claude-task-manager/public/js/session-search-utils.js +94 -10
  191. package/template/claude-task-manager/public/js/session-status-precedence.js +23 -5
  192. package/template/claude-task-manager/public/js/setup.js +1273 -176
  193. package/template/claude-task-manager/public/js/stream-view.js +691 -73
  194. package/template/claude-task-manager/public/js/terminal-reconciler.js +210 -0
  195. package/template/claude-task-manager/public/js/walle-session.js +2455 -158
  196. package/template/claude-task-manager/public/js/walle.js +455 -28
  197. package/template/claude-task-manager/public/m/app.css +2909 -262
  198. package/template/claude-task-manager/public/m/app.js +6601 -398
  199. package/template/claude-task-manager/public/m/claim.html +224 -17
  200. package/template/claude-task-manager/public/m/index.html +117 -21
  201. package/template/claude-task-manager/public/m/sw.js +3 -1
  202. package/template/claude-task-manager/public/manifest.json +2 -2
  203. package/template/claude-task-manager/public/prompts.html +30 -14
  204. package/template/claude-task-manager/queue-engine.js +507 -28
  205. package/template/claude-task-manager/scripts/repair-claude-session-images.js +27 -8
  206. package/template/claude-task-manager/server.js +14341 -2197
  207. package/template/claude-task-manager/session-integrity.js +160 -18
  208. package/template/claude-task-manager/session-search-ranking.js +1 -0
  209. package/template/claude-task-manager/session-utils.js +25 -5
  210. package/template/claude-task-manager/workers/approval-blocklist.js +96 -6
  211. package/template/claude-task-manager/workers/approval-widget-validator.js +14 -8
  212. package/template/claude-task-manager/workers/conversation-import-worker.js +11 -50
  213. package/template/claude-task-manager/workers/db-owner-worker.js +386 -0
  214. package/template/claude-task-manager/workers/harvest-worker.js +9 -55
  215. package/template/claude-task-manager/workers/headless-term-worker.js +9 -530
  216. package/template/claude-task-manager/workers/read-pool-worker.js +387 -0
  217. package/template/claude-task-manager/workers/scrollback-worker.js +11 -72
  218. package/template/claude-task-manager/workers/session-host-process.js +146 -0
  219. package/template/claude-task-manager/workers/session-integrity-worker.js +10 -54
  220. package/template/claude-task-manager/workers/state-detectors/base.js +18 -1
  221. package/template/claude-task-manager/workers/state-detectors/claude-code.js +182 -9
  222. package/template/claude-task-manager/workers/state-detectors/codex.js +150 -2
  223. package/template/claude-task-manager/workers/state-detectors/cursor.js +127 -0
  224. package/template/claude-task-manager/workers/state-detectors/gemini.js +21 -0
  225. package/template/claude-task-manager/workers/state-detectors/index.js +29 -0
  226. package/template/claude-task-manager/workers/state-detectors/opencode.js +103 -0
  227. package/template/docs/design/markdown-review-pane.md +206 -0
  228. package/template/docs/designs/2026-05-17-portkey-gateway-provider-ux.md +129 -38
  229. package/template/docs/designs/2026-05-20-mobile-worktree-finish-command.md +27 -0
  230. package/template/docs/designs/2026-05-22-ai-configuration-consolidation.md +248 -0
  231. package/template/docs/designs/ai-configuration-consolidation-mock.html +812 -0
  232. package/template/docs/private-memory-and-pii-policy.md +69 -0
  233. package/template/package.json +2 -1
  234. package/template/scripts/check-private-data.js +201 -0
  235. package/template/shared/sqlite-owner-guard.js +30 -0
  236. package/template/shared/sqlite-owner-write-queue.js +225 -0
  237. package/template/shared/sqlite-storage-policy.js +111 -0
  238. package/template/shared/sqlite-write-lock.js +428 -0
  239. package/template/wall-e/agent-runners/claude-code.js +5 -0
  240. package/template/wall-e/agent.js +166 -22
  241. package/template/wall-e/api-walle.js +524 -70
  242. package/template/wall-e/auth/provider-flows.js +11 -1
  243. package/template/wall-e/bin/walle-mcp-stdio.js +341 -17
  244. package/template/wall-e/brain.js +1614 -141
  245. package/template/wall-e/chat/attachment-blocks.js +96 -0
  246. package/template/wall-e/chat/attachments.js +2 -1
  247. package/template/wall-e/chat/capability-resolver.js +7 -7
  248. package/template/wall-e/chat/context-messages.js +28 -0
  249. package/template/wall-e/chat/conversation-frame.js +630 -0
  250. package/template/wall-e/chat/provider-messages.js +125 -0
  251. package/template/wall-e/chat.js +1002 -233
  252. package/template/wall-e/coding/acceptance-contract.js +170 -0
  253. package/template/wall-e/coding/acp-adapter.js +1 -1
  254. package/template/wall-e/coding/agent-catalog.js +3 -0
  255. package/template/wall-e/coding/artifact-store.js +93 -0
  256. package/template/wall-e/coding/capability-router.js +120 -0
  257. package/template/wall-e/coding/coding-run-controller.js +423 -0
  258. package/template/wall-e/coding/compaction-service.js +157 -12
  259. package/template/wall-e/coding/frontend-verification.js +258 -0
  260. package/template/wall-e/coding/lifecycle-hooks.js +75 -0
  261. package/template/wall-e/coding/local-preview-contract.js +157 -0
  262. package/template/wall-e/coding/permission-service.js +57 -13
  263. package/template/wall-e/coding/prompt-bundle.js +19 -1
  264. package/template/wall-e/coding/prompt-section-registry.js +227 -0
  265. package/template/wall-e/coding/provider-compat.js +15 -0
  266. package/template/wall-e/coding/runtime-events.js +224 -0
  267. package/template/wall-e/coding/runtime-mode.js +3 -0
  268. package/template/wall-e/coding/side-git-snapshot.js +160 -4
  269. package/template/wall-e/coding/snapshot-service.js +143 -1
  270. package/template/wall-e/coding/stream-processor.js +388 -34
  271. package/template/wall-e/coding/task-tool.js +141 -4
  272. package/template/wall-e/coding/tool-execution-controller.js +365 -0
  273. package/template/wall-e/coding/tool-registry.js +43 -5
  274. package/template/wall-e/coding/user-hooks.js +217 -0
  275. package/template/wall-e/coding-orchestrator.js +1330 -221
  276. package/template/wall-e/coding-prompts.js +20 -4
  277. package/template/wall-e/context/context-builder.js +15 -2
  278. package/template/wall-e/decision/confidence.js +1 -1
  279. package/template/wall-e/docs/coding-acceptance-contract.md +41 -0
  280. package/template/wall-e/docs/external-action-controller.md +26 -6
  281. package/template/wall-e/docs/telemetry-lifecycle.md +8 -2
  282. package/template/wall-e/embeddings.js +591 -53
  283. package/template/wall-e/external-action-controller.js +12 -0
  284. package/template/wall-e/http/auth.js +1 -0
  285. package/template/wall-e/http/chat-api.js +46 -11
  286. package/template/wall-e/http/model-admin.js +836 -34
  287. package/template/wall-e/lib/boot-profile.js +88 -0
  288. package/template/wall-e/lib/event-loop-monitor.js +93 -0
  289. package/template/wall-e/lib/service-health.js +194 -0
  290. package/template/wall-e/llm/anthropic.js +130 -5
  291. package/template/wall-e/llm/client.js +266 -63
  292. package/template/wall-e/llm/default-fallback.js +382 -0
  293. package/template/wall-e/llm/health.js +19 -0
  294. package/template/wall-e/llm/message-guard.js +78 -0
  295. package/template/wall-e/llm/model-catalog.js +252 -1
  296. package/template/wall-e/llm/openai.js +26 -4
  297. package/template/wall-e/llm/portkey-sync.js +654 -0
  298. package/template/wall-e/llm/provider-error.js +30 -2
  299. package/template/wall-e/llm/registry.js +5 -1
  300. package/template/wall-e/llm/request-compat.js +67 -0
  301. package/template/wall-e/loops/backfill.js +79 -23
  302. package/template/wall-e/loops/brain-optimize.js +67 -0
  303. package/template/wall-e/loops/ingest.js +25 -10
  304. package/template/wall-e/loops/question-digest.js +160 -0
  305. package/template/wall-e/loops/reflect.js +6 -4
  306. package/template/wall-e/loops/think.js +39 -12
  307. package/template/wall-e/mcp-server.js +318 -36
  308. package/template/wall-e/memory/ctm-context-client.js +52 -14
  309. package/template/wall-e/memory/ctm-operational-context.js +237 -0
  310. package/template/wall-e/memory/ctm-prompt-executions-client.js +128 -0
  311. package/template/wall-e/memory/ctm-session-context.js +111 -63
  312. package/template/wall-e/prompts/coding/deepseek.txt +3 -0
  313. package/template/wall-e/prompts/coding/gemini.txt +6 -0
  314. package/template/wall-e/prompts/coding/gpt.txt +6 -0
  315. package/template/wall-e/prompts/coding/local.txt +7 -0
  316. package/template/wall-e/runtime/decision-hooks.js +115 -0
  317. package/template/wall-e/runtime/devbox-gateway.js +82 -8
  318. package/template/wall-e/runtime/prompt-manifest.js +86 -0
  319. package/template/wall-e/runtime/tool-executor.js +269 -0
  320. package/template/wall-e/runtime/tool-result-envelope.js +138 -0
  321. package/template/wall-e/runtime/transcript-projection.js +60 -0
  322. package/template/wall-e/runtime/walle-runtime.js +224 -0
  323. package/template/wall-e/scripts/db-optimize/migrate.js +162 -0
  324. package/template/wall-e/scripts/db-optimize/recall-eval.js +117 -0
  325. package/template/wall-e/server.js +15 -0
  326. package/template/wall-e/session-files.js +9 -0
  327. package/template/wall-e/skills/_bundled/google-calendar/run.js +1 -1
  328. package/template/wall-e/skills/_bundled/gws-workspace/run.js +1 -1
  329. package/template/wall-e/skills/_bundled/slack-mentions/run.js +76 -6
  330. package/template/wall-e/skills/claude-code-reader.js +7 -3
  331. package/template/wall-e/skills/script-skill-runner.js +10 -0
  332. package/template/wall-e/skills/skill-planner.js +38 -0
  333. package/template/wall-e/tools/builtin-middleware.js +19 -9
  334. package/template/wall-e/tools/local-tools.js +1428 -16
  335. package/template/wall-e/tools/permission-checker.js +73 -5
  336. package/template/wall-e/tools/question-manager.js +117 -7
  337. package/template/wall-e/training/harvester.js +12 -28
  338. package/template/wall-e/training/replay.js +25 -80
  339. package/template/website/index.html +10 -10
  340. package/template/wall-e/eval/ab-test.js +0 -203
  341. package/template/wall-e/eval/agent-runner.js +0 -772
  342. package/template/wall-e/eval/agent-scorer.js +0 -461
  343. package/template/wall-e/eval/aggregator.js +0 -414
  344. package/template/wall-e/eval/allowed-test-commands.js +0 -34
  345. package/template/wall-e/eval/benchmark-generator.js +0 -113
  346. package/template/wall-e/eval/benchmarks/chat-eval.json +0 -1662
  347. package/template/wall-e/eval/benchmarks/chat.json +0 -82
  348. package/template/wall-e/eval/benchmarks/coding-agent-real.json +0 -1
  349. package/template/wall-e/eval/benchmarks/coding-agent.json +0 -1581
  350. package/template/wall-e/eval/benchmarks/coding.json +0 -122
  351. package/template/wall-e/eval/benchmarks/memory-retrieval.json +0 -234
  352. package/template/wall-e/eval/benchmarks/reasoning.json +0 -82
  353. package/template/wall-e/eval/benchmarks/swebench-lite-30.json +0 -212
  354. package/template/wall-e/eval/benchmarks.js +0 -669
  355. package/template/wall-e/eval/cc-replay.js +0 -719
  356. package/template/wall-e/eval/chat-eval.js +0 -525
  357. package/template/wall-e/eval/check-keys.js +0 -15
  358. package/template/wall-e/eval/check-providers.js +0 -42
  359. package/template/wall-e/eval/codex-cli-baseline.js +0 -669
  360. package/template/wall-e/eval/coding-agent-real.js +0 -570
  361. package/template/wall-e/eval/context-compactor.js +0 -251
  362. package/template/wall-e/eval/debug-agent003.js +0 -68
  363. package/template/wall-e/eval/diagnostics.js +0 -216
  364. package/template/wall-e/eval/eval-orchestrator.js +0 -642
  365. package/template/wall-e/eval/evaluate.js +0 -202
  366. package/template/wall-e/eval/evaluator.js +0 -373
  367. package/template/wall-e/eval/exporter.js +0 -212
  368. package/template/wall-e/eval/fixtures/express-basic/package.json +0 -9
  369. package/template/wall-e/eval/fixtures/express-basic/server.js +0 -115
  370. package/template/wall-e/eval/fixtures/express-basic/test.js +0 -83
  371. package/template/wall-e/eval/fixtures/express-buggy/package.json +0 -9
  372. package/template/wall-e/eval/fixtures/express-buggy/server.js +0 -113
  373. package/template/wall-e/eval/fixtures/express-buggy/test.js +0 -83
  374. package/template/wall-e/eval/fixtures/express-buggy-items/package.json +0 -9
  375. package/template/wall-e/eval/fixtures/express-buggy-items/server.js +0 -112
  376. package/template/wall-e/eval/fixtures/express-buggy-items/test.js +0 -83
  377. package/template/wall-e/eval/fixtures/express-buggy-search/package.json +0 -9
  378. package/template/wall-e/eval/fixtures/express-buggy-search/server.js +0 -121
  379. package/template/wall-e/eval/fixtures/express-buggy-search/test.js +0 -83
  380. package/template/wall-e/eval/fixtures/express-rename-data/data.js +0 -34
  381. package/template/wall-e/eval/fixtures/express-rename-data/package.json +0 -9
  382. package/template/wall-e/eval/fixtures/express-rename-data/server.js +0 -97
  383. package/template/wall-e/eval/fixtures/express-rename-data/test.js +0 -88
  384. package/template/wall-e/eval/fixtures/express-xss/package.json +0 -12
  385. package/template/wall-e/eval/fixtures/express-xss/server.js +0 -90
  386. package/template/wall-e/eval/fixtures/express-xss/test.js +0 -67
  387. package/template/wall-e/eval/fixtures/express-xss/views/profile.ejs +0 -9
  388. package/template/wall-e/eval/fixtures/fullstack-app/config/default.js +0 -9
  389. package/template/wall-e/eval/fixtures/fullstack-app/config/test.js +0 -13
  390. package/template/wall-e/eval/fixtures/fullstack-app/package.json +0 -11
  391. package/template/wall-e/eval/fixtures/fullstack-app/public/css/style.css +0 -137
  392. package/template/wall-e/eval/fixtures/fullstack-app/public/index.html +0 -46
  393. package/template/wall-e/eval/fixtures/fullstack-app/public/js/app.js +0 -121
  394. package/template/wall-e/eval/fixtures/fullstack-app/public/js/auth.js +0 -71
  395. package/template/wall-e/eval/fixtures/fullstack-app/public/js/items.js +0 -80
  396. package/template/wall-e/eval/fixtures/fullstack-app/public/js/users.js +0 -46
  397. package/template/wall-e/eval/fixtures/fullstack-app/public/login.html +0 -45
  398. package/template/wall-e/eval/fixtures/fullstack-app/public/register.html +0 -38
  399. package/template/wall-e/eval/fixtures/fullstack-app/scripts/migrate.js +0 -23
  400. package/template/wall-e/eval/fixtures/fullstack-app/scripts/seed.js +0 -46
  401. package/template/wall-e/eval/fixtures/fullstack-app/server/db.js +0 -99
  402. package/template/wall-e/eval/fixtures/fullstack-app/server/index.js +0 -94
  403. package/template/wall-e/eval/fixtures/fullstack-app/server/middleware/auth.js +0 -19
  404. package/template/wall-e/eval/fixtures/fullstack-app/server/middleware/logger.js +0 -19
  405. package/template/wall-e/eval/fixtures/fullstack-app/server/router.js +0 -50
  406. package/template/wall-e/eval/fixtures/fullstack-app/server/routes/auth.js +0 -69
  407. package/template/wall-e/eval/fixtures/fullstack-app/server/routes/health.js +0 -23
  408. package/template/wall-e/eval/fixtures/fullstack-app/server/routes/items.js +0 -88
  409. package/template/wall-e/eval/fixtures/fullstack-app/server/routes/users.js +0 -75
  410. package/template/wall-e/eval/fixtures/fullstack-app/server/test.js +0 -198
  411. package/template/wall-e/eval/fixtures/fullstack-app/server/utils/response.js +0 -34
  412. package/template/wall-e/eval/fixtures/fullstack-app/server/utils/validate.js +0 -26
  413. package/template/wall-e/eval/fixtures/fullstack-app/server.js +0 -8
  414. package/template/wall-e/eval/fixtures/fullstack-app/test.js +0 -12
  415. package/template/wall-e/eval/fixtures/monorepo-basic/package.json +0 -8
  416. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/data.js +0 -58
  417. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/middleware.js +0 -46
  418. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/package.json +0 -8
  419. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/routes.js +0 -64
  420. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/server.js +0 -56
  421. package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/test.js +0 -116
  422. package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/commands.js +0 -61
  423. package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/index.js +0 -62
  424. package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/output.js +0 -43
  425. package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/package.json +0 -11
  426. package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/test.js +0 -44
  427. package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/formatters.js +0 -43
  428. package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/index.js +0 -12
  429. package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/package.json +0 -5
  430. package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/test.js +0 -55
  431. package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/validators.js +0 -29
  432. package/template/wall-e/eval/fixtures/monorepo-basic/test.js +0 -46
  433. package/template/wall-e/eval/fixtures/node-cli/index.js +0 -78
  434. package/template/wall-e/eval/fixtures/node-cli/package.json +0 -10
  435. package/template/wall-e/eval/fixtures/node-cli/test.js +0 -57
  436. package/template/wall-e/eval/fixtures/node-typed/package.json +0 -8
  437. package/template/wall-e/eval/fixtures/node-typed/src/handlers.js +0 -31
  438. package/template/wall-e/eval/fixtures/node-typed/src/utils.js +0 -33
  439. package/template/wall-e/eval/fixtures/node-typed/test.js +0 -36
  440. package/template/wall-e/eval/fixtures/python-flask/app.py +0 -14
  441. package/template/wall-e/eval/fixtures/python-flask/requirements.txt +0 -2
  442. package/template/wall-e/eval/fixtures/python-flask/test_app.py +0 -25
  443. package/template/wall-e/eval/fixtures/wall-e-subset/brain.js +0 -105
  444. package/template/wall-e/eval/fixtures/wall-e-subset/eval/aggregator.js +0 -101
  445. package/template/wall-e/eval/fixtures/wall-e-subset/eval/benchmarks/chat.json +0 -20
  446. package/template/wall-e/eval/fixtures/wall-e-subset/eval/benchmarks/coding.json +0 -32
  447. package/template/wall-e/eval/fixtures/wall-e-subset/eval/benchmarks.js +0 -64
  448. package/template/wall-e/eval/fixtures/wall-e-subset/eval/fixtures/simple-project/package.json +0 -6
  449. package/template/wall-e/eval/fixtures/wall-e-subset/eval/fixtures/simple-project/server.js +0 -31
  450. package/template/wall-e/eval/fixtures/wall-e-subset/eval/fixtures/simple-project/test.js +0 -18
  451. package/template/wall-e/eval/fixtures/wall-e-subset/eval/fixtures/simple-project/utils.js +0 -34
  452. package/template/wall-e/eval/fixtures/wall-e-subset/eval/runner.js +0 -104
  453. package/template/wall-e/eval/fixtures/wall-e-subset/eval/scorer.js +0 -73
  454. package/template/wall-e/eval/fixtures/wall-e-subset/eval/test.js +0 -134
  455. package/template/wall-e/eval/fixtures/wall-e-subset/llm/client.js +0 -99
  456. package/template/wall-e/eval/fixtures/wall-e-subset/llm/providers.js +0 -63
  457. package/template/wall-e/eval/fixtures/wall-e-subset/llm/test.js +0 -70
  458. package/template/wall-e/eval/fixtures/wall-e-subset/package.json +0 -10
  459. package/template/wall-e/eval/fixtures/wall-e-subset/test.js +0 -86
  460. package/template/wall-e/eval/harvester.js +0 -685
  461. package/template/wall-e/eval/head-to-head.js +0 -388
  462. package/template/wall-e/eval/humaneval-adapter.js +0 -321
  463. package/template/wall-e/eval/list-models.js +0 -31
  464. package/template/wall-e/eval/livecodebench-adapter.js +0 -291
  465. package/template/wall-e/eval/mail-integration.js +0 -443
  466. package/template/wall-e/eval/manifest.js +0 -186
  467. package/template/wall-e/eval/meta-harness/adapters/coding-agent.js +0 -57
  468. package/template/wall-e/eval/meta-harness/bootstrap-snapshot.js +0 -149
  469. package/template/wall-e/eval/meta-harness/candidate-store.js +0 -117
  470. package/template/wall-e/eval/meta-harness/cli.js +0 -86
  471. package/template/wall-e/eval/meta-harness/domain-spec.js +0 -154
  472. package/template/wall-e/eval/meta-harness/domains/coding-agent.domain.json +0 -84
  473. package/template/wall-e/eval/meta-harness/examples/env-bootstrap-candidate.js +0 -29
  474. package/template/wall-e/eval/meta-harness/experience-store.js +0 -174
  475. package/template/wall-e/eval/meta-harness/frontier.js +0 -96
  476. package/template/wall-e/eval/meta-harness/harness-interface.js +0 -90
  477. package/template/wall-e/eval/meta-harness/leakage-guard.js +0 -80
  478. package/template/wall-e/eval/meta-harness/optimizer.js +0 -207
  479. package/template/wall-e/eval/meta-harness/proposer-runner.js +0 -110
  480. package/template/wall-e/eval/meta-harness/reporting.js +0 -58
  481. package/template/wall-e/eval/meta-harness/telemetry.js +0 -27
  482. package/template/wall-e/eval/meta-harness/validation.js +0 -81
  483. package/template/wall-e/eval/promoter.js +0 -228
  484. package/template/wall-e/eval/provider-normalizer.js +0 -33
  485. package/template/wall-e/eval/replay.js +0 -395
  486. package/template/wall-e/eval/run-agent-benchmarks.js +0 -386
  487. package/template/wall-e/eval/run-codex-cli-baseline.js +0 -177
  488. package/template/wall-e/eval/run-coding-agent-real.js +0 -187
  489. package/template/wall-e/eval/run-eval.js +0 -435
  490. package/template/wall-e/eval/run-model-comparison.js +0 -142
  491. package/template/wall-e/eval/session-evaluator.js +0 -187
  492. package/template/wall-e/eval/session-miner.js +0 -207
  493. package/template/wall-e/eval/session-retrieval-benchmark.js +0 -150
  494. package/template/wall-e/eval/session-transcripts.js +0 -509
  495. package/template/wall-e/eval/shadow.js +0 -161
  496. package/template/wall-e/eval/swebench-adapter.js +0 -345
  497. package/template/wall-e/eval/swebench-docker.js +0 -192
  498. package/template/wall-e/eval/train.py +0 -320
  499. package/template/wall-e/eval/trainer.js +0 -232
  500. package/template/wall-e/eval/weekly-eval-loop.js +0 -241
@@ -1,174 +0,0 @@
1
- 'use strict';
2
-
3
- const fs = require('node:fs');
4
- const path = require('node:path');
5
- const crypto = require('node:crypto');
6
-
7
- const SAFE_SEGMENT_RE = /^[A-Za-z0-9][A-Za-z0-9_.:-]*$/;
8
-
9
- class ExperienceStore {
10
- constructor({ domainSpec, rootDir, runName, now = () => new Date().toISOString() } = {}) {
11
- if (!domainSpec || typeof domainSpec !== 'object') throw new Error('domainSpec is required');
12
- this.domainSpec = domainSpec;
13
- this.now = now;
14
- this.rootDir = path.resolve(rootDir || domainSpec.paths?.runRoot || path.join(process.cwd(), 'wall-e', 'eval', 'meta-runs'));
15
- this.runName = safeSegment(runName || `${domainSpec.id}-${timestampForName(now())}`, 'runName');
16
- this.runDir = path.join(this.rootDir, this.runName);
17
- this.paths = {
18
- rootDir: this.rootDir,
19
- runDir: this.runDir,
20
- candidatesDir: path.join(this.runDir, 'candidates'),
21
- iterationsDir: path.join(this.runDir, 'iterations'),
22
- reportsDir: path.join(this.runDir, 'reports'),
23
- telemetryFile: path.join(this.runDir, 'telemetry.jsonl'),
24
- manifestFile: path.join(this.runDir, 'run_manifest.json'),
25
- experienceFile: path.join(this.runDir, 'experience.jsonl'),
26
- frontierFile: path.join(this.runDir, 'frontier.json'),
27
- };
28
- }
29
-
30
- ensureRun(extraManifest = {}) {
31
- for (const dir of [
32
- this.paths.runDir,
33
- this.paths.candidatesDir,
34
- this.paths.iterationsDir,
35
- this.paths.reportsDir,
36
- ]) {
37
- fs.mkdirSync(dir, { recursive: true });
38
- }
39
- if (!fs.existsSync(this.paths.manifestFile)) {
40
- this.writeRunManifest(extraManifest);
41
- }
42
- return this.paths;
43
- }
44
-
45
- writeRunManifest(extra = {}) {
46
- const manifest = {
47
- runName: this.runName,
48
- domainId: this.domainSpec.id,
49
- taskType: this.domainSpec.taskType,
50
- createdAt: this.now(),
51
- domainSpecHash: this.domainSpec.specHash || null,
52
- domainSpecPath: this.domainSpec.specPath || null,
53
- projectRoot: this.domainSpec.paths?.projectRoot || this.domainSpec.projectRoot || null,
54
- ...extra,
55
- };
56
- this.writeJsonAbsolute(this.paths.manifestFile, manifest);
57
- return manifest;
58
- }
59
-
60
- iterationDir(iteration) {
61
- return path.join(this.paths.iterationsDir, `iter-${padIteration(iteration)}`);
62
- }
63
-
64
- candidateDir(iteration, candidateId) {
65
- const dir = path.join(this.iterationDir(iteration), 'candidates', safeSegment(candidateId, 'candidateId'));
66
- fs.mkdirSync(dir, { recursive: true });
67
- return dir;
68
- }
69
-
70
- evalTaskDir(iteration, candidateId, taskId) {
71
- const dir = path.join(this.candidateDir(iteration, candidateId), 'tasks', safeSegment(taskId, 'taskId'));
72
- fs.mkdirSync(dir, { recursive: true });
73
- return dir;
74
- }
75
-
76
- writeCandidateFile(candidateId, fileName, contents) {
77
- const dir = path.join(this.paths.candidatesDir, safeSegment(candidateId, 'candidateId'));
78
- fs.mkdirSync(dir, { recursive: true });
79
- const target = path.join(dir, safeSegment(fileName, 'fileName'));
80
- atomicWrite(target, String(contents || ''));
81
- return target;
82
- }
83
-
84
- writeJson(relativePath, data) {
85
- const target = this.resolveRunPath(relativePath);
86
- this.writeJsonAbsolute(target, data);
87
- return target;
88
- }
89
-
90
- writeJsonAbsolute(target, data) {
91
- atomicWrite(target, JSON.stringify(data, null, 2) + '\n');
92
- }
93
-
94
- readJson(relativePath, fallback = undefined) {
95
- const target = this.resolveRunPath(relativePath);
96
- if (!fs.existsSync(target)) return fallback;
97
- return JSON.parse(fs.readFileSync(target, 'utf8'));
98
- }
99
-
100
- writeText(relativePath, text) {
101
- const target = this.resolveRunPath(relativePath);
102
- atomicWrite(target, String(text || ''));
103
- return target;
104
- }
105
-
106
- appendJsonl(relativePath, row) {
107
- const target = this.resolveRunPath(relativePath);
108
- fs.mkdirSync(path.dirname(target), { recursive: true });
109
- fs.appendFileSync(target, JSON.stringify(row) + '\n', 'utf8');
110
- return target;
111
- }
112
-
113
- appendExperience(row) {
114
- this.ensureRun();
115
- return this.appendJsonl('experience.jsonl', {
116
- timestamp: this.now(),
117
- domainId: this.domainSpec.id,
118
- ...row,
119
- });
120
- }
121
-
122
- readJsonl(relativePath) {
123
- const target = this.resolveRunPath(relativePath);
124
- if (!fs.existsSync(target)) return [];
125
- return fs.readFileSync(target, 'utf8')
126
- .split('\n')
127
- .filter((line) => line.trim())
128
- .map((line) => JSON.parse(line));
129
- }
130
-
131
- resolveRunPath(relativePath) {
132
- if (!relativePath || typeof relativePath !== 'string') throw new Error('relative path is required');
133
- if (path.isAbsolute(relativePath)) throw new Error(`absolute paths are not allowed in run store: ${relativePath}`);
134
- const target = path.resolve(this.runDir, relativePath);
135
- if (!target.startsWith(this.runDir + path.sep) && target !== this.runDir) {
136
- throw new Error(`path escapes run directory: ${relativePath}`);
137
- }
138
- return target;
139
- }
140
- }
141
-
142
- function atomicWrite(target, contents) {
143
- fs.mkdirSync(path.dirname(target), { recursive: true });
144
- const suffix = crypto.randomBytes(6).toString('hex');
145
- const tmp = `${target}.${process.pid}.${suffix}.tmp`;
146
- fs.writeFileSync(tmp, contents, 'utf8');
147
- fs.renameSync(tmp, target);
148
- }
149
-
150
- function padIteration(iteration) {
151
- const n = Number(iteration);
152
- if (!Number.isInteger(n) || n < 0) throw new Error(`invalid iteration: ${iteration}`);
153
- return String(n).padStart(3, '0');
154
- }
155
-
156
- function safeSegment(value, label = 'segment') {
157
- const text = String(value || '');
158
- if (!SAFE_SEGMENT_RE.test(text) || text.includes('..') || /[\\/]/.test(text)) {
159
- throw new Error(`invalid ${label}: ${text || '<empty>'}`);
160
- }
161
- return text;
162
- }
163
-
164
- function timestampForName(iso) {
165
- return String(iso || new Date().toISOString()).replace(/[^0-9T]/g, '').slice(0, 15) || 'run';
166
- }
167
-
168
- module.exports = {
169
- ExperienceStore,
170
- SAFE_SEGMENT_RE,
171
- atomicWrite,
172
- padIteration,
173
- safeSegment,
174
- };
@@ -1,96 +0,0 @@
1
- 'use strict';
2
-
3
- const path = require('node:path');
4
-
5
- function getMetricValue(result, metricPath = 'score.composite') {
6
- const parts = String(metricPath || '').split('.').filter(Boolean);
7
- let value = result;
8
- for (const part of parts) {
9
- if (value == null) return null;
10
- value = value[part];
11
- }
12
- const n = Number(value);
13
- return Number.isFinite(n) ? n : null;
14
- }
15
-
16
- function summarizeCandidateResults({ candidateId, iteration, split, results = [], metricPath = 'score.composite' } = {}) {
17
- const scored = results.map((result) => getMetricValue(result, metricPath)).filter((value) => value !== null);
18
- const aggregateScore = scored.length > 0
19
- ? scored.reduce((sum, value) => sum + value, 0) / scored.length
20
- : 0;
21
- return {
22
- candidateId,
23
- iteration,
24
- split,
25
- metricPath,
26
- aggregateScore,
27
- taskCount: results.length,
28
- scoredTaskCount: scored.length,
29
- successCount: results.filter((result) => result.success === true).length,
30
- results: results.map((result) => ({
31
- benchmarkId: result.benchmarkId || result.taskId || null,
32
- success: result.success === true,
33
- score: getMetricValue(result, metricPath),
34
- artifactPath: result.artifactPath || null,
35
- error: result.error || null,
36
- })),
37
- };
38
- }
39
-
40
- function updateFrontier(frontier = {}, candidateSummary = {}, domainSpec = {}) {
41
- const higherIsBetter = domainSpec.metrics?.higherIsBetter !== false;
42
- const current = normalizeFrontier(frontier);
43
- const candidateScore = Number(candidateSummary.aggregateScore || 0);
44
- const bestScore = current.bestCandidate ? Number(current.bestCandidate.aggregateScore || 0) : null;
45
- const improves = bestScore === null || (higherIsBetter ? candidateScore > bestScore : candidateScore < bestScore);
46
- const next = {
47
- ...current,
48
- metricPath: candidateSummary.metricPath || current.metricPath || domainSpec.metrics?.primary || 'score.composite',
49
- updatedAt: new Date().toISOString(),
50
- history: [...current.history, candidateSummary],
51
- };
52
- if (improves) next.bestCandidate = candidateSummary;
53
- for (const result of candidateSummary.results || []) {
54
- if (!result.benchmarkId || result.score === null) continue;
55
- const existing = next.perTask[result.benchmarkId];
56
- const resultImproves = !existing || (higherIsBetter ? result.score > existing.score : result.score < existing.score);
57
- if (resultImproves) {
58
- next.perTask[result.benchmarkId] = {
59
- candidateId: candidateSummary.candidateId,
60
- iteration: candidateSummary.iteration,
61
- score: result.score,
62
- success: result.success,
63
- artifactPath: result.artifactPath,
64
- };
65
- }
66
- }
67
- return next;
68
- }
69
-
70
- function normalizeFrontier(frontier = {}) {
71
- return {
72
- metricPath: frontier.metricPath || 'score.composite',
73
- bestCandidate: frontier.bestCandidate || null,
74
- perTask: { ...(frontier.perTask || {}) },
75
- history: Array.isArray(frontier.history) ? frontier.history.slice() : [],
76
- updatedAt: frontier.updatedAt || null,
77
- };
78
- }
79
-
80
- function readFrontier(store) {
81
- return normalizeFrontier(store.readJson('frontier.json', {}));
82
- }
83
-
84
- function writeFrontier(store, frontier) {
85
- store.writeJson('frontier.json', normalizeFrontier(frontier));
86
- return path.join(store.runDir, 'frontier.json');
87
- }
88
-
89
- module.exports = {
90
- getMetricValue,
91
- summarizeCandidateResults,
92
- updateFrontier,
93
- normalizeFrontier,
94
- readFrontier,
95
- writeFrontier,
96
- };
@@ -1,90 +0,0 @@
1
- 'use strict';
2
-
3
- const { validateCandidateModule } = require('./validation');
4
-
5
- const BASE_HARNESS_VERSION = 'walle-meta-harness-v1';
6
-
7
- function createBaseHarness(overrides = {}) {
8
- const base = {
9
- id: 'walle-coding-agent-base',
10
- version: BASE_HARNESS_VERSION,
11
- surfaces: {
12
- systemPromptPrelude: '',
13
- contextBuilder: null,
14
- toolPolicy: null,
15
- environmentBootstrap: null,
16
- runtimeOptions: {},
17
- },
18
- transformPrompt(prompt, context = {}) {
19
- const prelude = String(this.surfaces.systemPromptPrelude || '').trim();
20
- const env = typeof this.surfaces.environmentBootstrap === 'function'
21
- ? String(this.surfaces.environmentBootstrap(context) || '').trim()
22
- : String(this.surfaces.environmentBootstrap || '').trim();
23
- return [prelude, env, String(prompt || '')].filter(Boolean).join('\n\n');
24
- },
25
- buildRunOptions(context = {}) {
26
- return {
27
- ...(context.options || {}),
28
- ...(this.surfaces.runtimeOptions || {}),
29
- };
30
- },
31
- };
32
- return mergeHarness(base, overrides);
33
- }
34
-
35
- function applyCandidate(baseHarness, candidateModule, context = {}) {
36
- const candidate = validateCandidateModule(candidateModule, context);
37
- const inputHarness = mergeHarness(createBaseHarness(), baseHarness || {});
38
- const applied = candidate.apply(inputHarness, context);
39
- if (!applied || typeof applied !== 'object') {
40
- throw new Error(`Candidate ${candidate.manifest.id} apply() must return a harness object`);
41
- }
42
- const harness = mergeHarness(inputHarness, applied);
43
- validateHarness(harness);
44
- return {
45
- ...harness,
46
- candidateManifest: candidate.manifest,
47
- candidateId: candidate.manifest.id,
48
- };
49
- }
50
-
51
- function normalizeCandidateModule(candidateModule) {
52
- if (!candidateModule || typeof candidateModule !== 'object') return candidateModule;
53
- return candidateModule.default || candidateModule.candidate || candidateModule;
54
- }
55
-
56
- function validateHarness(harness) {
57
- if (!harness || typeof harness !== 'object') throw new Error('harness must be an object');
58
- if (typeof harness.id !== 'string' || !harness.id) throw new Error('harness.id is required');
59
- if (!harness.surfaces || typeof harness.surfaces !== 'object') throw new Error('harness.surfaces is required');
60
- if (typeof harness.transformPrompt !== 'function') throw new Error('harness.transformPrompt must be a function');
61
- if (typeof harness.buildRunOptions !== 'function') throw new Error('harness.buildRunOptions must be a function');
62
- return true;
63
- }
64
-
65
- function mergeHarness(base, overlay) {
66
- const merged = {
67
- ...base,
68
- ...overlay,
69
- surfaces: {
70
- ...(base.surfaces || {}),
71
- ...(overlay.surfaces || {}),
72
- runtimeOptions: {
73
- ...(base.surfaces?.runtimeOptions || {}),
74
- ...(overlay.surfaces?.runtimeOptions || {}),
75
- },
76
- },
77
- };
78
- if (!overlay.transformPrompt && base.transformPrompt) merged.transformPrompt = base.transformPrompt;
79
- if (!overlay.buildRunOptions && base.buildRunOptions) merged.buildRunOptions = base.buildRunOptions;
80
- return merged;
81
- }
82
-
83
- module.exports = {
84
- BASE_HARNESS_VERSION,
85
- createBaseHarness,
86
- applyCandidate,
87
- normalizeCandidateModule,
88
- validateHarness,
89
- mergeHarness,
90
- };
@@ -1,80 +0,0 @@
1
- 'use strict';
2
-
3
- const fs = require('node:fs');
4
- const path = require('node:path');
5
-
6
- function getHeldoutTaskIds(domainSpec = {}) {
7
- return new Set(domainSpec.datasets?.heldout?.taskIds || []);
8
- }
9
-
10
- function scanCandidateSourceForLeakage(source, domainSpec = {}) {
11
- const text = String(source || '');
12
- const violations = [];
13
- for (const taskId of getHeldoutTaskIds(domainSpec)) {
14
- if (taskId && text.includes(taskId)) violations.push({ type: 'heldout_task_id', value: taskId });
15
- }
16
- if (/held[-_]?out/i.test(text) && domainSpec.leakagePolicy?.proposerCanReadHeldout === false) {
17
- violations.push({ type: 'heldout_reference', value: 'heldout' });
18
- }
19
- return violations;
20
- }
21
-
22
- function assertNoHeldoutLeakage({ source, domainSpec, candidateId = 'candidate' } = {}) {
23
- const violations = scanCandidateSourceForLeakage(source, domainSpec);
24
- if (violations.length > 0) {
25
- const details = violations.map((violation) => `${violation.type}:${violation.value}`).join(', ');
26
- throw new Error(`${candidateId} failed held-out leakage scan: ${details}`);
27
- }
28
- return true;
29
- }
30
-
31
- function assertProposerPathAllowed(filePath, { store, domainSpec } = {}) {
32
- const absolute = path.resolve(filePath);
33
- const runDir = path.resolve(store.runDir);
34
- if (!absolute.startsWith(runDir + path.sep) && absolute !== runDir) {
35
- throw new Error(`proposer path escapes run directory: ${filePath}`);
36
- }
37
- const rel = path.relative(runDir, absolute).split(path.sep).join('/');
38
- if (domainSpec?.leakagePolicy?.proposerCanReadHeldout === false && /(^|\/)heldout(\/|$)/i.test(rel)) {
39
- throw new Error(`proposer path is held-out protected: ${rel}`);
40
- }
41
- for (const taskId of getHeldoutTaskIds(domainSpec)) {
42
- if (rel.includes(taskId)) throw new Error(`proposer path includes held-out task id ${taskId}: ${rel}`);
43
- }
44
- return true;
45
- }
46
-
47
- function listProposerVisibleArtifacts({ store, domainSpec } = {}) {
48
- const out = [];
49
- walk(store.runDir, (filePath) => {
50
- try {
51
- assertProposerPathAllowed(filePath, { store, domainSpec });
52
- out.push(filePath);
53
- } catch {
54
- // Hidden from proposer context.
55
- }
56
- });
57
- return out.sort();
58
- }
59
-
60
- function walk(dir, onFile) {
61
- let entries = [];
62
- try {
63
- entries = fs.readdirSync(dir, { withFileTypes: true });
64
- } catch {
65
- return;
66
- }
67
- for (const entry of entries) {
68
- const full = path.join(dir, entry.name);
69
- if (entry.isDirectory()) walk(full, onFile);
70
- else if (entry.isFile()) onFile(full);
71
- }
72
- }
73
-
74
- module.exports = {
75
- getHeldoutTaskIds,
76
- scanCandidateSourceForLeakage,
77
- assertNoHeldoutLeakage,
78
- assertProposerPathAllowed,
79
- listProposerVisibleArtifacts,
80
- };
@@ -1,207 +0,0 @@
1
- 'use strict';
2
-
3
- const path = require('node:path');
4
-
5
- const { ExperienceStore, padIteration, safeSegment } = require('./experience-store');
6
- const { CandidateStore } = require('./candidate-store');
7
- const { createBaseHarness, applyCandidate } = require('./harness-interface');
8
- const { recordMetaHarnessTelemetry } = require('./telemetry');
9
- const {
10
- readFrontier,
11
- summarizeCandidateResults,
12
- updateFrontier,
13
- writeFrontier,
14
- } = require('./frontier');
15
- const { writeIterationReport, writeRunSummary } = require('./reporting');
16
-
17
- async function runMetaHarness({
18
- domainSpec,
19
- store = null,
20
- adapter,
21
- proposer = null,
22
- candidateStore = null,
23
- maxIterations = null,
24
- evaluateBaseline = true,
25
- } = {}) {
26
- if (!domainSpec) throw new Error('domainSpec is required');
27
- if (!adapter || typeof adapter.runTask !== 'function') throw new Error('adapter.runTask is required');
28
- const runStore = store || new ExperienceStore({ domainSpec });
29
- runStore.ensureRun({ optimizer: 'meta-harness' });
30
- const candidates = candidateStore || new CandidateStore({ store: runStore, domainSpec });
31
- let frontier = readFrontier(runStore);
32
- const summaries = [];
33
- recordMetaHarnessTelemetry(runStore, {
34
- type: 'optimizer_start',
35
- domainId: domainSpec.id,
36
- maxIterations: maxIterations || domainSpec.budget?.maxIterations || 1,
37
- });
38
-
39
- if (evaluateBaseline && frontier.history.length === 0) {
40
- const baselineSummary = await evaluateCandidate({
41
- store: runStore,
42
- adapter,
43
- domainSpec,
44
- iteration: 0,
45
- candidateId: 'baseline',
46
- harness: createBaseHarness(),
47
- split: 'search',
48
- });
49
- frontier = updateFrontier(frontier, baselineSummary, domainSpec);
50
- writeFrontier(runStore, frontier);
51
- summaries.push(baselineSummary);
52
- }
53
-
54
- const iterations = Math.min(
55
- Number(maxIterations || domainSpec.budget?.maxIterations || 1),
56
- Number(domainSpec.budget?.maxIterations || maxIterations || 1),
57
- );
58
- for (let iteration = 1; iteration <= iterations; iteration++) {
59
- const iterationSummaries = [];
60
- const proposals = await resolveProposals(proposer, { iteration, store: runStore, domainSpec, frontier });
61
- const limited = proposals.slice(0, Number(domainSpec.budget?.maxCandidatesPerIteration || proposals.length || 1));
62
- for (const proposal of limited) {
63
- const registered = candidates.registerCandidate({ ...proposal, iteration });
64
- let harness;
65
- try {
66
- harness = applyCandidate(createBaseHarness(), registered.module, { domainSpec, iteration, store: runStore });
67
- } catch (err) {
68
- runStore.appendExperience({ event: 'candidate_rejected', candidateId: registered.id, iteration, error: err.message });
69
- continue;
70
- }
71
-
72
- const smokeSummary = await evaluateCandidate({
73
- store: runStore,
74
- adapter,
75
- domainSpec,
76
- iteration,
77
- candidateId: registered.id,
78
- harness,
79
- split: 'smoke',
80
- });
81
- summaries.push(smokeSummary);
82
- iterationSummaries.push(smokeSummary);
83
- const minimum = Number(domainSpec.metrics?.minimumViableScore ?? 0);
84
- if (smokeSummary.aggregateScore < minimum) {
85
- runStore.appendExperience({
86
- event: 'candidate_smoke_rejected',
87
- candidateId: registered.id,
88
- iteration,
89
- aggregateScore: smokeSummary.aggregateScore,
90
- minimum,
91
- });
92
- continue;
93
- }
94
-
95
- const searchSummary = await evaluateCandidate({
96
- store: runStore,
97
- adapter,
98
- domainSpec,
99
- iteration,
100
- candidateId: registered.id,
101
- harness,
102
- split: 'search',
103
- });
104
- frontier = updateFrontier(frontier, searchSummary, domainSpec);
105
- writeFrontier(runStore, frontier);
106
- summaries.push(searchSummary);
107
- iterationSummaries.push(searchSummary);
108
- }
109
- writeIterationReport({ store: runStore, iteration, frontier, summaries: iterationSummaries });
110
- }
111
-
112
- writeRunSummary({ store: runStore, frontier, summaries });
113
- recordMetaHarnessTelemetry(runStore, {
114
- type: 'optimizer_complete',
115
- domainId: domainSpec.id,
116
- bestCandidate: frontier.bestCandidate?.candidateId || null,
117
- bestScore: frontier.bestCandidate?.aggregateScore ?? null,
118
- summaryCount: summaries.length,
119
- });
120
- return { store: runStore, frontier, summaries };
121
- }
122
-
123
- async function evaluateCandidate({ store, adapter, domainSpec, iteration, candidateId, harness, split = 'search' } = {}) {
124
- const dataset = domainSpec.datasets?.[split];
125
- if (!dataset) throw new Error(`Unknown dataset split: ${split}`);
126
- const maxTasks = Number(domainSpec.budget?.maxTasksPerCandidate || dataset.taskIds.length);
127
- const taskIds = dataset.taskIds.slice(0, maxTasks);
128
- const results = [];
129
- for (const taskId of taskIds) {
130
- const artifactDir = store.evalTaskDir(iteration, candidateId, taskId);
131
- const result = await adapter.runTask({
132
- taskId,
133
- candidateId,
134
- harness,
135
- artifactDir,
136
- split,
137
- domainSpec,
138
- });
139
- const normalized = {
140
- taskId,
141
- benchmarkId: result.benchmarkId || taskId,
142
- ...result,
143
- artifactPath: result.artifactPath || artifactDir,
144
- };
145
- results.push(normalized);
146
- store.appendExperience({
147
- event: 'task_evaluated',
148
- candidateId,
149
- iteration,
150
- split,
151
- taskId,
152
- success: normalized.success === true,
153
- score: normalized.score?.composite ?? null,
154
- artifactPath: normalized.artifactPath,
155
- error: normalized.error || null,
156
- });
157
- recordMetaHarnessTelemetry(store, {
158
- type: 'task_evaluated',
159
- candidateId,
160
- iteration,
161
- split,
162
- taskId,
163
- success: normalized.success === true,
164
- score: normalized.score?.composite ?? null,
165
- });
166
- }
167
- const summary = summarizeCandidateResults({
168
- candidateId,
169
- iteration,
170
- split,
171
- results,
172
- metricPath: domainSpec.metrics?.primary || 'score.composite',
173
- });
174
- const summaryPath = path.join(
175
- 'iterations',
176
- `iter-${padIteration(iteration)}`,
177
- 'candidates',
178
- safeSegment(candidateId, 'candidateId'),
179
- `${safeSegment(split, 'split')}_summary.json`,
180
- );
181
- store.writeJson(summaryPath, summary);
182
- store.appendExperience({
183
- event: 'candidate_evaluated',
184
- candidateId,
185
- iteration,
186
- split,
187
- aggregateScore: summary.aggregateScore,
188
- taskCount: summary.taskCount,
189
- successCount: summary.successCount,
190
- });
191
- return summary;
192
- }
193
-
194
- async function resolveProposals(proposer, context) {
195
- if (!proposer) return [];
196
- const proposals = typeof proposer === 'function'
197
- ? await proposer(context)
198
- : await proposer.propose(context);
199
- if (!Array.isArray(proposals)) throw new Error('proposer must return an array of candidate proposals');
200
- return proposals;
201
- }
202
-
203
- module.exports = {
204
- runMetaHarness,
205
- evaluateCandidate,
206
- resolveProposals,
207
- };