vellum 0.2.0 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (361) hide show
  1. package/README.md +15 -2
  2. package/bun.lock +5 -2
  3. package/package.json +4 -2
  4. package/scripts/capture-x-graphql.ts +562 -0
  5. package/scripts/ipc/check-swift-decoder-drift.ts +2 -1
  6. package/scripts/test.sh +5 -0
  7. package/src/__tests__/__snapshots__/ipc-snapshot.test.ts.snap +161 -34
  8. package/src/__tests__/account-registry.test.ts +2 -1
  9. package/src/__tests__/agent-heartbeat-service.test.ts +250 -0
  10. package/src/__tests__/app-bundler.test.ts +12 -33
  11. package/src/__tests__/asset-materialize-tool.test.ts +16 -15
  12. package/src/__tests__/asset-search-tool.test.ts +23 -22
  13. package/src/__tests__/attachments-store.test.ts +56 -127
  14. package/src/__tests__/browser-skill-baseline-tool-payload.test.ts +5 -4
  15. package/src/__tests__/browser-skill-endstate.test.ts +5 -8
  16. package/src/__tests__/call-bridge.test.ts +385 -0
  17. package/src/__tests__/call-constants.test.ts +40 -0
  18. package/src/__tests__/call-orchestrator.test.ts +454 -0
  19. package/src/__tests__/call-recovery.test.ts +518 -0
  20. package/src/__tests__/call-routes-http.test.ts +459 -0
  21. package/src/__tests__/call-state-machine.test.ts +143 -0
  22. package/src/__tests__/call-state.test.ts +133 -0
  23. package/src/__tests__/call-store.test.ts +691 -0
  24. package/src/__tests__/cli-discover.test.ts +1 -1
  25. package/src/__tests__/commit-message-enrichment-service.test.ts +550 -0
  26. package/src/__tests__/compaction.benchmark.test.ts +176 -0
  27. package/src/__tests__/computer-use-tools.test.ts +250 -0
  28. package/src/__tests__/config-schema.test.ts +348 -3
  29. package/src/__tests__/conflict-store.test.ts +2 -1
  30. package/src/__tests__/contacts-tools.test.ts +331 -0
  31. package/src/__tests__/conversation-store.test.ts +30 -32
  32. package/src/__tests__/credential-security-invariants.test.ts +4 -0
  33. package/src/__tests__/date-context.test.ts +373 -0
  34. package/src/__tests__/db-schedule-syntax-migration.test.ts +129 -0
  35. package/src/__tests__/doordash-session.test.ts +9 -0
  36. package/src/__tests__/fixtures/media-reuse-fixtures.ts +3 -3
  37. package/src/__tests__/followup-tools.test.ts +303 -0
  38. package/src/__tests__/handlers-twitter-config.test.ts +718 -0
  39. package/src/__tests__/intent-routing.test.ts +64 -57
  40. package/src/__tests__/ipc-roundtrip.benchmark.test.ts +237 -0
  41. package/src/__tests__/ipc-snapshot.test.ts +96 -28
  42. package/src/__tests__/llm-usage-store.test.ts +3 -8
  43. package/src/__tests__/media-generate-image.test.ts +1 -1
  44. package/src/__tests__/media-reuse-story.e2e.test.ts +7 -7
  45. package/src/__tests__/memory-retrieval.benchmark.test.ts +430 -0
  46. package/src/__tests__/parallel-tool.benchmark.test.ts +294 -0
  47. package/src/__tests__/playbook-tools.test.ts +342 -0
  48. package/src/__tests__/profile-compiler.test.ts +2 -1
  49. package/src/__tests__/provider-streaming.benchmark.test.ts +773 -0
  50. package/src/__tests__/recurrence-engine-rruleset.test.ts +78 -0
  51. package/src/__tests__/recurrence-engine.test.ts +69 -0
  52. package/src/__tests__/recurrence-types.test.ts +71 -0
  53. package/src/__tests__/registry.test.ts +17 -10
  54. package/src/__tests__/relay-server.test.ts +633 -0
  55. package/src/__tests__/reminder-store.test.ts +6 -3
  56. package/src/__tests__/reminder.test.ts +43 -77
  57. package/src/__tests__/run-orchestrator-assistant-events.test.ts +222 -0
  58. package/src/__tests__/run-orchestrator.test.ts +7 -7
  59. package/src/__tests__/runtime-attachment-metadata.test.ts +19 -20
  60. package/src/__tests__/runtime-runs-http.test.ts +5 -23
  61. package/src/__tests__/runtime-runs.test.ts +11 -11
  62. package/src/__tests__/schedule-store.test.ts +482 -0
  63. package/src/__tests__/schedule-tools.test.ts +700 -0
  64. package/src/__tests__/scheduler-recurrence.test.ts +329 -0
  65. package/src/__tests__/server-history-render.test.ts +14 -13
  66. package/src/__tests__/session-error.test.ts +28 -0
  67. package/src/__tests__/session-init.benchmark.test.ts +462 -0
  68. package/src/__tests__/session-queue.test.ts +89 -16
  69. package/src/__tests__/session-runtime-assembly.test.ts +161 -0
  70. package/src/__tests__/session-surfaces-task-progress.test.ts +104 -0
  71. package/src/__tests__/signup-e2e.test.ts +2 -1
  72. package/src/__tests__/skill-projection.benchmark.test.ts +328 -0
  73. package/src/__tests__/skill-script-runner.test.ts +159 -0
  74. package/src/__tests__/speaker-identification.test.ts +52 -0
  75. package/src/__tests__/subagent-manager-notify.test.ts +42 -10
  76. package/src/__tests__/subagent-tools.test.ts +141 -41
  77. package/src/__tests__/task-compiler.test.ts +2 -1
  78. package/src/__tests__/task-runner.test.ts +2 -1
  79. package/src/__tests__/task-scheduler.test.ts +2 -1
  80. package/src/__tests__/task-tools.test.ts +49 -56
  81. package/src/__tests__/tool-audit-listener.test.ts +1 -0
  82. package/src/__tests__/tool-domain-event-publisher.test.ts +2 -0
  83. package/src/__tests__/tool-execution-pipeline.benchmark.test.ts +500 -0
  84. package/src/__tests__/tool-executor.test.ts +13 -17
  85. package/src/__tests__/turn-commit.test.ts +273 -2
  86. package/src/__tests__/twilio-provider.test.ts +143 -0
  87. package/src/__tests__/twilio-routes.test.ts +789 -0
  88. package/src/__tests__/twitter-auth-handler.test.ts +581 -0
  89. package/src/__tests__/view-image-tool.test.ts +217 -0
  90. package/src/__tests__/workspace-git-service.test.ts +403 -0
  91. package/src/__tests__/workspace-heartbeat-service.test.ts +141 -2
  92. package/src/agent-heartbeat/agent-heartbeat-service.ts +155 -0
  93. package/src/bundler/app-bundler.ts +35 -14
  94. package/src/calls/call-bridge.ts +95 -0
  95. package/src/calls/call-constants.ts +48 -0
  96. package/src/calls/call-domain.ts +276 -0
  97. package/src/calls/call-orchestrator.ts +390 -0
  98. package/src/calls/call-recovery.ts +207 -0
  99. package/src/calls/call-state-machine.ts +68 -0
  100. package/src/calls/call-state.ts +64 -0
  101. package/src/calls/call-store.ts +416 -0
  102. package/src/calls/relay-server.ts +335 -0
  103. package/src/calls/speaker-identification.ts +213 -0
  104. package/src/calls/twilio-config.ts +34 -0
  105. package/src/calls/twilio-provider.ts +173 -0
  106. package/src/calls/twilio-routes.ts +250 -0
  107. package/src/calls/types.ts +37 -0
  108. package/src/calls/voice-provider.ts +14 -0
  109. package/src/cli/config-commands.ts +334 -0
  110. package/src/cli/core-commands.ts +776 -0
  111. package/src/cli/doordash.ts +256 -25
  112. package/src/cli/ipc-client.ts +82 -0
  113. package/src/cli/map.ts +246 -0
  114. package/src/cli/twitter.ts +575 -0
  115. package/src/cli.ts +7 -5
  116. package/src/commands/__tests__/cc-command-registry.test.ts +319 -0
  117. package/src/commands/cc-command-registry.ts +209 -0
  118. package/src/config/bundled-skills/contacts/SKILL.md +39 -0
  119. package/src/config/bundled-skills/contacts/TOOLS.json +122 -0
  120. package/src/config/bundled-skills/contacts/tools/contact-merge.ts +9 -0
  121. package/src/config/bundled-skills/contacts/tools/contact-search.ts +9 -0
  122. package/src/config/bundled-skills/contacts/tools/contact-upsert.ts +9 -0
  123. package/src/config/bundled-skills/document/SKILL.md +18 -0
  124. package/src/config/bundled-skills/document/TOOLS.json +53 -0
  125. package/src/config/bundled-skills/document/tools/document-create.ts +9 -0
  126. package/src/config/bundled-skills/document/tools/document-update.ts +9 -0
  127. package/src/config/bundled-skills/doordash/SKILL.md +163 -0
  128. package/src/config/bundled-skills/followups/SKILL.md +32 -0
  129. package/src/config/bundled-skills/followups/TOOLS.json +100 -0
  130. package/src/config/bundled-skills/followups/tools/followup-create.ts +9 -0
  131. package/src/config/bundled-skills/followups/tools/followup-list.ts +9 -0
  132. package/src/config/bundled-skills/followups/tools/followup-resolve.ts +9 -0
  133. package/src/config/bundled-skills/image-studio/TOOLS.json +2 -2
  134. package/src/config/bundled-skills/image-studio/tools/media-generate-image.ts +2 -24
  135. package/src/config/bundled-skills/messaging/tools/messaging-analyze-style.ts +2 -1
  136. package/src/config/bundled-skills/playbooks/SKILL.md +31 -0
  137. package/src/config/bundled-skills/playbooks/TOOLS.json +126 -0
  138. package/src/config/bundled-skills/playbooks/tools/playbook-create.ts +9 -0
  139. package/src/config/bundled-skills/playbooks/tools/playbook-delete.ts +9 -0
  140. package/src/config/bundled-skills/playbooks/tools/playbook-list.ts +9 -0
  141. package/src/config/bundled-skills/playbooks/tools/playbook-update.ts +9 -0
  142. package/src/config/bundled-skills/reminder/SKILL.md +20 -0
  143. package/src/config/bundled-skills/reminder/TOOLS.json +67 -0
  144. package/src/config/bundled-skills/reminder/tools/reminder-cancel.ts +9 -0
  145. package/src/config/bundled-skills/reminder/tools/reminder-create.ts +9 -0
  146. package/src/config/bundled-skills/reminder/tools/reminder-list.ts +9 -0
  147. package/src/config/bundled-skills/schedule/SKILL.md +74 -0
  148. package/src/config/bundled-skills/schedule/TOOLS.json +135 -0
  149. package/src/config/bundled-skills/schedule/tools/schedule-create.ts +9 -0
  150. package/src/config/bundled-skills/schedule/tools/schedule-delete.ts +9 -0
  151. package/src/config/bundled-skills/schedule/tools/schedule-list.ts +9 -0
  152. package/src/config/bundled-skills/schedule/tools/schedule-update.ts +9 -0
  153. package/src/config/bundled-skills/subagent/SKILL.md +25 -0
  154. package/src/config/bundled-skills/subagent/TOOLS.json +107 -0
  155. package/src/config/bundled-skills/subagent/tools/subagent-abort.ts +9 -0
  156. package/src/config/bundled-skills/subagent/tools/subagent-message.ts +9 -0
  157. package/src/config/bundled-skills/subagent/tools/subagent-read.ts +9 -0
  158. package/src/config/bundled-skills/subagent/tools/subagent-spawn.ts +9 -0
  159. package/src/config/bundled-skills/subagent/tools/subagent-status.ts +9 -0
  160. package/src/config/bundled-skills/tasks/SKILL.md +28 -0
  161. package/src/config/bundled-skills/tasks/TOOLS.json +256 -0
  162. package/src/config/bundled-skills/tasks/tools/task-delete.ts +9 -0
  163. package/src/config/bundled-skills/tasks/tools/task-list-add.ts +9 -0
  164. package/src/config/bundled-skills/tasks/tools/task-list-remove.ts +9 -0
  165. package/src/config/bundled-skills/tasks/tools/task-list-show.ts +9 -0
  166. package/src/config/bundled-skills/tasks/tools/task-list-update.ts +9 -0
  167. package/src/config/bundled-skills/tasks/tools/task-list.ts +9 -0
  168. package/src/config/bundled-skills/tasks/tools/task-run.ts +9 -0
  169. package/src/config/bundled-skills/tasks/tools/task-save.ts +9 -0
  170. package/src/config/bundled-skills/twitter/SKILL.md +134 -0
  171. package/src/config/bundled-skills/watcher/SKILL.md +27 -0
  172. package/src/config/bundled-skills/watcher/TOOLS.json +147 -0
  173. package/src/config/bundled-skills/watcher/tools/watcher-create.ts +9 -0
  174. package/src/config/bundled-skills/watcher/tools/watcher-delete.ts +9 -0
  175. package/src/config/bundled-skills/watcher/tools/watcher-digest.ts +9 -0
  176. package/src/config/bundled-skills/watcher/tools/watcher-list.ts +9 -0
  177. package/src/config/bundled-skills/watcher/tools/watcher-update.ts +9 -0
  178. package/src/config/defaults.ts +44 -0
  179. package/src/config/loader.ts +4 -1
  180. package/src/config/schema.ts +218 -1
  181. package/src/config/system-prompt.ts +100 -6
  182. package/src/config/templates/IDENTITY.md +7 -0
  183. package/src/config/types.ts +5 -0
  184. package/src/contacts/contact-store.ts +4 -4
  185. package/src/daemon/assistant-attachments.ts +10 -0
  186. package/src/daemon/classifier.ts +3 -1
  187. package/src/daemon/computer-use-session.ts +3 -1
  188. package/src/daemon/date-context.ts +136 -0
  189. package/src/daemon/handlers/apps.ts +16 -1
  190. package/src/daemon/handlers/browser.ts +54 -0
  191. package/src/daemon/handlers/computer-use.ts +7 -1
  192. package/src/daemon/handlers/config.ts +192 -4
  193. package/src/daemon/handlers/diagnostics.ts +5 -1
  194. package/src/daemon/handlers/documents.ts +18 -29
  195. package/src/daemon/handlers/home-base.ts +5 -1
  196. package/src/daemon/handlers/index.ts +40 -271
  197. package/src/daemon/handlers/misc.ts +9 -1
  198. package/src/daemon/handlers/publish.ts +6 -1
  199. package/src/daemon/handlers/sessions.ts +65 -12
  200. package/src/daemon/handlers/shared.ts +36 -1
  201. package/src/daemon/handlers/signing.ts +37 -0
  202. package/src/daemon/handlers/skills.ts +20 -6
  203. package/src/daemon/handlers/subagents.ts +8 -3
  204. package/src/daemon/handlers/twitter-auth.ts +169 -0
  205. package/src/daemon/handlers/work-items.ts +495 -39
  206. package/src/daemon/ipc-contract-inventory.json +40 -4
  207. package/src/daemon/ipc-contract.ts +185 -37
  208. package/src/daemon/ipc-protocol.ts +7 -2
  209. package/src/daemon/lifecycle.ts +48 -5
  210. package/src/daemon/main.ts +10 -4
  211. package/src/daemon/ride-shotgun-handler.ts +74 -10
  212. package/src/daemon/server.ts +144 -29
  213. package/src/daemon/session-agent-loop.ts +887 -0
  214. package/src/daemon/session-attachments.ts +28 -5
  215. package/src/daemon/session-error.ts +24 -3
  216. package/src/daemon/session-lifecycle.ts +147 -0
  217. package/src/daemon/session-media-retry.ts +147 -0
  218. package/src/daemon/session-messaging.ts +145 -0
  219. package/src/daemon/session-notifiers.ts +164 -0
  220. package/src/daemon/session-process.ts +2 -2
  221. package/src/daemon/session-queue-manager.ts +1 -0
  222. package/src/daemon/session-runtime-assembly.ts +52 -0
  223. package/src/daemon/session-skill-tools.ts +124 -5
  224. package/src/daemon/session-slash.ts +3 -0
  225. package/src/daemon/session-surfaces.ts +77 -2
  226. package/src/daemon/session-tool-setup.ts +222 -2
  227. package/src/daemon/session-usage.ts +0 -2
  228. package/src/daemon/session.ts +114 -1365
  229. package/src/daemon/video-thumbnail.ts +60 -0
  230. package/src/doordash/client.ts +121 -27
  231. package/src/doordash/queries.ts +1 -2
  232. package/src/export/formatter.ts +3 -1
  233. package/src/followups/followup-store.ts +4 -2
  234. package/src/followups/types.ts +6 -0
  235. package/src/hooks/templates.ts +1 -1
  236. package/src/index.ts +32 -1151
  237. package/src/media/gemini-image-service.ts +1 -1
  238. package/src/memory/attachments-store.ts +28 -83
  239. package/src/memory/channel-delivery-store.ts +7 -21
  240. package/src/memory/clarification-resolver.ts +6 -5
  241. package/src/memory/contradiction-checker.ts +3 -2
  242. package/src/memory/conversation-key-store.ts +10 -29
  243. package/src/memory/conversation-store.ts +2 -1
  244. package/src/memory/db.ts +362 -2
  245. package/src/memory/entity-extractor.ts +6 -3
  246. package/src/memory/items-extractor.ts +5 -4
  247. package/src/memory/jobs-store.ts +3 -2
  248. package/src/memory/llm-usage-store.ts +1 -2
  249. package/src/memory/runs-store.ts +1 -2
  250. package/src/memory/schema.ts +65 -2
  251. package/src/messaging/style-analyzer.ts +3 -2
  252. package/src/messaging/thread-summarizer.ts +8 -12
  253. package/src/messaging/triage-engine.ts +4 -2
  254. package/src/providers/openrouter/client.ts +20 -0
  255. package/src/providers/registry.ts +8 -0
  256. package/src/runtime/http-server.ts +277 -25
  257. package/src/runtime/http-types.ts +0 -2
  258. package/src/runtime/routes/attachment-routes.ts +5 -6
  259. package/src/runtime/routes/call-routes.ts +140 -0
  260. package/src/runtime/routes/channel-routes.ts +12 -19
  261. package/src/runtime/routes/conversation-routes.ts +5 -9
  262. package/src/runtime/routes/run-routes.ts +4 -8
  263. package/src/runtime/run-orchestrator.ts +39 -6
  264. package/src/schedule/recurrence-engine.ts +138 -0
  265. package/src/schedule/recurrence-types.ts +67 -0
  266. package/src/schedule/schedule-store.ts +102 -57
  267. package/src/schedule/scheduler.ts +9 -6
  268. package/src/security/oauth2.ts +29 -4
  269. package/src/security/secret-allowlist.ts +46 -0
  270. package/src/skills/clawhub.ts +1 -1
  271. package/src/subagent/manager.ts +40 -8
  272. package/src/swarm/backend-claude-code.ts +64 -9
  273. package/src/swarm/worker-prompts.ts +2 -1
  274. package/src/tasks/SPEC.md +34 -28
  275. package/src/tasks/ephemeral-permissions.ts +16 -7
  276. package/src/tasks/task-compiler.ts +5 -4
  277. package/src/tasks/task-runner.ts +10 -5
  278. package/src/tasks/task-scheduler.ts +1 -1
  279. package/src/tasks/tool-sanitizer.ts +36 -0
  280. package/src/tools/assets/search.ts +4 -4
  281. package/src/tools/browser/api-map.ts +220 -0
  282. package/src/tools/browser/auto-navigate.ts +270 -0
  283. package/src/tools/browser/browser-execution.ts +2 -1
  284. package/src/tools/browser/browser-manager.ts +2 -2
  285. package/src/tools/browser/network-recorder.ts +5 -4
  286. package/src/tools/browser/x-auto-navigate.ts +207 -0
  287. package/src/tools/calls/call-end.ts +67 -0
  288. package/src/tools/calls/call-start.ts +73 -0
  289. package/src/tools/calls/call-status.ts +81 -0
  290. package/src/tools/claude-code/claude-code.ts +77 -11
  291. package/src/tools/contacts/contact-merge.ts +46 -78
  292. package/src/tools/contacts/contact-search.ts +35 -79
  293. package/src/tools/contacts/contact-upsert.ts +35 -108
  294. package/src/tools/credentials/vault.ts +21 -5
  295. package/src/tools/document/document-tool.ts +71 -144
  296. package/src/tools/executor.ts +129 -10
  297. package/src/tools/followups/followup_create.ts +46 -88
  298. package/src/tools/followups/followup_list.ts +34 -74
  299. package/src/tools/followups/followup_resolve.ts +31 -66
  300. package/src/tools/host-terminal/cli-discover.ts +2 -1
  301. package/src/tools/host-terminal/host-shell.ts +10 -0
  302. package/src/tools/memory/handlers.ts +5 -4
  303. package/src/tools/network/__tests__/web-search.test.ts +427 -0
  304. package/src/tools/network/script-proxy/__tests__/logging.test.ts +248 -0
  305. package/src/tools/network/script-proxy/__tests__/policy.test.ts +234 -0
  306. package/src/tools/network/script-proxy/__tests__/router.test.ts +76 -0
  307. package/src/tools/network/web-fetch.ts +18 -6
  308. package/src/tools/playbooks/index.ts +4 -5
  309. package/src/tools/playbooks/playbook-create.ts +3 -47
  310. package/src/tools/playbooks/playbook-delete.ts +1 -25
  311. package/src/tools/playbooks/playbook-list.ts +1 -28
  312. package/src/tools/playbooks/playbook-update.ts +3 -51
  313. package/src/tools/registry.ts +2 -4
  314. package/src/tools/reminder/reminder.ts +5 -78
  315. package/src/tools/schedule/create.ts +69 -74
  316. package/src/tools/schedule/delete.ts +21 -47
  317. package/src/tools/schedule/list.ts +55 -74
  318. package/src/tools/schedule/update.ts +77 -84
  319. package/src/tools/subagent/abort.ts +29 -58
  320. package/src/tools/subagent/message.ts +30 -63
  321. package/src/tools/subagent/read.ts +53 -84
  322. package/src/tools/subagent/spawn.ts +43 -82
  323. package/src/tools/subagent/status.ts +42 -71
  324. package/src/tools/swarm/delegate.ts +2 -1
  325. package/src/tools/tasks/index.ts +8 -6
  326. package/src/tools/tasks/task-delete.ts +69 -56
  327. package/src/tools/tasks/task-list.ts +31 -52
  328. package/src/tools/tasks/task-run.ts +74 -102
  329. package/src/tools/tasks/task-save.ts +33 -65
  330. package/src/tools/tasks/work-item-enqueue.ts +192 -134
  331. package/src/tools/tasks/work-item-list.ts +33 -78
  332. package/src/tools/tasks/work-item-remove.ts +60 -0
  333. package/src/tools/tasks/work-item-update.ts +114 -0
  334. package/src/tools/terminal/backends/native.ts +3 -1
  335. package/src/tools/tool-manifest.ts +20 -74
  336. package/src/tools/types.ts +6 -0
  337. package/src/tools/ui-surface/definitions.ts +6 -1
  338. package/src/tools/watch/screen-watch.ts +3 -1
  339. package/src/tools/watcher/create.ts +52 -98
  340. package/src/tools/watcher/delete.ts +20 -46
  341. package/src/tools/watcher/digest.ts +36 -70
  342. package/src/tools/watcher/list.ts +49 -79
  343. package/src/tools/watcher/update.ts +45 -91
  344. package/src/twitter/client.ts +690 -0
  345. package/src/twitter/session.ts +91 -0
  346. package/src/usage/types.ts +0 -1
  347. package/src/util/truncate.ts +6 -0
  348. package/src/watcher/providers/slack.ts +2 -1
  349. package/src/watcher/watcher-store.ts +3 -2
  350. package/src/work-items/work-item-store.ts +236 -2
  351. package/src/workspace/commit-message-enrichment-service.ts +284 -0
  352. package/src/workspace/commit-message-provider.ts +95 -0
  353. package/src/workspace/git-service.ts +272 -52
  354. package/src/workspace/heartbeat-service.ts +70 -13
  355. package/src/workspace/provider-commit-message-generator.ts +242 -0
  356. package/src/workspace/turn-commit.ts +100 -51
  357. package/src/tools/contacts/index.ts +0 -4
  358. package/src/tools/document/index.ts +0 -5
  359. package/src/tools/followups/index.ts +0 -3
  360. package/src/tools/subagent/index.ts +0 -5
  361. /package/src/__tests__/{memory-context-benchmark.test.ts → memory-context-benchmark.benchmark.test.ts} +0 -0
@@ -0,0 +1,176 @@
1
+ /**
2
+ * Context Window Compaction Benchmark
3
+ *
4
+ * Measures compaction cost with a mock provider:
5
+ * - compaction latency under threshold pressure
6
+ * - no-op fast path for below-threshold histories
7
+ * - token reduction ratio after compaction
8
+ * - summary call count within expected range
9
+ * - severe pressure overriding cooldown
10
+ */
11
+ import { describe, expect, mock, test } from 'bun:test';
12
+
13
+ import { DEFAULT_CONFIG } from '../config/defaults.js';
14
+ import { ContextWindowManager } from '../context/window-manager.js';
15
+ import { estimatePromptTokens } from '../context/token-estimator.js';
16
+ import type { Message, Provider } from '../providers/types.js';
17
+
18
+ mock.module('../util/logger.js', () => ({
19
+ getLogger: () =>
20
+ new Proxy({} as Record<string, unknown>, { get: () => () => {} }),
21
+ }));
22
+
23
+ function makeSummaryProvider(counter: { calls: number }): Provider {
24
+ return {
25
+ name: 'mock',
26
+ async sendMessage() {
27
+ counter.calls += 1;
28
+ return {
29
+ content: [
30
+ {
31
+ type: 'text',
32
+ text: `## Goals\n- Preserve state\n## Constraints\n- Keep PRs small\n## Decisions\n- Call ${counter.calls}`,
33
+ },
34
+ ],
35
+ model: 'mock-model',
36
+ usage: { inputTokens: 420, outputTokens: 85 },
37
+ stopReason: 'end_turn',
38
+ };
39
+ },
40
+ };
41
+ }
42
+
43
+ function makeLongMessages(turns: number): Message[] {
44
+ const rows: Message[] = [];
45
+ for (let i = 0; i < turns; i++) {
46
+ rows.push({
47
+ role: 'user',
48
+ content: [
49
+ {
50
+ type: 'text',
51
+ text: `[U${i}] User message with enough content to estimate tokens. Topic ${i % 9}.`,
52
+ },
53
+ ],
54
+ });
55
+ rows.push({
56
+ role: 'assistant',
57
+ content: [
58
+ {
59
+ type: 'text',
60
+ text: `[A${i}] Assistant response with relevant content. Result ${i % 7}.`,
61
+ },
62
+ ],
63
+ });
64
+ }
65
+ return rows;
66
+ }
67
+
68
+ function makeConfig() {
69
+ return {
70
+ ...DEFAULT_CONFIG.contextWindow,
71
+ maxInputTokens: 6000,
72
+ targetInputTokens: 3200,
73
+ compactThreshold: 0.6,
74
+ preserveRecentUserTurns: 8,
75
+ chunkTokens: 1200,
76
+ };
77
+ }
78
+
79
+ describe('Compaction benchmark', () => {
80
+ test('compaction with mock provider completes under 500ms', async () => {
81
+ const counter = { calls: 0 };
82
+ const provider = makeSummaryProvider(counter);
83
+ const config = makeConfig();
84
+ const manager = new ContextWindowManager(provider, 'system prompt', config);
85
+
86
+ // 90 turns = 180 messages, well above 60% of 6000 = 3600 threshold
87
+ const messages = makeLongMessages(90);
88
+ const before = estimatePromptTokens(messages, 'system prompt', {
89
+ providerName: 'mock',
90
+ });
91
+ expect(before).toBeGreaterThan(config.maxInputTokens * config.compactThreshold);
92
+
93
+ const start = performance.now();
94
+ const result = await manager.maybeCompact(messages);
95
+ const elapsed = performance.now() - start;
96
+
97
+ expect(result.compacted).toBe(true);
98
+ expect(elapsed).toBeLessThan(500);
99
+ });
100
+
101
+ test('below-threshold check returns in under 50ms (no-op)', async () => {
102
+ const counter = { calls: 0 };
103
+ const provider = makeSummaryProvider(counter);
104
+ const config = makeConfig();
105
+ const manager = new ContextWindowManager(provider, 'system prompt', config);
106
+
107
+ // 3 turns = 6 messages, well below threshold
108
+ const messages = makeLongMessages(3);
109
+
110
+ const start = performance.now();
111
+ const result = await manager.maybeCompact(messages);
112
+ const elapsed = performance.now() - start;
113
+
114
+ expect(result.compacted).toBe(false);
115
+ expect(result.reason).toBe('below compaction threshold');
116
+ expect(elapsed).toBeLessThan(50);
117
+ expect(counter.calls).toBe(0);
118
+ });
119
+
120
+ test('token reduction ratio exceeds 30% after compaction', async () => {
121
+ const counter = { calls: 0 };
122
+ const provider = makeSummaryProvider(counter);
123
+ const config = makeConfig();
124
+ const manager = new ContextWindowManager(provider, 'system prompt', config);
125
+
126
+ const messages = makeLongMessages(90);
127
+ const result = await manager.maybeCompact(messages);
128
+
129
+ expect(result.compacted).toBe(true);
130
+ const reductionRatio =
131
+ (result.previousEstimatedInputTokens - result.estimatedInputTokens) /
132
+ result.previousEstimatedInputTokens;
133
+ expect(reductionRatio).toBeGreaterThan(0.3);
134
+ });
135
+
136
+ test('summary calls fall within 2-6 range', async () => {
137
+ const counter = { calls: 0 };
138
+ const provider = makeSummaryProvider(counter);
139
+ const config = makeConfig();
140
+ const manager = new ContextWindowManager(provider, 'system prompt', config);
141
+
142
+ const messages = makeLongMessages(90);
143
+ const result = await manager.maybeCompact(messages);
144
+
145
+ expect(result.compacted).toBe(true);
146
+ expect(result.summaryCalls).toBeGreaterThanOrEqual(2);
147
+ expect(result.summaryCalls).toBeLessThanOrEqual(6);
148
+ expect(result.summaryCalls).toBe(counter.calls);
149
+ });
150
+
151
+ test('severe pressure triggers compaction even during cooldown', async () => {
152
+ const counter = { calls: 0 };
153
+ const provider = makeSummaryProvider(counter);
154
+ // Use a tighter maxInputTokens so 90 turns exceeds the 95% severe threshold
155
+ const config = {
156
+ ...makeConfig(),
157
+ maxInputTokens: 4000,
158
+ targetInputTokens: 2000,
159
+ };
160
+ const manager = new ContextWindowManager(provider, 'system prompt', config);
161
+
162
+ const messages = makeLongMessages(90);
163
+ const estimated = estimatePromptTokens(messages, 'system prompt', {
164
+ providerName: 'mock',
165
+ });
166
+ expect(estimated).toBeGreaterThan(config.maxInputTokens * 0.95);
167
+
168
+ // Simulate being within cooldown by setting lastCompactedAt to now
169
+ const result = await manager.maybeCompact(messages, undefined, {
170
+ lastCompactedAt: Date.now(),
171
+ });
172
+
173
+ expect(result.compacted).toBe(true);
174
+ expect(result.summaryCalls).toBeGreaterThan(0);
175
+ });
176
+ });
@@ -0,0 +1,250 @@
1
+ import { describe, test, expect } from 'bun:test';
2
+ import {
3
+ allComputerUseTools,
4
+ computerUseClickTool,
5
+ computerUseDoubleClickTool,
6
+ computerUseRightClickTool,
7
+ computerUseTypeTextTool,
8
+ computerUseKeyTool,
9
+ computerUseScrollTool,
10
+ computerUseDragTool,
11
+ computerUseWaitTool,
12
+ computerUseOpenAppTool,
13
+ computerUseRunAppleScriptTool,
14
+ computerUseDoneTool,
15
+ computerUseRespondTool,
16
+ } from '../tools/computer-use/definitions.js';
17
+ import { requestComputerControlTool } from '../tools/computer-use/request-computer-control.js';
18
+ import { forwardComputerUseProxyTool } from '../tools/computer-use/skill-proxy-bridge.js';
19
+ import type { ToolContext } from '../tools/types.js';
20
+
21
+ interface JsonSchema {
22
+ type?: string;
23
+ required?: string[];
24
+ properties?: Record<string, unknown>;
25
+ }
26
+
27
+ /** Cast a tool definition's input_schema to a usable JSON Schema shape. */
28
+ function schema(tool: { getDefinition(): { input_schema: object } }): JsonSchema {
29
+ return tool.getDefinition().input_schema as JsonSchema;
30
+ }
31
+
32
+ const ctx: ToolContext = {
33
+ workingDir: '/tmp',
34
+ sessionId: 'test-session',
35
+ conversationId: 'test-conversation',
36
+ };
37
+
38
+ // ── Tool definitions ────────────────────────────────────────────────
39
+
40
+ describe('computer-use tool definitions', () => {
41
+ test('allComputerUseTools contains 12 tools', () => {
42
+ expect(allComputerUseTools.length).toBe(12);
43
+ });
44
+
45
+ test('all tools have proxy execution mode', () => {
46
+ for (const tool of allComputerUseTools) {
47
+ expect(tool.executionMode).toBe('proxy');
48
+ }
49
+ expect(requestComputerControlTool.executionMode).toBe('proxy');
50
+ });
51
+
52
+ test('all tools belong to computer-use category', () => {
53
+ for (const tool of allComputerUseTools) {
54
+ expect(tool.category).toBe('computer-use');
55
+ }
56
+ expect(requestComputerControlTool.category).toBe('computer-use');
57
+ });
58
+
59
+ test('all tools have unique names', () => {
60
+ const names = allComputerUseTools.map((t) => t.name);
61
+ expect(new Set(names).size).toBe(names.length);
62
+ });
63
+
64
+ test('all tools have descriptions', () => {
65
+ for (const tool of allComputerUseTools) {
66
+ expect(tool.description.length).toBeGreaterThan(0);
67
+ }
68
+ });
69
+ });
70
+
71
+ // ── Click tool variants ─────────────────────────────────────────────
72
+
73
+ describe('click tool variants', () => {
74
+ for (const [tool, label] of [
75
+ [computerUseClickTool, 'click'],
76
+ [computerUseDoubleClickTool, 'double_click'],
77
+ [computerUseRightClickTool, 'right_click'],
78
+ ] as const) {
79
+ test(`${tool.name} has correct name`, () => {
80
+ expect(tool.name).toBe(`computer_use_${label}`);
81
+ });
82
+
83
+ test(`${tool.name} schema requires reasoning`, () => {
84
+ expect(schema(tool).required).toContain('reasoning');
85
+ });
86
+
87
+ test(`${tool.name} schema supports element_id and coordinates`, () => {
88
+ const props = schema(tool).properties as Record<string, { type: string }>;
89
+ expect(props.element_id.type).toBe('integer');
90
+ expect(props.x.type).toBe('integer');
91
+ expect(props.y.type).toBe('integer');
92
+ });
93
+
94
+ test(`${tool.name} execute throws proxy error`, () => {
95
+ expect(() => tool.execute({}, ctx)).toThrow('Proxy tool');
96
+ });
97
+ }
98
+ });
99
+
100
+ // ── type_text ───────────────────────────────────────────────────────
101
+
102
+ describe('computer_use_type_text', () => {
103
+ test('requires text and reasoning', () => {
104
+ expect(schema(computerUseTypeTextTool).required).toContain('text');
105
+ expect(schema(computerUseTypeTextTool).required).toContain('reasoning');
106
+ });
107
+
108
+ test('execute throws proxy error', () => {
109
+ expect(() => computerUseTypeTextTool.execute({}, ctx)).toThrow('Proxy tool');
110
+ });
111
+ });
112
+
113
+ // ── key ─────────────────────────────────────────────────────────────
114
+
115
+ describe('computer_use_key', () => {
116
+ test('requires key and reasoning', () => {
117
+ expect(schema(computerUseKeyTool).required).toContain('key');
118
+ expect(schema(computerUseKeyTool).required).toContain('reasoning');
119
+ });
120
+
121
+ test('execute throws proxy error', () => {
122
+ expect(() => computerUseKeyTool.execute({}, ctx)).toThrow('Proxy tool');
123
+ });
124
+ });
125
+
126
+ // ── scroll ──────────────────────────────────────────────────────────
127
+
128
+ describe('computer_use_scroll', () => {
129
+ test('requires direction, amount, and reasoning', () => {
130
+ expect(schema(computerUseScrollTool).required).toContain('direction');
131
+ expect(schema(computerUseScrollTool).required).toContain('amount');
132
+ expect(schema(computerUseScrollTool).required).toContain('reasoning');
133
+ });
134
+
135
+ test('direction enum includes up, down, left, right', () => {
136
+ const props = schema(computerUseScrollTool).properties as Record<string, { enum?: string[] }>;
137
+ expect(props.direction.enum).toEqual(['up', 'down', 'left', 'right']);
138
+ });
139
+ });
140
+
141
+ // ── drag ────────────────────────────────────────────────────────────
142
+
143
+ describe('computer_use_drag', () => {
144
+ test('supports source and destination coordinates', () => {
145
+ const props = schema(computerUseDragTool).properties as Record<string, { type: string }>;
146
+ expect(props.element_id.type).toBe('integer');
147
+ expect(props.to_element_id.type).toBe('integer');
148
+ expect(props.x.type).toBe('integer');
149
+ expect(props.y.type).toBe('integer');
150
+ expect(props.to_x.type).toBe('integer');
151
+ expect(props.to_y.type).toBe('integer');
152
+ });
153
+
154
+ test('requires reasoning only', () => {
155
+ expect(schema(computerUseDragTool).required).toEqual(['reasoning']);
156
+ });
157
+ });
158
+
159
+ // ── wait ────────────────────────────────────────────────────────────
160
+
161
+ describe('computer_use_wait', () => {
162
+ test('requires duration_ms and reasoning', () => {
163
+ expect(schema(computerUseWaitTool).required).toContain('duration_ms');
164
+ expect(schema(computerUseWaitTool).required).toContain('reasoning');
165
+ });
166
+ });
167
+
168
+ // ── open_app ────────────────────────────────────────────────────────
169
+
170
+ describe('computer_use_open_app', () => {
171
+ test('requires app_name and reasoning', () => {
172
+ expect(schema(computerUseOpenAppTool).required).toContain('app_name');
173
+ expect(schema(computerUseOpenAppTool).required).toContain('reasoning');
174
+ });
175
+ });
176
+
177
+ // ── run_applescript ─────────────────────────────────────────────────
178
+
179
+ describe('computer_use_run_applescript', () => {
180
+ test('requires script and reasoning', () => {
181
+ expect(schema(computerUseRunAppleScriptTool).required).toContain('script');
182
+ expect(schema(computerUseRunAppleScriptTool).required).toContain('reasoning');
183
+ });
184
+
185
+ test('description warns against do shell script', () => {
186
+ expect(computerUseRunAppleScriptTool.description).toContain('do shell script');
187
+ expect(computerUseRunAppleScriptTool.description).toContain('blocked');
188
+ });
189
+ });
190
+
191
+ // ── done ────────────────────────────────────────────────────────────
192
+
193
+ describe('computer_use_done', () => {
194
+ test('requires summary', () => {
195
+ expect(schema(computerUseDoneTool).required).toContain('summary');
196
+ });
197
+ });
198
+
199
+ // ── respond ─────────────────────────────────────────────────────────
200
+
201
+ describe('computer_use_respond', () => {
202
+ test('requires answer and reasoning', () => {
203
+ expect(schema(computerUseRespondTool).required).toContain('answer');
204
+ expect(schema(computerUseRespondTool).required).toContain('reasoning');
205
+ });
206
+ });
207
+
208
+ // ── request_computer_control ────────────────────────────────────────
209
+
210
+ describe('computer_use_request_control', () => {
211
+ test('requires task parameter', () => {
212
+ expect(schema(requestComputerControlTool).required).toContain('task');
213
+ });
214
+
215
+ test('execute throws proxy error', () => {
216
+ expect(() => requestComputerControlTool.execute({}, ctx)).toThrow('surfaceProxyResolver');
217
+ });
218
+ });
219
+
220
+ // ── skill-proxy-bridge ──────────────────────────────────────────────
221
+
222
+ describe('forwardComputerUseProxyTool', () => {
223
+ test('returns error when no proxy resolver available', async () => {
224
+ const result = await forwardComputerUseProxyTool('computer_use_click', {}, ctx);
225
+
226
+ expect(result.isError).toBe(true);
227
+ expect(result.content).toContain('no proxy resolver available');
228
+ expect(result.content).toContain('computer_use_click');
229
+ });
230
+
231
+ test('delegates to proxy resolver when available', async () => {
232
+ const ctxWithProxy: ToolContext = {
233
+ ...ctx,
234
+ proxyToolResolver: async (name: string, input: Record<string, unknown>) => ({
235
+ content: `Forwarded ${name} with ${JSON.stringify(input)}`,
236
+ isError: false,
237
+ }),
238
+ };
239
+
240
+ const result = await forwardComputerUseProxyTool(
241
+ 'computer_use_screenshot',
242
+ { reasoning: 'test' },
243
+ ctxWithProxy,
244
+ );
245
+
246
+ expect(result.isError).toBe(false);
247
+ expect(result.content).toContain('Forwarded computer_use_screenshot');
248
+ expect(result.content).toContain('test');
249
+ });
250
+ });