@vellumai/assistant 0.8.7 → 0.8.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (387) hide show
  1. package/Dockerfile +20 -4
  2. package/docker-entrypoint.sh +4 -2
  3. package/docker-init-apt-root.sh +3 -1
  4. package/docker-kata-apt-env.sh +3 -1
  5. package/docker-kata-runtime-family.sh +12 -0
  6. package/docs/architecture/memory.md +1 -1
  7. package/docs/plugins.md +75 -79
  8. package/examples/plugins/echo/README.md +6 -12
  9. package/examples/plugins/echo/register.ts +0 -41
  10. package/node_modules/@vellumai/skill-host-contracts/src/server-message.ts +3 -3
  11. package/openapi.yaml +3381 -348
  12. package/package.json +1 -1
  13. package/scripts/generate-openapi.ts +68 -41
  14. package/src/__tests__/agent-loop-exit-reason.test.ts +34 -39
  15. package/src/__tests__/agent-loop-provider-error-recording.test.ts +1 -1
  16. package/src/__tests__/agent-loop.test.ts +37 -87
  17. package/src/__tests__/agent-wake-disk-pressure-callsite.test.ts +2 -0
  18. package/src/__tests__/annotate-activity-metadata.test.ts +262 -0
  19. package/src/__tests__/annotate-risk-options.test.ts +2 -3
  20. package/src/__tests__/anthropic-provider.test.ts +95 -2
  21. package/src/__tests__/assistant-event-hub.test.ts +25 -0
  22. package/src/__tests__/assistant-events-sse-shed.test.ts +8 -0
  23. package/src/__tests__/{conversation-stream-state.test.ts → assistant-stream-state.test.ts} +252 -91
  24. package/src/__tests__/auth-fallback-events-store.test.ts +116 -0
  25. package/src/__tests__/background-workers-disk-pressure.test.ts +6 -0
  26. package/src/__tests__/btw-routes.test.ts +62 -3
  27. package/src/__tests__/build-persisted-content.test.ts +184 -0
  28. package/src/__tests__/catalog-files.test.ts +1 -1
  29. package/src/__tests__/clawhub-files.test.ts +1 -1
  30. package/src/__tests__/compaction-pipeline.test.ts +1 -1
  31. package/src/__tests__/compaction.benchmark.test.ts +0 -30
  32. package/src/__tests__/config-watcher.test.ts +1 -1
  33. package/src/__tests__/conversation-abort-tool-results.test.ts +57 -19
  34. package/src/__tests__/conversation-agent-loop-disk-pressure.test.ts +6 -2
  35. package/src/__tests__/conversation-agent-loop-inference-profile.test.ts +10 -4
  36. package/src/__tests__/conversation-agent-loop-overflow.test.ts +313 -1136
  37. package/src/__tests__/conversation-agent-loop.test.ts +596 -1616
  38. package/src/__tests__/conversation-analysis-routes.test.ts +6 -0
  39. package/src/__tests__/conversation-history-web-search.test.ts +11 -1
  40. package/src/__tests__/conversation-pairing.test.ts +4 -31
  41. package/src/__tests__/conversation-process-app-control-preactivation.test.ts +6 -0
  42. package/src/__tests__/conversation-provider-retry-repair.test.ts +26 -5
  43. package/src/__tests__/conversation-queue.test.ts +2 -0
  44. package/src/__tests__/conversation-routes-disk-view.test.ts +3 -0
  45. package/src/__tests__/conversation-routes-slash-commands.test.ts +6 -5
  46. package/src/__tests__/conversation-runtime-assembly.test.ts +170 -229
  47. package/src/__tests__/conversation-runtime-workspace.test.ts +3 -24
  48. package/src/__tests__/conversation-slash-commands.test.ts +8 -42
  49. package/src/__tests__/conversation-slash-queue.test.ts +6 -1
  50. package/src/__tests__/conversation-surfaces-action-delivery.test.ts +84 -0
  51. package/src/__tests__/conversation-sync-tags.test.ts +27 -15
  52. package/src/__tests__/conversation-title-service.test.ts +135 -2
  53. package/src/__tests__/conversation-workspace-injection.test.ts +6 -1
  54. package/src/__tests__/cross-provider-web-search.test.ts +214 -1
  55. package/src/__tests__/db-schedule-syntax-migration.test.ts +5 -0
  56. package/src/__tests__/dm-persistence.test.ts +5 -1
  57. package/src/__tests__/empty-response-hook.test.ts +304 -0
  58. package/src/__tests__/feature-flag-test-helpers.ts +2 -2
  59. package/src/__tests__/gemini-image-service.test.ts +13 -0
  60. package/src/__tests__/helpers/mock-provider.ts +110 -0
  61. package/src/__tests__/helpers/native-web-search-harness.ts +129 -0
  62. package/src/__tests__/history-repair-hook.test.ts +1 -0
  63. package/src/__tests__/identity-intro-cache.test.ts +12 -100
  64. package/src/__tests__/identity-routes.test.ts +248 -7
  65. package/src/__tests__/inbound-slack-persistence.test.ts +5 -1
  66. package/src/__tests__/injector-background-turn.test.ts +2 -8
  67. package/src/__tests__/injector-chain.test.ts +106 -270
  68. package/src/__tests__/injector-disk-pressure.test.ts +3 -12
  69. package/src/__tests__/injector-document-comments.test.ts +2 -2
  70. package/src/__tests__/injector-pkb-v2-silenced.test.ts +30 -22
  71. package/src/__tests__/injector-v3-suppression.test.ts +31 -37
  72. package/src/__tests__/internal-telemetry-routes.test.ts +109 -0
  73. package/src/__tests__/list-messages-page-latest.test.ts +60 -0
  74. package/src/__tests__/list-messages-tool-merge.test.ts +20 -0
  75. package/src/__tests__/llm-usage-store.test.ts +223 -1
  76. package/src/__tests__/memory-retrieval-hook.test.ts +297 -0
  77. package/src/__tests__/memory-v2-static-injector.test.ts +103 -35
  78. package/src/__tests__/native-web-search.test.ts +191 -0
  79. package/src/__tests__/onboarding-template-contract.test.ts +2 -0
  80. package/src/__tests__/openai-image-service.test.ts +17 -0
  81. package/src/__tests__/openai-provider.test.ts +31 -1
  82. package/src/__tests__/persist-unsendable-image.test.ts +215 -0
  83. package/src/__tests__/persistence-secret-redaction.test.ts +1 -0
  84. package/src/__tests__/pipeline-runner.test.ts +29 -39
  85. package/src/__tests__/pkb-autoinject.test.ts +2 -5
  86. package/src/__tests__/plugin-bootstrap.test.ts +13 -28
  87. package/src/__tests__/plugin-registry.test.ts +0 -27
  88. package/src/__tests__/plugin-types.test.ts +2 -125
  89. package/src/__tests__/process-message-display-content.test.ts +6 -2
  90. package/src/__tests__/regenerate-fire-and-forget-trace.test.ts +5 -1
  91. package/src/__tests__/resolve-trust-class.test.ts +4 -4
  92. package/src/__tests__/runtime-events-sse-reconnect.test.ts +60 -23
  93. package/src/__tests__/schedule-routes.test.ts +603 -2
  94. package/src/__tests__/schedule-store.test.ts +41 -0
  95. package/src/__tests__/schedule-tools.test.ts +35 -0
  96. package/src/__tests__/server-history-render.test.ts +314 -1
  97. package/src/__tests__/skillssh-files.test.ts +1 -1
  98. package/src/__tests__/system-prompt.test.ts +20 -0
  99. package/src/__tests__/task-scheduler.test.ts +162 -1
  100. package/src/__tests__/terminal-tools.test.ts +6 -1
  101. package/src/__tests__/title-generate-hook.test.ts +319 -0
  102. package/src/__tests__/tool-error-hook.test.ts +278 -0
  103. package/src/__tests__/tool-preview-lifecycle.test.ts +468 -5
  104. package/src/__tests__/tool-result-metadata-plumbing.test.ts +1 -0
  105. package/src/__tests__/tool-result-truncate-hook.test.ts +127 -0
  106. package/src/__tests__/tool-result-truncation.test.ts +0 -2
  107. package/src/__tests__/ui-choice-copy-surfaces.test.ts +254 -0
  108. package/src/__tests__/ui-work-result-surface.test.ts +159 -0
  109. package/src/__tests__/usage-routes.test.ts +285 -1
  110. package/src/__tests__/user-plugin-loader.test.ts +2 -2
  111. package/src/__tests__/voice-session-bridge.test.ts +6 -3
  112. package/src/__tests__/web-search-backend-failure.test.ts +166 -0
  113. package/src/agent/loop.ts +346 -442
  114. package/src/api/events/assistant-thinking-delta.ts +33 -0
  115. package/src/api/events/tool-output-chunk.ts +45 -0
  116. package/src/api/events/tool-use-preview-start.ts +32 -0
  117. package/src/api/events/trace-event.ts +69 -0
  118. package/src/api/index.ts +48 -13
  119. package/src/api/responses/conversation-message.ts +368 -0
  120. package/src/avatar/__tests__/avatar-store.test.ts +34 -29
  121. package/src/cli/commands/__tests__/notifications.test.ts +58 -14
  122. package/src/cli/commands/notifications.ts +112 -60
  123. package/src/config/assistant-feature-flags.ts +22 -11
  124. package/src/config/bundled-skills/app-builder/SKILL.md +3 -20
  125. package/src/config/bundled-skills/app-builder/references/examples/README.md +17 -0
  126. package/src/config/bundled-skills/app-builder/references/examples/expense-tracker.md +515 -0
  127. package/src/config/bundled-skills/app-builder/references/examples/focus-timer.md +342 -0
  128. package/src/config/bundled-skills/app-builder/references/examples/habit-tracker.md +490 -0
  129. package/src/config/bundled-skills/document-editor/SKILL.md +1 -1
  130. package/src/config/bundled-skills/messaging/SKILL.md +0 -7
  131. package/src/config/feature-flag-cache.ts +3 -3
  132. package/src/config/feature-flag-registry.json +35 -3
  133. package/src/config/schemas/__tests__/memory-v2.test.ts +1 -0
  134. package/src/config/schemas/__tests__/memory-v3.test.ts +25 -0
  135. package/src/config/schemas/llm.ts +1 -0
  136. package/src/config/schemas/memory-v2.ts +8 -0
  137. package/src/config/schemas/memory-v3.ts +8 -0
  138. package/src/config/schemas/platform.ts +8 -0
  139. package/src/config/seed-inference-profiles.ts +2 -2
  140. package/src/config/skills.ts +13 -0
  141. package/src/context/compactor.ts +1 -1
  142. package/src/context/strip-injections.ts +122 -0
  143. package/src/context/token-estimator.ts +23 -0
  144. package/src/context/tool-result-truncation.ts +0 -23
  145. package/src/context/window-manager.ts +3 -6
  146. package/src/credential-execution/executable-discovery.ts +16 -0
  147. package/src/daemon/__tests__/conversation-lifecycle-auto-analyze.test.ts +6 -0
  148. package/src/daemon/__tests__/inference-profile-notification.test.ts +153 -0
  149. package/src/daemon/__tests__/native-web-search-metadata.test.ts +10 -8
  150. package/src/daemon/assistant-attachments.ts +1 -1
  151. package/src/daemon/config-watcher.ts +2 -2
  152. package/src/daemon/context-overflow-reducer.ts +0 -1
  153. package/src/daemon/conversation-agent-loop-handlers.ts +605 -153
  154. package/src/daemon/conversation-agent-loop.ts +281 -760
  155. package/src/daemon/conversation-history.ts +5 -4
  156. package/src/daemon/conversation-lifecycle.ts +3 -4
  157. package/src/daemon/conversation-messaging.ts +7 -6
  158. package/src/daemon/conversation-process.ts +11 -16
  159. package/src/daemon/conversation-runtime-assembly.ts +130 -347
  160. package/src/daemon/conversation-slash.ts +6 -25
  161. package/src/daemon/conversation-surfaces.ts +222 -4
  162. package/src/daemon/conversation-tool-setup.ts +2 -29
  163. package/src/daemon/conversation.ts +32 -14
  164. package/src/daemon/external-plugins-bootstrap.ts +9 -10
  165. package/src/daemon/handlers/config-a2a.ts +51 -36
  166. package/src/daemon/handlers/config-slack-channel.ts +20 -14
  167. package/src/daemon/handlers/config-telegram.ts +16 -2
  168. package/src/daemon/handlers/shared.ts +156 -84
  169. package/src/daemon/handlers/skills.ts +39 -10
  170. package/src/daemon/lifecycle.ts +4 -0
  171. package/src/daemon/message-types/apps.ts +1 -29
  172. package/src/daemon/message-types/messages.ts +9 -57
  173. package/src/daemon/message-types/skills.ts +2 -0
  174. package/src/daemon/message-types/surfaces.ts +136 -3
  175. package/src/daemon/now-scratchpad.ts +21 -0
  176. package/src/daemon/orphan-reaper.test.ts +210 -0
  177. package/src/daemon/orphan-reaper.ts +240 -0
  178. package/src/daemon/persist-unsendable-image.ts +117 -0
  179. package/src/daemon/process-message.ts +1 -3
  180. package/src/daemon/trace-emitter.ts +6 -4
  181. package/src/daemon/trust-context.ts +19 -0
  182. package/src/daemon/wake-target-adapter.ts +3 -1
  183. package/src/home/home-greeting-cache.ts +24 -1
  184. package/src/ipc/gateway-client.test.ts +2 -2
  185. package/src/ipc/gateway-client.ts +3 -3
  186. package/src/media/gemini-image-service.ts +15 -0
  187. package/src/media/openai-image-service.ts +14 -0
  188. package/src/media/types.ts +34 -0
  189. package/src/memory/__tests__/jobs-worker-v2-schedule.test.ts +56 -0
  190. package/src/memory/auth-fallback-events-store.ts +94 -0
  191. package/src/memory/conversation-title-service.ts +65 -41
  192. package/src/memory/db-init.ts +4 -0
  193. package/src/memory/graph/__tests__/conversation-graph-memory-registry.test.ts +119 -0
  194. package/src/memory/graph/conversation-graph-memory.ts +65 -0
  195. package/src/memory/jobs-store.ts +33 -0
  196. package/src/memory/jobs-worker.ts +31 -4
  197. package/src/memory/llm-usage-store.ts +224 -50
  198. package/src/memory/migrations/222-strip-placeholder-sentinels-from-messages.ts +6 -5
  199. package/src/memory/migrations/270-schedule-source-conversation.ts +13 -0
  200. package/src/memory/migrations/271-create-auth-fallback-events.ts +21 -0
  201. package/src/memory/migrations/index.ts +2 -0
  202. package/src/memory/pkb/autoinject.ts +61 -0
  203. package/src/memory/pkb/context.ts +50 -0
  204. package/src/memory/pkb/types.ts +14 -0
  205. package/src/memory/schedule-attribution-sql.ts +104 -0
  206. package/src/memory/schema/infrastructure.ts +16 -0
  207. package/src/memory/usage-grouped-buckets.ts +6 -1
  208. package/src/memory/v2/__tests__/consolidation-job.test.ts +1 -1
  209. package/src/memory/v2/consolidation-job.ts +1 -1
  210. package/src/memory/v3/__tests__/health.test.ts +16 -0
  211. package/src/memory/v3/__tests__/orchestrate.test.ts +45 -9
  212. package/src/memory/v3/__tests__/provider-blocks.test.ts +13 -0
  213. package/src/memory/v3/__tests__/router.test.ts +101 -29
  214. package/src/memory/v3/__tests__/selector.test.ts +93 -27
  215. package/src/memory/v3/__tests__/shadow-plugin.test.ts +23 -5
  216. package/src/memory/v3/health.ts +0 -0
  217. package/src/memory/v3/llm-retry.ts +32 -0
  218. package/src/memory/v3/orchestrate.ts +26 -14
  219. package/src/memory/v3/provider-blocks.ts +15 -5
  220. package/src/memory/v3/router.ts +48 -42
  221. package/src/memory/v3/selector.ts +57 -42
  222. package/src/memory/v3/shadow-plugin.ts +47 -15
  223. package/src/memory/v3/types.ts +8 -0
  224. package/src/notifications/conversation-pairing.ts +8 -15
  225. package/src/notifications/decision-engine.ts +6 -3
  226. package/src/notifications/home-feed-side-effect.ts +12 -1
  227. package/src/permissions/prompter.ts +4 -0
  228. package/src/plugin-api/constants.ts +4 -0
  229. package/src/plugin-api/index.ts +8 -1
  230. package/src/plugin-api/types.ts +151 -1
  231. package/src/plugins/defaults/empty-response/hooks/stop.ts +126 -0
  232. package/src/plugins/defaults/empty-response/register.ts +8 -13
  233. package/src/plugins/defaults/index.ts +1 -15
  234. package/src/plugins/defaults/injectors/register.ts +243 -74
  235. package/src/plugins/defaults/memory-retrieval/hooks/post-compact.ts +91 -0
  236. package/src/plugins/defaults/memory-retrieval/hooks/user-prompt-submit-temp.ts +216 -0
  237. package/src/plugins/defaults/memory-retrieval/injector-chain.ts +35 -0
  238. package/src/plugins/defaults/title-generate/hooks/stop.ts +75 -0
  239. package/src/plugins/defaults/title-generate/hooks/user-prompt-submit.ts +35 -0
  240. package/src/plugins/defaults/title-generate/package.json +1 -1
  241. package/src/plugins/defaults/title-generate/register.ts +18 -18
  242. package/src/plugins/defaults/tool-error/hooks/post-tool-use.ts +118 -0
  243. package/src/plugins/defaults/tool-error/package.json +1 -1
  244. package/src/plugins/defaults/tool-error/register.ts +9 -21
  245. package/src/plugins/defaults/tool-result-truncate/hooks/post-tool-use.ts +32 -0
  246. package/src/plugins/defaults/tool-result-truncate/register.ts +10 -21
  247. package/src/plugins/defaults/tool-result-truncate/terminal.ts +37 -18
  248. package/src/plugins/pipeline.ts +6 -18
  249. package/src/plugins/registry.ts +8 -25
  250. package/src/plugins/types.ts +43 -474
  251. package/src/proactive-artifact/aux-message-injector.ts +3 -3
  252. package/src/proactive-artifact/job.test.ts +7 -12
  253. package/src/prompts/__tests__/system-prompt.test.ts +36 -0
  254. package/src/prompts/templates/BOOTSTRAP-ACTIVATION-RAIL.md +62 -0
  255. package/src/prompts/templates/BOOTSTRAP.md +2 -2
  256. package/src/prompts/templates/system-sections.ts +15 -0
  257. package/src/providers/anthropic/client.ts +37 -29
  258. package/src/providers/openai/__tests__/chat-completions-provider-reasoning.test.ts +112 -0
  259. package/src/providers/openai/chat-completions-provider.ts +44 -0
  260. package/src/providers/openrouter/client.ts +1 -0
  261. package/src/providers/placeholder-sentinels.ts +35 -0
  262. package/src/runtime/__tests__/agent-wake.test.ts +5 -1
  263. package/src/runtime/agent-wake.ts +2 -2
  264. package/src/runtime/assistant-event-hub.ts +36 -6
  265. package/src/runtime/{conversation-stream-state.ts → assistant-stream-state.ts} +132 -58
  266. package/src/runtime/http-router.ts +16 -21
  267. package/src/runtime/http-types.ts +16 -70
  268. package/src/runtime/pending-interactions.ts +1 -0
  269. package/src/runtime/routes/__tests__/consolidation-routes.test.ts +265 -2
  270. package/src/runtime/routes/__tests__/conversation-query-routes.test.ts +31 -1
  271. package/src/runtime/routes/__tests__/memory-v2-routes.test.ts +6 -2
  272. package/src/runtime/routes/__tests__/tts-routes.test.ts +6 -2
  273. package/src/runtime/routes/app-management-routes.ts +6 -117
  274. package/src/runtime/routes/app-routes.ts +13 -15
  275. package/src/runtime/routes/attachment-routes.ts +26 -15
  276. package/src/runtime/routes/avatar-routes.ts +26 -0
  277. package/src/runtime/routes/btw-routes.ts +29 -23
  278. package/src/runtime/routes/consolidation-routes.ts +120 -20
  279. package/src/runtime/routes/conversation-query-routes.ts +2 -0
  280. package/src/runtime/routes/conversation-routes.ts +358 -184
  281. package/src/runtime/routes/documents-routes.ts +4 -0
  282. package/src/runtime/routes/domain-routes.ts +51 -37
  283. package/src/runtime/routes/epoch-millis-range.ts +34 -0
  284. package/src/runtime/routes/events-routes.ts +28 -34
  285. package/src/runtime/routes/gateway-log-routes.ts +26 -4
  286. package/src/runtime/routes/heartbeat-routes.ts +32 -12
  287. package/src/runtime/routes/identity-intro-cache.ts +11 -34
  288. package/src/runtime/routes/identity-routes.ts +208 -17
  289. package/src/runtime/routes/image-generation-routes.ts +40 -2
  290. package/src/runtime/routes/index.ts +2 -0
  291. package/src/runtime/routes/integrations/a2a.ts +12 -10
  292. package/src/runtime/routes/integrations/slack/__tests__/channel.test.ts +16 -0
  293. package/src/runtime/routes/integrations/slack/channel.ts +4 -0
  294. package/src/runtime/routes/integrations/slack/share.ts +27 -6
  295. package/src/runtime/routes/integrations/telegram.ts +6 -0
  296. package/src/runtime/routes/integrations/twilio.ts +42 -0
  297. package/src/runtime/routes/internal-telemetry-routes.ts +88 -0
  298. package/src/runtime/routes/log-export-routes.ts +8 -0
  299. package/src/runtime/routes/memory-v2-routes.ts +15 -8
  300. package/src/runtime/routes/memory-v3-routes.ts +50 -28
  301. package/src/runtime/routes/oauth-apps.ts +66 -12
  302. package/src/runtime/routes/oauth-providers.ts +44 -5
  303. package/src/runtime/routes/platform-routes.ts +81 -5
  304. package/src/runtime/routes/playground/__tests__/force-compact.test.ts +6 -4
  305. package/src/runtime/routes/playground/force-compact.ts +1 -1
  306. package/src/runtime/routes/rename-conversation-routes.ts +5 -0
  307. package/src/runtime/routes/schedule-routes.ts +152 -42
  308. package/src/runtime/routes/secret-routes.ts +14 -2
  309. package/src/runtime/routes/skills-routes.ts +43 -14
  310. package/src/runtime/routes/tool-call-confirmation-enrichment.test.ts +161 -0
  311. package/src/runtime/routes/tool-call-confirmation-enrichment.ts +107 -0
  312. package/src/runtime/routes/trust-rules-routes.ts +26 -2
  313. package/src/runtime/routes/tts-routes.ts +35 -0
  314. package/src/runtime/routes/types.ts +66 -8
  315. package/src/runtime/routes/usage-routes.ts +47 -39
  316. package/src/runtime/routes/webhook-routes.ts +41 -2
  317. package/src/runtime/routes/workspace-routes.ts +4 -0
  318. package/src/runtime/services/__tests__/analyze-conversation.test.ts +6 -0
  319. package/src/runtime/services/analyze-conversation.ts +2 -2
  320. package/src/schedule/schedule-store.ts +20 -1
  321. package/src/schedule/schedule-usage-store.ts +83 -0
  322. package/src/schedule/scheduler.ts +12 -5
  323. package/src/skills/catalog-files.ts +2 -2
  324. package/src/skills/catalog-install.ts +3 -0
  325. package/src/skills/categories-cache.ts +118 -0
  326. package/src/skills/clawhub-files.ts +1 -2
  327. package/src/skills/skillssh-files.ts +1 -2
  328. package/src/telemetry/types.ts +29 -1
  329. package/src/telemetry/usage-telemetry-reporter.test.ts +112 -3
  330. package/src/telemetry/usage-telemetry-reporter.ts +57 -2
  331. package/src/tools/executor.ts +1 -53
  332. package/src/tools/network/__tests__/web-search-metadata.test.ts +7 -1
  333. package/src/tools/network/__tests__/web-search.test.ts +11 -3
  334. package/src/tools/network/web-search-error.test.ts +248 -0
  335. package/src/tools/network/web-search-error.ts +267 -0
  336. package/src/tools/network/web-search.ts +207 -48
  337. package/src/tools/schedule/create.ts +2 -0
  338. package/src/tools/terminal/safe-env.ts +10 -1
  339. package/src/tools/ui-surface/definitions.ts +9 -1
  340. package/src/tts/__tests__/provider-catalog-consistency.test.ts +85 -1
  341. package/src/tts/provider-catalog.ts +76 -1
  342. package/src/util/mutex.ts +47 -0
  343. package/src/workspace/git-service.ts +1 -42
  344. package/src/workspace/migrations/095-bump-heartbeat-interval-30m-to-60m.ts +51 -0
  345. package/src/workspace/migrations/096-reduce-quality-profile-effort.ts +72 -0
  346. package/src/workspace/migrations/097-enable-adaptive-thinking-managed-profiles.ts +93 -0
  347. package/src/workspace/migrations/registry.ts +6 -0
  348. package/src/__tests__/bootstrap-turn-cleanup.test.ts +0 -44
  349. package/src/__tests__/empty-response-pipeline.test.ts +0 -423
  350. package/src/__tests__/llm-call-pipeline.test.ts +0 -287
  351. package/src/__tests__/memory-retrieval-pipeline.test.ts +0 -418
  352. package/src/__tests__/persistence-pipeline.test.ts +0 -503
  353. package/src/__tests__/title-generate-pipeline.test.ts +0 -211
  354. package/src/__tests__/token-estimate-pipeline.test.ts +0 -479
  355. package/src/__tests__/tool-error-pipeline.test.ts +0 -241
  356. package/src/__tests__/tool-execute-pipeline.test.ts +0 -417
  357. package/src/__tests__/tool-result-truncate-pipeline.test.ts +0 -341
  358. package/src/daemon/bootstrap-turn-cleanup.ts +0 -45
  359. package/src/gallery/default-gallery.ts +0 -1359
  360. package/src/gallery/gallery-manifest.ts +0 -28
  361. package/src/home/feature-gate.ts +0 -22
  362. package/src/plugins/defaults/empty-response/middlewares/emptyResponse.ts +0 -22
  363. package/src/plugins/defaults/empty-response/terminal.ts +0 -106
  364. package/src/plugins/defaults/injectors/package.json +0 -15
  365. package/src/plugins/defaults/llm-call/middlewares/llmCall.ts +0 -17
  366. package/src/plugins/defaults/llm-call/package.json +0 -15
  367. package/src/plugins/defaults/llm-call/register.ts +0 -45
  368. package/src/plugins/defaults/memory-retrieval/middlewares/memoryRetrieval.ts +0 -17
  369. package/src/plugins/defaults/memory-retrieval/package.json +0 -15
  370. package/src/plugins/defaults/memory-retrieval/register.ts +0 -181
  371. package/src/plugins/defaults/persistence/middlewares/persistence.ts +0 -19
  372. package/src/plugins/defaults/persistence/package.json +0 -15
  373. package/src/plugins/defaults/persistence/register.ts +0 -38
  374. package/src/plugins/defaults/persistence/terminal.ts +0 -83
  375. package/src/plugins/defaults/title-generate/terminal.ts +0 -31
  376. package/src/plugins/defaults/token-estimate/middlewares/tokenEstimate.ts +0 -23
  377. package/src/plugins/defaults/token-estimate/package.json +0 -15
  378. package/src/plugins/defaults/token-estimate/register.ts +0 -34
  379. package/src/plugins/defaults/token-estimate/terminal.ts +0 -40
  380. package/src/plugins/defaults/tool-error/middlewares/toolError.ts +0 -21
  381. package/src/plugins/defaults/tool-error/terminal.ts +0 -47
  382. package/src/plugins/defaults/tool-execute/middlewares/toolExecute.ts +0 -23
  383. package/src/plugins/defaults/tool-execute/package.json +0 -15
  384. package/src/plugins/defaults/tool-execute/register.ts +0 -49
  385. package/src/plugins/defaults/tool-result-truncate/middlewares/toolResultTruncate.ts +0 -23
  386. package/src/plugins/defaults/tool-result-truncate/types.ts +0 -22
  387. package/src/skills/category-inference.ts +0 -111
@@ -14,27 +14,11 @@
14
14
  import { createRequire } from "node:module";
15
15
  import { afterAll, beforeEach, describe, expect, mock, test } from "bun:test";
16
16
 
17
- import { CompactionCircuit } from "../agent/compaction-circuit.js";
18
- import type {
19
- AgentEvent,
20
- AgentLoopRunOptions,
21
- AgentLoopRunResult,
22
- MidLoopCompaction,
23
- } from "../agent/loop.js";
17
+ import type { LoopToolExecutor } from "../agent/loop.js";
24
18
  import type { LLMConfig } from "../config/schemas/llm.js";
25
- import type { ContextWindowResult } from "../context/window-manager.js";
26
19
  import type { ServerMessage } from "../daemon/message-protocol.js";
27
- import { defaultCompactionTerminal } from "../plugins/defaults/compaction/terminal.js";
28
20
  import { resetPluginRegistryAndRegisterDefaults } from "../plugins/defaults/index.js";
29
- import { DEFAULT_TIMEOUTS, runPipeline } from "../plugins/pipeline.js";
30
- import { getMiddlewaresFor } from "../plugins/registry.js";
31
- import type {
32
- CompactionArgs,
33
- CompactionResult,
34
- TurnContext,
35
- } from "../plugins/types.js";
36
- import { PluginTimeoutError } from "../plugins/types.js";
37
- import type { ContentBlock, Message } from "../providers/types.js";
21
+ import type { Message, Provider, ToolDefinition } from "../providers/types.js";
38
22
 
39
23
  const conversationCrudRealSnapshot = {
40
24
  ...(createRequire(import.meta.url)(
@@ -103,6 +87,7 @@ mock.module("../config/loader.js", () => ({
103
87
  memory: { retrieval: { scratchpadInjection: { enabled: true } } },
104
88
  ui: {},
105
89
  compaction: { enabled: true, autoThreshold: 0.7 },
90
+ conversations: { skipAutoRetitling: true },
106
91
  }),
107
92
  loadRawConfig: () => ({}),
108
93
  saveRawConfig: () => {},
@@ -114,10 +99,10 @@ mock.module("../config/loader.js", () => ({
114
99
  // Token estimator — controllable per-test via mockEstimateTokens.
115
100
  // Can be a number (constant), a no-arg function, or a function that
116
101
  // receives the messages array for dynamic behavior based on content.
117
- // Both the calibrated entry point (`estimatePromptTokens`, used in the
118
- // convergence path) and the raw entry point (`estimatePromptTokensRaw`,
119
- // used by the default `tokenEstimate` plugin pipeline for preflight/mid-
120
- // loop) are stubbed so either call site can drive the test.
102
+ // Both the calibrated entry point (`estimatePromptTokens`, which backs the
103
+ // preflight overflow gate and the convergence path) and the raw entry point
104
+ // (`estimatePromptTokensRaw`, used by the pre-send calibration capture) are
105
+ // stubbed so either call site can drive the test.
121
106
  let mockEstimateTokens: number | ((msgs?: Message[]) => number) = 1000;
122
107
  mock.module("../context/token-estimator.js", () => ({
123
108
  estimatePromptTokens: (msgs: Message[]) =>
@@ -128,8 +113,16 @@ mock.module("../context/token-estimator.js", () => ({
128
113
  typeof mockEstimateTokens === "function"
129
114
  ? mockEstimateTokens(msgs)
130
115
  : mockEstimateTokens,
131
- // Default plugin multiplies-in tool tokens via this helper; 0 keeps the
132
- // stubbed raw value unchanged.
116
+ // The preflight overflow gate calls this calibrated wrapper directly, so it
117
+ // must honor `mockEstimateTokens` too — otherwise the real implementation
118
+ // (which sums tool tokens onto the real calibrated estimate) ignores the
119
+ // per-test value and the overflow scenarios below never trigger.
120
+ estimatePromptTokensWithTools: (history: Message[]) =>
121
+ typeof mockEstimateTokens === "function"
122
+ ? mockEstimateTokens(history)
123
+ : mockEstimateTokens,
124
+ // `estimatePromptTokensWithTools` folds tool tokens in via this helper; 0
125
+ // keeps the stubbed value unchanged.
133
126
  estimateToolsTokens: () => 0,
134
127
  // Conversation agent loop now calls this helper to canonicalize the
135
128
  // provider key shared with the calibration system. The tests here
@@ -281,15 +274,6 @@ mock.module("../daemon/conversation-runtime-assembly.js", () => ({
281
274
  blocks: {},
282
275
  }),
283
276
  stripInjectionsForCompaction: (msgs: Message[]) => msgs,
284
- findLastInjectedNowContent: () => null,
285
- readNowScratchpad: () => null,
286
- readPkbContext: () => null,
287
- getPkbAutoInjectList: () => [
288
- "INDEX.md",
289
- "essentials.md",
290
- "threads.md",
291
- "buffer.md",
292
- ],
293
277
  isSlackChannelConversation: () => false,
294
278
  getSlackCompactionWatermarkForPrefix: () => null,
295
279
  loadSlackChronologicalContext: () => null,
@@ -437,179 +421,55 @@ mock.module("../memory/archive-store.js", () => ({
437
421
 
438
422
  // ── Imports (after mocks) ────────────────────────────────────────────
439
423
 
424
+ import { AgentLoop } from "../agent/loop.js";
440
425
  import {
441
426
  type AgentLoopConversationContext,
442
427
  runAgentLoopImpl,
443
428
  } from "../daemon/conversation-agent-loop.js";
429
+ import {
430
+ createMockProvider,
431
+ type ScriptedResponse,
432
+ textResponse,
433
+ toolUseResponse,
434
+ } from "./helpers/mock-provider.js";
444
435
 
445
436
  // ── Test helpers ─────────────────────────────────────────────────────
446
437
 
447
- type AgentLoopRun = (
448
- messages: Message[],
449
- onEvent: (event: AgentEvent) => void,
450
- options?: AgentLoopRunOptions,
451
- ) => Promise<Message[]>;
452
-
453
- /**
454
- * Faithful re-implementation of `AgentLoop.compact()` for the mock loop: run
455
- * the compaction pipeline against the supplied turn context (which carries the
456
- * test's `contextWindowManager`), invoke the orchestrator-supplied hooks, and
457
- * return the continuation history — or `null` on timeout/exhaustion so the
458
- * caller yields "budget".
459
- */
460
- async function simulateInlineCompaction(
461
- compaction: MidLoopCompaction,
462
- history: Message[],
463
- turnContext: TurnContext | undefined,
464
- signal: AbortSignal | undefined,
465
- onEvent: (event: AgentEvent) => void | Promise<void>,
466
- compactionCircuit: CompactionCircuit,
467
- ): Promise<Message[] | null> {
468
- await onEvent({ type: "context_compacting" });
469
- const { rawHistory, options } = compaction.prepare(history);
470
- let result: CompactionResult;
471
- try {
472
- result = await runPipeline<CompactionArgs, CompactionResult>(
473
- "compaction",
474
- getMiddlewaresFor("compaction"),
475
- (args) => defaultCompactionTerminal(args, turnContext as TurnContext),
476
- { messages: rawHistory, signal, options },
477
- turnContext as TurnContext,
478
- DEFAULT_TIMEOUTS.compaction,
479
- );
480
- } catch (error) {
481
- if (error instanceof PluginTimeoutError) {
482
- await compactionCircuit.recordOutcome(
483
- {
484
- currentRequestId: turnContext?.requestId,
485
- currentTurnTrustContext: turnContext?.trust,
486
- turnCount: turnContext?.turnIndex ?? 0,
487
- },
488
- true,
489
- onEvent,
490
- );
491
- return null;
492
- }
493
- throw error;
494
- }
495
- const compactResult = result as ContextWindowResult;
496
- if (compactResult.summaryFailed !== undefined) {
497
- await compactionCircuit.recordOutcome(
498
- {
499
- currentRequestId: turnContext?.requestId,
500
- currentTurnTrustContext: turnContext?.trust,
501
- turnCount: turnContext?.turnIndex ?? 0,
502
- },
503
- compactResult.summaryFailed,
504
- onEvent,
505
- );
506
- }
507
- if (compactResult.compacted) {
508
- await compaction.applyResult(compactResult, rawHistory);
509
- }
510
- if (compactResult.exhausted ?? false) {
511
- return null;
512
- }
513
- return compaction.reinject();
514
- }
515
-
516
- /**
517
- * Adapt a `Message[]`-returning mock loop body into `run()`'s real result
518
- * shape. Mirrors the production loop: the pause-reason carried back is
519
- * whatever the most recent `onCheckpoint` call yielded with (null when it
520
- * never yielded), so the orchestrator derives its yield bookkeeping the same
521
- * way it does against the real loop.
522
- */
523
- const asAgentLoopRun = (
524
- fn: AgentLoopRun,
525
- compactionCircuit: CompactionCircuit,
526
- ): ((
527
- messages: Message[],
528
- onEvent: (event: AgentEvent) => void | Promise<void>,
529
- options?: AgentLoopRunOptions,
530
- ) => Promise<AgentLoopRunResult>) => {
531
- return async (messages, onEvent, options) => {
532
- let exitReason: AgentLoopRunResult["exitReason"] = null;
533
- let wrapped = options;
534
- if (options?.onCheckpoint) {
535
- const inner = options.onCheckpoint;
536
- wrapped = {
537
- ...options,
538
- onCheckpoint: async (info) => {
539
- // Handoff is offered first, mirroring the loop's ordering.
540
- const decision = await inner(info);
541
- if (decision !== "continue") {
542
- exitReason = decision;
543
- return decision;
544
- }
545
- // The mid-loop budget gate and inline compaction both live inside
546
- // `AgentLoop.run`. Replicate them here — same formula, stubbed
547
- // estimator, and the loop's own `compact()` ceremony — so these
548
- // orchestrator tests drive the real escalation path now that the
549
- // orchestrator's `onCheckpoint` is handoff-only and compaction
550
- // runs inline rather than via an orchestrator re-entry loop.
551
- const contextWindow = options.resolveContextWindow?.();
552
- if (contextWindow?.overflowRecovery.enabled) {
553
- const { maxInputTokens, overflowRecovery } = contextWindow;
554
- const safetyMargin =
555
- info.history.length > 50
556
- ? Math.max(overflowRecovery.safetyMarginRatio, 0.15)
557
- : overflowRecovery.safetyMarginRatio;
558
- const preflightBudget = Math.floor(
559
- maxInputTokens * (1 - safetyMargin),
560
- );
561
- const estimated =
562
- typeof mockEstimateTokens === "function"
563
- ? mockEstimateTokens(info.history)
564
- : mockEstimateTokens;
565
- if (estimated > preflightBudget * 0.85) {
566
- // Mirror `AgentLoop.compact()`: when a compaction path is
567
- // supplied, run it in place and continue; on timeout or
568
- // exhaustion it returns null, so the loop yields "budget".
569
- const compacted = options.compaction
570
- ? await simulateInlineCompaction(
571
- options.compaction,
572
- info.history,
573
- options.turnContext,
574
- options.signal,
575
- onEvent,
576
- compactionCircuit,
577
- )
578
- : null;
579
- if (compacted) {
580
- exitReason = null;
581
- return "continue";
582
- }
583
- exitReason = "budget";
584
- return "budget";
585
- }
586
- }
587
- exitReason = null;
588
- return "continue";
589
- },
590
- };
591
- }
592
- const history = await fn(messages, onEvent, wrapped);
593
- return { history, exitReason };
594
- };
595
- };
596
-
597
438
  function makeCtx(
598
439
  overrides?: Partial<AgentLoopConversationContext> & {
599
- agentLoopRun?: AgentLoopRun;
440
+ providerResponses?: ScriptedResponse[];
441
+ loopProvider?: Provider;
442
+ loopTools?: ToolDefinition[];
443
+ toolExecutor?: LoopToolExecutor;
600
444
  },
601
445
  ): AgentLoopConversationContext {
602
- const agentLoopRun =
603
- overrides?.agentLoopRun ??
604
- (async (messages: Message[]) => [
605
- ...messages,
606
- {
607
- role: "assistant" as const,
608
- content: [{ type: "text" as const, text: "response" }],
609
- },
610
- ]);
611
-
612
- const compactionCircuit = new CompactionCircuit("test-conv");
446
+ const {
447
+ providerResponses,
448
+ loopProvider,
449
+ loopTools,
450
+ toolExecutor,
451
+ ...ctxOverrides
452
+ } = overrides ?? {};
453
+ const conversationId = ctxOverrides.conversationId ?? "test-conv";
454
+
455
+ // Drive the real `AgentLoop` against a scripted provider, mocking only the
456
+ // provider HTTP boundary. The loop owns its mid-loop budget gate, inline
457
+ // compaction, and event emission, so these overflow tests exercise the real
458
+ // escalation/persistence path.
459
+ const loopProviderName =
460
+ (ctxOverrides.provider as { name?: string } | undefined)?.name ??
461
+ "mock-provider";
462
+ const provider =
463
+ loopProvider ??
464
+ createMockProvider(
465
+ providerResponses ?? [textResponse("response")],
466
+ loopProviderName,
467
+ ).provider;
468
+ const agentLoop = new AgentLoop(provider, "system prompt", {
469
+ conversationId,
470
+ tools: loopTools ?? [],
471
+ toolExecutor,
472
+ });
613
473
 
614
474
  return {
615
475
  conversationId: "test-conv",
@@ -617,19 +477,16 @@ function makeCtx(
617
477
  { role: "user", content: [{ type: "text", text: "Hello" }] },
618
478
  ] as Message[],
619
479
  processing: true,
480
+ isProcessing(this: { processing: boolean }) {
481
+ return this.processing;
482
+ },
483
+ setProcessing(this: { processing: boolean }, value: boolean) {
484
+ this.processing = value;
485
+ },
620
486
  abortController: new AbortController(),
621
487
  currentRequestId: "test-req",
622
488
 
623
- agentLoop: {
624
- run: asAgentLoopRun(agentLoopRun, compactionCircuit),
625
- getToolTokenBudget: () => 0,
626
- getResolvedTools: () => [],
627
- // Tests in this file don't exercise calibration, so returning
628
- // undefined is fine — the estimator falls back to the per-provider
629
- // aggregate key.
630
- getActiveModel: () => undefined,
631
- compactionCircuit,
632
- } as unknown as AgentLoopConversationContext["agentLoop"],
489
+ agentLoop,
633
490
  provider: {
634
491
  name: "mock-provider",
635
492
  sendMessage: async () => ({
@@ -722,9 +579,10 @@ function makeCtx(
722
579
  injectedTokens: 0,
723
580
  }),
724
581
  retrackCachedNodes: () => {},
582
+ recordPkbQueryVectors: () => {},
725
583
  } as unknown as AgentLoopConversationContext["graphMemory"],
726
584
 
727
- ...overrides,
585
+ ...ctxOverrides,
728
586
  } as AgentLoopConversationContext;
729
587
  }
730
588
 
@@ -802,6 +660,7 @@ beforeEach(() => {
802
660
 
803
661
  describe("session-agent-loop overflow recovery (JARVIS-110)", () => {
804
662
  test("usage update context max follows active main-agent profile budget", async () => {
663
+ // GIVEN an active main-agent profile that narrows the context budget
805
664
  mockLlmConfig = {
806
665
  ...structuredClone(defaultLlmConfig),
807
666
  activeProfile: "short-context",
@@ -813,27 +672,22 @@ describe("session-agent-loop overflow recovery (JARVIS-110)", () => {
813
672
  },
814
673
  };
815
674
 
675
+ // AND a provider turn that reports 12k input tokens of usage
816
676
  const ctx = makeCtx({
817
- agentLoopRun: async (messages, onEvent) => {
818
- onEvent({
819
- type: "usage",
820
- inputTokens: 12_000,
821
- outputTokens: 300,
677
+ providerResponses: [
678
+ {
679
+ content: [{ type: "text", text: "response" }],
822
680
  model: "mock-model",
823
- providerDurationMs: 25,
824
- });
825
- return [
826
- ...messages,
827
- {
828
- role: "assistant" as const,
829
- content: [{ type: "text" as const, text: "response" }],
830
- },
831
- ];
832
- },
681
+ usage: { inputTokens: 12_000, outputTokens: 300 },
682
+ stopReason: "end_turn",
683
+ },
684
+ ],
833
685
  });
834
686
 
687
+ // WHEN the turn runs to completion
835
688
  await runAgentLoopImpl(ctx, "hello", "msg-1", () => {});
836
689
 
690
+ // THEN the recorded main-agent usage carries the profile's max budget
837
691
  const mainAgentUsageCall = recordUsageMock.mock.calls.find(
838
692
  (call) => call[5] === "main_agent",
839
693
  );
@@ -846,10 +700,9 @@ describe("session-agent-loop overflow recovery (JARVIS-110)", () => {
846
700
 
847
701
  // ── Test 1 ────────────────────────────────────────────────────────
848
702
  // BUG: When the agent loop makes progress (adds messages to history)
849
- // before hitting context_too_large, the convergence loop at line 864
850
- // checks `updatedHistory.length === preRunHistoryLength` which is
851
- // false when progress was made. This means the reducer is never
852
- // invoked — the error is surfaced immediately at line 1163-1175
703
+ // before hitting context_too_large, the convergence loop's progress
704
+ // check must recognize that the loop appended messages. If it fails to,
705
+ // the reducer is never invoked the error is surfaced immediately
853
706
  // without any compaction attempt.
854
707
  //
855
708
  // Expected behavior (PR 2 fix): After progress + context_too_large,
@@ -889,125 +742,31 @@ describe("session-agent-loop overflow recovery (JARVIS-110)", () => {
889
742
  };
890
743
  };
891
744
 
892
- let agentLoopCallCount = 0;
893
- const agentLoopRun: AgentLoopRun = async (messages, onEvent) => {
894
- // Prime the assistant row anchor production code emits this from
895
- // `AgentLoop.run` just before `provider.sendMessage`.
896
- await onEvent({ type: "llm_call_started" });
897
- agentLoopCallCount++;
898
- if (agentLoopCallCount === 1) {
899
- // Simulate: agent makes progress (tool calls + results added)
900
- // then hits context_too_large on next LLM call
901
- const progressMessages: Message[] = [
902
- ...messages,
903
- {
904
- role: "assistant" as const,
905
- content: [
906
- { type: "text", text: "Let me check that." },
907
- {
908
- type: "tool_use",
909
- id: "tu-progress",
910
- name: "bash",
911
- input: { command: "ls" },
912
- },
913
- ] as ContentBlock[],
914
- },
915
- {
916
- role: "user" as const,
917
- content: [
918
- {
919
- type: "tool_result",
920
- tool_use_id: "tu-progress",
921
- content: "file1.ts\nfile2.ts",
922
- is_error: false,
923
- },
924
- ] as ContentBlock[],
925
- },
926
- ];
745
+ // Run 1 makes progress (a tool turn) then the following provider call
746
+ // rejects with a context_too_large error; after the convergence reducer
747
+ // compacts, the rerun recovers with plain text.
748
+ const { provider } = createMockProvider([
749
+ toolUseResponse("tu-progress", "bash", { command: "ls" }),
750
+ new Error("prompt is too long: 242201 tokens > 200000 maximum"),
751
+ textResponse("recovered after compaction"),
752
+ ]);
927
753
 
928
- // Emit events for the progress that was made
929
- onEvent({
930
- type: "tool_use",
931
- id: "tu-progress",
754
+ const ctx = makeCtx({
755
+ loopProvider: provider,
756
+ loopTools: [
757
+ {
932
758
  name: "bash",
933
- input: { command: "ls" },
934
- });
935
- onEvent({
936
- type: "tool_result",
937
- toolUseId: "tu-progress",
938
- content: "file1.ts\nfile2.ts",
939
- isError: false,
940
- });
941
- onEvent({
942
- type: "message_complete",
943
- message: {
944
- role: "assistant",
945
- content: [
946
- { type: "text", text: "Let me check that." },
947
- {
948
- type: "tool_use",
949
- id: "tu-progress",
950
- name: "bash",
951
- input: { command: "ls" },
952
- },
953
- ],
759
+ description: "Run a shell command",
760
+ input_schema: {
761
+ type: "object",
762
+ properties: { command: { type: "string" } },
954
763
  },
955
- });
956
- onEvent({
957
- type: "usage",
958
- inputTokens: 100,
959
- outputTokens: 50,
960
- model: "test-model",
961
- providerDurationMs: 100,
962
- });
963
-
964
- // Then context_too_large error occurs on the *next* LLM call
965
- onEvent({
966
- type: "error",
967
- error: new Error(
968
- "prompt is too long: 242201 tokens > 200000 maximum",
969
- ),
970
- });
971
- onEvent({
972
- type: "usage",
973
- inputTokens: 0,
974
- outputTokens: 0,
975
- model: "test-model",
976
- providerDurationMs: 10,
977
- });
978
-
979
- // Return the history WITH progress (more messages than input)
980
- return progressMessages;
981
- }
982
-
983
- // Second call (after compaction): succeed
984
- onEvent({
985
- type: "message_complete",
986
- message: {
987
- role: "assistant",
988
- content: [{ type: "text", text: "recovered after compaction" }],
989
- },
990
- });
991
- onEvent({
992
- type: "usage",
993
- inputTokens: 50,
994
- outputTokens: 25,
995
- model: "test-model",
996
- providerDurationMs: 100,
997
- });
998
- return [
999
- ...messages,
1000
- {
1001
- role: "assistant" as const,
1002
- content: [
1003
- { type: "text", text: "recovered after compaction" },
1004
- ] as ContentBlock[],
1005
764
  },
1006
- ];
1007
- };
1008
-
1009
- const ctx = makeCtx({
1010
- agentLoopRun,
765
+ ],
766
+ toolExecutor: async () => ({
767
+ content: "file1.ts\nfile2.ts",
768
+ isError: false,
769
+ }),
1011
770
  contextWindowManager: {
1012
771
  shouldCompact: () => ({ needed: false, estimatedTokens: 0 }),
1013
772
  maybeCompact: async () => ({ compacted: false }),
@@ -1036,13 +795,14 @@ describe("session-agent-loop overflow recovery (JARVIS-110)", () => {
1036
795
  // This test should PASS against current code (when no progress is made).
1037
796
  test("overflow recovery compacts below limit even when estimation underestimates", async () => {
1038
797
  const events: ServerMessage[] = [];
1039
- let callCount = 0;
1040
798
  let reducerCalled = false;
1041
799
 
1042
- // Estimator says 185k (below 190k budget = 200k * 0.95)
800
+ // GIVEN the estimator reports 185k under the 190k preflight budget
801
+ // (200k * 0.95), so the turn proceeds to the provider rather than
802
+ // compacting up front.
1043
803
  mockEstimateTokens = 185_000;
1044
804
 
1045
- // Reducer successfully compacts
805
+ // AND the post-run convergence reducer successfully compacts
1046
806
  mockReducerStepFn = (msgs: Message[]) => {
1047
807
  reducerCalled = true;
1048
808
  return {
@@ -1072,96 +832,46 @@ describe("session-agent-loop overflow recovery (JARVIS-110)", () => {
1072
832
  };
1073
833
  };
1074
834
 
1075
- const agentLoopRun: AgentLoopRun = async (messages, onEvent) => {
1076
- // Prime the assistant row anchor production code emits this from
1077
- // `AgentLoop.run` just before `provider.sendMessage`.
1078
- await onEvent({ type: "llm_call_started" });
1079
- callCount++;
1080
- if (callCount === 1) {
1081
- // Provider rejects with "prompt is too long: 242201 tokens > 200000"
1082
- // even though estimator said 185k
1083
- onEvent({
1084
- type: "error",
1085
- error: new Error(
1086
- "prompt is too long: 242201 tokens > 200000 maximum",
1087
- ),
1088
- });
1089
- onEvent({
1090
- type: "usage",
1091
- inputTokens: 0,
1092
- outputTokens: 0,
1093
- model: "test-model",
1094
- providerDurationMs: 10,
1095
- });
1096
- // No progress — return same messages
1097
- return messages;
1098
- }
1099
- // Second call succeeds
1100
- onEvent({
1101
- type: "message_complete",
1102
- message: {
1103
- role: "assistant",
1104
- content: [{ type: "text", text: "recovered" }],
1105
- },
1106
- });
1107
- onEvent({
1108
- type: "usage",
1109
- inputTokens: 80_000,
1110
- outputTokens: 200,
1111
- model: "test-model",
1112
- providerDurationMs: 500,
1113
- });
1114
- return [
1115
- ...messages,
1116
- {
1117
- role: "assistant" as const,
1118
- content: [{ type: "text", text: "recovered" }] as ContentBlock[],
1119
- },
1120
- ];
1121
- };
835
+ // AND a provider that rejects the first call as too long (revealing the
836
+ // real 242k count the estimator missed), then succeeds on the rerun.
837
+ const { provider, calls } = createMockProvider([
838
+ new Error("prompt is too long: 242201 tokens > 200000 maximum"),
839
+ textResponse("recovered"),
840
+ ]);
1122
841
 
1123
842
  const ctx = makeCtx({
1124
- agentLoopRun,
843
+ loopProvider: provider,
1125
844
  contextWindowManager: {
1126
845
  shouldCompact: () => ({ needed: false, estimatedTokens: 0 }),
1127
846
  maybeCompact: async () => ({ compacted: false }),
1128
847
  } as unknown as AgentLoopConversationContext["contextWindowManager"],
1129
848
  });
1130
849
 
850
+ // WHEN the turn runs
1131
851
  await runAgentLoopImpl(ctx, "hello", "msg-1", (msg) => events.push(msg));
1132
852
 
1133
- // The reducer should be called in the convergence loop
853
+ // THEN the convergence reducer ran and the rerun recovered without a
854
+ // user-facing conversation_error.
1134
855
  expect(reducerCalled).toBe(true);
1135
- // Should recover without conversation_error
1136
856
  const conversationError = events.find(
1137
857
  (e) => e.type === "conversation_error",
1138
858
  );
1139
859
  expect(conversationError).toBeUndefined();
1140
- expect(callCount).toBe(2);
860
+ expect(calls.length).toBe(2);
1141
861
  });
1142
862
 
1143
863
  // ── Test 3 ────────────────────────────────────────────────────────
1144
- // BUG: When the provider rejection reveals actual token count (e.g.,
1145
- // "242201 tokens > 200000"), the reducer should target a budget below
1146
- // the actual limit (not below the estimator's inaccurate budget).
1147
- // Currently the reducer always uses `preflightBudget` (190k) as the
1148
- // target, but the actual tokens were 242k so 190k is already too
1149
- // high relative to the real count. The target should be adjusted
1150
- // downward based on the observed mismatch.
1151
- //
1152
- // Expected behavior (PR 4 fix): `targetInputTokensOverride` should
1153
- // be adjusted based on the ratio between estimated and actual tokens.
1154
- // BUG: The targetTokens passed to the reducer is preflightBudget = 190k.
1155
- // But when the actual token count is 242k (1.31x the estimate of 185k),
1156
- // the target should be adjusted downward to account for the estimation
1157
- // inaccuracy. For example: 190k / 1.31 ≈ 145k.
1158
- // Planned fix: targetInputTokensOverride should be adjusted based on
1159
- // the ratio between estimated and actual tokens.
864
+ // When the provider rejection reveals the actual token count (e.g.,
865
+ // "242201 tokens > 200000"), the overflow reducer's `targetTokens`
866
+ // should be a budget below the actual limit, not below the estimator's
867
+ // inaccurate budget. With a preflightBudget of 190k but an actual count
868
+ // of 242k (1.31x the estimate of 185k), the target is adjusted downward
869
+ // based on the observed mismatch (190k / 1.31 145k) so the reducer
870
+ // converges toward the real ceiling rather than the optimistic estimate.
1160
871
  test.todo(
1161
872
  "forced compaction targets a lower budget when estimation has been inaccurate",
1162
873
  async () => {
1163
874
  const events: ServerMessage[] = [];
1164
- let callCount = 0;
1165
875
  let capturedTargetTokens: number | undefined;
1166
876
 
1167
877
  // Estimator says 185k (below 190k budget = 200k * 0.95)
@@ -1197,55 +907,16 @@ describe("session-agent-loop overflow recovery (JARVIS-110)", () => {
1197
907
  };
1198
908
  };
1199
909
 
1200
- const agentLoopRun: AgentLoopRun = async (messages, onEvent) => {
1201
- // Prime the assistant row anchor production code emits this from
1202
- // `AgentLoop.run` just before `provider.sendMessage`.
1203
- await onEvent({ type: "llm_call_started" });
1204
- callCount++;
1205
- if (callCount === 1) {
1206
- // Provider rejects: actual tokens 242201, way above estimate of 185k
1207
- onEvent({
1208
- type: "error",
1209
- error: new Error(
1210
- "prompt is too long: 242201 tokens > 200000 maximum",
1211
- ),
1212
- });
1213
- onEvent({
1214
- type: "usage",
1215
- inputTokens: 0,
1216
- outputTokens: 0,
1217
- model: "test-model",
1218
- providerDurationMs: 10,
1219
- });
1220
- // No progress — return same messages
1221
- return messages;
1222
- }
1223
- // Second call succeeds after compaction
1224
- onEvent({
1225
- type: "message_complete",
1226
- message: {
1227
- role: "assistant",
1228
- content: [{ type: "text", text: "recovered" }],
1229
- },
1230
- });
1231
- onEvent({
1232
- type: "usage",
1233
- inputTokens: 80_000,
1234
- outputTokens: 200,
1235
- model: "test-model",
1236
- providerDurationMs: 500,
1237
- });
1238
- return [
1239
- ...messages,
1240
- {
1241
- role: "assistant" as const,
1242
- content: [{ type: "text", text: "recovered" }] as ContentBlock[],
1243
- },
1244
- ];
1245
- };
910
+ // The provider rejects the first call with a context_too_large error
911
+ // (actual tokens 242201, far above the 185k estimate); after forced
912
+ // compaction re-targets a lower budget, the rerun recovers with text.
913
+ const { provider, calls } = createMockProvider([
914
+ new Error("prompt is too long: 242201 tokens > 200000 maximum"),
915
+ textResponse("recovered"),
916
+ ]);
1246
917
 
1247
918
  const ctx = makeCtx({
1248
- agentLoopRun,
919
+ loopProvider: provider,
1249
920
  contextWindowManager: {
1250
921
  shouldCompact: () => ({ needed: false, estimatedTokens: 0 }),
1251
922
  maybeCompact: async () => ({ compacted: false }),
@@ -1275,7 +946,7 @@ describe("session-agent-loop overflow recovery (JARVIS-110)", () => {
1275
946
  (e) => e.type === "conversation_error",
1276
947
  );
1277
948
  expect(conversationError).toBeUndefined();
1278
- expect(callCount).toBe(2);
949
+ expect(calls.length).toBe(2);
1279
950
  },
1280
951
  );
1281
952
 
@@ -1289,7 +960,6 @@ describe("session-agent-loop overflow recovery (JARVIS-110)", () => {
1289
960
  async () => {
1290
961
  const events: ServerMessage[] = [];
1291
962
  const longHistory = buildLongConversation(75);
1292
- let callCount = 0;
1293
963
  let reducerCalled = false;
1294
964
 
1295
965
  // Estimator says ~195k — just above budget so preflight reducer runs
@@ -1325,38 +995,14 @@ describe("session-agent-loop overflow recovery (JARVIS-110)", () => {
1325
995
  };
1326
996
  };
1327
997
 
1328
- const agentLoopRun: AgentLoopRun = async (messages, onEvent) => {
1329
- // Prime the assistant row anchor production code emits this from
1330
- // `AgentLoop.run` just before `provider.sendMessage`.
1331
- await onEvent({ type: "llm_call_started" });
1332
- callCount++;
1333
- onEvent({
1334
- type: "message_complete",
1335
- message: {
1336
- role: "assistant",
1337
- content: [{ type: "text", text: "Here's the analysis..." }],
1338
- },
1339
- });
1340
- onEvent({
1341
- type: "usage",
1342
- inputTokens: 50_000,
1343
- outputTokens: 300,
1344
- model: "test-model",
1345
- providerDurationMs: 800,
1346
- });
1347
- return [
1348
- ...messages,
1349
- {
1350
- role: "assistant" as const,
1351
- content: [
1352
- { type: "text", text: "Here's the analysis..." },
1353
- ] as ContentBlock[],
1354
- },
1355
- ];
1356
- };
998
+ // After the preflight reducer compacts the long history under budget,
999
+ // a single provider call completes the turn with plain text.
1000
+ const { provider, calls } = createMockProvider([
1001
+ textResponse("Here's the analysis..."),
1002
+ ]);
1357
1003
 
1358
1004
  const ctx = makeCtx({
1359
- agentLoopRun,
1005
+ loopProvider: provider,
1360
1006
  messages: longHistory,
1361
1007
  contextWindowManager: {
1362
1008
  shouldCompact: () => ({ needed: false, estimatedTokens: 0 }),
@@ -1371,7 +1017,7 @@ describe("session-agent-loop overflow recovery (JARVIS-110)", () => {
1371
1017
  // Preflight should trigger the reducer since 195k > 190k budget
1372
1018
  expect(reducerCalled).toBe(true);
1373
1019
  // Should succeed
1374
- expect(callCount).toBe(1);
1020
+ expect(calls.length).toBe(1);
1375
1021
  const conversationError = events.find(
1376
1022
  (e) => e.type === "conversation_error",
1377
1023
  );
@@ -1415,118 +1061,31 @@ describe("session-agent-loop overflow recovery (JARVIS-110)", () => {
1415
1061
  };
1416
1062
  };
1417
1063
 
1418
- let agentLoopCallCount = 0;
1419
- const agentLoopRun: AgentLoopRun = async (messages, onEvent) => {
1420
- // Prime the assistant row anchor — production code emits this from
1421
- // `AgentLoop.run` just before `provider.sendMessage`.
1422
- await onEvent({ type: "llm_call_started" });
1423
- agentLoopCallCount++;
1424
- if (agentLoopCallCount === 1) {
1425
- // Agent makes progress (tool calls succeed, messages grow)
1426
- const progressMessages: Message[] = [
1427
- ...messages,
1428
- {
1429
- role: "assistant" as const,
1430
- content: [
1431
- { type: "text", text: "Running analysis..." },
1432
- {
1433
- type: "tool_use",
1434
- id: "tu-1",
1435
- name: "bash",
1436
- input: { command: "find . -name '*.ts'" },
1437
- },
1438
- ] as ContentBlock[],
1439
- },
1440
- {
1441
- role: "user" as const,
1442
- content: [
1443
- {
1444
- type: "tool_result",
1445
- tool_use_id: "tu-1",
1446
- content: "file1.ts\nfile2.ts\nfile3.ts",
1447
- is_error: false,
1448
- },
1449
- ] as ContentBlock[],
1450
- },
1451
- ];
1064
+ // Run 1 makes progress (a tool turn) then the following provider call
1065
+ // rejects with context_too_large; after emergency compaction the rerun
1066
+ // recovers with plain text.
1067
+ const { provider } = createMockProvider([
1068
+ toolUseResponse("tu-1", "bash", { command: "find . -name '*.ts'" }),
1069
+ new Error("context_length_exceeded"),
1070
+ textResponse("recovered"),
1071
+ ]);
1452
1072
 
1453
- onEvent({
1454
- type: "tool_use",
1455
- id: "tu-1",
1073
+ const ctx = makeCtx({
1074
+ loopProvider: provider,
1075
+ loopTools: [
1076
+ {
1456
1077
  name: "bash",
1457
- input: { command: "find . -name '*.ts'" },
1458
- });
1459
- onEvent({
1460
- type: "tool_result",
1461
- toolUseId: "tu-1",
1462
- content: "file1.ts\nfile2.ts\nfile3.ts",
1463
- isError: false,
1464
- });
1465
- onEvent({
1466
- type: "message_complete",
1467
- message: {
1468
- role: "assistant",
1469
- content: [
1470
- { type: "text", text: "Running analysis..." },
1471
- {
1472
- type: "tool_use",
1473
- id: "tu-1",
1474
- name: "bash",
1475
- input: { command: "find . -name '*.ts'" },
1476
- },
1477
- ],
1078
+ description: "Run a shell command",
1079
+ input_schema: {
1080
+ type: "object",
1081
+ properties: { command: { type: "string" } },
1478
1082
  },
1479
- });
1480
- onEvent({
1481
- type: "usage",
1482
- inputTokens: 190_000,
1483
- outputTokens: 100,
1484
- model: "test-model",
1485
- providerDurationMs: 200,
1486
- });
1487
-
1488
- // Then context_too_large on the next LLM call within the loop
1489
- onEvent({
1490
- type: "error",
1491
- error: new Error("context_length_exceeded"),
1492
- });
1493
- onEvent({
1494
- type: "usage",
1495
- inputTokens: 0,
1496
- outputTokens: 0,
1497
- model: "test-model",
1498
- providerDurationMs: 10,
1499
- });
1500
-
1501
- return progressMessages;
1502
- }
1503
-
1504
- // After emergency compaction, succeed
1505
- onEvent({
1506
- type: "message_complete",
1507
- message: {
1508
- role: "assistant",
1509
- content: [{ type: "text", text: "recovered" }],
1510
1083
  },
1511
- });
1512
- onEvent({
1513
- type: "usage",
1514
- inputTokens: 50_000,
1515
- outputTokens: 100,
1516
- model: "test-model",
1517
- providerDurationMs: 200,
1518
- });
1519
- return [
1520
- ...messages,
1521
- {
1522
- role: "assistant" as const,
1523
- content: [{ type: "text", text: "recovered" }] as ContentBlock[],
1524
- },
1525
- ];
1526
- };
1527
-
1528
- const ctx = makeCtx({
1529
- agentLoopRun,
1084
+ ],
1085
+ toolExecutor: async () => ({
1086
+ content: "file1.ts\nfile2.ts\nfile3.ts",
1087
+ isError: false,
1088
+ }),
1530
1089
  contextWindowManager: {
1531
1090
  shouldCompact: () => ({ needed: false, estimatedTokens: 0 }),
1532
1091
  maybeCompact: async (
@@ -1603,111 +1162,30 @@ describe("session-agent-loop overflow recovery (JARVIS-110)", () => {
1603
1162
  return 170_000;
1604
1163
  };
1605
1164
 
1606
- let agentLoopCallCount = 0;
1607
- const agentLoopRun: AgentLoopRun = async (messages, onEvent, options) => {
1608
- // Prime the assistant row anchor production code emits this from
1609
- // `AgentLoop.run` just before `provider.sendMessage`.
1610
- await onEvent({ type: "llm_call_started" });
1611
- agentLoopCallCount++;
1612
-
1613
- if (agentLoopCallCount === 1) {
1614
- // Simulate a tool round: assistant calls a tool, results come back
1615
- const withProgress: Message[] = [
1616
- ...messages,
1617
- {
1618
- role: "assistant" as const,
1619
- content: [
1620
- { type: "text", text: "Let me check." },
1621
- {
1622
- type: "tool_use",
1623
- id: "tu-1",
1624
- name: "bash",
1625
- input: { command: "ls" },
1626
- },
1627
- ] as ContentBlock[],
1628
- },
1629
- {
1630
- role: "user" as const,
1631
- content: [
1632
- {
1633
- type: "tool_result",
1634
- tool_use_id: "tu-1",
1635
- content: "file1.ts\nfile2.ts",
1636
- is_error: false,
1637
- },
1638
- ] as ContentBlock[],
1639
- },
1640
- ];
1641
-
1642
- onEvent({
1643
- type: "message_complete",
1644
- message: {
1645
- role: "assistant",
1646
- content: [
1647
- { type: "text", text: "Let me check." },
1648
- {
1649
- type: "tool_use",
1650
- id: "tu-1",
1651
- name: "bash",
1652
- input: { command: "ls" },
1653
- },
1654
- ],
1655
- },
1656
- });
1657
- onEvent({
1658
- type: "usage",
1659
- inputTokens: 100,
1660
- outputTokens: 50,
1661
- model: "test-model",
1662
- providerDurationMs: 100,
1663
- });
1664
-
1665
- // Call onCheckpoint — this should trigger the mid-loop budget check
1666
- // which sees 170_000 > 161_500 and returns "yield"
1667
- if (options?.onCheckpoint) {
1668
- const decision = await options.onCheckpoint({
1669
- turnIndex: 0,
1670
- toolCount: 1,
1671
- hasToolUse: true,
1672
- history: withProgress,
1673
- });
1674
- if (decision !== "continue") {
1675
- // Agent loop stops when checkpoint yields
1676
- return withProgress;
1677
- }
1678
- }
1679
-
1680
- return withProgress;
1681
- }
1165
+ // A tool round trips the mid-loop budget gate (170k > 161_500); the
1166
+ // gate compacts in place (productive) and the loop continues, so the
1167
+ // post-compaction provider call completes the turn with plain text.
1168
+ const { provider, calls } = createMockProvider([
1169
+ toolUseResponse("tu-1", "bash", { command: "ls" }),
1170
+ textResponse("done after compaction"),
1171
+ ]);
1682
1172
 
1683
- // Second call (after compaction): complete successfully
1684
- onEvent({
1685
- type: "message_complete",
1686
- message: {
1687
- role: "assistant",
1688
- content: [{ type: "text", text: "done after compaction" }],
1689
- },
1690
- });
1691
- onEvent({
1692
- type: "usage",
1693
- inputTokens: 50,
1694
- outputTokens: 25,
1695
- model: "test-model",
1696
- providerDurationMs: 100,
1697
- });
1698
- return [
1699
- ...messages,
1173
+ const ctx = makeCtx({
1174
+ loopProvider: provider,
1175
+ loopTools: [
1700
1176
  {
1701
- role: "assistant" as const,
1702
- content: [
1703
- { type: "text", text: "done after compaction" },
1704
- ] as ContentBlock[],
1177
+ name: "bash",
1178
+ description: "Run a shell command",
1179
+ input_schema: {
1180
+ type: "object",
1181
+ properties: { command: { type: "string" } },
1182
+ },
1705
1183
  },
1706
- ];
1707
- };
1708
-
1709
- const ctx = makeCtx({
1710
- agentLoopRun,
1184
+ ],
1185
+ toolExecutor: async () => ({
1186
+ content: "file1.ts\nfile2.ts",
1187
+ isError: false,
1188
+ }),
1711
1189
  contextWindowManager: {
1712
1190
  shouldCompact: () => ({ needed: false, estimatedTokens: 0 }),
1713
1191
  maybeCompact: async () => {
@@ -1741,8 +1219,9 @@ describe("session-agent-loop overflow recovery (JARVIS-110)", () => {
1741
1219
  // The mid-loop budget check should have triggered compaction
1742
1220
  expect(compactionCalled).toBe(true);
1743
1221
 
1744
- // Agent loop should have been called twice: once before yield, once after compaction
1745
- expect(agentLoopCallCount).toBe(2);
1222
+ // Provider called twice: the tool turn that tripped the gate, then the
1223
+ // post-compaction turn that completed the run.
1224
+ expect(calls.length).toBe(2);
1746
1225
 
1747
1226
  // No conversation_error should be emitted
1748
1227
  const conversationError = events.find(
@@ -1783,104 +1262,36 @@ describe("session-agent-loop overflow recovery (JARVIS-110)", () => {
1783
1262
  return 175_000;
1784
1263
  };
1785
1264
 
1786
- let agentLoopCallCount = 0;
1787
1265
  let contextTooLargeEmitted = false;
1788
1266
 
1789
- const agentLoopRun: AgentLoopRun = async (messages, onEvent, options) => {
1790
- // Prime the assistant row anchor production code emits this from
1791
- // `AgentLoop.run` just before `provider.sendMessage`.
1792
- await onEvent({ type: "llm_call_started" });
1793
- agentLoopCallCount++;
1794
-
1795
- if (agentLoopCallCount === 1) {
1796
- const currentHistory = [...messages];
1797
-
1798
- // Simulate 5 tool rounds — but the checkpoint should yield at round 3
1799
- for (let i = 0; i < 5; i++) {
1800
- const toolId = `tu-${i}`;
1801
- const assistantMsg: Message = {
1802
- role: "assistant" as const,
1803
- content: [
1804
- { type: "text", text: `Step ${i}` },
1805
- {
1806
- type: "tool_use",
1807
- id: toolId,
1808
- name: "bash",
1809
- input: { command: `cmd-${i}` },
1810
- },
1811
- ] as ContentBlock[],
1812
- };
1813
- const resultMsg: Message = {
1814
- role: "user" as const,
1815
- content: [
1816
- {
1817
- type: "tool_result",
1818
- tool_use_id: toolId,
1819
- content: "x".repeat(10_000),
1820
- is_error: false,
1821
- },
1822
- ] as ContentBlock[],
1823
- };
1824
- currentHistory.push(assistantMsg, resultMsg);
1825
-
1826
- onEvent({
1827
- type: "message_complete",
1828
- message: assistantMsg,
1829
- });
1830
- onEvent({
1831
- type: "usage",
1832
- inputTokens: 50_000 + i * 20_000,
1833
- outputTokens: 50,
1834
- model: "test-model",
1835
- providerDurationMs: 100,
1836
- });
1837
-
1838
- if (options?.onCheckpoint) {
1839
- const decision = await options.onCheckpoint({
1840
- turnIndex: i,
1841
- toolCount: 1,
1842
- hasToolUse: true,
1843
- history: currentHistory,
1844
- });
1845
- if (decision !== "continue") {
1846
- return currentHistory;
1847
- }
1848
- }
1849
- }
1267
+ // Each tool round produces a large result; the estimate grows with each
1268
+ // checkpoint until tool round 3 trips the mid-loop gate (175k > 161_500).
1269
+ // Compaction runs in place (productive) and the loop continues, so the
1270
+ // following plain-text provider call completes the turn. The provider
1271
+ // never rejects with context_too_large.
1272
+ const { provider, calls } = createMockProvider([
1273
+ toolUseResponse("tu-0", "bash", { command: "cmd-0" }),
1274
+ toolUseResponse("tu-1", "bash", { command: "cmd-1" }),
1275
+ toolUseResponse("tu-2", "bash", { command: "cmd-2" }),
1276
+ textResponse("completed after mid-loop compaction"),
1277
+ ]);
1850
1278
 
1851
- return currentHistory;
1852
- }
1853
-
1854
- // Second call (after compaction): complete
1855
- onEvent({
1856
- type: "message_complete",
1857
- message: {
1858
- role: "assistant",
1859
- content: [
1860
- { type: "text", text: "completed after mid-loop compaction" },
1861
- ],
1862
- },
1863
- });
1864
- onEvent({
1865
- type: "usage",
1866
- inputTokens: 60_000,
1867
- outputTokens: 100,
1868
- model: "test-model",
1869
- providerDurationMs: 200,
1870
- });
1871
- return [
1872
- ...messages,
1279
+ const ctx = makeCtx({
1280
+ loopProvider: provider,
1281
+ loopTools: [
1873
1282
  {
1874
- role: "assistant" as const,
1875
- content: [
1876
- { type: "text", text: "completed after mid-loop compaction" },
1877
- ] as ContentBlock[],
1283
+ name: "bash",
1284
+ description: "Run a shell command",
1285
+ input_schema: {
1286
+ type: "object",
1287
+ properties: { command: { type: "string" } },
1288
+ },
1878
1289
  },
1879
- ];
1880
- };
1881
-
1882
- const ctx = makeCtx({
1883
- agentLoopRun,
1290
+ ],
1291
+ toolExecutor: async () => ({
1292
+ content: "x".repeat(10_000),
1293
+ isError: false,
1294
+ }),
1884
1295
  contextWindowManager: {
1885
1296
  shouldCompact: () => ({ needed: false, estimatedTokens: 0 }),
1886
1297
  maybeCompact: async () => {
@@ -1927,8 +1338,9 @@ describe("session-agent-loop overflow recovery (JARVIS-110)", () => {
1927
1338
  // The provider should NEVER have rejected with context_too_large
1928
1339
  expect(contextTooLargeEmitted).toBe(false);
1929
1340
 
1930
- // Agent loop called twice: once (yielded at tool 3), once after compaction
1931
- expect(agentLoopCallCount).toBe(2);
1341
+ // Provider called four times: three tool rounds (the third trips the
1342
+ // mid-loop gate) plus the post-compaction text turn that completes.
1343
+ expect(calls.length).toBe(4);
1932
1344
 
1933
1345
  // No conversation_error
1934
1346
  const conversationError = events.find(
@@ -1957,82 +1369,7 @@ describe("session-agent-loop overflow recovery (JARVIS-110)", () => {
1957
1369
  return 170_000;
1958
1370
  };
1959
1371
 
1960
- let agentLoopCallCount = 0;
1961
- const agentLoopRun: AgentLoopRun = async (messages, onEvent, options) => {
1962
- // Prime the assistant row anchor — production code emits this from
1963
- // `AgentLoop.run` just before `provider.sendMessage`.
1964
- await onEvent({ type: "llm_call_started" });
1965
- agentLoopCallCount++;
1966
-
1967
- // Every call: simulate tool progress then yield at checkpoint
1968
- const withProgress: Message[] = [
1969
- ...messages,
1970
- {
1971
- role: "assistant" as const,
1972
- content: [
1973
- { type: "text", text: `Tool call ${agentLoopCallCount}` },
1974
- {
1975
- type: "tool_use",
1976
- id: `tu-${agentLoopCallCount}`,
1977
- name: "bash",
1978
- input: { command: "ls" },
1979
- },
1980
- ] as ContentBlock[],
1981
- },
1982
- {
1983
- role: "user" as const,
1984
- content: [
1985
- {
1986
- type: "tool_result",
1987
- tool_use_id: `tu-${agentLoopCallCount}`,
1988
- content: "output",
1989
- is_error: false,
1990
- },
1991
- ] as ContentBlock[],
1992
- },
1993
- ];
1994
-
1995
- onEvent({
1996
- type: "message_complete",
1997
- message: {
1998
- role: "assistant",
1999
- content: [
2000
- { type: "text", text: `Tool call ${agentLoopCallCount}` },
2001
- {
2002
- type: "tool_use",
2003
- id: `tu-${agentLoopCallCount}`,
2004
- name: "bash",
2005
- input: { command: "ls" },
2006
- },
2007
- ],
2008
- },
2009
- });
2010
- onEvent({
2011
- type: "usage",
2012
- inputTokens: 100,
2013
- outputTokens: 50,
2014
- model: "test-model",
2015
- providerDurationMs: 100,
2016
- });
2017
-
2018
- // Always yield at checkpoint — simulates compaction not helping
2019
- if (options?.onCheckpoint) {
2020
- const decision = await options.onCheckpoint({
2021
- turnIndex: 0,
2022
- toolCount: 1,
2023
- hasToolUse: true,
2024
- history: withProgress,
2025
- });
2026
- if (decision !== "continue") {
2027
- return withProgress;
2028
- }
2029
- }
2030
-
2031
- return withProgress;
2032
- };
2033
-
2034
- let compactionCallCount = 0;
2035
- // Convergence reducer: reduce tokens enough to succeed
1372
+ // The convergence reducer reduces tokens enough for the rerun to recover.
2036
1373
  let convergenceReducerCalled = false;
2037
1374
  mockReducerStepFn = (msgs: Message[]) => {
2038
1375
  convergenceReducerCalled = true;
@@ -2048,8 +1385,30 @@ describe("session-agent-loop overflow recovery (JARVIS-110)", () => {
2048
1385
  };
2049
1386
  };
2050
1387
 
1388
+ // Every provider call returns a tool_use, so each loop run does a tool
1389
+ // turn that trips the mid-loop budget gate. On the initial run the gate
1390
+ // calls compaction (which surfaces `exhausted: true`); the convergence
1391
+ // rerun runs without a compaction hook and yields "budget" directly.
1392
+ // With the reducer exhausted, the convergence loop terminates with the
1393
+ // turn still over budget and the orchestrator stamps `context_too_large`.
1394
+ const { provider, calls } = createMockProvider([
1395
+ toolUseResponse("tu-1", "bash", { command: "ls" }),
1396
+ ]);
1397
+
1398
+ let compactionCallCount = 0;
2051
1399
  const ctx = makeCtx({
2052
- agentLoopRun,
1400
+ loopProvider: provider,
1401
+ loopTools: [
1402
+ {
1403
+ name: "bash",
1404
+ description: "Run a shell command",
1405
+ input_schema: {
1406
+ type: "object",
1407
+ properties: { command: { type: "string" } },
1408
+ },
1409
+ },
1410
+ ],
1411
+ toolExecutor: async () => ({ content: "output", isError: false }),
2053
1412
  contextWindowManager: {
2054
1413
  shouldCompact: () => ({ needed: false, estimatedTokens: 0 }),
2055
1414
  maybeCompact: async () => {
@@ -2057,9 +1416,9 @@ describe("session-agent-loop overflow recovery (JARVIS-110)", () => {
2057
1416
  // Compaction's internal retry budget is exhausted — the
2058
1417
  // compactor itself ran maxAttempts passes and still couldn't
2059
1418
  // drop below the auto-threshold. `maybeCompact` surfaces this
2060
- // via `exhausted: true` so the orchestrator escalates
2061
- // straight to the convergence loop instead of looping on a
2062
- // stuck compactor.
1419
+ // via `exhausted: true` so the loop yields "budget" and the
1420
+ // orchestrator escalates straight to the convergence loop
1421
+ // instead of looping on a stuck compactor.
2063
1422
  return {
2064
1423
  compacted: true,
2065
1424
  exhausted: true,
@@ -2094,10 +1453,10 @@ describe("session-agent-loop overflow recovery (JARVIS-110)", () => {
2094
1453
  // `ContextWindowManager.maybeCompact`.
2095
1454
  expect(compactionCallCount).toBe(2);
2096
1455
 
2097
- // Agent loop: 1 initial + 1 convergence re-run = 2 calls. No
2098
- // mid-loop re-entries because the orchestrator broke out on
2099
- // `exhausted` before re-invoking the agent loop.
2100
- expect(agentLoopCallCount).toBe(2);
1456
+ // Provider calls: 1 initial tool turn (yields budget) + 1 convergence
1457
+ // rerun that recovers. No mid-loop re-entries because the orchestrator
1458
+ // broke out on `exhausted` before re-invoking the loop.
1459
+ expect(calls.length).toBe(2);
2101
1460
 
2102
1461
  // After the compactor exhausted itself, the convergence loop
2103
1462
  // should have been triggered (contextTooLargeDetected set to true)
@@ -2132,83 +1491,32 @@ describe("session-agent-loop overflow recovery (JARVIS-110)", () => {
2132
1491
  return 170_000;
2133
1492
  };
2134
1493
 
2135
- // A single tool round reaches one checkpoint; the in-loop budget
2136
- // gate trips there and compaction runs in place. The loop continues
2137
- // the run itself rather than handing control back, so the
2138
- // orchestrator invokes `run()` exactly once.
2139
- let agentLoopCallCount = 0;
2140
- const agentLoopRun: AgentLoopRun = async (messages, onEvent, options) => {
2141
- await onEvent({ type: "llm_call_started" });
2142
- agentLoopCallCount++;
2143
-
2144
- const withProgress: Message[] = [
2145
- ...messages,
2146
- {
2147
- role: "assistant" as const,
2148
- content: [
2149
- { type: "text", text: `Tool call ${agentLoopCallCount}` },
2150
- {
2151
- type: "tool_use",
2152
- id: `tu-${agentLoopCallCount}`,
2153
- name: "bash",
2154
- input: { command: "ls" },
2155
- },
2156
- ] as ContentBlock[],
2157
- },
2158
- {
2159
- role: "user" as const,
2160
- content: [
2161
- {
2162
- type: "tool_result",
2163
- tool_use_id: `tu-${agentLoopCallCount}`,
2164
- content: "output",
2165
- is_error: false,
2166
- },
2167
- ] as ContentBlock[],
2168
- },
2169
- ];
2170
-
2171
- onEvent({
2172
- type: "message_complete",
2173
- message: {
2174
- role: "assistant",
2175
- content: [
2176
- { type: "text", text: `Tool call ${agentLoopCallCount}` },
2177
- {
2178
- type: "tool_use",
2179
- id: `tu-${agentLoopCallCount}`,
2180
- name: "bash",
2181
- input: { command: "ls" },
2182
- },
2183
- ],
2184
- },
2185
- });
2186
- onEvent({
2187
- type: "usage",
2188
- inputTokens: 100,
2189
- outputTokens: 50,
2190
- model: "test-model",
2191
- providerDurationMs: 100,
2192
- });
2193
-
2194
- if (options?.onCheckpoint) {
2195
- await options.onCheckpoint({
2196
- turnIndex: 0,
2197
- toolCount: 1,
2198
- hasToolUse: true,
2199
- history: withProgress,
2200
- });
2201
- }
2202
-
2203
- return withProgress;
2204
- };
1494
+ // A single tool round reaches one checkpoint; the in-loop budget gate
1495
+ // trips there and compaction runs in place. The loop continues the run
1496
+ // itself — the following provider call returns plain text and the turn
1497
+ // completes — so the orchestrator never re-enters the convergence loop.
1498
+ const { provider, calls } = createMockProvider([
1499
+ toolUseResponse("tu-1", "bash", { command: "ls" }),
1500
+ textResponse("final answer"),
1501
+ ]);
2205
1502
 
2206
1503
  // Compaction reports `estimatedInputTokens` well below the 161_500
2207
1504
  // threshold — the "compaction is productive" signal (no `exhausted`
2208
1505
  // flag) that lets the loop continue in place.
2209
1506
  let compactionCallCount = 0;
2210
1507
  const ctx = makeCtx({
2211
- agentLoopRun,
1508
+ loopProvider: provider,
1509
+ loopTools: [
1510
+ {
1511
+ name: "bash",
1512
+ description: "Run a shell command",
1513
+ input_schema: {
1514
+ type: "object",
1515
+ properties: { command: { type: "string" } },
1516
+ },
1517
+ },
1518
+ ],
1519
+ toolExecutor: async () => ({ content: "output", isError: false }),
2212
1520
  contextWindowManager: {
2213
1521
  shouldCompact: () => ({ needed: false, estimatedTokens: 0 }),
2214
1522
  maybeCompact: async () => {
@@ -2239,18 +1547,20 @@ describe("session-agent-loop overflow recovery (JARVIS-110)", () => {
2239
1547
 
2240
1548
  await runAgentLoopImpl(ctx, "hello", "msg-1", (msg) => events.push(msg));
2241
1549
 
2242
- // 1 initial auto-compact + 1 productive mid-loop compaction. The
2243
- // loop continues in place after compacting, so the orchestrator
2244
- // never re-enters `run()` — it is invoked exactly once.
1550
+ // 1 initial auto-compact + 1 productive mid-loop compaction.
2245
1551
  expect(compactionCallCount).toBe(2);
2246
- expect(agentLoopCallCount).toBe(1);
1552
+ // The loop continued in place after compacting: a tool turn followed by
1553
+ // the post-compaction text turn, both within a single run.
1554
+ expect(calls.length).toBe(2);
2247
1555
 
2248
1556
  // No escalation to the convergence loop because the mid-loop
2249
- // `maybeCompact` returned productive (no `exhausted` flag).
1557
+ // `maybeCompact` returned productive (no `exhausted` flag), and the turn
1558
+ // completed normally.
2250
1559
  expect(setAgentLoopExitReasonOnLatestLogMock).not.toHaveBeenCalledWith(
2251
1560
  "test-conv",
2252
1561
  "context_too_large",
2253
1562
  );
1563
+ expect(events.find((e) => e.type === "conversation_error")).toBeUndefined();
2254
1564
  });
2255
1565
 
2256
1566
  // ── Test 9 ────────────────────────────────────────────────────────
@@ -2272,78 +1582,13 @@ describe("session-agent-loop overflow recovery (JARVIS-110)", () => {
2272
1582
  return 170_000;
2273
1583
  };
2274
1584
 
2275
- let agentLoopCallCount = 0;
2276
- const agentLoopRun: AgentLoopRun = async (messages, onEvent, options) => {
2277
- // Prime the assistant row anchor production code emits this from
2278
- // `AgentLoop.run` just before `provider.sendMessage`.
2279
- await onEvent({ type: "llm_call_started" });
2280
- agentLoopCallCount++;
2281
-
2282
- const withProgress: Message[] = [
2283
- ...messages,
2284
- {
2285
- role: "assistant" as const,
2286
- content: [
2287
- { type: "text", text: `Tool call ${agentLoopCallCount}` },
2288
- {
2289
- type: "tool_use",
2290
- id: `tu-${agentLoopCallCount}`,
2291
- name: "bash",
2292
- input: { command: "ls" },
2293
- },
2294
- ] as ContentBlock[],
2295
- },
2296
- {
2297
- role: "user" as const,
2298
- content: [
2299
- {
2300
- type: "tool_result",
2301
- tool_use_id: `tu-${agentLoopCallCount}`,
2302
- content: "output",
2303
- is_error: false,
2304
- },
2305
- ] as ContentBlock[],
2306
- },
2307
- ];
2308
-
2309
- onEvent({
2310
- type: "message_complete",
2311
- message: {
2312
- role: "assistant",
2313
- content: [
2314
- { type: "text", text: `Tool call ${agentLoopCallCount}` },
2315
- {
2316
- type: "tool_use",
2317
- id: `tu-${agentLoopCallCount}`,
2318
- name: "bash",
2319
- input: { command: "ls" },
2320
- },
2321
- ],
2322
- },
2323
- });
2324
- onEvent({
2325
- type: "usage",
2326
- inputTokens: 100,
2327
- outputTokens: 50,
2328
- model: "test-model",
2329
- providerDurationMs: 100,
2330
- });
2331
-
2332
- // Always yield at checkpoint — simulates reduction not helping enough
2333
- if (options?.onCheckpoint) {
2334
- const decision = await options.onCheckpoint({
2335
- turnIndex: 0,
2336
- toolCount: 1,
2337
- hasToolUse: true,
2338
- history: withProgress,
2339
- });
2340
- if (decision !== "continue") {
2341
- return withProgress;
2342
- }
2343
- }
2344
-
2345
- return withProgress;
2346
- };
1585
+ // Every provider call returns a tool_use, so each loop run does a tool
1586
+ // turn that trips the mid-loop budget gate and yields "budget". The
1587
+ // initial run's gate calls compaction (exhausted); the convergence
1588
+ // reruns run without a compaction hook and yield directly.
1589
+ const { provider, calls } = createMockProvider([
1590
+ toolUseResponse("tu-1", "bash", { command: "ls" }),
1591
+ ]);
2347
1592
 
2348
1593
  // Convergence reducer: first call returns non-exhausted, second returns exhausted
2349
1594
  let reducerCallCount = 0;
@@ -2375,7 +1620,18 @@ describe("session-agent-loop overflow recovery (JARVIS-110)", () => {
2375
1620
  };
2376
1621
 
2377
1622
  const ctx = makeCtx({
2378
- agentLoopRun,
1623
+ loopProvider: provider,
1624
+ loopTools: [
1625
+ {
1626
+ name: "bash",
1627
+ description: "Run a shell command",
1628
+ input_schema: {
1629
+ type: "object",
1630
+ properties: { command: { type: "string" } },
1631
+ },
1632
+ },
1633
+ ],
1634
+ toolExecutor: async () => ({ content: "output", isError: false }),
2379
1635
  contextWindowManager: {
2380
1636
  shouldCompact: () => ({ needed: false, estimatedTokens: 0 }),
2381
1637
  // Under the new architecture (Compaction Re-homing Arc, Bullet 1)
@@ -2413,10 +1669,11 @@ describe("session-agent-loop overflow recovery (JARVIS-110)", () => {
2413
1669
  // once more after yieldedForBudget triggered re-entry
2414
1670
  expect(reducerCallCount).toBe(2);
2415
1671
 
2416
- // Agent loop: 1 initial + 2 convergence re-runs = 3 calls. The mid-loop
2417
- // no longer drives daemon-level retries the manager owns its retry
2418
- // budget and signals exhaustion via the `exhausted` flag.
2419
- expect(agentLoopCallCount).toBe(3);
1672
+ // Provider calls: 1 initial run + 2 convergence reruns = 3 calls, each a
1673
+ // tool turn that yields "budget". The mid-loop no longer drives
1674
+ // daemon-level retries the manager owns its retry budget and signals
1675
+ // exhaustion via the `exhausted` flag.
1676
+ expect(calls.length).toBe(3);
2420
1677
  expect(setAgentLoopExitReasonOnLatestLogMock).toHaveBeenCalledWith(
2421
1678
  "test-conv",
2422
1679
  "context_too_large",
@@ -2516,35 +1773,10 @@ describe("session-agent-loop overflow recovery (JARVIS-110)", () => {
2516
1773
  };
2517
1774
  };
2518
1775
 
2519
- const agentLoopRun: AgentLoopRun = async (messages, onEvent) => {
2520
- // Prime the assistant row anchor production code emits this from
2521
- // `AgentLoop.run` just before `provider.sendMessage`.
2522
- await onEvent({ type: "llm_call_started" });
2523
- onEvent({
2524
- type: "message_complete",
2525
- message: {
2526
- role: "assistant",
2527
- content: [{ type: "text", text: "done" }],
2528
- },
2529
- });
2530
- onEvent({
2531
- type: "usage",
2532
- inputTokens: 170_000,
2533
- outputTokens: 200,
2534
- model: "test-model",
2535
- providerDurationMs: 500,
2536
- });
2537
- return [
2538
- ...messages,
2539
- {
2540
- role: "assistant" as const,
2541
- content: [{ type: "text", text: "done" }] as ContentBlock[],
2542
- },
2543
- ];
2544
- };
2545
-
1776
+ // The preflight overflow reducer runs in the orchestrator before the loop,
1777
+ // so a single successful provider turn is enough to drive the path.
2546
1778
  const ctx = makeCtx({
2547
- agentLoopRun,
1779
+ providerResponses: [textResponse("done")],
2548
1780
  contextWindowManager: {
2549
1781
  shouldCompact: () => ({ needed: false, estimatedTokens: 0 }),
2550
1782
  maybeCompact: async () => ({ compacted: false }),
@@ -2615,78 +1847,12 @@ describe("session-agent-loop overflow recovery (JARVIS-110)", () => {
2615
1847
  // emergency compaction + final agentLoop.run path executes.
2616
1848
  mockOverflowAction = "auto_compress_latest_turn";
2617
1849
 
2618
- let agentLoopCallCount = 0;
2619
- const agentLoopRun: AgentLoopRun = async (messages, onEvent, options) => {
2620
- // Prime the assistant row anchor — production code emits this from
2621
- // `AgentLoop.run` just before `provider.sendMessage`.
2622
- await onEvent({ type: "llm_call_started" });
2623
- agentLoopCallCount++;
2624
-
2625
- const withProgress: Message[] = [
2626
- ...messages,
2627
- {
2628
- role: "assistant" as const,
2629
- content: [
2630
- { type: "text", text: `tool call ${agentLoopCallCount}` },
2631
- {
2632
- type: "tool_use",
2633
- id: `tu-${agentLoopCallCount}`,
2634
- name: "bash",
2635
- input: { command: "ls" },
2636
- },
2637
- ] as ContentBlock[],
2638
- },
2639
- {
2640
- role: "user" as const,
2641
- content: [
2642
- {
2643
- type: "tool_result",
2644
- tool_use_id: `tu-${agentLoopCallCount}`,
2645
- content: "output",
2646
- is_error: false,
2647
- },
2648
- ] as ContentBlock[],
2649
- },
2650
- ];
2651
-
2652
- onEvent({
2653
- type: "message_complete",
2654
- message: {
2655
- role: "assistant",
2656
- content: [
2657
- { type: "text", text: `tool call ${agentLoopCallCount}` },
2658
- {
2659
- type: "tool_use",
2660
- id: `tu-${agentLoopCallCount}`,
2661
- name: "bash",
2662
- input: { command: "ls" },
2663
- },
2664
- ],
2665
- },
2666
- });
2667
- onEvent({
2668
- type: "usage",
2669
- inputTokens: 100,
2670
- outputTokens: 50,
2671
- model: "test-model",
2672
- providerDurationMs: 100,
2673
- });
2674
-
2675
- // Every checkpoint yields — including the final auto_compress rerun.
2676
- if (options?.onCheckpoint) {
2677
- const decision = await options.onCheckpoint({
2678
- turnIndex: 0,
2679
- toolCount: 1,
2680
- hasToolUse: true,
2681
- history: withProgress,
2682
- });
2683
- if (decision !== "continue") {
2684
- return withProgress;
2685
- }
2686
- }
2687
-
2688
- return withProgress;
2689
- };
1850
+ // Every provider call returns a tool_use, so each loop run does a tool
1851
+ // turn that trips the mid-loop budget gate and yields "budget" —
1852
+ // including the final auto_compress rerun.
1853
+ const { provider } = createMockProvider([
1854
+ toolUseResponse("tu-1", "bash", { command: "ls" }),
1855
+ ]);
2690
1856
 
2691
1857
  // `maybeCompact` is invoked through three distinct call sites:
2692
1858
  // 1. Start-of-turn compaction (no `force` option) — return a no-op
@@ -2702,7 +1868,18 @@ describe("session-agent-loop overflow recovery (JARVIS-110)", () => {
2702
1868
  // as BUDGET_YIELD_UNRECOVERED.
2703
1869
  let forcedMaybeCompactCallCount = 0;
2704
1870
  const ctx = makeCtx({
2705
- agentLoopRun,
1871
+ loopProvider: provider,
1872
+ loopTools: [
1873
+ {
1874
+ name: "bash",
1875
+ description: "Run a shell command",
1876
+ input_schema: {
1877
+ type: "object",
1878
+ properties: { command: { type: "string" } },
1879
+ },
1880
+ },
1881
+ ],
1882
+ toolExecutor: async () => ({ content: "output", isError: false }),
2706
1883
  contextWindowManager: {
2707
1884
  shouldCompact: () => ({ needed: false, estimatedTokens: 0 }),
2708
1885
  maybeCompact: async (