@vellumai/assistant 0.4.49 → 0.4.50

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (239) hide show
  1. package/ARCHITECTURE.md +24 -33
  2. package/README.md +3 -3
  3. package/docs/architecture/memory.md +180 -119
  4. package/package.json +2 -2
  5. package/src/__tests__/agent-loop.test.ts +3 -1
  6. package/src/__tests__/anthropic-provider.test.ts +114 -23
  7. package/src/__tests__/approval-cascade.test.ts +1 -15
  8. package/src/__tests__/approval-routes-http.test.ts +2 -0
  9. package/src/__tests__/assistant-feature-flag-guard.test.ts +0 -23
  10. package/src/__tests__/canonical-guardian-store.test.ts +95 -0
  11. package/src/__tests__/checker.test.ts +13 -0
  12. package/src/__tests__/config-schema.test.ts +1 -68
  13. package/src/__tests__/context-memory-e2e.test.ts +11 -100
  14. package/src/__tests__/conversation-routes-guardian-reply.test.ts +8 -0
  15. package/src/__tests__/conversation-routes-slash-commands.test.ts +1 -0
  16. package/src/__tests__/credential-security-e2e.test.ts +1 -0
  17. package/src/__tests__/credential-vault-unit.test.ts +4 -0
  18. package/src/__tests__/credential-vault.test.ts +13 -1
  19. package/src/__tests__/cu-unified-flow.test.ts +532 -0
  20. package/src/__tests__/date-context.test.ts +93 -77
  21. package/src/__tests__/deterministic-verification-control-plane.test.ts +64 -0
  22. package/src/__tests__/guardian-routing-invariants.test.ts +93 -0
  23. package/src/__tests__/history-repair.test.ts +245 -0
  24. package/src/__tests__/host-cu-proxy.test.ts +165 -3
  25. package/src/__tests__/http-user-message-parity.test.ts +1 -0
  26. package/src/__tests__/invite-redemption-service.test.ts +65 -1
  27. package/src/__tests__/keychain-broker-client.test.ts +4 -4
  28. package/src/__tests__/memory-context-benchmark.benchmark.test.ts +56 -18
  29. package/src/__tests__/memory-lifecycle-e2e.test.ts +244 -387
  30. package/src/__tests__/memory-recall-quality.test.ts +244 -407
  31. package/src/__tests__/memory-regressions.experimental.test.ts +126 -101
  32. package/src/__tests__/memory-regressions.test.ts +477 -2841
  33. package/src/__tests__/memory-retrieval.benchmark.test.ts +33 -150
  34. package/src/__tests__/memory-upsert-concurrency.test.ts +5 -244
  35. package/src/__tests__/mime-builder.test.ts +28 -0
  36. package/src/__tests__/native-web-search.test.ts +1 -0
  37. package/src/__tests__/oauth-cli.test.ts +572 -5
  38. package/src/__tests__/oauth-store.test.ts +120 -6
  39. package/src/__tests__/qdrant-collection-migration.test.ts +53 -8
  40. package/src/__tests__/registry.test.ts +0 -1
  41. package/src/__tests__/relay-server.test.ts +46 -1
  42. package/src/__tests__/schedule-tools.test.ts +32 -0
  43. package/src/__tests__/script-proxy-certs.test.ts +1 -1
  44. package/src/__tests__/secret-onetime-send.test.ts +1 -0
  45. package/src/__tests__/secure-keys.test.ts +7 -2
  46. package/src/__tests__/send-endpoint-busy.test.ts +3 -0
  47. package/src/__tests__/session-abort-tool-results.test.ts +1 -14
  48. package/src/__tests__/session-agent-loop-overflow.test.ts +1583 -0
  49. package/src/__tests__/session-agent-loop.test.ts +19 -15
  50. package/src/__tests__/session-confirmation-signals.test.ts +1 -15
  51. package/src/__tests__/session-error.test.ts +124 -2
  52. package/src/__tests__/session-history-web-search.test.ts +918 -0
  53. package/src/__tests__/session-pre-run-repair.test.ts +1 -14
  54. package/src/__tests__/session-provider-retry-repair.test.ts +25 -28
  55. package/src/__tests__/session-queue.test.ts +37 -27
  56. package/src/__tests__/session-runtime-assembly.test.ts +54 -0
  57. package/src/__tests__/session-slash-known.test.ts +1 -15
  58. package/src/__tests__/session-slash-queue.test.ts +1 -15
  59. package/src/__tests__/session-slash-unknown.test.ts +1 -15
  60. package/src/__tests__/session-workspace-cache-state.test.ts +3 -33
  61. package/src/__tests__/session-workspace-injection.test.ts +3 -37
  62. package/src/__tests__/session-workspace-tool-tracking.test.ts +3 -37
  63. package/src/__tests__/skills-install-extract.test.ts +93 -0
  64. package/src/__tests__/skillssh-registry.test.ts +451 -0
  65. package/src/__tests__/trust-store.test.ts +15 -0
  66. package/src/__tests__/voice-invite-redemption.test.ts +32 -1
  67. package/src/agent/ax-tree-compaction.test.ts +51 -0
  68. package/src/agent/loop.ts +39 -12
  69. package/src/approvals/AGENTS.md +1 -1
  70. package/src/approvals/guardian-request-resolvers.ts +14 -2
  71. package/src/bundler/compiler-tools.ts +66 -2
  72. package/src/calls/call-domain.ts +132 -0
  73. package/src/calls/call-store.ts +6 -0
  74. package/src/calls/relay-server.ts +43 -5
  75. package/src/calls/relay-setup-router.ts +17 -1
  76. package/src/calls/twilio-config.ts +1 -1
  77. package/src/calls/types.ts +3 -1
  78. package/src/cli/commands/doctor.ts +4 -3
  79. package/src/cli/commands/mcp.ts +46 -59
  80. package/src/cli/commands/memory.ts +16 -165
  81. package/src/cli/commands/oauth/apps.ts +31 -2
  82. package/src/cli/commands/oauth/connections.ts +431 -97
  83. package/src/cli/commands/oauth/providers.ts +15 -1
  84. package/src/cli/commands/sessions.ts +5 -2
  85. package/src/cli/commands/skills.ts +173 -1
  86. package/src/cli/http-client.ts +0 -20
  87. package/src/cli/main-screen.tsx +2 -2
  88. package/src/cli/program.ts +5 -6
  89. package/src/cli.ts +4 -10
  90. package/src/config/bundled-skills/computer-use/TOOLS.json +1 -1
  91. package/src/config/bundled-skills/computer-use/tools/computer-use-observe.ts +12 -0
  92. package/src/config/bundled-tool-registry.ts +2 -5
  93. package/src/config/schema.ts +1 -12
  94. package/src/config/schemas/memory-lifecycle.ts +0 -9
  95. package/src/config/schemas/memory-processing.ts +0 -180
  96. package/src/config/schemas/memory-retrieval.ts +32 -104
  97. package/src/config/schemas/memory.ts +0 -10
  98. package/src/config/types.ts +0 -4
  99. package/src/context/window-manager.ts +4 -1
  100. package/src/daemon/config-watcher.ts +61 -3
  101. package/src/daemon/daemon-control.ts +1 -1
  102. package/src/daemon/date-context.ts +114 -31
  103. package/src/daemon/handlers/sessions.ts +18 -13
  104. package/src/daemon/handlers/skills.ts +20 -1
  105. package/src/daemon/history-repair.ts +72 -8
  106. package/src/daemon/host-cu-proxy.ts +55 -26
  107. package/src/daemon/lifecycle.ts +31 -3
  108. package/src/daemon/mcp-reload-service.ts +2 -2
  109. package/src/daemon/message-types/computer-use.ts +1 -12
  110. package/src/daemon/message-types/memory.ts +4 -16
  111. package/src/daemon/message-types/messages.ts +1 -0
  112. package/src/daemon/message-types/sessions.ts +4 -0
  113. package/src/daemon/server.ts +12 -1
  114. package/src/daemon/session-agent-loop-handlers.ts +38 -0
  115. package/src/daemon/session-agent-loop.ts +334 -48
  116. package/src/daemon/session-error.ts +89 -6
  117. package/src/daemon/session-history.ts +17 -7
  118. package/src/daemon/session-media-retry.ts +6 -2
  119. package/src/daemon/session-memory.ts +69 -149
  120. package/src/daemon/session-process.ts +10 -1
  121. package/src/daemon/session-runtime-assembly.ts +49 -19
  122. package/src/daemon/session-surfaces.ts +4 -1
  123. package/src/daemon/session-tool-setup.ts +7 -1
  124. package/src/daemon/session.ts +12 -2
  125. package/src/instrument.ts +61 -1
  126. package/src/memory/admin.ts +2 -191
  127. package/src/memory/canonical-guardian-store.ts +38 -2
  128. package/src/memory/conversation-crud.ts +0 -33
  129. package/src/memory/conversation-queries.ts +22 -3
  130. package/src/memory/db-init.ts +28 -0
  131. package/src/memory/embedding-backend.ts +84 -8
  132. package/src/memory/embedding-types.ts +9 -1
  133. package/src/memory/indexer.ts +7 -46
  134. package/src/memory/items-extractor.ts +274 -76
  135. package/src/memory/job-handlers/backfill.ts +2 -127
  136. package/src/memory/job-handlers/cleanup.ts +2 -16
  137. package/src/memory/job-handlers/extraction.ts +2 -138
  138. package/src/memory/job-handlers/index-maintenance.ts +1 -6
  139. package/src/memory/job-handlers/summarization.ts +3 -148
  140. package/src/memory/job-utils.ts +21 -59
  141. package/src/memory/jobs-store.ts +1 -159
  142. package/src/memory/jobs-worker.ts +9 -52
  143. package/src/memory/migrations/104-core-indexes.ts +3 -3
  144. package/src/memory/migrations/149-oauth-tables.ts +2 -0
  145. package/src/memory/migrations/150-oauth-apps-client-secret-path.ts +98 -0
  146. package/src/memory/migrations/151-oauth-providers-ping-url.ts +11 -0
  147. package/src/memory/migrations/152-memory-item-supersession.ts +44 -0
  148. package/src/memory/migrations/153-drop-entity-tables.ts +15 -0
  149. package/src/memory/migrations/154-drop-fts.ts +20 -0
  150. package/src/memory/migrations/155-drop-conflicts.ts +7 -0
  151. package/src/memory/migrations/156-call-session-invite-metadata.ts +24 -0
  152. package/src/memory/migrations/index.ts +7 -0
  153. package/src/memory/qdrant-client.ts +148 -51
  154. package/src/memory/raw-query.ts +1 -1
  155. package/src/memory/retriever.test.ts +294 -273
  156. package/src/memory/retriever.ts +421 -645
  157. package/src/memory/schema/calls.ts +2 -0
  158. package/src/memory/schema/memory-core.ts +3 -48
  159. package/src/memory/schema/oauth.ts +2 -0
  160. package/src/memory/search/formatting.ts +263 -176
  161. package/src/memory/search/lexical.ts +1 -254
  162. package/src/memory/search/ranking.ts +0 -455
  163. package/src/memory/search/semantic.ts +100 -14
  164. package/src/memory/search/staleness.ts +47 -0
  165. package/src/memory/search/tier-classifier.ts +21 -0
  166. package/src/memory/search/types.ts +15 -77
  167. package/src/memory/task-memory-cleanup.ts +4 -6
  168. package/src/messaging/providers/gmail/mime-builder.ts +17 -7
  169. package/src/oauth/byo-connection.test.ts +8 -1
  170. package/src/oauth/oauth-store.ts +113 -27
  171. package/src/oauth/seed-providers.ts +6 -0
  172. package/src/oauth/token-persistence.ts +11 -3
  173. package/src/permissions/defaults.ts +1 -0
  174. package/src/permissions/trust-store.ts +23 -1
  175. package/src/playbooks/playbook-compiler.ts +1 -1
  176. package/src/prompts/system-prompt.ts +18 -2
  177. package/src/providers/anthropic/client.ts +56 -126
  178. package/src/providers/types.ts +7 -1
  179. package/src/runtime/AGENTS.md +9 -0
  180. package/src/runtime/auth/route-policy.ts +6 -3
  181. package/src/runtime/guardian-reply-router.ts +24 -22
  182. package/src/runtime/http-server.ts +2 -2
  183. package/src/runtime/invite-redemption-service.ts +19 -1
  184. package/src/runtime/invite-service.ts +25 -0
  185. package/src/runtime/pending-interactions.ts +2 -2
  186. package/src/runtime/routes/brain-graph-routes.ts +10 -90
  187. package/src/runtime/routes/conversation-routes.ts +9 -1
  188. package/src/runtime/routes/inbound-stages/acl-enforcement.ts +21 -12
  189. package/src/runtime/routes/memory-item-routes.test.ts +754 -0
  190. package/src/runtime/routes/memory-item-routes.ts +503 -0
  191. package/src/runtime/routes/session-management-routes.ts +3 -3
  192. package/src/runtime/routes/settings-routes.ts +2 -2
  193. package/src/runtime/routes/trust-rules-routes.ts +14 -0
  194. package/src/runtime/routes/workspace-routes.ts +2 -1
  195. package/src/security/keychain-broker-client.ts +17 -4
  196. package/src/security/secure-keys.ts +25 -3
  197. package/src/security/token-manager.ts +36 -36
  198. package/src/skills/catalog-install.ts +74 -18
  199. package/src/skills/skillssh-registry.ts +503 -0
  200. package/src/tools/assets/search.ts +5 -1
  201. package/src/tools/computer-use/definitions.ts +0 -10
  202. package/src/tools/computer-use/registry.ts +1 -1
  203. package/src/tools/credentials/vault.ts +1 -3
  204. package/src/tools/memory/definitions.ts +4 -13
  205. package/src/tools/memory/handlers.test.ts +83 -103
  206. package/src/tools/memory/handlers.ts +50 -85
  207. package/src/tools/schedule/create.ts +8 -1
  208. package/src/tools/schedule/update.ts +8 -1
  209. package/src/tools/skills/load.ts +25 -2
  210. package/src/__tests__/clarification-resolver.test.ts +0 -193
  211. package/src/__tests__/conflict-intent-tokenization.test.ts +0 -160
  212. package/src/__tests__/conflict-policy.test.ts +0 -269
  213. package/src/__tests__/conflict-store.test.ts +0 -372
  214. package/src/__tests__/contradiction-checker.test.ts +0 -361
  215. package/src/__tests__/entity-extractor.test.ts +0 -211
  216. package/src/__tests__/entity-search.test.ts +0 -1117
  217. package/src/__tests__/profile-compiler.test.ts +0 -392
  218. package/src/__tests__/session-conflict-gate.test.ts +0 -1228
  219. package/src/__tests__/session-profile-injection.test.ts +0 -557
  220. package/src/config/bundled-skills/knowledge-graph/SKILL.md +0 -25
  221. package/src/config/bundled-skills/knowledge-graph/TOOLS.json +0 -66
  222. package/src/config/bundled-skills/knowledge-graph/tools/graph-query.ts +0 -211
  223. package/src/daemon/session-conflict-gate.ts +0 -167
  224. package/src/daemon/session-dynamic-profile.ts +0 -77
  225. package/src/memory/clarification-resolver.ts +0 -417
  226. package/src/memory/conflict-intent.ts +0 -205
  227. package/src/memory/conflict-policy.ts +0 -127
  228. package/src/memory/conflict-store.ts +0 -410
  229. package/src/memory/contradiction-checker.ts +0 -508
  230. package/src/memory/entity-extractor.ts +0 -535
  231. package/src/memory/format-recall.ts +0 -47
  232. package/src/memory/fts-reconciler.ts +0 -165
  233. package/src/memory/job-handlers/conflict.ts +0 -200
  234. package/src/memory/profile-compiler.ts +0 -195
  235. package/src/memory/recall-cache.ts +0 -117
  236. package/src/memory/search/entity.ts +0 -535
  237. package/src/memory/search/query-expansion.test.ts +0 -70
  238. package/src/memory/search/query-expansion.ts +0 -118
  239. package/src/runtime/routes/mcp-routes.ts +0 -20
@@ -0,0 +1,1583 @@
1
+ /**
2
+ * Overflow recovery test suite for JARVIS-110.
3
+ *
4
+ * Reproduces the failure modes observed in long conversations (75+ messages)
5
+ * where context overflow recovery fails because:
6
+ * 1. Progress during the agent loop bypasses the convergence retry
7
+ * 2. Token estimation significantly underestimates actual token count
8
+ * 3. No mid-loop budget check to prevent hitting the provider limit
9
+ *
10
+ * Tests 2, 3, and 4 pass against the current code.
11
+ * Tests 1, 5 fail (documenting bugs to be fixed in PR 2).
12
+ * Tests 6 and 7 are skipped (depend on mid-loop checkpoint changes in PR 3).
13
+ */
14
+ import { beforeEach, describe, expect, mock, test } from "bun:test";
15
+
16
+ import type {
17
+ AgentEvent,
18
+ CheckpointDecision,
19
+ CheckpointInfo,
20
+ } from "../agent/loop.js";
21
+ import type { ServerMessage } from "../daemon/message-protocol.js";
22
+ import type { ContentBlock, Message } from "../providers/types.js";
23
+
24
+ // ── Module mocks (must precede imports of the module under test) ─────
25
+
26
+ mock.module("../util/logger.js", () => ({
27
+ getLogger: () =>
28
+ new Proxy({} as Record<string, unknown>, { get: () => () => {} }),
29
+ }));
30
+
31
+ mock.module("../util/platform.js", () => ({
32
+ getDataDir: () => "/tmp",
33
+ }));
34
+
35
+ mock.module("../config/loader.js", () => ({
36
+ getConfig: () => ({
37
+ provider: "mock-provider",
38
+ maxTokens: 4096,
39
+ thinking: false,
40
+ contextWindow: {
41
+ maxInputTokens: 200_000,
42
+ thresholdTokens: 160_000,
43
+ preserveRecentMessages: 6,
44
+ summaryModel: "mock-model",
45
+ maxSummaryTokens: 512,
46
+ overflowRecovery: {
47
+ enabled: true,
48
+ safetyMarginRatio: 0.05,
49
+ maxAttempts: 3,
50
+ interactiveLatestTurnCompression: "summarize",
51
+ nonInteractiveLatestTurnCompression: "truncate",
52
+ },
53
+ },
54
+ rateLimit: { maxRequestsPerMinute: 0, maxTokensPerSession: 0 },
55
+ apiKeys: {},
56
+ workspaceGit: { turnCommitMaxWaitMs: 10 },
57
+ ui: {},
58
+ }),
59
+ loadRawConfig: () => ({}),
60
+ saveRawConfig: () => {},
61
+ invalidateConfigCache: () => {},
62
+ }));
63
+
64
+ // ── Overflow recovery mocks ──────────────────────────────────────────
65
+
66
+ // Token estimator — controllable per-test via mockEstimateTokens.
67
+ // Can be a number (constant) or a function for dynamic behavior.
68
+ let mockEstimateTokens: number | (() => number) = 1000;
69
+ mock.module("../context/token-estimator.js", () => ({
70
+ estimatePromptTokens: () =>
71
+ typeof mockEstimateTokens === "function"
72
+ ? mockEstimateTokens()
73
+ : mockEstimateTokens,
74
+ }));
75
+
76
+ // Reducer: by default returns the input untouched and marks exhausted
77
+ let mockReducerStepFn:
78
+ | ((msgs: Message[], cfg: unknown, state: unknown) => unknown)
79
+ | null = null;
80
+ mock.module("../daemon/context-overflow-reducer.js", () => ({
81
+ createInitialReducerState: () => ({
82
+ appliedTiers: [],
83
+ injectionMode: "full" as const,
84
+ exhausted: false,
85
+ }),
86
+ reduceContextOverflow: async (
87
+ msgs: Message[],
88
+ cfg: unknown,
89
+ state: unknown,
90
+ ) => {
91
+ if (mockReducerStepFn) return mockReducerStepFn(msgs, cfg, state);
92
+ return {
93
+ messages: msgs,
94
+ tier: "forced_compaction",
95
+ state: {
96
+ appliedTiers: [
97
+ "forced_compaction",
98
+ "tool_result_truncation",
99
+ "media_stubbing",
100
+ "injection_downgrade",
101
+ ],
102
+ injectionMode: "full",
103
+ exhausted: true,
104
+ },
105
+ estimatedTokens: 1000,
106
+ };
107
+ },
108
+ }));
109
+
110
+ // Policy: default to fail_gracefully
111
+ let mockOverflowAction: string = "fail_gracefully";
112
+ mock.module("../daemon/context-overflow-policy.js", () => ({
113
+ resolveOverflowAction: () => mockOverflowAction,
114
+ }));
115
+
116
+ // Approval: default to denied
117
+ let mockApprovalResult = { approved: false };
118
+ mock.module("../daemon/context-overflow-approval.js", () => ({
119
+ requestCompressionApproval: async () => mockApprovalResult,
120
+ CONTEXT_OVERFLOW_TOOL_NAME: "context_overflow_compression",
121
+ }));
122
+
123
+ let hookBlocked = false;
124
+ let hookBlockedBy = "";
125
+
126
+ mock.module("../hooks/manager.js", () => ({
127
+ getHookManager: () => ({
128
+ trigger: async (hookName: string) => {
129
+ if (hookName === "pre-message" && hookBlocked) {
130
+ return { blocked: true, blockedBy: hookBlockedBy };
131
+ }
132
+ return { blocked: false };
133
+ },
134
+ }),
135
+ }));
136
+
137
+ mock.module("../memory/conversation-crud.js", () => ({
138
+ getConversationThreadType: () => "default",
139
+ setConversationOriginChannelIfUnset: () => {},
140
+ updateConversationUsage: () => {},
141
+ getMessages: () => [],
142
+ getConversation: () => ({
143
+ id: "conv-1",
144
+ contextSummary: null,
145
+ contextCompactedMessageCount: 0,
146
+ totalInputTokens: 0,
147
+ totalOutputTokens: 0,
148
+ totalEstimatedCost: 0,
149
+ title: null,
150
+ }),
151
+ provenanceFromTrustContext: () => ({
152
+ source: "user",
153
+ trustContext: undefined,
154
+ }),
155
+ getConversationOriginInterface: () => null,
156
+ addMessage: () => ({ id: "mock-msg-id" }),
157
+ deleteMessageById: () => {},
158
+ updateConversationContextWindow: () => {},
159
+ updateConversationTitle: () => {},
160
+ getConversationOriginChannel: () => null,
161
+ getMessageById: () => null,
162
+ updateMessageContent: () => {},
163
+ }));
164
+
165
+ mock.module("../memory/retriever.js", () => ({
166
+ buildMemoryRecall: async () => ({
167
+ enabled: false,
168
+ degraded: false,
169
+ injectedText: "",
170
+
171
+ semanticHits: 0,
172
+ recencyHits: 0,
173
+ injectedTokens: 0,
174
+ latencyMs: 0,
175
+ }),
176
+ stripMemoryRecallMessages: (msgs: Message[]) => msgs,
177
+ }));
178
+
179
+ mock.module("../memory/app-store.js", () => ({
180
+ getApp: () => null,
181
+ listAppFiles: () => [],
182
+ getAppsDir: () => "/tmp/apps",
183
+ }));
184
+
185
+ mock.module("../memory/app-git-service.js", () => ({
186
+ commitAppTurnChanges: () => Promise.resolve(),
187
+ }));
188
+
189
+ mock.module("../daemon/session-memory.js", () => ({
190
+ prepareMemoryContext: async (
191
+ _ctx: unknown,
192
+ _content: string,
193
+ _id: string,
194
+ _signal: AbortSignal,
195
+ ) => ({
196
+ runMessages: [],
197
+ recall: {
198
+ enabled: false,
199
+ degraded: false,
200
+ injectedText: "",
201
+
202
+ semanticHits: 0,
203
+ recencyHits: 0,
204
+ injectedTokens: 0,
205
+ latencyMs: 0,
206
+ tier1Count: 0,
207
+ tier2Count: 0,
208
+ hybridSearchMs: 0,
209
+ },
210
+ }),
211
+ }));
212
+
213
+ mock.module("../daemon/session-runtime-assembly.js", () => ({
214
+ applyRuntimeInjections: (msgs: Message[]) => msgs,
215
+ stripInjectedContext: (msgs: Message[]) => msgs,
216
+ }));
217
+
218
+ mock.module("../daemon/date-context.js", () => ({
219
+ buildTemporalContext: () => null,
220
+ }));
221
+
222
+ mock.module("../daemon/history-repair.js", () => ({
223
+ repairHistory: (msgs: Message[]) => ({
224
+ messages: msgs,
225
+ stats: {
226
+ assistantToolResultsMigrated: 0,
227
+ missingToolResultsInserted: 0,
228
+ orphanToolResultsDowngraded: 0,
229
+ consecutiveSameRoleMerged: 0,
230
+ },
231
+ }),
232
+ deepRepairHistory: (msgs: Message[]) => ({ messages: msgs, stats: {} }),
233
+ }));
234
+
235
+ mock.module("../daemon/session-history.js", () => ({
236
+ consolidateAssistantMessages: () => {},
237
+ }));
238
+
239
+ const recordUsageMock = mock(() => {});
240
+ mock.module("../daemon/session-usage.js", () => ({
241
+ recordUsage: recordUsageMock,
242
+ }));
243
+
244
+ const resolveAssistantAttachmentsMock = mock(async () => ({
245
+ assistantAttachments: [],
246
+ emittedAttachments: [],
247
+ directiveWarnings: [],
248
+ }));
249
+ mock.module("../daemon/session-attachments.js", () => ({
250
+ resolveAssistantAttachments: resolveAssistantAttachmentsMock,
251
+ approveHostAttachmentRead: async () => true,
252
+ formatAttachmentWarnings: () => "",
253
+ }));
254
+
255
+ mock.module("../daemon/assistant-attachments.js", () => ({
256
+ cleanAssistantContent: (content: unknown[]) => ({
257
+ cleanedContent: content,
258
+ directives: [],
259
+ warnings: [],
260
+ }),
261
+ drainDirectiveDisplayBuffer: (buffer: string) => ({
262
+ emitText: buffer,
263
+ bufferedRemainder: "",
264
+ }),
265
+ }));
266
+
267
+ mock.module("../daemon/session-media-retry.js", () => ({
268
+ stripMediaPayloadsForRetry: (msgs: Message[]) => ({
269
+ messages: msgs,
270
+ modified: false,
271
+ replacedBlocks: 0,
272
+ latestUserIndex: null,
273
+ }),
274
+ raceWithTimeout: async () => "completed" as const,
275
+ }));
276
+
277
+ mock.module("../workspace/turn-commit.js", () => ({
278
+ commitTurnChanges: async () => {},
279
+ }));
280
+
281
+ mock.module("../workspace/git-service.js", () => ({
282
+ getWorkspaceGitService: () => ({
283
+ ensureInitialized: async () => {},
284
+ }),
285
+ }));
286
+
287
+ mock.module("../daemon/session-error.js", () => ({
288
+ classifySessionError: (_err: unknown, _ctx: unknown) => ({
289
+ code: "SESSION_PROCESSING_FAILED",
290
+ userMessage: "Something went wrong processing your message.",
291
+ retryable: false,
292
+ errorCategory: "processing_failed",
293
+ }),
294
+ isUserCancellation: (err: unknown, ctx: { aborted?: boolean }) => {
295
+ if (!ctx.aborted) return false;
296
+ if (err instanceof DOMException && err.name === "AbortError") return true;
297
+ if (err instanceof Error && err.name === "AbortError") return true;
298
+ return false;
299
+ },
300
+ buildSessionErrorMessage: (
301
+ sessionId: string,
302
+ classified: Record<string, unknown>,
303
+ ) => ({
304
+ type: "session_error",
305
+ sessionId,
306
+ ...classified,
307
+ }),
308
+ isContextTooLarge: (msg: string) =>
309
+ /context.?length.?exceeded|prompt.?is.?too.?long|too many.*input.*tokens/i.test(
310
+ msg,
311
+ ),
312
+ }));
313
+
314
+ mock.module("../daemon/session-slash.js", () => ({
315
+ isProviderOrderingError: (msg: string) =>
316
+ /ordering|before.*after|messages.*order/i.test(msg),
317
+ }));
318
+
319
+ mock.module("../util/truncate.js", () => ({
320
+ truncate: (s: string) => s,
321
+ }));
322
+
323
+ mock.module("../agent/message-types.js", () => ({
324
+ createAssistantMessage: (text: string) => ({
325
+ role: "assistant" as const,
326
+ content: [{ type: "text", text }],
327
+ }),
328
+ }));
329
+
330
+ mock.module("../memory/llm-request-log-store.js", () => ({
331
+ recordRequestLog: () => {},
332
+ }));
333
+
334
+ // ── Imports (after mocks) ────────────────────────────────────────────
335
+
336
+ import {
337
+ type AgentLoopSessionContext,
338
+ runAgentLoopImpl,
339
+ } from "../daemon/session-agent-loop.js";
340
+
341
+ // ── Test helpers ─────────────────────────────────────────────────────
342
+
343
+ type AgentLoopRun = (
344
+ messages: Message[],
345
+ onEvent: (event: AgentEvent) => void,
346
+ signal?: AbortSignal,
347
+ requestId?: string,
348
+ onCheckpoint?: (checkpoint: CheckpointInfo) => CheckpointDecision,
349
+ ) => Promise<Message[]>;
350
+
351
+ function makeCtx(
352
+ overrides?: Partial<AgentLoopSessionContext> & {
353
+ agentLoopRun?: AgentLoopRun;
354
+ },
355
+ ): AgentLoopSessionContext {
356
+ const agentLoopRun =
357
+ overrides?.agentLoopRun ??
358
+ (async (messages: Message[]) => [
359
+ ...messages,
360
+ {
361
+ role: "assistant" as const,
362
+ content: [{ type: "text" as const, text: "response" }],
363
+ },
364
+ ]);
365
+
366
+ return {
367
+ conversationId: "test-conv",
368
+ messages: [
369
+ { role: "user", content: [{ type: "text", text: "Hello" }] },
370
+ ] as Message[],
371
+ processing: true,
372
+ abortController: new AbortController(),
373
+ currentRequestId: "test-req",
374
+
375
+ agentLoop: {
376
+ run: agentLoopRun,
377
+ } as unknown as AgentLoopSessionContext["agentLoop"],
378
+ provider: {
379
+ name: "mock-provider",
380
+ sendMessage: async () => ({
381
+ content: [{ type: "text", text: "title" }],
382
+ model: "mock",
383
+ usage: { inputTokens: 0, outputTokens: 0 },
384
+ stopReason: "end_turn",
385
+ }),
386
+ } as unknown as AgentLoopSessionContext["provider"],
387
+ systemPrompt: "system prompt",
388
+
389
+ contextWindowManager: {
390
+ shouldCompact: () => ({ needed: false, estimatedTokens: 0 }),
391
+ maybeCompact: async () => ({ compacted: false }),
392
+ } as unknown as AgentLoopSessionContext["contextWindowManager"],
393
+ contextCompactedMessageCount: 0,
394
+ contextCompactedAt: null,
395
+
396
+ memoryPolicy: { scopeId: "default", includeDefaultFallback: true },
397
+
398
+ currentActiveSurfaceId: undefined,
399
+ currentPage: undefined,
400
+ surfaceState: new Map(),
401
+ pendingSurfaceActions: new Map(),
402
+ surfaceActionRequestIds: new Set<string>(),
403
+ currentTurnSurfaces: [],
404
+
405
+ workingDir: "/tmp",
406
+ workspaceTopLevelContext: null,
407
+ workspaceTopLevelDirty: false,
408
+ channelCapabilities: undefined,
409
+ commandIntent: undefined,
410
+ trustContext: undefined,
411
+
412
+ coreToolNames: new Set(),
413
+ allowedToolNames: undefined,
414
+ preactivatedSkillIds: undefined,
415
+ skillProjectionState: new Map(),
416
+ skillProjectionCache:
417
+ new Map() as unknown as AgentLoopSessionContext["skillProjectionCache"],
418
+
419
+ traceEmitter: {
420
+ emit: () => {},
421
+ } as unknown as AgentLoopSessionContext["traceEmitter"],
422
+ profiler: {
423
+ startRequest: () => {},
424
+ emitSummary: () => {},
425
+ } as unknown as AgentLoopSessionContext["profiler"],
426
+ usageStats: {
427
+ totalInputTokens: 0,
428
+ totalOutputTokens: 0,
429
+ totalEstimatedCost: 0,
430
+ model: "",
431
+ },
432
+ turnCount: 0,
433
+
434
+ lastAssistantAttachments: [],
435
+ lastAttachmentWarnings: [],
436
+
437
+ hasNoClient: false,
438
+ prompter: {} as unknown as AgentLoopSessionContext["prompter"],
439
+ queue: {} as unknown as AgentLoopSessionContext["queue"],
440
+
441
+ getWorkspaceGitService: () => ({ ensureInitialized: async () => {} }),
442
+ commitTurnChanges: async () => {},
443
+
444
+ refreshWorkspaceTopLevelContextIfNeeded: () => {},
445
+ markWorkspaceTopLevelDirty: () => {},
446
+ emitActivityState: () => {},
447
+ getQueueDepth: () => 0,
448
+ hasQueuedMessages: () => false,
449
+ canHandoffAtCheckpoint: () => false,
450
+ drainQueue: () => {},
451
+ getTurnInterfaceContext: () => null,
452
+ getTurnChannelContext: () => ({
453
+ userMessageChannel: "vellum" as const,
454
+ assistantMessageChannel: "vellum" as const,
455
+ }),
456
+
457
+ ...overrides,
458
+ } as AgentLoopSessionContext;
459
+ }
460
+
461
+ /**
462
+ * Build a realistic long conversation with interleaved tool calls.
463
+ * Returns an array of messages simulating a 75+ message conversation
464
+ * with a mix of text, tool_use, and tool_result blocks.
465
+ */
466
+ function buildLongConversation(messageCount: number): Message[] {
467
+ const messages: Message[] = [];
468
+ for (let i = 0; i < messageCount; i++) {
469
+ if (i % 3 === 0) {
470
+ // User text message
471
+ messages.push({
472
+ role: "user",
473
+ content: [
474
+ {
475
+ type: "text",
476
+ text: `User message ${i}: ${"x".repeat(200)} some detailed instructions about the task at hand`,
477
+ },
478
+ ],
479
+ });
480
+ } else if (i % 3 === 1) {
481
+ // Assistant with tool_use
482
+ messages.push({
483
+ role: "assistant",
484
+ content: [
485
+ { type: "text", text: `Thinking about step ${i}...` },
486
+ {
487
+ type: "tool_use",
488
+ id: `tool-${i}`,
489
+ name: i % 6 === 1 ? "bash" : "file_read",
490
+ input: {
491
+ command: `some command ${i}`,
492
+ path: `/path/to/file-${i}.ts`,
493
+ },
494
+ },
495
+ ],
496
+ });
497
+ } else {
498
+ // User with tool_result
499
+ messages.push({
500
+ role: "user",
501
+ content: [
502
+ {
503
+ type: "tool_result",
504
+ tool_use_id: `tool-${i - 1}`,
505
+ content: `Result of tool call ${i - 1}: ${"output data ".repeat(50)}`,
506
+ is_error: false,
507
+ },
508
+ ],
509
+ });
510
+ }
511
+ }
512
+ return messages as Message[];
513
+ }
514
+
515
+ // ── Tests ────────────────────────────────────────────────────────────
516
+
517
+ beforeEach(() => {
518
+ hookBlocked = false;
519
+ hookBlockedBy = "";
520
+ mockEstimateTokens = 1000;
521
+ mockReducerStepFn = null;
522
+ mockOverflowAction = "fail_gracefully";
523
+ mockApprovalResult = { approved: false };
524
+ recordUsageMock.mockClear();
525
+ });
526
+
527
+ describe("session-agent-loop overflow recovery (JARVIS-110)", () => {
528
+ // ── Test 1 ────────────────────────────────────────────────────────
529
+ // BUG: When the agent loop makes progress (adds messages to history)
530
+ // before hitting context_too_large, the convergence loop at line 864
531
+ // checks `updatedHistory.length === preRunHistoryLength` which is
532
+ // false when progress was made. This means the reducer is never
533
+ // invoked — the error is surfaced immediately at line 1163-1175
534
+ // without any compaction attempt.
535
+ //
536
+ // Expected behavior (PR 2 fix): After progress + context_too_large,
537
+ // the system should still attempt compaction before surfacing error.
538
+ test("context too large after progress triggers compaction retry instead of immediate failure", async () => {
539
+ const events: ServerMessage[] = [];
540
+ let reducerCalled = false;
541
+
542
+ mockReducerStepFn = (msgs: Message[]) => {
543
+ reducerCalled = true;
544
+ return {
545
+ messages: msgs,
546
+ tier: "forced_compaction",
547
+ state: {
548
+ appliedTiers: ["forced_compaction"],
549
+ injectionMode: "full",
550
+ exhausted: false,
551
+ },
552
+ estimatedTokens: 50_000,
553
+ compactionResult: {
554
+ compacted: true,
555
+ messages: msgs,
556
+ compactedPersistedMessages: 5,
557
+ summaryText: "Summary",
558
+ previousEstimatedInputTokens: 190_000,
559
+ estimatedInputTokens: 50_000,
560
+ maxInputTokens: 200_000,
561
+ thresholdTokens: 160_000,
562
+ compactedMessages: 10,
563
+ summaryCalls: 1,
564
+ summaryInputTokens: 500,
565
+ summaryOutputTokens: 200,
566
+ summaryModel: "mock-model",
567
+ },
568
+ };
569
+ };
570
+
571
+ let agentLoopCallCount = 0;
572
+ const agentLoopRun: AgentLoopRun = async (messages, onEvent) => {
573
+ agentLoopCallCount++;
574
+ if (agentLoopCallCount === 1) {
575
+ // Simulate: agent makes progress (tool calls + results added)
576
+ // then hits context_too_large on next LLM call
577
+ const progressMessages: Message[] = [
578
+ ...messages,
579
+ {
580
+ role: "assistant" as const,
581
+ content: [
582
+ { type: "text", text: "Let me check that." },
583
+ {
584
+ type: "tool_use",
585
+ id: "tu-progress",
586
+ name: "bash",
587
+ input: { command: "ls" },
588
+ },
589
+ ] as ContentBlock[],
590
+ },
591
+ {
592
+ role: "user" as const,
593
+ content: [
594
+ {
595
+ type: "tool_result",
596
+ tool_use_id: "tu-progress",
597
+ content: "file1.ts\nfile2.ts",
598
+ is_error: false,
599
+ },
600
+ ] as ContentBlock[],
601
+ },
602
+ ];
603
+
604
+ // Emit events for the progress that was made
605
+ onEvent({
606
+ type: "tool_use",
607
+ id: "tu-progress",
608
+ name: "bash",
609
+ input: { command: "ls" },
610
+ });
611
+ onEvent({
612
+ type: "tool_result",
613
+ toolUseId: "tu-progress",
614
+ content: "file1.ts\nfile2.ts",
615
+ isError: false,
616
+ });
617
+ onEvent({
618
+ type: "message_complete",
619
+ message: {
620
+ role: "assistant",
621
+ content: [
622
+ { type: "text", text: "Let me check that." },
623
+ {
624
+ type: "tool_use",
625
+ id: "tu-progress",
626
+ name: "bash",
627
+ input: { command: "ls" },
628
+ },
629
+ ],
630
+ },
631
+ });
632
+ onEvent({
633
+ type: "usage",
634
+ inputTokens: 100,
635
+ outputTokens: 50,
636
+ model: "test-model",
637
+ providerDurationMs: 100,
638
+ });
639
+
640
+ // Then context_too_large error occurs on the *next* LLM call
641
+ onEvent({
642
+ type: "error",
643
+ error: new Error(
644
+ "prompt is too long: 242201 tokens > 200000 maximum",
645
+ ),
646
+ });
647
+ onEvent({
648
+ type: "usage",
649
+ inputTokens: 0,
650
+ outputTokens: 0,
651
+ model: "test-model",
652
+ providerDurationMs: 10,
653
+ });
654
+
655
+ // Return the history WITH progress (more messages than input)
656
+ return progressMessages;
657
+ }
658
+
659
+ // Second call (after compaction): succeed
660
+ onEvent({
661
+ type: "message_complete",
662
+ message: {
663
+ role: "assistant",
664
+ content: [{ type: "text", text: "recovered after compaction" }],
665
+ },
666
+ });
667
+ onEvent({
668
+ type: "usage",
669
+ inputTokens: 50,
670
+ outputTokens: 25,
671
+ model: "test-model",
672
+ providerDurationMs: 100,
673
+ });
674
+ return [
675
+ ...messages,
676
+ {
677
+ role: "assistant" as const,
678
+ content: [
679
+ { type: "text", text: "recovered after compaction" },
680
+ ] as ContentBlock[],
681
+ },
682
+ ];
683
+ };
684
+
685
+ const ctx = makeCtx({
686
+ agentLoopRun,
687
+ contextWindowManager: {
688
+ shouldCompact: () => ({ needed: false, estimatedTokens: 0 }),
689
+ maybeCompact: async () => ({ compacted: false }),
690
+ } as unknown as AgentLoopSessionContext["contextWindowManager"],
691
+ });
692
+
693
+ await runAgentLoopImpl(ctx, "hello", "msg-1", (msg) => events.push(msg));
694
+
695
+ // BUG: Currently the reducer is NOT called when progress was made before
696
+ // context_too_large. The error is surfaced immediately.
697
+ // After PR 2 fix, the reducer SHOULD be called to attempt compaction.
698
+ expect(reducerCalled).toBe(true);
699
+
700
+ // BUG: Currently a session_error IS emitted instead of retrying.
701
+ // After PR 2 fix, there should be no session_error.
702
+ const sessionError = events.find((e) => e.type === "session_error");
703
+ expect(sessionError).toBeUndefined();
704
+ });
705
+
706
+ // ── Test 2 ────────────────────────────────────────────────────────
707
+ // When estimation says we're within budget but the provider rejects,
708
+ // the post-run convergence loop should kick in and recover.
709
+ // This test should PASS against current code (when no progress is made).
710
+ test("overflow recovery compacts below limit even when estimation underestimates", async () => {
711
+ const events: ServerMessage[] = [];
712
+ let callCount = 0;
713
+ let reducerCalled = false;
714
+
715
+ // Estimator says 185k (below 190k budget = 200k * 0.95)
716
+ mockEstimateTokens = 185_000;
717
+
718
+ // Reducer successfully compacts
719
+ mockReducerStepFn = (msgs: Message[]) => {
720
+ reducerCalled = true;
721
+ return {
722
+ messages: msgs,
723
+ tier: "forced_compaction",
724
+ state: {
725
+ appliedTiers: ["forced_compaction"],
726
+ injectionMode: "full",
727
+ exhausted: false,
728
+ },
729
+ estimatedTokens: 100_000,
730
+ compactionResult: {
731
+ compacted: true,
732
+ messages: msgs,
733
+ compactedPersistedMessages: 10,
734
+ summaryText: "Summary",
735
+ previousEstimatedInputTokens: 185_000,
736
+ estimatedInputTokens: 100_000,
737
+ maxInputTokens: 200_000,
738
+ thresholdTokens: 160_000,
739
+ compactedMessages: 20,
740
+ summaryCalls: 1,
741
+ summaryInputTokens: 800,
742
+ summaryOutputTokens: 300,
743
+ summaryModel: "mock-model",
744
+ },
745
+ };
746
+ };
747
+
748
+ const agentLoopRun: AgentLoopRun = async (messages, onEvent) => {
749
+ callCount++;
750
+ if (callCount === 1) {
751
+ // Provider rejects with "prompt is too long: 242201 tokens > 200000"
752
+ // even though estimator said 185k
753
+ onEvent({
754
+ type: "error",
755
+ error: new Error(
756
+ "prompt is too long: 242201 tokens > 200000 maximum",
757
+ ),
758
+ });
759
+ onEvent({
760
+ type: "usage",
761
+ inputTokens: 0,
762
+ outputTokens: 0,
763
+ model: "test-model",
764
+ providerDurationMs: 10,
765
+ });
766
+ // No progress — return same messages
767
+ return messages;
768
+ }
769
+ // Second call succeeds
770
+ onEvent({
771
+ type: "message_complete",
772
+ message: {
773
+ role: "assistant",
774
+ content: [{ type: "text", text: "recovered" }],
775
+ },
776
+ });
777
+ onEvent({
778
+ type: "usage",
779
+ inputTokens: 80_000,
780
+ outputTokens: 200,
781
+ model: "test-model",
782
+ providerDurationMs: 500,
783
+ });
784
+ return [
785
+ ...messages,
786
+ {
787
+ role: "assistant" as const,
788
+ content: [{ type: "text", text: "recovered" }] as ContentBlock[],
789
+ },
790
+ ];
791
+ };
792
+
793
+ const ctx = makeCtx({
794
+ agentLoopRun,
795
+ contextWindowManager: {
796
+ shouldCompact: () => ({ needed: false, estimatedTokens: 0 }),
797
+ maybeCompact: async () => ({ compacted: false }),
798
+ } as unknown as AgentLoopSessionContext["contextWindowManager"],
799
+ });
800
+
801
+ await runAgentLoopImpl(ctx, "hello", "msg-1", (msg) => events.push(msg));
802
+
803
+ // The reducer should be called in the convergence loop
804
+ expect(reducerCalled).toBe(true);
805
+ // Should recover without session_error
806
+ const sessionError = events.find((e) => e.type === "session_error");
807
+ expect(sessionError).toBeUndefined();
808
+ expect(callCount).toBe(2);
809
+ });
810
+
811
+ // ── Test 3 ────────────────────────────────────────────────────────
812
+ // BUG: When the provider rejection reveals actual token count (e.g.,
813
+ // "242201 tokens > 200000"), the reducer should target a budget below
814
+ // the actual limit (not below the estimator's inaccurate budget).
815
+ // Currently the reducer always uses `preflightBudget` (190k) as the
816
+ // target, but the actual tokens were 242k — so 190k is already too
817
+ // high relative to the real count. The target should be adjusted
818
+ // downward based on the observed mismatch.
819
+ //
820
+ // Expected behavior (PR 4 fix): `targetInputTokensOverride` should
821
+ // be adjusted based on the ratio between estimated and actual tokens.
822
+ // BUG: The targetTokens passed to the reducer is preflightBudget = 190k.
823
+ // But when the actual token count is 242k (1.31x the estimate of 185k),
824
+ // the target should be adjusted downward to account for the estimation
825
+ // inaccuracy. For example: 190k / 1.31 ≈ 145k.
826
+ // Planned fix: targetInputTokensOverride should be adjusted based on
827
+ // the ratio between estimated and actual tokens.
828
+ test("forced compaction targets a lower budget when estimation has been inaccurate", async () => {
829
+ const events: ServerMessage[] = [];
830
+ let callCount = 0;
831
+ let capturedTargetTokens: number | undefined;
832
+
833
+ // Estimator says 185k (below 190k budget = 200k * 0.95)
834
+ mockEstimateTokens = 185_000;
835
+
836
+ // Reducer captures the targetTokens from the config
837
+ mockReducerStepFn = (
838
+ msgs: Message[],
839
+ cfg: unknown,
840
+ ) => {
841
+ capturedTargetTokens = (cfg as { targetTokens: number }).targetTokens;
842
+ return {
843
+ messages: msgs,
844
+ tier: "forced_compaction",
845
+ state: {
846
+ appliedTiers: ["forced_compaction"],
847
+ injectionMode: "full",
848
+ exhausted: false,
849
+ },
850
+ estimatedTokens: 100_000,
851
+ compactionResult: {
852
+ compacted: true,
853
+ messages: msgs,
854
+ compactedPersistedMessages: 10,
855
+ summaryText: "Summary",
856
+ previousEstimatedInputTokens: 185_000,
857
+ estimatedInputTokens: 100_000,
858
+ maxInputTokens: 200_000,
859
+ thresholdTokens: 160_000,
860
+ compactedMessages: 20,
861
+ summaryCalls: 1,
862
+ summaryInputTokens: 800,
863
+ summaryOutputTokens: 300,
864
+ summaryModel: "mock-model",
865
+ },
866
+ };
867
+ };
868
+
869
+ const agentLoopRun: AgentLoopRun = async (messages, onEvent) => {
870
+ callCount++;
871
+ if (callCount === 1) {
872
+ // Provider rejects: actual tokens 242201, way above estimate of 185k
873
+ onEvent({
874
+ type: "error",
875
+ error: new Error(
876
+ "prompt is too long: 242201 tokens > 200000 maximum",
877
+ ),
878
+ });
879
+ onEvent({
880
+ type: "usage",
881
+ inputTokens: 0,
882
+ outputTokens: 0,
883
+ model: "test-model",
884
+ providerDurationMs: 10,
885
+ });
886
+ // No progress — return same messages
887
+ return messages;
888
+ }
889
+ // Second call succeeds after compaction
890
+ onEvent({
891
+ type: "message_complete",
892
+ message: {
893
+ role: "assistant",
894
+ content: [{ type: "text", text: "recovered" }],
895
+ },
896
+ });
897
+ onEvent({
898
+ type: "usage",
899
+ inputTokens: 80_000,
900
+ outputTokens: 200,
901
+ model: "test-model",
902
+ providerDurationMs: 500,
903
+ });
904
+ return [
905
+ ...messages,
906
+ {
907
+ role: "assistant" as const,
908
+ content: [{ type: "text", text: "recovered" }] as ContentBlock[],
909
+ },
910
+ ];
911
+ };
912
+
913
+ const ctx = makeCtx({
914
+ agentLoopRun,
915
+ contextWindowManager: {
916
+ shouldCompact: () => ({ needed: false, estimatedTokens: 0 }),
917
+ maybeCompact: async () => ({ compacted: false }),
918
+ } as unknown as AgentLoopSessionContext["contextWindowManager"],
919
+ });
920
+
921
+ await runAgentLoopImpl(ctx, "hello", "msg-1", (msg) => events.push(msg));
922
+
923
+ // The reducer should have been called with a corrected target
924
+ expect(capturedTargetTokens).toBeDefined();
925
+
926
+ // preflightBudget = 200_000 * 0.95 = 190_000
927
+ // estimationErrorRatio = 242201 / 185000 ≈ 1.309
928
+ // correctedTarget = floor(190000 / 1.309) ≈ 145_130
929
+ // The corrected target must be LESS than the uncorrected preflightBudget
930
+ const preflightBudget = 190_000;
931
+ expect(capturedTargetTokens!).toBeLessThan(preflightBudget);
932
+
933
+ // Verify the approximate corrected value (190000 / (242201/185000))
934
+ const expectedCorrectedTarget = Math.floor(
935
+ preflightBudget / (242201 / 185_000),
936
+ );
937
+ expect(capturedTargetTokens!).toBe(expectedCorrectedTarget);
938
+
939
+ // Should recover without session_error
940
+ const sessionError = events.find((e) => e.type === "session_error");
941
+ expect(sessionError).toBeUndefined();
942
+ expect(callCount).toBe(2);
943
+ });
944
+
945
+ // ── Test 4 ────────────────────────────────────────────────────────
946
+ // A realistic 75+ message conversation with many tool calls where
947
+ // token estimation underestimates. This test should PASS against
948
+ // current code because the agent loop returns same-length history
949
+ // (no progress), so the convergence loop kicks in.
950
+ test("overflow recovery succeeds for 75+ message conversation with many tool calls", async () => {
951
+ const events: ServerMessage[] = [];
952
+ const longHistory = buildLongConversation(75);
953
+ let callCount = 0;
954
+ let reducerCalled = false;
955
+
956
+ // Estimator says ~195k — just above budget so preflight reducer runs
957
+ mockEstimateTokens = 195_000;
958
+
959
+ // Reducer reduces to under budget
960
+ mockReducerStepFn = (msgs: Message[]) => {
961
+ reducerCalled = true;
962
+ return {
963
+ messages: msgs.slice(-10), // Keep only last 10 messages
964
+ tier: "forced_compaction",
965
+ state: {
966
+ appliedTiers: ["forced_compaction"],
967
+ injectionMode: "full",
968
+ exhausted: false,
969
+ },
970
+ estimatedTokens: 50_000,
971
+ compactionResult: {
972
+ compacted: true,
973
+ messages: msgs.slice(-10),
974
+ compactedPersistedMessages: msgs.length - 10,
975
+ summaryText: "Long conversation summary",
976
+ previousEstimatedInputTokens: 195_000,
977
+ estimatedInputTokens: 50_000,
978
+ maxInputTokens: 200_000,
979
+ thresholdTokens: 160_000,
980
+ compactedMessages: msgs.length - 10,
981
+ summaryCalls: 2,
982
+ summaryInputTokens: 2000,
983
+ summaryOutputTokens: 500,
984
+ summaryModel: "mock-model",
985
+ },
986
+ };
987
+ };
988
+
989
+ const agentLoopRun: AgentLoopRun = async (messages, onEvent) => {
990
+ callCount++;
991
+ onEvent({
992
+ type: "message_complete",
993
+ message: {
994
+ role: "assistant",
995
+ content: [{ type: "text", text: "Here's the analysis..." }],
996
+ },
997
+ });
998
+ onEvent({
999
+ type: "usage",
1000
+ inputTokens: 50_000,
1001
+ outputTokens: 300,
1002
+ model: "test-model",
1003
+ providerDurationMs: 800,
1004
+ });
1005
+ return [
1006
+ ...messages,
1007
+ {
1008
+ role: "assistant" as const,
1009
+ content: [
1010
+ { type: "text", text: "Here's the analysis..." },
1011
+ ] as ContentBlock[],
1012
+ },
1013
+ ];
1014
+ };
1015
+
1016
+ const ctx = makeCtx({
1017
+ agentLoopRun,
1018
+ messages: longHistory,
1019
+ contextWindowManager: {
1020
+ shouldCompact: () => ({ needed: false, estimatedTokens: 0 }),
1021
+ maybeCompact: async () => ({ compacted: false }),
1022
+ } as unknown as AgentLoopSessionContext["contextWindowManager"],
1023
+ });
1024
+
1025
+ await runAgentLoopImpl(ctx, "analyze this", "msg-1", (msg) =>
1026
+ events.push(msg),
1027
+ );
1028
+
1029
+ // Preflight should trigger the reducer since 195k > 190k budget
1030
+ expect(reducerCalled).toBe(true);
1031
+ // Should succeed
1032
+ expect(callCount).toBe(1);
1033
+ const sessionError = events.find((e) => e.type === "session_error");
1034
+ expect(sessionError).toBeUndefined();
1035
+ const complete = events.find((e) => e.type === "message_complete");
1036
+ expect(complete).toBeDefined();
1037
+ });
1038
+
1039
+ // ── Test 5 ────────────────────────────────────────────────────────
1040
+ // BUG: When all 4 reducer tiers have been applied, then the agent
1041
+ // makes progress and context_too_large fires again, no emergency
1042
+ // compaction is attempted. The `else if` at line 1163 just surfaces
1043
+ // the error.
1044
+ //
1045
+ // Expected behavior (PR 2 fix): Even after all tiers are exhausted,
1046
+ // if progress was made, attempt emergency compaction with
1047
+ // `minKeepRecentUserTurns: 0` as a last resort.
1048
+ test("exhausted reducer tiers with progress still attempts emergency compaction", async () => {
1049
+ const events: ServerMessage[] = [];
1050
+ let emergencyCompactCalled = false;
1051
+
1052
+ // Start with reducer already exhausted
1053
+ mockReducerStepFn = (msgs: Message[]) => {
1054
+ return {
1055
+ messages: msgs,
1056
+ tier: "injection_downgrade",
1057
+ state: {
1058
+ appliedTiers: [
1059
+ "forced_compaction",
1060
+ "tool_result_truncation",
1061
+ "media_stubbing",
1062
+ "injection_downgrade",
1063
+ ],
1064
+ injectionMode: "minimal",
1065
+ exhausted: true,
1066
+ },
1067
+ estimatedTokens: 195_000,
1068
+ };
1069
+ };
1070
+
1071
+ let agentLoopCallCount = 0;
1072
+ const agentLoopRun: AgentLoopRun = async (messages, onEvent) => {
1073
+ agentLoopCallCount++;
1074
+ if (agentLoopCallCount === 1) {
1075
+ // Agent makes progress (tool calls succeed, messages grow)
1076
+ const progressMessages: Message[] = [
1077
+ ...messages,
1078
+ {
1079
+ role: "assistant" as const,
1080
+ content: [
1081
+ { type: "text", text: "Running analysis..." },
1082
+ {
1083
+ type: "tool_use",
1084
+ id: "tu-1",
1085
+ name: "bash",
1086
+ input: { command: "find . -name '*.ts'" },
1087
+ },
1088
+ ] as ContentBlock[],
1089
+ },
1090
+ {
1091
+ role: "user" as const,
1092
+ content: [
1093
+ {
1094
+ type: "tool_result",
1095
+ tool_use_id: "tu-1",
1096
+ content: "file1.ts\nfile2.ts\nfile3.ts",
1097
+ is_error: false,
1098
+ },
1099
+ ] as ContentBlock[],
1100
+ },
1101
+ ];
1102
+
1103
+ onEvent({
1104
+ type: "tool_use",
1105
+ id: "tu-1",
1106
+ name: "bash",
1107
+ input: { command: "find . -name '*.ts'" },
1108
+ });
1109
+ onEvent({
1110
+ type: "tool_result",
1111
+ toolUseId: "tu-1",
1112
+ content: "file1.ts\nfile2.ts\nfile3.ts",
1113
+ isError: false,
1114
+ });
1115
+ onEvent({
1116
+ type: "message_complete",
1117
+ message: {
1118
+ role: "assistant",
1119
+ content: [
1120
+ { type: "text", text: "Running analysis..." },
1121
+ {
1122
+ type: "tool_use",
1123
+ id: "tu-1",
1124
+ name: "bash",
1125
+ input: { command: "find . -name '*.ts'" },
1126
+ },
1127
+ ],
1128
+ },
1129
+ });
1130
+ onEvent({
1131
+ type: "usage",
1132
+ inputTokens: 190_000,
1133
+ outputTokens: 100,
1134
+ model: "test-model",
1135
+ providerDurationMs: 200,
1136
+ });
1137
+
1138
+ // Then context_too_large on the next LLM call within the loop
1139
+ onEvent({
1140
+ type: "error",
1141
+ error: new Error("context_length_exceeded"),
1142
+ });
1143
+ onEvent({
1144
+ type: "usage",
1145
+ inputTokens: 0,
1146
+ outputTokens: 0,
1147
+ model: "test-model",
1148
+ providerDurationMs: 10,
1149
+ });
1150
+
1151
+ return progressMessages;
1152
+ }
1153
+
1154
+ // After emergency compaction, succeed
1155
+ onEvent({
1156
+ type: "message_complete",
1157
+ message: {
1158
+ role: "assistant",
1159
+ content: [{ type: "text", text: "recovered" }],
1160
+ },
1161
+ });
1162
+ onEvent({
1163
+ type: "usage",
1164
+ inputTokens: 50_000,
1165
+ outputTokens: 100,
1166
+ model: "test-model",
1167
+ providerDurationMs: 200,
1168
+ });
1169
+ return [
1170
+ ...messages,
1171
+ {
1172
+ role: "assistant" as const,
1173
+ content: [{ type: "text", text: "recovered" }] as ContentBlock[],
1174
+ },
1175
+ ];
1176
+ };
1177
+
1178
+ const ctx = makeCtx({
1179
+ agentLoopRun,
1180
+ contextWindowManager: {
1181
+ shouldCompact: () => ({ needed: false, estimatedTokens: 0 }),
1182
+ maybeCompact: async (
1183
+ _msgs: Message[],
1184
+ _signal: AbortSignal,
1185
+ opts?: Record<string, unknown>,
1186
+ ) => {
1187
+ if (opts?.force && opts?.minKeepRecentUserTurns === 0) {
1188
+ emergencyCompactCalled = true;
1189
+ return {
1190
+ compacted: true,
1191
+ messages: [
1192
+ {
1193
+ role: "user",
1194
+ content: [{ type: "text", text: "Hello" }],
1195
+ },
1196
+ ] as Message[],
1197
+ compactedPersistedMessages: 50,
1198
+ summaryText: "Emergency summary",
1199
+ previousEstimatedInputTokens: 195_000,
1200
+ estimatedInputTokens: 50_000,
1201
+ maxInputTokens: 200_000,
1202
+ thresholdTokens: 160_000,
1203
+ compactedMessages: 50,
1204
+ summaryCalls: 1,
1205
+ summaryInputTokens: 1000,
1206
+ summaryOutputTokens: 300,
1207
+ summaryModel: "mock-model",
1208
+ };
1209
+ }
1210
+ return { compacted: false };
1211
+ },
1212
+ } as unknown as AgentLoopSessionContext["contextWindowManager"],
1213
+ });
1214
+
1215
+ await runAgentLoopImpl(ctx, "hello", "msg-1", (msg) => events.push(msg));
1216
+
1217
+ // BUG: Currently when progress was made + all tiers exhausted,
1218
+ // emergency compaction is NOT attempted. The error is surfaced directly.
1219
+ // After PR 2 fix, emergency compaction should be attempted.
1220
+ expect(emergencyCompactCalled).toBe(true);
1221
+
1222
+ // BUG: Currently a session_error IS emitted.
1223
+ const sessionError = events.find((e) => e.type === "session_error");
1224
+ expect(sessionError).toBeUndefined();
1225
+ });
1226
+
1227
+ // ── Test 6 ────────────────────────────────────────────────────────
1228
+ // Tests mid-loop budget check via onCheckpoint.
1229
+ // The onCheckpoint callback estimates prompt tokens after each tool round.
1230
+ // When estimate exceeds the mid-loop threshold (85% of budget),
1231
+ // it returns "yield" to break the agent loop.
1232
+ // The session-agent-loop then runs compaction and re-enters the agent loop.
1233
+ test("onCheckpoint yields when token estimate exceeds mid-loop budget threshold", async () => {
1234
+ const events: ServerMessage[] = [];
1235
+ let compactionCalled = false;
1236
+
1237
+ // estimatePromptTokens is called:
1238
+ // 1. During preflight budget check (low value, below budget)
1239
+ // 2. During onCheckpoint mid-loop check (high value, above 85% threshold)
1240
+ // Budget = 200_000 * 0.95 = 190_000
1241
+ // Mid-loop threshold = 190_000 * 0.85 = 161_500
1242
+ let estimateCallCount = 0;
1243
+ mockEstimateTokens = () => {
1244
+ estimateCallCount++;
1245
+ // First call: preflight check — below budget
1246
+ if (estimateCallCount === 1) return 100_000;
1247
+ // Subsequent calls: mid-loop check — above 85% threshold
1248
+ return 170_000;
1249
+ };
1250
+
1251
+ let agentLoopCallCount = 0;
1252
+ const agentLoopRun: AgentLoopRun = async (
1253
+ messages,
1254
+ onEvent,
1255
+ _signal,
1256
+ _requestId,
1257
+ onCheckpoint,
1258
+ ) => {
1259
+ agentLoopCallCount++;
1260
+
1261
+ if (agentLoopCallCount === 1) {
1262
+ // Simulate a tool round: assistant calls a tool, results come back
1263
+ const withProgress: Message[] = [
1264
+ ...messages,
1265
+ {
1266
+ role: "assistant" as const,
1267
+ content: [
1268
+ { type: "text", text: "Let me check." },
1269
+ {
1270
+ type: "tool_use",
1271
+ id: "tu-1",
1272
+ name: "bash",
1273
+ input: { command: "ls" },
1274
+ },
1275
+ ] as ContentBlock[],
1276
+ },
1277
+ {
1278
+ role: "user" as const,
1279
+ content: [
1280
+ {
1281
+ type: "tool_result",
1282
+ tool_use_id: "tu-1",
1283
+ content: "file1.ts\nfile2.ts",
1284
+ is_error: false,
1285
+ },
1286
+ ] as ContentBlock[],
1287
+ },
1288
+ ];
1289
+
1290
+ onEvent({
1291
+ type: "message_complete",
1292
+ message: {
1293
+ role: "assistant",
1294
+ content: [
1295
+ { type: "text", text: "Let me check." },
1296
+ {
1297
+ type: "tool_use",
1298
+ id: "tu-1",
1299
+ name: "bash",
1300
+ input: { command: "ls" },
1301
+ },
1302
+ ],
1303
+ },
1304
+ });
1305
+ onEvent({
1306
+ type: "usage",
1307
+ inputTokens: 100,
1308
+ outputTokens: 50,
1309
+ model: "test-model",
1310
+ providerDurationMs: 100,
1311
+ });
1312
+
1313
+ // Call onCheckpoint — this should trigger the mid-loop budget check
1314
+ // which sees 170_000 > 161_500 and returns "yield"
1315
+ if (onCheckpoint) {
1316
+ const decision = onCheckpoint({
1317
+ turnIndex: 0,
1318
+ toolCount: 1,
1319
+ hasToolUse: true,
1320
+ history: withProgress,
1321
+ });
1322
+ if (decision === "yield") {
1323
+ // Agent loop stops when checkpoint yields
1324
+ return withProgress;
1325
+ }
1326
+ }
1327
+
1328
+ return withProgress;
1329
+ }
1330
+
1331
+ // Second call (after compaction): complete successfully
1332
+ onEvent({
1333
+ type: "message_complete",
1334
+ message: {
1335
+ role: "assistant",
1336
+ content: [{ type: "text", text: "done after compaction" }],
1337
+ },
1338
+ });
1339
+ onEvent({
1340
+ type: "usage",
1341
+ inputTokens: 50,
1342
+ outputTokens: 25,
1343
+ model: "test-model",
1344
+ providerDurationMs: 100,
1345
+ });
1346
+ return [
1347
+ ...messages,
1348
+ {
1349
+ role: "assistant" as const,
1350
+ content: [
1351
+ { type: "text", text: "done after compaction" },
1352
+ ] as ContentBlock[],
1353
+ },
1354
+ ];
1355
+ };
1356
+
1357
+ const ctx = makeCtx({
1358
+ agentLoopRun,
1359
+ contextWindowManager: {
1360
+ shouldCompact: () => ({ needed: false, estimatedTokens: 0 }),
1361
+ maybeCompact: async () => {
1362
+ compactionCalled = true;
1363
+ return {
1364
+ compacted: true,
1365
+ messages: [
1366
+ {
1367
+ role: "user" as const,
1368
+ content: [{ type: "text", text: "Hello" }],
1369
+ },
1370
+ ] as Message[],
1371
+ compactedPersistedMessages: 5,
1372
+ summaryText: "Mid-loop compaction summary",
1373
+ previousEstimatedInputTokens: 170_000,
1374
+ estimatedInputTokens: 80_000,
1375
+ maxInputTokens: 200_000,
1376
+ thresholdTokens: 160_000,
1377
+ compactedMessages: 10,
1378
+ summaryCalls: 1,
1379
+ summaryInputTokens: 500,
1380
+ summaryOutputTokens: 200,
1381
+ summaryModel: "mock-model",
1382
+ };
1383
+ },
1384
+ } as unknown as AgentLoopSessionContext["contextWindowManager"],
1385
+ });
1386
+
1387
+ await runAgentLoopImpl(ctx, "hello", "msg-1", (msg) => events.push(msg));
1388
+
1389
+ // The mid-loop budget check should have triggered compaction
1390
+ expect(compactionCalled).toBe(true);
1391
+
1392
+ // Agent loop should have been called twice: once before yield, once after compaction
1393
+ expect(agentLoopCallCount).toBe(2);
1394
+
1395
+ // No session_error should be emitted
1396
+ const sessionError = events.find((e) => e.type === "session_error");
1397
+ expect(sessionError).toBeUndefined();
1398
+
1399
+ // A context_compacted event should have been emitted
1400
+ const compacted = events.find((e) => e.type === "context_compacted");
1401
+ expect(compacted).toBeDefined();
1402
+ });
1403
+
1404
+ // ── Test 7 ────────────────────────────────────────────────────────
1405
+ // Tests that mid-loop budget check prevents context_too_large entirely.
1406
+ // Agent loop runs tool calls with growing history. After the estimate
1407
+ // exceeds the mid-loop threshold, the loop yields, compaction runs,
1408
+ // and the loop resumes. The provider NEVER rejects with context_too_large.
1409
+ test("mid-loop budget check prevents context_too_large when tools produce large results", async () => {
1410
+ const events: ServerMessage[] = [];
1411
+ let compactionCalled = false;
1412
+
1413
+ // Budget = 200_000 * 0.95 = 190_000
1414
+ // Mid-loop threshold = 190_000 * 0.85 = 161_500
1415
+ // Simulate token growth: preflight = 50k, then each checkpoint call
1416
+ // returns a growing estimate. By tool call 3, we exceed the threshold.
1417
+ let estimateCallCount = 0;
1418
+ mockEstimateTokens = () => {
1419
+ estimateCallCount++;
1420
+ // First call: preflight — well below budget
1421
+ if (estimateCallCount === 1) return 50_000;
1422
+ // Checkpoint calls grow with each tool round
1423
+ if (estimateCallCount === 2) return 100_000; // tool 1
1424
+ if (estimateCallCount === 3) return 140_000; // tool 2
1425
+ // Tool 3: exceeds 161_500 threshold
1426
+ return 175_000;
1427
+ };
1428
+
1429
+ let agentLoopCallCount = 0;
1430
+ let contextTooLargeEmitted = false;
1431
+
1432
+ const agentLoopRun: AgentLoopRun = async (
1433
+ messages,
1434
+ onEvent,
1435
+ _signal,
1436
+ _requestId,
1437
+ onCheckpoint,
1438
+ ) => {
1439
+ agentLoopCallCount++;
1440
+
1441
+ if (agentLoopCallCount === 1) {
1442
+ const currentHistory = [...messages];
1443
+
1444
+ // Simulate 5 tool rounds — but the checkpoint should yield at round 3
1445
+ for (let i = 0; i < 5; i++) {
1446
+ const toolId = `tu-${i}`;
1447
+ const assistantMsg: Message = {
1448
+ role: "assistant" as const,
1449
+ content: [
1450
+ { type: "text", text: `Step ${i}` },
1451
+ {
1452
+ type: "tool_use",
1453
+ id: toolId,
1454
+ name: "bash",
1455
+ input: { command: `cmd-${i}` },
1456
+ },
1457
+ ] as ContentBlock[],
1458
+ };
1459
+ const resultMsg: Message = {
1460
+ role: "user" as const,
1461
+ content: [
1462
+ {
1463
+ type: "tool_result",
1464
+ tool_use_id: toolId,
1465
+ content: "x".repeat(10_000),
1466
+ is_error: false,
1467
+ },
1468
+ ] as ContentBlock[],
1469
+ };
1470
+ currentHistory.push(assistantMsg, resultMsg);
1471
+
1472
+ onEvent({
1473
+ type: "message_complete",
1474
+ message: assistantMsg,
1475
+ });
1476
+ onEvent({
1477
+ type: "usage",
1478
+ inputTokens: 50_000 + i * 20_000,
1479
+ outputTokens: 50,
1480
+ model: "test-model",
1481
+ providerDurationMs: 100,
1482
+ });
1483
+
1484
+ if (onCheckpoint) {
1485
+ const decision = onCheckpoint({
1486
+ turnIndex: i,
1487
+ toolCount: 1,
1488
+ hasToolUse: true,
1489
+ history: currentHistory,
1490
+ });
1491
+ if (decision === "yield") {
1492
+ return currentHistory;
1493
+ }
1494
+ }
1495
+ }
1496
+
1497
+ return currentHistory;
1498
+ }
1499
+
1500
+ // Second call (after compaction): complete
1501
+ onEvent({
1502
+ type: "message_complete",
1503
+ message: {
1504
+ role: "assistant",
1505
+ content: [
1506
+ { type: "text", text: "completed after mid-loop compaction" },
1507
+ ],
1508
+ },
1509
+ });
1510
+ onEvent({
1511
+ type: "usage",
1512
+ inputTokens: 60_000,
1513
+ outputTokens: 100,
1514
+ model: "test-model",
1515
+ providerDurationMs: 200,
1516
+ });
1517
+ return [
1518
+ ...messages,
1519
+ {
1520
+ role: "assistant" as const,
1521
+ content: [
1522
+ { type: "text", text: "completed after mid-loop compaction" },
1523
+ ] as ContentBlock[],
1524
+ },
1525
+ ];
1526
+ };
1527
+
1528
+ const ctx = makeCtx({
1529
+ agentLoopRun,
1530
+ contextWindowManager: {
1531
+ shouldCompact: () => ({ needed: false, estimatedTokens: 0 }),
1532
+ maybeCompact: async () => {
1533
+ compactionCalled = true;
1534
+ return {
1535
+ compacted: true,
1536
+ messages: [
1537
+ {
1538
+ role: "user" as const,
1539
+ content: [{ type: "text", text: "Hello" }],
1540
+ },
1541
+ ] as Message[],
1542
+ compactedPersistedMessages: 8,
1543
+ summaryText: "Compacted large tool results",
1544
+ previousEstimatedInputTokens: 175_000,
1545
+ estimatedInputTokens: 60_000,
1546
+ maxInputTokens: 200_000,
1547
+ thresholdTokens: 160_000,
1548
+ compactedMessages: 15,
1549
+ summaryCalls: 1,
1550
+ summaryInputTokens: 800,
1551
+ summaryOutputTokens: 300,
1552
+ summaryModel: "mock-model",
1553
+ };
1554
+ },
1555
+ } as unknown as AgentLoopSessionContext["contextWindowManager"],
1556
+ });
1557
+
1558
+ await runAgentLoopImpl(ctx, "hello", "msg-1", (msg) => {
1559
+ events.push(msg);
1560
+ // Track if context_too_large was ever emitted
1561
+ if (
1562
+ msg.type === "session_error" &&
1563
+ "code" in msg &&
1564
+ msg.code === "SESSION_PROCESSING_FAILED"
1565
+ ) {
1566
+ contextTooLargeEmitted = true;
1567
+ }
1568
+ });
1569
+
1570
+ // Compaction should have been triggered by mid-loop budget check
1571
+ expect(compactionCalled).toBe(true);
1572
+
1573
+ // The provider should NEVER have rejected with context_too_large
1574
+ expect(contextTooLargeEmitted).toBe(false);
1575
+
1576
+ // Agent loop called twice: once (yielded at tool 3), once after compaction
1577
+ expect(agentLoopCallCount).toBe(2);
1578
+
1579
+ // No session_error
1580
+ const sessionError = events.find((e) => e.type === "session_error");
1581
+ expect(sessionError).toBeUndefined();
1582
+ });
1583
+ });