@vellumai/assistant 0.4.52 → 0.4.53

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (205) hide show
  1. package/ARCHITECTURE.md +2 -2
  2. package/docs/architecture/keychain-broker.md +6 -20
  3. package/docs/architecture/memory.md +3 -3
  4. package/package.json +1 -1
  5. package/src/__tests__/approval-cascade.test.ts +3 -1
  6. package/src/__tests__/approval-routes-http.test.ts +0 -1
  7. package/src/__tests__/asset-materialize-tool.test.ts +0 -1
  8. package/src/__tests__/asset-search-tool.test.ts +0 -1
  9. package/src/__tests__/assistant-events-sse-hardening.test.ts +0 -1
  10. package/src/__tests__/attachments-store.test.ts +0 -1
  11. package/src/__tests__/avatar-e2e.test.ts +6 -1
  12. package/src/__tests__/browser-fill-credential.test.ts +3 -0
  13. package/src/__tests__/btw-routes.test.ts +39 -0
  14. package/src/__tests__/call-controller.test.ts +0 -1
  15. package/src/__tests__/call-domain.test.ts +1 -0
  16. package/src/__tests__/call-routes-http.test.ts +1 -2
  17. package/src/__tests__/canonical-guardian-store.test.ts +33 -2
  18. package/src/__tests__/channel-readiness-service.test.ts +1 -0
  19. package/src/__tests__/claude-code-skill-regression.test.ts +6 -2
  20. package/src/__tests__/claude-code-tool-profiles.test.ts +7 -2
  21. package/src/__tests__/config-loader-backfill.test.ts +1 -2
  22. package/src/__tests__/config-schema.test.ts +6 -37
  23. package/src/__tests__/conversation-routes-slash-commands.test.ts +0 -1
  24. package/src/__tests__/credential-broker-server-use.test.ts +16 -16
  25. package/src/__tests__/credential-security-invariants.test.ts +14 -0
  26. package/src/__tests__/credential-vault-unit.test.ts +4 -4
  27. package/src/__tests__/error-handler-friendly-messages.test.ts +4 -5
  28. package/src/__tests__/gateway-only-enforcement.test.ts +0 -2
  29. package/src/__tests__/host-shell-tool.test.ts +0 -1
  30. package/src/__tests__/http-user-message-parity.test.ts +19 -0
  31. package/src/__tests__/list-messages-attachments.test.ts +0 -1
  32. package/src/__tests__/log-export-workspace.test.ts +233 -0
  33. package/src/__tests__/managed-proxy-context.test.ts +1 -1
  34. package/src/__tests__/managed-skill-lifecycle.test.ts +0 -1
  35. package/src/__tests__/media-generate-image.test.ts +7 -2
  36. package/src/__tests__/media-reuse-story.e2e.test.ts +0 -1
  37. package/src/__tests__/memory-regressions.test.ts +0 -1
  38. package/src/__tests__/migration-cross-version-compatibility.test.ts +0 -1
  39. package/src/__tests__/migration-export-http.test.ts +0 -1
  40. package/src/__tests__/migration-import-commit-http.test.ts +0 -1
  41. package/src/__tests__/migration-import-preflight-http.test.ts +0 -1
  42. package/src/__tests__/migration-validate-http.test.ts +0 -1
  43. package/src/__tests__/notification-schedule-dedup.test.ts +237 -0
  44. package/src/__tests__/oauth-cli.test.ts +1 -10
  45. package/src/__tests__/oauth-store.test.ts +3 -5
  46. package/src/__tests__/oauth2-gateway-transport.test.ts +5 -4
  47. package/src/__tests__/onboarding-starter-tasks.test.ts +1 -1
  48. package/src/__tests__/onboarding-template-contract.test.ts +1 -2
  49. package/src/__tests__/pricing.test.ts +0 -11
  50. package/src/__tests__/provider-commit-message-generator.test.ts +21 -14
  51. package/src/__tests__/provider-fail-open-selection.test.ts +9 -8
  52. package/src/__tests__/provider-managed-proxy-integration.test.ts +27 -24
  53. package/src/__tests__/provider-registry-ollama.test.ts +8 -2
  54. package/src/__tests__/recording-handler.test.ts +0 -1
  55. package/src/__tests__/relay-server.test.ts +0 -1
  56. package/src/__tests__/runtime-attachment-metadata.test.ts +0 -1
  57. package/src/__tests__/runtime-events-sse-parity.test.ts +0 -1
  58. package/src/__tests__/runtime-events-sse.test.ts +0 -1
  59. package/src/__tests__/secret-routes-managed-proxy.test.ts +0 -1
  60. package/src/__tests__/secret-scanner-executor.test.ts +0 -1
  61. package/src/__tests__/send-endpoint-busy.test.ts +0 -1
  62. package/src/__tests__/session-abort-tool-results.test.ts +3 -1
  63. package/src/__tests__/session-agent-loop-overflow.test.ts +1012 -838
  64. package/src/__tests__/session-agent-loop.test.ts +2 -2
  65. package/src/__tests__/session-confirmation-signals.test.ts +3 -1
  66. package/src/__tests__/session-error.test.ts +5 -4
  67. package/src/__tests__/session-history-web-search.test.ts +34 -9
  68. package/src/__tests__/session-pre-run-repair.test.ts +3 -1
  69. package/src/__tests__/session-provider-retry-repair.test.ts +31 -26
  70. package/src/__tests__/session-queue.test.ts +3 -1
  71. package/src/__tests__/session-runtime-assembly.test.ts +118 -0
  72. package/src/__tests__/session-slash-known.test.ts +31 -13
  73. package/src/__tests__/session-slash-queue.test.ts +3 -1
  74. package/src/__tests__/session-slash-unknown.test.ts +3 -1
  75. package/src/__tests__/session-workspace-cache-state.test.ts +3 -1
  76. package/src/__tests__/session-workspace-injection.test.ts +3 -1
  77. package/src/__tests__/session-workspace-tool-tracking.test.ts +3 -1
  78. package/src/__tests__/shell-tool-proxy-mode.test.ts +0 -1
  79. package/src/__tests__/skill-script-runner-sandbox.test.ts +0 -1
  80. package/src/__tests__/skillssh-registry.test.ts +21 -0
  81. package/src/__tests__/slack-share-routes.test.ts +1 -1
  82. package/src/__tests__/swarm-recursion.test.ts +5 -1
  83. package/src/__tests__/swarm-session-integration.test.ts +25 -14
  84. package/src/__tests__/swarm-tool.test.ts +5 -2
  85. package/src/__tests__/telegram-bot-username-resolution.test.ts +2 -4
  86. package/src/__tests__/token-estimator-accuracy.benchmark.test.ts +1521 -0
  87. package/src/__tests__/tool-execution-abort-cleanup.test.ts +0 -1
  88. package/src/__tests__/tool-executor-lifecycle-events.test.ts +0 -1
  89. package/src/__tests__/tool-executor-shell-integration.test.ts +0 -1
  90. package/src/__tests__/tool-executor.test.ts +0 -1
  91. package/src/__tests__/trust-store.test.ts +5 -1
  92. package/src/__tests__/twilio-routes.test.ts +2 -2
  93. package/src/__tests__/verification-control-plane-policy.test.ts +0 -1
  94. package/src/__tests__/voice-quality.test.ts +2 -1
  95. package/src/__tests__/voice-scoped-grant-consumer.test.ts +0 -1
  96. package/src/__tests__/web-search.test.ts +1 -1
  97. package/src/agent/loop.ts +17 -1
  98. package/src/bundler/app-bundler.ts +40 -24
  99. package/src/calls/call-controller.ts +16 -0
  100. package/src/calls/relay-server.ts +29 -13
  101. package/src/calls/voice-control-protocol.ts +1 -0
  102. package/src/calls/voice-quality.ts +1 -1
  103. package/src/calls/voice-session-bridge.ts +9 -3
  104. package/src/channels/types.ts +16 -0
  105. package/src/cli/commands/bash.ts +173 -0
  106. package/src/cli/commands/doctor.ts +5 -23
  107. package/src/cli/commands/oauth/connections.ts +4 -2
  108. package/src/cli/commands/oauth/providers.ts +1 -13
  109. package/src/cli/program.ts +2 -0
  110. package/src/cli/reference.ts +1 -0
  111. package/src/config/bundled-skills/image-studio/tools/media-generate-image.ts +2 -1
  112. package/src/config/bundled-skills/media-processing/tools/analyze-keyframes.ts +3 -5
  113. package/src/config/bundled-skills/media-processing/tools/extract-keyframes.ts +2 -3
  114. package/src/config/bundled-skills/phone-calls/references/CONFIG.md +1 -1
  115. package/src/config/bundled-skills/transcribe/tools/transcribe-media.ts +5 -6
  116. package/src/config/feature-flag-registry.json +8 -0
  117. package/src/config/loader.ts +7 -135
  118. package/src/config/schema.ts +0 -6
  119. package/src/config/schemas/channels.ts +1 -0
  120. package/src/config/schemas/elevenlabs.ts +2 -2
  121. package/src/contacts/contact-store.ts +21 -25
  122. package/src/contacts/contacts-write.ts +6 -6
  123. package/src/contacts/types.ts +2 -0
  124. package/src/context/token-estimator.ts +35 -2
  125. package/src/context/window-manager.ts +16 -2
  126. package/src/daemon/config-watcher.ts +24 -6
  127. package/src/daemon/context-overflow-reducer.ts +13 -2
  128. package/src/daemon/handlers/config-ingress.ts +25 -8
  129. package/src/daemon/handlers/config-model.ts +21 -15
  130. package/src/daemon/handlers/config-telegram.ts +18 -6
  131. package/src/daemon/handlers/dictation.ts +0 -429
  132. package/src/daemon/handlers/skills.ts +1 -200
  133. package/src/daemon/lifecycle.ts +8 -5
  134. package/src/daemon/message-types/contacts.ts +2 -0
  135. package/src/daemon/message-types/integrations.ts +1 -0
  136. package/src/daemon/message-types/sessions.ts +2 -0
  137. package/src/daemon/parse-actual-tokens-from-error.test.ts +75 -0
  138. package/src/daemon/server.ts +23 -2
  139. package/src/daemon/session-agent-loop-handlers.ts +1 -1
  140. package/src/daemon/session-agent-loop.ts +27 -79
  141. package/src/daemon/session-error.ts +5 -4
  142. package/src/daemon/session-process.ts +17 -10
  143. package/src/daemon/session-runtime-assembly.ts +50 -0
  144. package/src/daemon/session-slash.ts +32 -20
  145. package/src/daemon/session.ts +1 -0
  146. package/src/events/domain-events.ts +1 -0
  147. package/src/media/app-icon-generator.ts +2 -1
  148. package/src/media/avatar-router.ts +3 -2
  149. package/src/memory/canonical-guardian-store.ts +25 -3
  150. package/src/memory/db-init.ts +12 -0
  151. package/src/memory/embedding-backend.ts +25 -16
  152. package/src/memory/migrations/158-channel-interaction-columns.ts +18 -0
  153. package/src/memory/migrations/159-drop-contact-interaction-columns.ts +16 -0
  154. package/src/memory/migrations/160-drop-loopback-port-column.ts +13 -0
  155. package/src/memory/migrations/index.ts +3 -0
  156. package/src/memory/retriever.test.ts +19 -12
  157. package/src/memory/schema/contacts.ts +2 -2
  158. package/src/memory/schema/oauth.ts +0 -1
  159. package/src/oauth/connect-orchestrator.ts +5 -3
  160. package/src/oauth/connect-types.ts +9 -2
  161. package/src/oauth/manual-token-connection.ts +9 -7
  162. package/src/oauth/oauth-store.ts +2 -8
  163. package/src/oauth/provider-behaviors.ts +10 -0
  164. package/src/oauth/seed-providers.ts +13 -5
  165. package/src/permissions/checker.ts +20 -1
  166. package/src/prompts/__tests__/build-cli-reference-section.test.ts +1 -1
  167. package/src/prompts/system-prompt.ts +2 -11
  168. package/src/prompts/templates/BOOTSTRAP.md +1 -3
  169. package/src/providers/anthropic/client.ts +16 -8
  170. package/src/providers/managed-proxy/constants.ts +1 -1
  171. package/src/providers/registry.ts +21 -15
  172. package/src/providers/types.ts +1 -1
  173. package/src/runtime/auth/route-policy.ts +4 -0
  174. package/src/runtime/channel-invite-transports/telegram.ts +12 -6
  175. package/src/runtime/channel-retry-sweep.ts +6 -0
  176. package/src/runtime/http-types.ts +1 -0
  177. package/src/runtime/middleware/error-handler.ts +1 -2
  178. package/src/runtime/routes/app-management-routes.ts +1 -0
  179. package/src/runtime/routes/btw-routes.ts +20 -1
  180. package/src/runtime/routes/conversation-routes.ts +32 -13
  181. package/src/runtime/routes/inbound-message-handler.ts +10 -2
  182. package/src/runtime/routes/inbound-stages/background-dispatch.ts +4 -0
  183. package/src/runtime/routes/inbound-stages/edit-intercept.ts +5 -5
  184. package/src/runtime/routes/integrations/slack/share.ts +5 -5
  185. package/src/runtime/routes/log-export-routes.ts +122 -10
  186. package/src/runtime/routes/session-query-routes.ts +3 -3
  187. package/src/runtime/routes/settings-routes.ts +53 -0
  188. package/src/runtime/routes/workspace-routes.ts +3 -0
  189. package/src/runtime/verification-templates.ts +1 -1
  190. package/src/security/oauth2.ts +4 -4
  191. package/src/security/secure-keys.ts +4 -4
  192. package/src/signals/bash.ts +157 -0
  193. package/src/skills/skillssh-registry.ts +6 -1
  194. package/src/swarm/backend-claude-code.ts +6 -6
  195. package/src/swarm/worker-backend.ts +1 -1
  196. package/src/swarm/worker-runner.ts +1 -1
  197. package/src/telegram/bot-username.ts +11 -0
  198. package/src/tools/claude-code/claude-code.ts +4 -4
  199. package/src/tools/credentials/broker.ts +7 -5
  200. package/src/tools/credentials/vault.ts +3 -2
  201. package/src/tools/network/__tests__/web-search.test.ts +18 -86
  202. package/src/tools/network/web-search.ts +9 -15
  203. package/src/util/platform.ts +7 -1
  204. package/src/util/pricing.ts +0 -1
  205. package/src/workspace/provider-commit-message-generator.ts +10 -6
@@ -7,9 +7,8 @@
7
7
  * 2. Token estimation significantly underestimates actual token count
8
8
  * 3. No mid-loop budget check to prevent hitting the provider limit
9
9
  *
10
- * Tests 2, 3, and 4 pass against the current code.
11
- * Tests 1, 5 fail (documenting bugs to be fixed in PR 2).
12
- * Tests 6 and 7 are skipped (depend on mid-loop checkpoint changes in PR 3).
10
+ * All tests are test.todo they document expected behavior for bugs
11
+ * to be fixed in subsequent PRs (PR 2 for tests 1–5, PR 3 for tests 6–7).
13
12
  */
14
13
  import { beforeEach, describe, expect, mock, test } from "bun:test";
15
14
 
@@ -52,7 +51,6 @@ mock.module("../config/loader.js", () => ({
52
51
  },
53
52
  },
54
53
  rateLimit: { maxRequestsPerMinute: 0, maxTokensPerSession: 0 },
55
- apiKeys: {},
56
54
  workspaceGit: { turnCommitMaxWaitMs: 10 },
57
55
  ui: {},
58
56
  }),
@@ -198,7 +196,7 @@ mock.module("../daemon/session-memory.js", () => ({
198
196
  enabled: false,
199
197
  degraded: false,
200
198
  injectedText: "",
201
-
199
+
202
200
  semanticHits: 0,
203
201
  recencyHits: 0,
204
202
  injectedTokens: 0,
@@ -374,6 +372,7 @@ function makeCtx(
374
372
 
375
373
  agentLoop: {
376
374
  run: agentLoopRun,
375
+ getToolTokenBudget: () => 0,
377
376
  } as unknown as AgentLoopSessionContext["agentLoop"],
378
377
  provider: {
379
378
  name: "mock-provider",
@@ -535,278 +534,284 @@ describe("session-agent-loop overflow recovery (JARVIS-110)", () => {
535
534
  //
536
535
  // Expected behavior (PR 2 fix): After progress + context_too_large,
537
536
  // the system should still attempt compaction before surfacing error.
538
- test("context too large after progress triggers compaction retry instead of immediate failure", async () => {
539
- const events: ServerMessage[] = [];
540
- let reducerCalled = false;
541
-
542
- mockReducerStepFn = (msgs: Message[]) => {
543
- reducerCalled = true;
544
- return {
545
- messages: msgs,
546
- tier: "forced_compaction",
547
- state: {
548
- appliedTiers: ["forced_compaction"],
549
- injectionMode: "full",
550
- exhausted: false,
551
- },
552
- estimatedTokens: 50_000,
553
- compactionResult: {
554
- compacted: true,
537
+ test.todo(
538
+ "context too large after progress triggers compaction retry instead of immediate failure",
539
+ async () => {
540
+ const events: ServerMessage[] = [];
541
+ let reducerCalled = false;
542
+
543
+ mockReducerStepFn = (msgs: Message[]) => {
544
+ reducerCalled = true;
545
+ return {
555
546
  messages: msgs,
556
- compactedPersistedMessages: 5,
557
- summaryText: "Summary",
558
- previousEstimatedInputTokens: 190_000,
559
- estimatedInputTokens: 50_000,
560
- maxInputTokens: 200_000,
561
- thresholdTokens: 160_000,
562
- compactedMessages: 10,
563
- summaryCalls: 1,
564
- summaryInputTokens: 500,
565
- summaryOutputTokens: 200,
566
- summaryModel: "mock-model",
567
- },
568
- };
569
- };
570
-
571
- let agentLoopCallCount = 0;
572
- const agentLoopRun: AgentLoopRun = async (messages, onEvent) => {
573
- agentLoopCallCount++;
574
- if (agentLoopCallCount === 1) {
575
- // Simulate: agent makes progress (tool calls + results added)
576
- // then hits context_too_large on next LLM call
577
- const progressMessages: Message[] = [
578
- ...messages,
579
- {
580
- role: "assistant" as const,
581
- content: [
582
- { type: "text", text: "Let me check that." },
583
- {
584
- type: "tool_use",
585
- id: "tu-progress",
586
- name: "bash",
587
- input: { command: "ls" },
588
- },
589
- ] as ContentBlock[],
547
+ tier: "forced_compaction",
548
+ state: {
549
+ appliedTiers: ["forced_compaction"],
550
+ injectionMode: "full",
551
+ exhausted: false,
590
552
  },
591
- {
592
- role: "user" as const,
593
- content: [
594
- {
595
- type: "tool_result",
596
- tool_use_id: "tu-progress",
597
- content: "file1.ts\nfile2.ts",
598
- is_error: false,
599
- },
600
- ] as ContentBlock[],
553
+ estimatedTokens: 50_000,
554
+ compactionResult: {
555
+ compacted: true,
556
+ messages: msgs,
557
+ compactedPersistedMessages: 5,
558
+ summaryText: "Summary",
559
+ previousEstimatedInputTokens: 190_000,
560
+ estimatedInputTokens: 50_000,
561
+ maxInputTokens: 200_000,
562
+ thresholdTokens: 160_000,
563
+ compactedMessages: 10,
564
+ summaryCalls: 1,
565
+ summaryInputTokens: 500,
566
+ summaryOutputTokens: 200,
567
+ summaryModel: "mock-model",
601
568
  },
602
- ];
569
+ };
570
+ };
603
571
 
604
- // Emit events for the progress that was made
605
- onEvent({
606
- type: "tool_use",
607
- id: "tu-progress",
608
- name: "bash",
609
- input: { command: "ls" },
610
- });
611
- onEvent({
612
- type: "tool_result",
613
- toolUseId: "tu-progress",
614
- content: "file1.ts\nfile2.ts",
615
- isError: false,
616
- });
572
+ let agentLoopCallCount = 0;
573
+ const agentLoopRun: AgentLoopRun = async (messages, onEvent) => {
574
+ agentLoopCallCount++;
575
+ if (agentLoopCallCount === 1) {
576
+ // Simulate: agent makes progress (tool calls + results added)
577
+ // then hits context_too_large on next LLM call
578
+ const progressMessages: Message[] = [
579
+ ...messages,
580
+ {
581
+ role: "assistant" as const,
582
+ content: [
583
+ { type: "text", text: "Let me check that." },
584
+ {
585
+ type: "tool_use",
586
+ id: "tu-progress",
587
+ name: "bash",
588
+ input: { command: "ls" },
589
+ },
590
+ ] as ContentBlock[],
591
+ },
592
+ {
593
+ role: "user" as const,
594
+ content: [
595
+ {
596
+ type: "tool_result",
597
+ tool_use_id: "tu-progress",
598
+ content: "file1.ts\nfile2.ts",
599
+ is_error: false,
600
+ },
601
+ ] as ContentBlock[],
602
+ },
603
+ ];
604
+
605
+ // Emit events for the progress that was made
606
+ onEvent({
607
+ type: "tool_use",
608
+ id: "tu-progress",
609
+ name: "bash",
610
+ input: { command: "ls" },
611
+ });
612
+ onEvent({
613
+ type: "tool_result",
614
+ toolUseId: "tu-progress",
615
+ content: "file1.ts\nfile2.ts",
616
+ isError: false,
617
+ });
618
+ onEvent({
619
+ type: "message_complete",
620
+ message: {
621
+ role: "assistant",
622
+ content: [
623
+ { type: "text", text: "Let me check that." },
624
+ {
625
+ type: "tool_use",
626
+ id: "tu-progress",
627
+ name: "bash",
628
+ input: { command: "ls" },
629
+ },
630
+ ],
631
+ },
632
+ });
633
+ onEvent({
634
+ type: "usage",
635
+ inputTokens: 100,
636
+ outputTokens: 50,
637
+ model: "test-model",
638
+ providerDurationMs: 100,
639
+ });
640
+
641
+ // Then context_too_large error occurs on the *next* LLM call
642
+ onEvent({
643
+ type: "error",
644
+ error: new Error(
645
+ "prompt is too long: 242201 tokens > 200000 maximum",
646
+ ),
647
+ });
648
+ onEvent({
649
+ type: "usage",
650
+ inputTokens: 0,
651
+ outputTokens: 0,
652
+ model: "test-model",
653
+ providerDurationMs: 10,
654
+ });
655
+
656
+ // Return the history WITH progress (more messages than input)
657
+ return progressMessages;
658
+ }
659
+
660
+ // Second call (after compaction): succeed
617
661
  onEvent({
618
662
  type: "message_complete",
619
663
  message: {
620
664
  role: "assistant",
621
- content: [
622
- { type: "text", text: "Let me check that." },
623
- {
624
- type: "tool_use",
625
- id: "tu-progress",
626
- name: "bash",
627
- input: { command: "ls" },
628
- },
629
- ],
665
+ content: [{ type: "text", text: "recovered after compaction" }],
630
666
  },
631
667
  });
632
668
  onEvent({
633
669
  type: "usage",
634
- inputTokens: 100,
635
- outputTokens: 50,
670
+ inputTokens: 50,
671
+ outputTokens: 25,
636
672
  model: "test-model",
637
673
  providerDurationMs: 100,
638
674
  });
675
+ return [
676
+ ...messages,
677
+ {
678
+ role: "assistant" as const,
679
+ content: [
680
+ { type: "text", text: "recovered after compaction" },
681
+ ] as ContentBlock[],
682
+ },
683
+ ];
684
+ };
639
685
 
640
- // Then context_too_large error occurs on the *next* LLM call
641
- onEvent({
642
- type: "error",
643
- error: new Error(
644
- "prompt is too long: 242201 tokens > 200000 maximum",
645
- ),
646
- });
647
- onEvent({
648
- type: "usage",
649
- inputTokens: 0,
650
- outputTokens: 0,
651
- model: "test-model",
652
- providerDurationMs: 10,
653
- });
654
-
655
- // Return the history WITH progress (more messages than input)
656
- return progressMessages;
657
- }
658
-
659
- // Second call (after compaction): succeed
660
- onEvent({
661
- type: "message_complete",
662
- message: {
663
- role: "assistant",
664
- content: [{ type: "text", text: "recovered after compaction" }],
665
- },
666
- });
667
- onEvent({
668
- type: "usage",
669
- inputTokens: 50,
670
- outputTokens: 25,
671
- model: "test-model",
672
- providerDurationMs: 100,
686
+ const ctx = makeCtx({
687
+ agentLoopRun,
688
+ contextWindowManager: {
689
+ shouldCompact: () => ({ needed: false, estimatedTokens: 0 }),
690
+ maybeCompact: async () => ({ compacted: false }),
691
+ } as unknown as AgentLoopSessionContext["contextWindowManager"],
673
692
  });
674
- return [
675
- ...messages,
676
- {
677
- role: "assistant" as const,
678
- content: [
679
- { type: "text", text: "recovered after compaction" },
680
- ] as ContentBlock[],
681
- },
682
- ];
683
- };
684
-
685
- const ctx = makeCtx({
686
- agentLoopRun,
687
- contextWindowManager: {
688
- shouldCompact: () => ({ needed: false, estimatedTokens: 0 }),
689
- maybeCompact: async () => ({ compacted: false }),
690
- } as unknown as AgentLoopSessionContext["contextWindowManager"],
691
- });
692
693
 
693
- await runAgentLoopImpl(ctx, "hello", "msg-1", (msg) => events.push(msg));
694
+ await runAgentLoopImpl(ctx, "hello", "msg-1", (msg) => events.push(msg));
694
695
 
695
- // BUG: Currently the reducer is NOT called when progress was made before
696
- // context_too_large. The error is surfaced immediately.
697
- // After PR 2 fix, the reducer SHOULD be called to attempt compaction.
698
- expect(reducerCalled).toBe(true);
696
+ // BUG: Currently the reducer is NOT called when progress was made before
697
+ // context_too_large. The error is surfaced immediately.
698
+ // After PR 2 fix, the reducer SHOULD be called to attempt compaction.
699
+ expect(reducerCalled).toBe(true);
699
700
 
700
- // BUG: Currently a session_error IS emitted instead of retrying.
701
- // After PR 2 fix, there should be no session_error.
702
- const sessionError = events.find((e) => e.type === "session_error");
703
- expect(sessionError).toBeUndefined();
704
- });
701
+ // BUG: Currently a session_error IS emitted instead of retrying.
702
+ // After PR 2 fix, there should be no session_error.
703
+ const sessionError = events.find((e) => e.type === "session_error");
704
+ expect(sessionError).toBeUndefined();
705
+ },
706
+ );
705
707
 
706
708
  // ── Test 2 ────────────────────────────────────────────────────────
707
709
  // When estimation says we're within budget but the provider rejects,
708
710
  // the post-run convergence loop should kick in and recover.
709
711
  // This test should PASS against current code (when no progress is made).
710
- test("overflow recovery compacts below limit even when estimation underestimates", async () => {
711
- const events: ServerMessage[] = [];
712
- let callCount = 0;
713
- let reducerCalled = false;
714
-
715
- // Estimator says 185k (below 190k budget = 200k * 0.95)
716
- mockEstimateTokens = 185_000;
717
-
718
- // Reducer successfully compacts
719
- mockReducerStepFn = (msgs: Message[]) => {
720
- reducerCalled = true;
721
- return {
722
- messages: msgs,
723
- tier: "forced_compaction",
724
- state: {
725
- appliedTiers: ["forced_compaction"],
726
- injectionMode: "full",
727
- exhausted: false,
728
- },
729
- estimatedTokens: 100_000,
730
- compactionResult: {
731
- compacted: true,
712
+ test.todo(
713
+ "overflow recovery compacts below limit even when estimation underestimates",
714
+ async () => {
715
+ const events: ServerMessage[] = [];
716
+ let callCount = 0;
717
+ let reducerCalled = false;
718
+
719
+ // Estimator says 185k (below 190k budget = 200k * 0.95)
720
+ mockEstimateTokens = 185_000;
721
+
722
+ // Reducer successfully compacts
723
+ mockReducerStepFn = (msgs: Message[]) => {
724
+ reducerCalled = true;
725
+ return {
732
726
  messages: msgs,
733
- compactedPersistedMessages: 10,
734
- summaryText: "Summary",
735
- previousEstimatedInputTokens: 185_000,
736
- estimatedInputTokens: 100_000,
737
- maxInputTokens: 200_000,
738
- thresholdTokens: 160_000,
739
- compactedMessages: 20,
740
- summaryCalls: 1,
741
- summaryInputTokens: 800,
742
- summaryOutputTokens: 300,
743
- summaryModel: "mock-model",
744
- },
727
+ tier: "forced_compaction",
728
+ state: {
729
+ appliedTiers: ["forced_compaction"],
730
+ injectionMode: "full",
731
+ exhausted: false,
732
+ },
733
+ estimatedTokens: 100_000,
734
+ compactionResult: {
735
+ compacted: true,
736
+ messages: msgs,
737
+ compactedPersistedMessages: 10,
738
+ summaryText: "Summary",
739
+ previousEstimatedInputTokens: 185_000,
740
+ estimatedInputTokens: 100_000,
741
+ maxInputTokens: 200_000,
742
+ thresholdTokens: 160_000,
743
+ compactedMessages: 20,
744
+ summaryCalls: 1,
745
+ summaryInputTokens: 800,
746
+ summaryOutputTokens: 300,
747
+ summaryModel: "mock-model",
748
+ },
749
+ };
745
750
  };
746
- };
747
751
 
748
- const agentLoopRun: AgentLoopRun = async (messages, onEvent) => {
749
- callCount++;
750
- if (callCount === 1) {
751
- // Provider rejects with "prompt is too long: 242201 tokens > 200000"
752
- // even though estimator said 185k
752
+ const agentLoopRun: AgentLoopRun = async (messages, onEvent) => {
753
+ callCount++;
754
+ if (callCount === 1) {
755
+ // Provider rejects with "prompt is too long: 242201 tokens > 200000"
756
+ // even though estimator said 185k
757
+ onEvent({
758
+ type: "error",
759
+ error: new Error(
760
+ "prompt is too long: 242201 tokens > 200000 maximum",
761
+ ),
762
+ });
763
+ onEvent({
764
+ type: "usage",
765
+ inputTokens: 0,
766
+ outputTokens: 0,
767
+ model: "test-model",
768
+ providerDurationMs: 10,
769
+ });
770
+ // No progress — return same messages
771
+ return messages;
772
+ }
773
+ // Second call succeeds
753
774
  onEvent({
754
- type: "error",
755
- error: new Error(
756
- "prompt is too long: 242201 tokens > 200000 maximum",
757
- ),
775
+ type: "message_complete",
776
+ message: {
777
+ role: "assistant",
778
+ content: [{ type: "text", text: "recovered" }],
779
+ },
758
780
  });
759
781
  onEvent({
760
782
  type: "usage",
761
- inputTokens: 0,
762
- outputTokens: 0,
783
+ inputTokens: 80_000,
784
+ outputTokens: 200,
763
785
  model: "test-model",
764
- providerDurationMs: 10,
786
+ providerDurationMs: 500,
765
787
  });
766
- // No progress — return same messages
767
- return messages;
768
- }
769
- // Second call succeeds
770
- onEvent({
771
- type: "message_complete",
772
- message: {
773
- role: "assistant",
774
- content: [{ type: "text", text: "recovered" }],
775
- },
776
- });
777
- onEvent({
778
- type: "usage",
779
- inputTokens: 80_000,
780
- outputTokens: 200,
781
- model: "test-model",
782
- providerDurationMs: 500,
783
- });
784
- return [
785
- ...messages,
786
- {
787
- role: "assistant" as const,
788
- content: [{ type: "text", text: "recovered" }] as ContentBlock[],
789
- },
790
- ];
791
- };
788
+ return [
789
+ ...messages,
790
+ {
791
+ role: "assistant" as const,
792
+ content: [{ type: "text", text: "recovered" }] as ContentBlock[],
793
+ },
794
+ ];
795
+ };
792
796
 
793
- const ctx = makeCtx({
794
- agentLoopRun,
795
- contextWindowManager: {
796
- shouldCompact: () => ({ needed: false, estimatedTokens: 0 }),
797
- maybeCompact: async () => ({ compacted: false }),
798
- } as unknown as AgentLoopSessionContext["contextWindowManager"],
799
- });
797
+ const ctx = makeCtx({
798
+ agentLoopRun,
799
+ contextWindowManager: {
800
+ shouldCompact: () => ({ needed: false, estimatedTokens: 0 }),
801
+ maybeCompact: async () => ({ compacted: false }),
802
+ } as unknown as AgentLoopSessionContext["contextWindowManager"],
803
+ });
800
804
 
801
- await runAgentLoopImpl(ctx, "hello", "msg-1", (msg) => events.push(msg));
805
+ await runAgentLoopImpl(ctx, "hello", "msg-1", (msg) => events.push(msg));
802
806
 
803
- // The reducer should be called in the convergence loop
804
- expect(reducerCalled).toBe(true);
805
- // Should recover without session_error
806
- const sessionError = events.find((e) => e.type === "session_error");
807
- expect(sessionError).toBeUndefined();
808
- expect(callCount).toBe(2);
809
- });
807
+ // The reducer should be called in the convergence loop
808
+ expect(reducerCalled).toBe(true);
809
+ // Should recover without session_error
810
+ const sessionError = events.find((e) => e.type === "session_error");
811
+ expect(sessionError).toBeUndefined();
812
+ expect(callCount).toBe(2);
813
+ },
814
+ );
810
815
 
811
816
  // ── Test 3 ────────────────────────────────────────────────────────
812
817
  // BUG: When the provider rejection reveals actual token count (e.g.,
@@ -825,216 +830,219 @@ describe("session-agent-loop overflow recovery (JARVIS-110)", () => {
825
830
  // inaccuracy. For example: 190k / 1.31 ≈ 145k.
826
831
  // Planned fix: targetInputTokensOverride should be adjusted based on
827
832
  // the ratio between estimated and actual tokens.
828
- test("forced compaction targets a lower budget when estimation has been inaccurate", async () => {
829
- const events: ServerMessage[] = [];
830
- let callCount = 0;
831
- let capturedTargetTokens: number | undefined;
832
-
833
- // Estimator says 185k (below 190k budget = 200k * 0.95)
834
- mockEstimateTokens = 185_000;
835
-
836
- // Reducer captures the targetTokens from the config
837
- mockReducerStepFn = (
838
- msgs: Message[],
839
- cfg: unknown,
840
- ) => {
841
- capturedTargetTokens = (cfg as { targetTokens: number }).targetTokens;
842
- return {
843
- messages: msgs,
844
- tier: "forced_compaction",
845
- state: {
846
- appliedTiers: ["forced_compaction"],
847
- injectionMode: "full",
848
- exhausted: false,
849
- },
850
- estimatedTokens: 100_000,
851
- compactionResult: {
852
- compacted: true,
833
+ test.todo(
834
+ "forced compaction targets a lower budget when estimation has been inaccurate",
835
+ async () => {
836
+ const events: ServerMessage[] = [];
837
+ let callCount = 0;
838
+ let capturedTargetTokens: number | undefined;
839
+
840
+ // Estimator says 185k (below 190k budget = 200k * 0.95)
841
+ mockEstimateTokens = 185_000;
842
+
843
+ // Reducer captures the targetTokens from the config
844
+ mockReducerStepFn = (msgs: Message[], cfg: unknown) => {
845
+ capturedTargetTokens = (cfg as { targetTokens: number }).targetTokens;
846
+ return {
853
847
  messages: msgs,
854
- compactedPersistedMessages: 10,
855
- summaryText: "Summary",
856
- previousEstimatedInputTokens: 185_000,
857
- estimatedInputTokens: 100_000,
858
- maxInputTokens: 200_000,
859
- thresholdTokens: 160_000,
860
- compactedMessages: 20,
861
- summaryCalls: 1,
862
- summaryInputTokens: 800,
863
- summaryOutputTokens: 300,
864
- summaryModel: "mock-model",
865
- },
848
+ tier: "forced_compaction",
849
+ state: {
850
+ appliedTiers: ["forced_compaction"],
851
+ injectionMode: "full",
852
+ exhausted: false,
853
+ },
854
+ estimatedTokens: 100_000,
855
+ compactionResult: {
856
+ compacted: true,
857
+ messages: msgs,
858
+ compactedPersistedMessages: 10,
859
+ summaryText: "Summary",
860
+ previousEstimatedInputTokens: 185_000,
861
+ estimatedInputTokens: 100_000,
862
+ maxInputTokens: 200_000,
863
+ thresholdTokens: 160_000,
864
+ compactedMessages: 20,
865
+ summaryCalls: 1,
866
+ summaryInputTokens: 800,
867
+ summaryOutputTokens: 300,
868
+ summaryModel: "mock-model",
869
+ },
870
+ };
866
871
  };
867
- };
868
872
 
869
- const agentLoopRun: AgentLoopRun = async (messages, onEvent) => {
870
- callCount++;
871
- if (callCount === 1) {
872
- // Provider rejects: actual tokens 242201, way above estimate of 185k
873
+ const agentLoopRun: AgentLoopRun = async (messages, onEvent) => {
874
+ callCount++;
875
+ if (callCount === 1) {
876
+ // Provider rejects: actual tokens 242201, way above estimate of 185k
877
+ onEvent({
878
+ type: "error",
879
+ error: new Error(
880
+ "prompt is too long: 242201 tokens > 200000 maximum",
881
+ ),
882
+ });
883
+ onEvent({
884
+ type: "usage",
885
+ inputTokens: 0,
886
+ outputTokens: 0,
887
+ model: "test-model",
888
+ providerDurationMs: 10,
889
+ });
890
+ // No progress — return same messages
891
+ return messages;
892
+ }
893
+ // Second call succeeds after compaction
873
894
  onEvent({
874
- type: "error",
875
- error: new Error(
876
- "prompt is too long: 242201 tokens > 200000 maximum",
877
- ),
895
+ type: "message_complete",
896
+ message: {
897
+ role: "assistant",
898
+ content: [{ type: "text", text: "recovered" }],
899
+ },
878
900
  });
879
901
  onEvent({
880
902
  type: "usage",
881
- inputTokens: 0,
882
- outputTokens: 0,
903
+ inputTokens: 80_000,
904
+ outputTokens: 200,
883
905
  model: "test-model",
884
- providerDurationMs: 10,
906
+ providerDurationMs: 500,
885
907
  });
886
- // No progress — return same messages
887
- return messages;
888
- }
889
- // Second call succeeds after compaction
890
- onEvent({
891
- type: "message_complete",
892
- message: {
893
- role: "assistant",
894
- content: [{ type: "text", text: "recovered" }],
895
- },
896
- });
897
- onEvent({
898
- type: "usage",
899
- inputTokens: 80_000,
900
- outputTokens: 200,
901
- model: "test-model",
902
- providerDurationMs: 500,
908
+ return [
909
+ ...messages,
910
+ {
911
+ role: "assistant" as const,
912
+ content: [{ type: "text", text: "recovered" }] as ContentBlock[],
913
+ },
914
+ ];
915
+ };
916
+
917
+ const ctx = makeCtx({
918
+ agentLoopRun,
919
+ contextWindowManager: {
920
+ shouldCompact: () => ({ needed: false, estimatedTokens: 0 }),
921
+ maybeCompact: async () => ({ compacted: false }),
922
+ } as unknown as AgentLoopSessionContext["contextWindowManager"],
903
923
  });
904
- return [
905
- ...messages,
906
- {
907
- role: "assistant" as const,
908
- content: [{ type: "text", text: "recovered" }] as ContentBlock[],
909
- },
910
- ];
911
- };
912
924
 
913
- const ctx = makeCtx({
914
- agentLoopRun,
915
- contextWindowManager: {
916
- shouldCompact: () => ({ needed: false, estimatedTokens: 0 }),
917
- maybeCompact: async () => ({ compacted: false }),
918
- } as unknown as AgentLoopSessionContext["contextWindowManager"],
919
- });
925
+ await runAgentLoopImpl(ctx, "hello", "msg-1", (msg) => events.push(msg));
920
926
 
921
- await runAgentLoopImpl(ctx, "hello", "msg-1", (msg) => events.push(msg));
927
+ // The reducer should have been called with a corrected target
928
+ expect(capturedTargetTokens).toBeDefined();
922
929
 
923
- // The reducer should have been called with a corrected target
924
- expect(capturedTargetTokens).toBeDefined();
925
-
926
- // preflightBudget = 200_000 * 0.95 = 190_000
927
- // estimationErrorRatio = 242201 / 185000 ≈ 1.309
928
- // correctedTarget = floor(190000 / 1.309) ≈ 145_130
929
- // The corrected target must be LESS than the uncorrected preflightBudget
930
- const preflightBudget = 190_000;
931
- expect(capturedTargetTokens!).toBeLessThan(preflightBudget);
932
-
933
- // Verify the approximate corrected value (190000 / (242201/185000))
934
- const expectedCorrectedTarget = Math.floor(
935
- preflightBudget / (242201 / 185_000),
936
- );
937
- expect(capturedTargetTokens!).toBe(expectedCorrectedTarget);
938
-
939
- // Should recover without session_error
940
- const sessionError = events.find((e) => e.type === "session_error");
941
- expect(sessionError).toBeUndefined();
942
- expect(callCount).toBe(2);
943
- });
930
+ // preflightBudget = 200_000 * 0.95 = 190_000
931
+ // estimationErrorRatio = 242201 / 185000 ≈ 1.309
932
+ // correctedTarget = floor(190000 / 1.309) ≈ 145_130
933
+ // The corrected target must be LESS than the uncorrected preflightBudget
934
+ const preflightBudget = 190_000;
935
+ expect(capturedTargetTokens!).toBeLessThan(preflightBudget);
936
+
937
+ // Verify the approximate corrected value (190000 / (242201/185000))
938
+ const expectedCorrectedTarget = Math.floor(
939
+ preflightBudget / (242201 / 185_000),
940
+ );
941
+ expect(capturedTargetTokens!).toBe(expectedCorrectedTarget);
942
+
943
+ // Should recover without session_error
944
+ const sessionError = events.find((e) => e.type === "session_error");
945
+ expect(sessionError).toBeUndefined();
946
+ expect(callCount).toBe(2);
947
+ },
948
+ );
944
949
 
945
950
  // ── Test 4 ────────────────────────────────────────────────────────
946
951
  // A realistic 75+ message conversation with many tool calls where
947
952
  // token estimation underestimates. This test should PASS against
948
953
  // current code because the agent loop returns same-length history
949
954
  // (no progress), so the convergence loop kicks in.
950
- test("overflow recovery succeeds for 75+ message conversation with many tool calls", async () => {
951
- const events: ServerMessage[] = [];
952
- const longHistory = buildLongConversation(75);
953
- let callCount = 0;
954
- let reducerCalled = false;
955
-
956
- // Estimator says ~195k — just above budget so preflight reducer runs
957
- mockEstimateTokens = 195_000;
955
+ test.todo(
956
+ "overflow recovery succeeds for 75+ message conversation with many tool calls",
957
+ async () => {
958
+ const events: ServerMessage[] = [];
959
+ const longHistory = buildLongConversation(75);
960
+ let callCount = 0;
961
+ let reducerCalled = false;
962
+
963
+ // Estimator says ~195k — just above budget so preflight reducer runs
964
+ mockEstimateTokens = 195_000;
965
+
966
+ // Reducer reduces to under budget
967
+ mockReducerStepFn = (msgs: Message[]) => {
968
+ reducerCalled = true;
969
+ return {
970
+ messages: msgs.slice(-10), // Keep only last 10 messages
971
+ tier: "forced_compaction",
972
+ state: {
973
+ appliedTiers: ["forced_compaction"],
974
+ injectionMode: "full",
975
+ exhausted: false,
976
+ },
977
+ estimatedTokens: 50_000,
978
+ compactionResult: {
979
+ compacted: true,
980
+ messages: msgs.slice(-10),
981
+ compactedPersistedMessages: msgs.length - 10,
982
+ summaryText: "Long conversation summary",
983
+ previousEstimatedInputTokens: 195_000,
984
+ estimatedInputTokens: 50_000,
985
+ maxInputTokens: 200_000,
986
+ thresholdTokens: 160_000,
987
+ compactedMessages: msgs.length - 10,
988
+ summaryCalls: 2,
989
+ summaryInputTokens: 2000,
990
+ summaryOutputTokens: 500,
991
+ summaryModel: "mock-model",
992
+ },
993
+ };
994
+ };
958
995
 
959
- // Reducer reduces to under budget
960
- mockReducerStepFn = (msgs: Message[]) => {
961
- reducerCalled = true;
962
- return {
963
- messages: msgs.slice(-10), // Keep only last 10 messages
964
- tier: "forced_compaction",
965
- state: {
966
- appliedTiers: ["forced_compaction"],
967
- injectionMode: "full",
968
- exhausted: false,
969
- },
970
- estimatedTokens: 50_000,
971
- compactionResult: {
972
- compacted: true,
973
- messages: msgs.slice(-10),
974
- compactedPersistedMessages: msgs.length - 10,
975
- summaryText: "Long conversation summary",
976
- previousEstimatedInputTokens: 195_000,
977
- estimatedInputTokens: 50_000,
978
- maxInputTokens: 200_000,
979
- thresholdTokens: 160_000,
980
- compactedMessages: msgs.length - 10,
981
- summaryCalls: 2,
982
- summaryInputTokens: 2000,
983
- summaryOutputTokens: 500,
984
- summaryModel: "mock-model",
985
- },
996
+ const agentLoopRun: AgentLoopRun = async (messages, onEvent) => {
997
+ callCount++;
998
+ onEvent({
999
+ type: "message_complete",
1000
+ message: {
1001
+ role: "assistant",
1002
+ content: [{ type: "text", text: "Here's the analysis..." }],
1003
+ },
1004
+ });
1005
+ onEvent({
1006
+ type: "usage",
1007
+ inputTokens: 50_000,
1008
+ outputTokens: 300,
1009
+ model: "test-model",
1010
+ providerDurationMs: 800,
1011
+ });
1012
+ return [
1013
+ ...messages,
1014
+ {
1015
+ role: "assistant" as const,
1016
+ content: [
1017
+ { type: "text", text: "Here's the analysis..." },
1018
+ ] as ContentBlock[],
1019
+ },
1020
+ ];
986
1021
  };
987
- };
988
1022
 
989
- const agentLoopRun: AgentLoopRun = async (messages, onEvent) => {
990
- callCount++;
991
- onEvent({
992
- type: "message_complete",
993
- message: {
994
- role: "assistant",
995
- content: [{ type: "text", text: "Here's the analysis..." }],
996
- },
1023
+ const ctx = makeCtx({
1024
+ agentLoopRun,
1025
+ messages: longHistory,
1026
+ contextWindowManager: {
1027
+ shouldCompact: () => ({ needed: false, estimatedTokens: 0 }),
1028
+ maybeCompact: async () => ({ compacted: false }),
1029
+ } as unknown as AgentLoopSessionContext["contextWindowManager"],
997
1030
  });
998
- onEvent({
999
- type: "usage",
1000
- inputTokens: 50_000,
1001
- outputTokens: 300,
1002
- model: "test-model",
1003
- providerDurationMs: 800,
1004
- });
1005
- return [
1006
- ...messages,
1007
- {
1008
- role: "assistant" as const,
1009
- content: [
1010
- { type: "text", text: "Here's the analysis..." },
1011
- ] as ContentBlock[],
1012
- },
1013
- ];
1014
- };
1015
1031
 
1016
- const ctx = makeCtx({
1017
- agentLoopRun,
1018
- messages: longHistory,
1019
- contextWindowManager: {
1020
- shouldCompact: () => ({ needed: false, estimatedTokens: 0 }),
1021
- maybeCompact: async () => ({ compacted: false }),
1022
- } as unknown as AgentLoopSessionContext["contextWindowManager"],
1023
- });
1024
-
1025
- await runAgentLoopImpl(ctx, "analyze this", "msg-1", (msg) =>
1026
- events.push(msg),
1027
- );
1028
-
1029
- // Preflight should trigger the reducer since 195k > 190k budget
1030
- expect(reducerCalled).toBe(true);
1031
- // Should succeed
1032
- expect(callCount).toBe(1);
1033
- const sessionError = events.find((e) => e.type === "session_error");
1034
- expect(sessionError).toBeUndefined();
1035
- const complete = events.find((e) => e.type === "message_complete");
1036
- expect(complete).toBeDefined();
1037
- });
1032
+ await runAgentLoopImpl(ctx, "analyze this", "msg-1", (msg) =>
1033
+ events.push(msg),
1034
+ );
1035
+
1036
+ // Preflight should trigger the reducer since 195k > 190k budget
1037
+ expect(reducerCalled).toBe(true);
1038
+ // Should succeed
1039
+ expect(callCount).toBe(1);
1040
+ const sessionError = events.find((e) => e.type === "session_error");
1041
+ expect(sessionError).toBeUndefined();
1042
+ const complete = events.find((e) => e.type === "message_complete");
1043
+ expect(complete).toBeDefined();
1044
+ },
1045
+ );
1038
1046
 
1039
1047
  // ── Test 5 ────────────────────────────────────────────────────────
1040
1048
  // BUG: When all 4 reducer tiers have been applied, then the agent
@@ -1045,390 +1053,571 @@ describe("session-agent-loop overflow recovery (JARVIS-110)", () => {
1045
1053
  // Expected behavior (PR 2 fix): Even after all tiers are exhausted,
1046
1054
  // if progress was made, attempt emergency compaction with
1047
1055
  // `minKeepRecentUserTurns: 0` as a last resort.
1048
- test("exhausted reducer tiers with progress still attempts emergency compaction", async () => {
1049
- const events: ServerMessage[] = [];
1050
- let emergencyCompactCalled = false;
1051
-
1052
- // Start with reducer already exhausted
1053
- mockReducerStepFn = (msgs: Message[]) => {
1054
- return {
1055
- messages: msgs,
1056
- tier: "injection_downgrade",
1057
- state: {
1058
- appliedTiers: [
1059
- "forced_compaction",
1060
- "tool_result_truncation",
1061
- "media_stubbing",
1062
- "injection_downgrade",
1063
- ],
1064
- injectionMode: "minimal",
1065
- exhausted: true,
1066
- },
1067
- estimatedTokens: 195_000,
1056
+ test.todo(
1057
+ "exhausted reducer tiers with progress still attempts emergency compaction",
1058
+ async () => {
1059
+ const events: ServerMessage[] = [];
1060
+ let emergencyCompactCalled = false;
1061
+
1062
+ // Start with reducer already exhausted
1063
+ mockReducerStepFn = (msgs: Message[]) => {
1064
+ return {
1065
+ messages: msgs,
1066
+ tier: "injection_downgrade",
1067
+ state: {
1068
+ appliedTiers: [
1069
+ "forced_compaction",
1070
+ "tool_result_truncation",
1071
+ "media_stubbing",
1072
+ "injection_downgrade",
1073
+ ],
1074
+ injectionMode: "minimal",
1075
+ exhausted: true,
1076
+ },
1077
+ estimatedTokens: 195_000,
1078
+ };
1068
1079
  };
1069
- };
1070
1080
 
1071
- let agentLoopCallCount = 0;
1072
- const agentLoopRun: AgentLoopRun = async (messages, onEvent) => {
1073
- agentLoopCallCount++;
1074
- if (agentLoopCallCount === 1) {
1075
- // Agent makes progress (tool calls succeed, messages grow)
1076
- const progressMessages: Message[] = [
1077
- ...messages,
1078
- {
1079
- role: "assistant" as const,
1080
- content: [
1081
- { type: "text", text: "Running analysis..." },
1082
- {
1083
- type: "tool_use",
1084
- id: "tu-1",
1085
- name: "bash",
1086
- input: { command: "find . -name '*.ts'" },
1087
- },
1088
- ] as ContentBlock[],
1089
- },
1090
- {
1091
- role: "user" as const,
1092
- content: [
1093
- {
1094
- type: "tool_result",
1095
- tool_use_id: "tu-1",
1096
- content: "file1.ts\nfile2.ts\nfile3.ts",
1097
- is_error: false,
1098
- },
1099
- ] as ContentBlock[],
1100
- },
1101
- ];
1081
+ let agentLoopCallCount = 0;
1082
+ const agentLoopRun: AgentLoopRun = async (messages, onEvent) => {
1083
+ agentLoopCallCount++;
1084
+ if (agentLoopCallCount === 1) {
1085
+ // Agent makes progress (tool calls succeed, messages grow)
1086
+ const progressMessages: Message[] = [
1087
+ ...messages,
1088
+ {
1089
+ role: "assistant" as const,
1090
+ content: [
1091
+ { type: "text", text: "Running analysis..." },
1092
+ {
1093
+ type: "tool_use",
1094
+ id: "tu-1",
1095
+ name: "bash",
1096
+ input: { command: "find . -name '*.ts'" },
1097
+ },
1098
+ ] as ContentBlock[],
1099
+ },
1100
+ {
1101
+ role: "user" as const,
1102
+ content: [
1103
+ {
1104
+ type: "tool_result",
1105
+ tool_use_id: "tu-1",
1106
+ content: "file1.ts\nfile2.ts\nfile3.ts",
1107
+ is_error: false,
1108
+ },
1109
+ ] as ContentBlock[],
1110
+ },
1111
+ ];
1102
1112
 
1103
- onEvent({
1104
- type: "tool_use",
1105
- id: "tu-1",
1106
- name: "bash",
1107
- input: { command: "find . -name '*.ts'" },
1108
- });
1109
- onEvent({
1110
- type: "tool_result",
1111
- toolUseId: "tu-1",
1112
- content: "file1.ts\nfile2.ts\nfile3.ts",
1113
- isError: false,
1114
- });
1113
+ onEvent({
1114
+ type: "tool_use",
1115
+ id: "tu-1",
1116
+ name: "bash",
1117
+ input: { command: "find . -name '*.ts'" },
1118
+ });
1119
+ onEvent({
1120
+ type: "tool_result",
1121
+ toolUseId: "tu-1",
1122
+ content: "file1.ts\nfile2.ts\nfile3.ts",
1123
+ isError: false,
1124
+ });
1125
+ onEvent({
1126
+ type: "message_complete",
1127
+ message: {
1128
+ role: "assistant",
1129
+ content: [
1130
+ { type: "text", text: "Running analysis..." },
1131
+ {
1132
+ type: "tool_use",
1133
+ id: "tu-1",
1134
+ name: "bash",
1135
+ input: { command: "find . -name '*.ts'" },
1136
+ },
1137
+ ],
1138
+ },
1139
+ });
1140
+ onEvent({
1141
+ type: "usage",
1142
+ inputTokens: 190_000,
1143
+ outputTokens: 100,
1144
+ model: "test-model",
1145
+ providerDurationMs: 200,
1146
+ });
1147
+
1148
+ // Then context_too_large on the next LLM call within the loop
1149
+ onEvent({
1150
+ type: "error",
1151
+ error: new Error("context_length_exceeded"),
1152
+ });
1153
+ onEvent({
1154
+ type: "usage",
1155
+ inputTokens: 0,
1156
+ outputTokens: 0,
1157
+ model: "test-model",
1158
+ providerDurationMs: 10,
1159
+ });
1160
+
1161
+ return progressMessages;
1162
+ }
1163
+
1164
+ // After emergency compaction, succeed
1115
1165
  onEvent({
1116
1166
  type: "message_complete",
1117
1167
  message: {
1118
1168
  role: "assistant",
1119
- content: [
1120
- { type: "text", text: "Running analysis..." },
1121
- {
1122
- type: "tool_use",
1123
- id: "tu-1",
1124
- name: "bash",
1125
- input: { command: "find . -name '*.ts'" },
1126
- },
1127
- ],
1169
+ content: [{ type: "text", text: "recovered" }],
1128
1170
  },
1129
1171
  });
1130
1172
  onEvent({
1131
1173
  type: "usage",
1132
- inputTokens: 190_000,
1174
+ inputTokens: 50_000,
1133
1175
  outputTokens: 100,
1134
1176
  model: "test-model",
1135
1177
  providerDurationMs: 200,
1136
1178
  });
1179
+ return [
1180
+ ...messages,
1181
+ {
1182
+ role: "assistant" as const,
1183
+ content: [{ type: "text", text: "recovered" }] as ContentBlock[],
1184
+ },
1185
+ ];
1186
+ };
1187
+
1188
+ const ctx = makeCtx({
1189
+ agentLoopRun,
1190
+ contextWindowManager: {
1191
+ shouldCompact: () => ({ needed: false, estimatedTokens: 0 }),
1192
+ maybeCompact: async (
1193
+ _msgs: Message[],
1194
+ _signal: AbortSignal,
1195
+ opts?: Record<string, unknown>,
1196
+ ) => {
1197
+ if (opts?.force && opts?.minKeepRecentUserTurns === 0) {
1198
+ emergencyCompactCalled = true;
1199
+ return {
1200
+ compacted: true,
1201
+ messages: [
1202
+ {
1203
+ role: "user",
1204
+ content: [{ type: "text", text: "Hello" }],
1205
+ },
1206
+ ] as Message[],
1207
+ compactedPersistedMessages: 50,
1208
+ summaryText: "Emergency summary",
1209
+ previousEstimatedInputTokens: 195_000,
1210
+ estimatedInputTokens: 50_000,
1211
+ maxInputTokens: 200_000,
1212
+ thresholdTokens: 160_000,
1213
+ compactedMessages: 50,
1214
+ summaryCalls: 1,
1215
+ summaryInputTokens: 1000,
1216
+ summaryOutputTokens: 300,
1217
+ summaryModel: "mock-model",
1218
+ };
1219
+ }
1220
+ return { compacted: false };
1221
+ },
1222
+ } as unknown as AgentLoopSessionContext["contextWindowManager"],
1223
+ });
1224
+
1225
+ await runAgentLoopImpl(ctx, "hello", "msg-1", (msg) => events.push(msg));
1137
1226
 
1138
- // Then context_too_large on the next LLM call within the loop
1227
+ // BUG: Currently when progress was made + all tiers exhausted,
1228
+ // emergency compaction is NOT attempted. The error is surfaced directly.
1229
+ // After PR 2 fix, emergency compaction should be attempted.
1230
+ expect(emergencyCompactCalled).toBe(true);
1231
+
1232
+ // BUG: Currently a session_error IS emitted.
1233
+ const sessionError = events.find((e) => e.type === "session_error");
1234
+ expect(sessionError).toBeUndefined();
1235
+ },
1236
+ );
1237
+
1238
+ // ── Test 6 ────────────────────────────────────────────────────────
1239
+ // Tests mid-loop budget check via onCheckpoint.
1240
+ // The onCheckpoint callback estimates prompt tokens after each tool round.
1241
+ // When estimate exceeds the mid-loop threshold (85% of budget),
1242
+ // it returns "yield" to break the agent loop.
1243
+ // The session-agent-loop then runs compaction and re-enters the agent loop.
1244
+ test.todo(
1245
+ "onCheckpoint yields when token estimate exceeds mid-loop budget threshold",
1246
+ async () => {
1247
+ const events: ServerMessage[] = [];
1248
+ let compactionCalled = false;
1249
+
1250
+ // estimatePromptTokens is called:
1251
+ // 1. During preflight budget check (low value, below budget)
1252
+ // 2. During onCheckpoint mid-loop check (high value, above 85% threshold)
1253
+ // Budget = 200_000 * 0.95 = 190_000
1254
+ // Mid-loop threshold = 190_000 * 0.85 = 161_500
1255
+ let estimateCallCount = 0;
1256
+ mockEstimateTokens = () => {
1257
+ estimateCallCount++;
1258
+ // First call: preflight check — below budget
1259
+ if (estimateCallCount === 1) return 100_000;
1260
+ // Subsequent calls: mid-loop check — above 85% threshold
1261
+ return 170_000;
1262
+ };
1263
+
1264
+ let agentLoopCallCount = 0;
1265
+ const agentLoopRun: AgentLoopRun = async (
1266
+ messages,
1267
+ onEvent,
1268
+ _signal,
1269
+ _requestId,
1270
+ onCheckpoint,
1271
+ ) => {
1272
+ agentLoopCallCount++;
1273
+
1274
+ if (agentLoopCallCount === 1) {
1275
+ // Simulate a tool round: assistant calls a tool, results come back
1276
+ const withProgress: Message[] = [
1277
+ ...messages,
1278
+ {
1279
+ role: "assistant" as const,
1280
+ content: [
1281
+ { type: "text", text: "Let me check." },
1282
+ {
1283
+ type: "tool_use",
1284
+ id: "tu-1",
1285
+ name: "bash",
1286
+ input: { command: "ls" },
1287
+ },
1288
+ ] as ContentBlock[],
1289
+ },
1290
+ {
1291
+ role: "user" as const,
1292
+ content: [
1293
+ {
1294
+ type: "tool_result",
1295
+ tool_use_id: "tu-1",
1296
+ content: "file1.ts\nfile2.ts",
1297
+ is_error: false,
1298
+ },
1299
+ ] as ContentBlock[],
1300
+ },
1301
+ ];
1302
+
1303
+ onEvent({
1304
+ type: "message_complete",
1305
+ message: {
1306
+ role: "assistant",
1307
+ content: [
1308
+ { type: "text", text: "Let me check." },
1309
+ {
1310
+ type: "tool_use",
1311
+ id: "tu-1",
1312
+ name: "bash",
1313
+ input: { command: "ls" },
1314
+ },
1315
+ ],
1316
+ },
1317
+ });
1318
+ onEvent({
1319
+ type: "usage",
1320
+ inputTokens: 100,
1321
+ outputTokens: 50,
1322
+ model: "test-model",
1323
+ providerDurationMs: 100,
1324
+ });
1325
+
1326
+ // Call onCheckpoint — this should trigger the mid-loop budget check
1327
+ // which sees 170_000 > 161_500 and returns "yield"
1328
+ if (onCheckpoint) {
1329
+ const decision = onCheckpoint({
1330
+ turnIndex: 0,
1331
+ toolCount: 1,
1332
+ hasToolUse: true,
1333
+ history: withProgress,
1334
+ });
1335
+ if (decision === "yield") {
1336
+ // Agent loop stops when checkpoint yields
1337
+ return withProgress;
1338
+ }
1339
+ }
1340
+
1341
+ return withProgress;
1342
+ }
1343
+
1344
+ // Second call (after compaction): complete successfully
1139
1345
  onEvent({
1140
- type: "error",
1141
- error: new Error("context_length_exceeded"),
1346
+ type: "message_complete",
1347
+ message: {
1348
+ role: "assistant",
1349
+ content: [{ type: "text", text: "done after compaction" }],
1350
+ },
1142
1351
  });
1143
1352
  onEvent({
1144
1353
  type: "usage",
1145
- inputTokens: 0,
1146
- outputTokens: 0,
1354
+ inputTokens: 50,
1355
+ outputTokens: 25,
1147
1356
  model: "test-model",
1148
- providerDurationMs: 10,
1357
+ providerDurationMs: 100,
1149
1358
  });
1359
+ return [
1360
+ ...messages,
1361
+ {
1362
+ role: "assistant" as const,
1363
+ content: [
1364
+ { type: "text", text: "done after compaction" },
1365
+ ] as ContentBlock[],
1366
+ },
1367
+ ];
1368
+ };
1150
1369
 
1151
- return progressMessages;
1152
- }
1153
-
1154
- // After emergency compaction, succeed
1155
- onEvent({
1156
- type: "message_complete",
1157
- message: {
1158
- role: "assistant",
1159
- content: [{ type: "text", text: "recovered" }],
1160
- },
1161
- });
1162
- onEvent({
1163
- type: "usage",
1164
- inputTokens: 50_000,
1165
- outputTokens: 100,
1166
- model: "test-model",
1167
- providerDurationMs: 200,
1168
- });
1169
- return [
1170
- ...messages,
1171
- {
1172
- role: "assistant" as const,
1173
- content: [{ type: "text", text: "recovered" }] as ContentBlock[],
1174
- },
1175
- ];
1176
- };
1177
-
1178
- const ctx = makeCtx({
1179
- agentLoopRun,
1180
- contextWindowManager: {
1181
- shouldCompact: () => ({ needed: false, estimatedTokens: 0 }),
1182
- maybeCompact: async (
1183
- _msgs: Message[],
1184
- _signal: AbortSignal,
1185
- opts?: Record<string, unknown>,
1186
- ) => {
1187
- if (opts?.force && opts?.minKeepRecentUserTurns === 0) {
1188
- emergencyCompactCalled = true;
1370
+ const ctx = makeCtx({
1371
+ agentLoopRun,
1372
+ contextWindowManager: {
1373
+ shouldCompact: () => ({ needed: false, estimatedTokens: 0 }),
1374
+ maybeCompact: async () => {
1375
+ compactionCalled = true;
1189
1376
  return {
1190
1377
  compacted: true,
1191
1378
  messages: [
1192
1379
  {
1193
- role: "user",
1380
+ role: "user" as const,
1194
1381
  content: [{ type: "text", text: "Hello" }],
1195
1382
  },
1196
1383
  ] as Message[],
1197
- compactedPersistedMessages: 50,
1198
- summaryText: "Emergency summary",
1199
- previousEstimatedInputTokens: 195_000,
1200
- estimatedInputTokens: 50_000,
1384
+ compactedPersistedMessages: 5,
1385
+ summaryText: "Mid-loop compaction summary",
1386
+ previousEstimatedInputTokens: 170_000,
1387
+ estimatedInputTokens: 80_000,
1201
1388
  maxInputTokens: 200_000,
1202
1389
  thresholdTokens: 160_000,
1203
- compactedMessages: 50,
1390
+ compactedMessages: 10,
1204
1391
  summaryCalls: 1,
1205
- summaryInputTokens: 1000,
1206
- summaryOutputTokens: 300,
1392
+ summaryInputTokens: 500,
1393
+ summaryOutputTokens: 200,
1207
1394
  summaryModel: "mock-model",
1208
1395
  };
1209
- }
1210
- return { compacted: false };
1211
- },
1212
- } as unknown as AgentLoopSessionContext["contextWindowManager"],
1213
- });
1396
+ },
1397
+ } as unknown as AgentLoopSessionContext["contextWindowManager"],
1398
+ });
1214
1399
 
1215
- await runAgentLoopImpl(ctx, "hello", "msg-1", (msg) => events.push(msg));
1400
+ await runAgentLoopImpl(ctx, "hello", "msg-1", (msg) => events.push(msg));
1216
1401
 
1217
- // BUG: Currently when progress was made + all tiers exhausted,
1218
- // emergency compaction is NOT attempted. The error is surfaced directly.
1219
- // After PR 2 fix, emergency compaction should be attempted.
1220
- expect(emergencyCompactCalled).toBe(true);
1402
+ // The mid-loop budget check should have triggered compaction
1403
+ expect(compactionCalled).toBe(true);
1221
1404
 
1222
- // BUG: Currently a session_error IS emitted.
1223
- const sessionError = events.find((e) => e.type === "session_error");
1224
- expect(sessionError).toBeUndefined();
1225
- });
1405
+ // Agent loop should have been called twice: once before yield, once after compaction
1406
+ expect(agentLoopCallCount).toBe(2);
1226
1407
 
1227
- // ── Test 6 ────────────────────────────────────────────────────────
1228
- // Tests mid-loop budget check via onCheckpoint.
1229
- // The onCheckpoint callback estimates prompt tokens after each tool round.
1230
- // When estimate exceeds the mid-loop threshold (85% of budget),
1231
- // it returns "yield" to break the agent loop.
1232
- // The session-agent-loop then runs compaction and re-enters the agent loop.
1233
- test("onCheckpoint yields when token estimate exceeds mid-loop budget threshold", async () => {
1234
- const events: ServerMessage[] = [];
1235
- let compactionCalled = false;
1408
+ // No session_error should be emitted
1409
+ const sessionError = events.find((e) => e.type === "session_error");
1410
+ expect(sessionError).toBeUndefined();
1236
1411
 
1237
- // estimatePromptTokens is called:
1238
- // 1. During preflight budget check (low value, below budget)
1239
- // 2. During onCheckpoint mid-loop check (high value, above 85% threshold)
1240
- // Budget = 200_000 * 0.95 = 190_000
1241
- // Mid-loop threshold = 190_000 * 0.85 = 161_500
1242
- let estimateCallCount = 0;
1243
- mockEstimateTokens = () => {
1244
- estimateCallCount++;
1245
- // First call: preflight check — below budget
1246
- if (estimateCallCount === 1) return 100_000;
1247
- // Subsequent calls: mid-loop check — above 85% threshold
1248
- return 170_000;
1249
- };
1412
+ // A context_compacted event should have been emitted
1413
+ const compacted = events.find((e) => e.type === "context_compacted");
1414
+ expect(compacted).toBeDefined();
1415
+ },
1416
+ );
1250
1417
 
1251
- let agentLoopCallCount = 0;
1252
- const agentLoopRun: AgentLoopRun = async (
1253
- messages,
1254
- onEvent,
1255
- _signal,
1256
- _requestId,
1257
- onCheckpoint,
1258
- ) => {
1259
- agentLoopCallCount++;
1418
+ // ── Test 7 ────────────────────────────────────────────────────────
1419
+ // Tests that mid-loop budget check prevents context_too_large entirely.
1420
+ // Agent loop runs tool calls with growing history. After the estimate
1421
+ // exceeds the mid-loop threshold, the loop yields, compaction runs,
1422
+ // and the loop resumes. The provider NEVER rejects with context_too_large.
1423
+ test.todo(
1424
+ "mid-loop budget check prevents context_too_large when tools produce large results",
1425
+ async () => {
1426
+ const events: ServerMessage[] = [];
1427
+ let compactionCalled = false;
1428
+
1429
+ // Budget = 200_000 * 0.95 = 190_000
1430
+ // Mid-loop threshold = 190_000 * 0.85 = 161_500
1431
+ // Simulate token growth: preflight = 50k, then each checkpoint call
1432
+ // returns a growing estimate. By tool call 3, we exceed the threshold.
1433
+ let estimateCallCount = 0;
1434
+ mockEstimateTokens = () => {
1435
+ estimateCallCount++;
1436
+ // First call: preflight — well below budget
1437
+ if (estimateCallCount === 1) return 50_000;
1438
+ // Checkpoint calls grow with each tool round
1439
+ if (estimateCallCount === 2) return 100_000; // tool 1
1440
+ if (estimateCallCount === 3) return 140_000; // tool 2
1441
+ // Tool 3: exceeds 161_500 threshold
1442
+ return 175_000;
1443
+ };
1260
1444
 
1261
- if (agentLoopCallCount === 1) {
1262
- // Simulate a tool round: assistant calls a tool, results come back
1263
- const withProgress: Message[] = [
1264
- ...messages,
1265
- {
1266
- role: "assistant" as const,
1267
- content: [
1268
- { type: "text", text: "Let me check." },
1269
- {
1270
- type: "tool_use",
1271
- id: "tu-1",
1272
- name: "bash",
1273
- input: { command: "ls" },
1274
- },
1275
- ] as ContentBlock[],
1276
- },
1277
- {
1278
- role: "user" as const,
1279
- content: [
1280
- {
1281
- type: "tool_result",
1282
- tool_use_id: "tu-1",
1283
- content: "file1.ts\nfile2.ts",
1284
- is_error: false,
1285
- },
1286
- ] as ContentBlock[],
1287
- },
1288
- ];
1445
+ let agentLoopCallCount = 0;
1446
+ let contextTooLargeEmitted = false;
1447
+
1448
+ const agentLoopRun: AgentLoopRun = async (
1449
+ messages,
1450
+ onEvent,
1451
+ _signal,
1452
+ _requestId,
1453
+ onCheckpoint,
1454
+ ) => {
1455
+ agentLoopCallCount++;
1456
+
1457
+ if (agentLoopCallCount === 1) {
1458
+ const currentHistory = [...messages];
1459
+
1460
+ // Simulate 5 tool rounds — but the checkpoint should yield at round 3
1461
+ for (let i = 0; i < 5; i++) {
1462
+ const toolId = `tu-${i}`;
1463
+ const assistantMsg: Message = {
1464
+ role: "assistant" as const,
1465
+ content: [
1466
+ { type: "text", text: `Step ${i}` },
1467
+ {
1468
+ type: "tool_use",
1469
+ id: toolId,
1470
+ name: "bash",
1471
+ input: { command: `cmd-${i}` },
1472
+ },
1473
+ ] as ContentBlock[],
1474
+ };
1475
+ const resultMsg: Message = {
1476
+ role: "user" as const,
1477
+ content: [
1478
+ {
1479
+ type: "tool_result",
1480
+ tool_use_id: toolId,
1481
+ content: "x".repeat(10_000),
1482
+ is_error: false,
1483
+ },
1484
+ ] as ContentBlock[],
1485
+ };
1486
+ currentHistory.push(assistantMsg, resultMsg);
1289
1487
 
1488
+ onEvent({
1489
+ type: "message_complete",
1490
+ message: assistantMsg,
1491
+ });
1492
+ onEvent({
1493
+ type: "usage",
1494
+ inputTokens: 50_000 + i * 20_000,
1495
+ outputTokens: 50,
1496
+ model: "test-model",
1497
+ providerDurationMs: 100,
1498
+ });
1499
+
1500
+ if (onCheckpoint) {
1501
+ const decision = onCheckpoint({
1502
+ turnIndex: i,
1503
+ toolCount: 1,
1504
+ hasToolUse: true,
1505
+ history: currentHistory,
1506
+ });
1507
+ if (decision === "yield") {
1508
+ return currentHistory;
1509
+ }
1510
+ }
1511
+ }
1512
+
1513
+ return currentHistory;
1514
+ }
1515
+
1516
+ // Second call (after compaction): complete
1290
1517
  onEvent({
1291
1518
  type: "message_complete",
1292
1519
  message: {
1293
1520
  role: "assistant",
1294
1521
  content: [
1295
- { type: "text", text: "Let me check." },
1296
- {
1297
- type: "tool_use",
1298
- id: "tu-1",
1299
- name: "bash",
1300
- input: { command: "ls" },
1301
- },
1522
+ { type: "text", text: "completed after mid-loop compaction" },
1302
1523
  ],
1303
1524
  },
1304
1525
  });
1305
1526
  onEvent({
1306
1527
  type: "usage",
1307
- inputTokens: 100,
1308
- outputTokens: 50,
1528
+ inputTokens: 60_000,
1529
+ outputTokens: 100,
1309
1530
  model: "test-model",
1310
- providerDurationMs: 100,
1531
+ providerDurationMs: 200,
1311
1532
  });
1533
+ return [
1534
+ ...messages,
1535
+ {
1536
+ role: "assistant" as const,
1537
+ content: [
1538
+ { type: "text", text: "completed after mid-loop compaction" },
1539
+ ] as ContentBlock[],
1540
+ },
1541
+ ];
1542
+ };
1312
1543
 
1313
- // Call onCheckpoint — this should trigger the mid-loop budget check
1314
- // which sees 170_000 > 161_500 and returns "yield"
1315
- if (onCheckpoint) {
1316
- const decision = onCheckpoint({
1317
- turnIndex: 0,
1318
- toolCount: 1,
1319
- hasToolUse: true,
1320
- history: withProgress,
1321
- });
1322
- if (decision === "yield") {
1323
- // Agent loop stops when checkpoint yields
1324
- return withProgress;
1325
- }
1326
- }
1327
-
1328
- return withProgress;
1329
- }
1330
-
1331
- // Second call (after compaction): complete successfully
1332
- onEvent({
1333
- type: "message_complete",
1334
- message: {
1335
- role: "assistant",
1336
- content: [{ type: "text", text: "done after compaction" }],
1337
- },
1338
- });
1339
- onEvent({
1340
- type: "usage",
1341
- inputTokens: 50,
1342
- outputTokens: 25,
1343
- model: "test-model",
1344
- providerDurationMs: 100,
1544
+ const ctx = makeCtx({
1545
+ agentLoopRun,
1546
+ contextWindowManager: {
1547
+ shouldCompact: () => ({ needed: false, estimatedTokens: 0 }),
1548
+ maybeCompact: async () => {
1549
+ compactionCalled = true;
1550
+ return {
1551
+ compacted: true,
1552
+ messages: [
1553
+ {
1554
+ role: "user" as const,
1555
+ content: [{ type: "text", text: "Hello" }],
1556
+ },
1557
+ ] as Message[],
1558
+ compactedPersistedMessages: 8,
1559
+ summaryText: "Compacted large tool results",
1560
+ previousEstimatedInputTokens: 175_000,
1561
+ estimatedInputTokens: 60_000,
1562
+ maxInputTokens: 200_000,
1563
+ thresholdTokens: 160_000,
1564
+ compactedMessages: 15,
1565
+ summaryCalls: 1,
1566
+ summaryInputTokens: 800,
1567
+ summaryOutputTokens: 300,
1568
+ summaryModel: "mock-model",
1569
+ };
1570
+ },
1571
+ } as unknown as AgentLoopSessionContext["contextWindowManager"],
1345
1572
  });
1346
- return [
1347
- ...messages,
1348
- {
1349
- role: "assistant" as const,
1350
- content: [
1351
- { type: "text", text: "done after compaction" },
1352
- ] as ContentBlock[],
1353
- },
1354
- ];
1355
- };
1356
-
1357
- const ctx = makeCtx({
1358
- agentLoopRun,
1359
- contextWindowManager: {
1360
- shouldCompact: () => ({ needed: false, estimatedTokens: 0 }),
1361
- maybeCompact: async () => {
1362
- compactionCalled = true;
1363
- return {
1364
- compacted: true,
1365
- messages: [
1366
- {
1367
- role: "user" as const,
1368
- content: [{ type: "text", text: "Hello" }],
1369
- },
1370
- ] as Message[],
1371
- compactedPersistedMessages: 5,
1372
- summaryText: "Mid-loop compaction summary",
1373
- previousEstimatedInputTokens: 170_000,
1374
- estimatedInputTokens: 80_000,
1375
- maxInputTokens: 200_000,
1376
- thresholdTokens: 160_000,
1377
- compactedMessages: 10,
1378
- summaryCalls: 1,
1379
- summaryInputTokens: 500,
1380
- summaryOutputTokens: 200,
1381
- summaryModel: "mock-model",
1382
- };
1383
- },
1384
- } as unknown as AgentLoopSessionContext["contextWindowManager"],
1385
- });
1386
-
1387
- await runAgentLoopImpl(ctx, "hello", "msg-1", (msg) => events.push(msg));
1388
1573
 
1389
- // The mid-loop budget check should have triggered compaction
1390
- expect(compactionCalled).toBe(true);
1574
+ await runAgentLoopImpl(ctx, "hello", "msg-1", (msg) => {
1575
+ events.push(msg);
1576
+ // Track if context_too_large was ever emitted
1577
+ if (
1578
+ msg.type === "session_error" &&
1579
+ "code" in msg &&
1580
+ msg.code === "SESSION_PROCESSING_FAILED"
1581
+ ) {
1582
+ contextTooLargeEmitted = true;
1583
+ }
1584
+ });
1391
1585
 
1392
- // Agent loop should have been called twice: once before yield, once after compaction
1393
- expect(agentLoopCallCount).toBe(2);
1586
+ // Compaction should have been triggered by mid-loop budget check
1587
+ expect(compactionCalled).toBe(true);
1394
1588
 
1395
- // No session_error should be emitted
1396
- const sessionError = events.find((e) => e.type === "session_error");
1397
- expect(sessionError).toBeUndefined();
1589
+ // The provider should NEVER have rejected with context_too_large
1590
+ expect(contextTooLargeEmitted).toBe(false);
1398
1591
 
1399
- // A context_compacted event should have been emitted
1400
- const compacted = events.find((e) => e.type === "context_compacted");
1401
- expect(compacted).toBeDefined();
1402
- });
1592
+ // Agent loop called twice: once (yielded at tool 3), once after compaction
1593
+ expect(agentLoopCallCount).toBe(2);
1403
1594
 
1404
- // ── Test 7 ────────────────────────────────────────────────────────
1405
- // Tests that mid-loop budget check prevents context_too_large entirely.
1406
- // Agent loop runs tool calls with growing history. After the estimate
1407
- // exceeds the mid-loop threshold, the loop yields, compaction runs,
1408
- // and the loop resumes. The provider NEVER rejects with context_too_large.
1409
- test("mid-loop budget check prevents context_too_large when tools produce large results", async () => {
1595
+ // No session_error
1596
+ const sessionError = events.find((e) => e.type === "session_error");
1597
+ expect(sessionError).toBeUndefined();
1598
+ },
1599
+ );
1600
+
1601
+ // ── Test 8 ────────────────────────────────────────────────────────
1602
+ // When mid-loop compaction exhausts maxAttempts but the agent loop
1603
+ // still yields (yieldedForBudget remains true), the incomplete turn
1604
+ // must escalate to the convergence loop instead of being silently
1605
+ // treated as a completed turn.
1606
+ test("exhausted mid-loop compaction attempts escalate to convergence loop", async () => {
1410
1607
  const events: ServerMessage[] = [];
1411
- let compactionCalled = false;
1412
1608
 
1413
1609
  // Budget = 200_000 * 0.95 = 190_000
1414
1610
  // Mid-loop threshold = 190_000 * 0.85 = 161_500
1415
- // Simulate token growth: preflight = 50k, then each checkpoint call
1416
- // returns a growing estimate. By tool call 3, we exceed the threshold.
1417
1611
  let estimateCallCount = 0;
1418
1612
  mockEstimateTokens = () => {
1419
1613
  estimateCallCount++;
1420
- // First call: preflight — well below budget
1421
- if (estimateCallCount === 1) return 50_000;
1422
- // Checkpoint calls grow with each tool round
1423
- if (estimateCallCount === 2) return 100_000; // tool 1
1424
- if (estimateCallCount === 3) return 140_000; // tool 2
1425
- // Tool 3: exceeds 161_500 threshold
1426
- return 175_000;
1614
+ // Preflight: below budget
1615
+ if (estimateCallCount === 1) return 100_000;
1616
+ // Every checkpoint call: above threshold always triggers yield
1617
+ return 170_000;
1427
1618
  };
1428
1619
 
1429
1620
  let agentLoopCallCount = 0;
1430
- let contextTooLargeEmitted = false;
1431
-
1432
1621
  const agentLoopRun: AgentLoopRun = async (
1433
1622
  messages,
1434
1623
  onEvent,
@@ -1438,91 +1627,88 @@ describe("session-agent-loop overflow recovery (JARVIS-110)", () => {
1438
1627
  ) => {
1439
1628
  agentLoopCallCount++;
1440
1629
 
1441
- if (agentLoopCallCount === 1) {
1442
- const currentHistory = [...messages];
1443
-
1444
- // Simulate 5 tool rounds — but the checkpoint should yield at round 3
1445
- for (let i = 0; i < 5; i++) {
1446
- const toolId = `tu-${i}`;
1447
- const assistantMsg: Message = {
1448
- role: "assistant" as const,
1449
- content: [
1450
- { type: "text", text: `Step ${i}` },
1451
- {
1452
- type: "tool_use",
1453
- id: toolId,
1454
- name: "bash",
1455
- input: { command: `cmd-${i}` },
1456
- },
1457
- ] as ContentBlock[],
1458
- };
1459
- const resultMsg: Message = {
1460
- role: "user" as const,
1461
- content: [
1462
- {
1463
- type: "tool_result",
1464
- tool_use_id: toolId,
1465
- content: "x".repeat(10_000),
1466
- is_error: false,
1467
- },
1468
- ] as ContentBlock[],
1469
- };
1470
- currentHistory.push(assistantMsg, resultMsg);
1471
-
1472
- onEvent({
1473
- type: "message_complete",
1474
- message: assistantMsg,
1475
- });
1476
- onEvent({
1477
- type: "usage",
1478
- inputTokens: 50_000 + i * 20_000,
1479
- outputTokens: 50,
1480
- model: "test-model",
1481
- providerDurationMs: 100,
1482
- });
1483
-
1484
- if (onCheckpoint) {
1485
- const decision = onCheckpoint({
1486
- turnIndex: i,
1487
- toolCount: 1,
1488
- hasToolUse: true,
1489
- history: currentHistory,
1490
- });
1491
- if (decision === "yield") {
1492
- return currentHistory;
1493
- }
1494
- }
1495
- }
1496
-
1497
- return currentHistory;
1498
- }
1630
+ // Every call: simulate tool progress then yield at checkpoint
1631
+ const withProgress: Message[] = [
1632
+ ...messages,
1633
+ {
1634
+ role: "assistant" as const,
1635
+ content: [
1636
+ { type: "text", text: `Tool call ${agentLoopCallCount}` },
1637
+ {
1638
+ type: "tool_use",
1639
+ id: `tu-${agentLoopCallCount}`,
1640
+ name: "bash",
1641
+ input: { command: "ls" },
1642
+ },
1643
+ ] as ContentBlock[],
1644
+ },
1645
+ {
1646
+ role: "user" as const,
1647
+ content: [
1648
+ {
1649
+ type: "tool_result",
1650
+ tool_use_id: `tu-${agentLoopCallCount}`,
1651
+ content: "output",
1652
+ is_error: false,
1653
+ },
1654
+ ] as ContentBlock[],
1655
+ },
1656
+ ];
1499
1657
 
1500
- // Second call (after compaction): complete
1501
1658
  onEvent({
1502
1659
  type: "message_complete",
1503
1660
  message: {
1504
1661
  role: "assistant",
1505
1662
  content: [
1506
- { type: "text", text: "completed after mid-loop compaction" },
1663
+ { type: "text", text: `Tool call ${agentLoopCallCount}` },
1664
+ {
1665
+ type: "tool_use",
1666
+ id: `tu-${agentLoopCallCount}`,
1667
+ name: "bash",
1668
+ input: { command: "ls" },
1669
+ },
1507
1670
  ],
1508
1671
  },
1509
1672
  });
1510
1673
  onEvent({
1511
1674
  type: "usage",
1512
- inputTokens: 60_000,
1513
- outputTokens: 100,
1675
+ inputTokens: 100,
1676
+ outputTokens: 50,
1514
1677
  model: "test-model",
1515
- providerDurationMs: 200,
1678
+ providerDurationMs: 100,
1516
1679
  });
1517
- return [
1518
- ...messages,
1519
- {
1520
- role: "assistant" as const,
1521
- content: [
1522
- { type: "text", text: "completed after mid-loop compaction" },
1523
- ] as ContentBlock[],
1680
+
1681
+ // Always yield at checkpoint — simulates compaction not helping
1682
+ if (onCheckpoint) {
1683
+ const decision = onCheckpoint({
1684
+ turnIndex: 0,
1685
+ toolCount: 1,
1686
+ hasToolUse: true,
1687
+ history: withProgress,
1688
+ });
1689
+ if (decision === "yield") {
1690
+ return withProgress;
1691
+ }
1692
+ }
1693
+
1694
+ return withProgress;
1695
+ };
1696
+
1697
+ let compactionCallCount = 0;
1698
+ // Convergence reducer: reduce tokens enough to succeed
1699
+ let convergenceReducerCalled = false;
1700
+ mockReducerStepFn = (msgs: Message[]) => {
1701
+ convergenceReducerCalled = true;
1702
+ return {
1703
+ messages: msgs,
1704
+ tier: "forced_compaction",
1705
+ state: {
1706
+ appliedTiers: ["forced_compaction"],
1707
+ injectionMode: "full",
1708
+ exhausted: true,
1524
1709
  },
1525
- ];
1710
+ estimatedTokens: 80_000,
1711
+ };
1526
1712
  };
1527
1713
 
1528
1714
  const ctx = makeCtx({
@@ -1530,7 +1716,8 @@ describe("session-agent-loop overflow recovery (JARVIS-110)", () => {
1530
1716
  contextWindowManager: {
1531
1717
  shouldCompact: () => ({ needed: false, estimatedTokens: 0 }),
1532
1718
  maybeCompact: async () => {
1533
- compactionCalled = true;
1719
+ compactionCallCount++;
1720
+ // Compaction "succeeds" but doesn't actually shrink enough
1534
1721
  return {
1535
1722
  compacted: true,
1536
1723
  messages: [
@@ -1539,45 +1726,32 @@ describe("session-agent-loop overflow recovery (JARVIS-110)", () => {
1539
1726
  content: [{ type: "text", text: "Hello" }],
1540
1727
  },
1541
1728
  ] as Message[],
1542
- compactedPersistedMessages: 8,
1543
- summaryText: "Compacted large tool results",
1544
- previousEstimatedInputTokens: 175_000,
1545
- estimatedInputTokens: 60_000,
1729
+ compactedPersistedMessages: 5,
1730
+ summaryText: "Compaction summary",
1731
+ previousEstimatedInputTokens: 170_000,
1732
+ estimatedInputTokens: 165_000, // barely reduced
1546
1733
  maxInputTokens: 200_000,
1547
1734
  thresholdTokens: 160_000,
1548
- compactedMessages: 15,
1735
+ compactedMessages: 10,
1549
1736
  summaryCalls: 1,
1550
- summaryInputTokens: 800,
1551
- summaryOutputTokens: 300,
1737
+ summaryInputTokens: 500,
1738
+ summaryOutputTokens: 200,
1552
1739
  summaryModel: "mock-model",
1553
1740
  };
1554
1741
  },
1555
1742
  } as unknown as AgentLoopSessionContext["contextWindowManager"],
1556
1743
  });
1557
1744
 
1558
- await runAgentLoopImpl(ctx, "hello", "msg-1", (msg) => {
1559
- events.push(msg);
1560
- // Track if context_too_large was ever emitted
1561
- if (
1562
- msg.type === "session_error" &&
1563
- "code" in msg &&
1564
- msg.code === "SESSION_PROCESSING_FAILED"
1565
- ) {
1566
- contextTooLargeEmitted = true;
1567
- }
1568
- });
1569
-
1570
- // Compaction should have been triggered by mid-loop budget check
1571
- expect(compactionCalled).toBe(true);
1745
+ await runAgentLoopImpl(ctx, "hello", "msg-1", (msg) => events.push(msg));
1572
1746
 
1573
- // The provider should NEVER have rejected with context_too_large
1574
- expect(contextTooLargeEmitted).toBe(false);
1747
+ // 1 initial auto-compact + 3 mid-loop compaction attempts = 4 total
1748
+ expect(compactionCallCount).toBe(4);
1575
1749
 
1576
- // Agent loop called twice: once (yielded at tool 3), once after compaction
1577
- expect(agentLoopCallCount).toBe(2);
1750
+ // Agent loop: 1 initial + 3 mid-loop re-entries + 1 convergence re-run = 5 calls
1751
+ expect(agentLoopCallCount).toBe(5);
1578
1752
 
1579
- // No session_error
1580
- const sessionError = events.find((e) => e.type === "session_error");
1581
- expect(sessionError).toBeUndefined();
1753
+ // After exhausting mid-loop attempts, the convergence loop should
1754
+ // have been triggered (contextTooLargeDetected set to true)
1755
+ expect(convergenceReducerCalled).toBe(true);
1582
1756
  });
1583
1757
  });