@vellumai/assistant 0.4.52 → 0.4.53
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/ARCHITECTURE.md +2 -2
- package/docs/architecture/keychain-broker.md +6 -20
- package/docs/architecture/memory.md +3 -3
- package/package.json +1 -1
- package/src/__tests__/approval-cascade.test.ts +3 -1
- package/src/__tests__/approval-routes-http.test.ts +0 -1
- package/src/__tests__/asset-materialize-tool.test.ts +0 -1
- package/src/__tests__/asset-search-tool.test.ts +0 -1
- package/src/__tests__/assistant-events-sse-hardening.test.ts +0 -1
- package/src/__tests__/attachments-store.test.ts +0 -1
- package/src/__tests__/avatar-e2e.test.ts +6 -1
- package/src/__tests__/browser-fill-credential.test.ts +3 -0
- package/src/__tests__/btw-routes.test.ts +39 -0
- package/src/__tests__/call-controller.test.ts +0 -1
- package/src/__tests__/call-domain.test.ts +1 -0
- package/src/__tests__/call-routes-http.test.ts +1 -2
- package/src/__tests__/canonical-guardian-store.test.ts +33 -2
- package/src/__tests__/channel-readiness-service.test.ts +1 -0
- package/src/__tests__/claude-code-skill-regression.test.ts +6 -2
- package/src/__tests__/claude-code-tool-profiles.test.ts +7 -2
- package/src/__tests__/config-loader-backfill.test.ts +1 -2
- package/src/__tests__/config-schema.test.ts +6 -37
- package/src/__tests__/conversation-routes-slash-commands.test.ts +0 -1
- package/src/__tests__/credential-broker-server-use.test.ts +16 -16
- package/src/__tests__/credential-security-invariants.test.ts +14 -0
- package/src/__tests__/credential-vault-unit.test.ts +4 -4
- package/src/__tests__/error-handler-friendly-messages.test.ts +4 -5
- package/src/__tests__/gateway-only-enforcement.test.ts +0 -2
- package/src/__tests__/host-shell-tool.test.ts +0 -1
- package/src/__tests__/http-user-message-parity.test.ts +19 -0
- package/src/__tests__/list-messages-attachments.test.ts +0 -1
- package/src/__tests__/log-export-workspace.test.ts +233 -0
- package/src/__tests__/managed-proxy-context.test.ts +1 -1
- package/src/__tests__/managed-skill-lifecycle.test.ts +0 -1
- package/src/__tests__/media-generate-image.test.ts +7 -2
- package/src/__tests__/media-reuse-story.e2e.test.ts +0 -1
- package/src/__tests__/memory-regressions.test.ts +0 -1
- package/src/__tests__/migration-cross-version-compatibility.test.ts +0 -1
- package/src/__tests__/migration-export-http.test.ts +0 -1
- package/src/__tests__/migration-import-commit-http.test.ts +0 -1
- package/src/__tests__/migration-import-preflight-http.test.ts +0 -1
- package/src/__tests__/migration-validate-http.test.ts +0 -1
- package/src/__tests__/notification-schedule-dedup.test.ts +237 -0
- package/src/__tests__/oauth-cli.test.ts +1 -10
- package/src/__tests__/oauth-store.test.ts +3 -5
- package/src/__tests__/oauth2-gateway-transport.test.ts +5 -4
- package/src/__tests__/onboarding-starter-tasks.test.ts +1 -1
- package/src/__tests__/onboarding-template-contract.test.ts +1 -2
- package/src/__tests__/pricing.test.ts +0 -11
- package/src/__tests__/provider-commit-message-generator.test.ts +21 -14
- package/src/__tests__/provider-fail-open-selection.test.ts +9 -8
- package/src/__tests__/provider-managed-proxy-integration.test.ts +27 -24
- package/src/__tests__/provider-registry-ollama.test.ts +8 -2
- package/src/__tests__/recording-handler.test.ts +0 -1
- package/src/__tests__/relay-server.test.ts +0 -1
- package/src/__tests__/runtime-attachment-metadata.test.ts +0 -1
- package/src/__tests__/runtime-events-sse-parity.test.ts +0 -1
- package/src/__tests__/runtime-events-sse.test.ts +0 -1
- package/src/__tests__/secret-routes-managed-proxy.test.ts +0 -1
- package/src/__tests__/secret-scanner-executor.test.ts +0 -1
- package/src/__tests__/send-endpoint-busy.test.ts +0 -1
- package/src/__tests__/session-abort-tool-results.test.ts +3 -1
- package/src/__tests__/session-agent-loop-overflow.test.ts +1012 -838
- package/src/__tests__/session-agent-loop.test.ts +2 -2
- package/src/__tests__/session-confirmation-signals.test.ts +3 -1
- package/src/__tests__/session-error.test.ts +5 -4
- package/src/__tests__/session-history-web-search.test.ts +34 -9
- package/src/__tests__/session-pre-run-repair.test.ts +3 -1
- package/src/__tests__/session-provider-retry-repair.test.ts +31 -26
- package/src/__tests__/session-queue.test.ts +3 -1
- package/src/__tests__/session-runtime-assembly.test.ts +118 -0
- package/src/__tests__/session-slash-known.test.ts +31 -13
- package/src/__tests__/session-slash-queue.test.ts +3 -1
- package/src/__tests__/session-slash-unknown.test.ts +3 -1
- package/src/__tests__/session-workspace-cache-state.test.ts +3 -1
- package/src/__tests__/session-workspace-injection.test.ts +3 -1
- package/src/__tests__/session-workspace-tool-tracking.test.ts +3 -1
- package/src/__tests__/shell-tool-proxy-mode.test.ts +0 -1
- package/src/__tests__/skill-script-runner-sandbox.test.ts +0 -1
- package/src/__tests__/skillssh-registry.test.ts +21 -0
- package/src/__tests__/slack-share-routes.test.ts +1 -1
- package/src/__tests__/swarm-recursion.test.ts +5 -1
- package/src/__tests__/swarm-session-integration.test.ts +25 -14
- package/src/__tests__/swarm-tool.test.ts +5 -2
- package/src/__tests__/telegram-bot-username-resolution.test.ts +2 -4
- package/src/__tests__/token-estimator-accuracy.benchmark.test.ts +1521 -0
- package/src/__tests__/tool-execution-abort-cleanup.test.ts +0 -1
- package/src/__tests__/tool-executor-lifecycle-events.test.ts +0 -1
- package/src/__tests__/tool-executor-shell-integration.test.ts +0 -1
- package/src/__tests__/tool-executor.test.ts +0 -1
- package/src/__tests__/trust-store.test.ts +5 -1
- package/src/__tests__/twilio-routes.test.ts +2 -2
- package/src/__tests__/verification-control-plane-policy.test.ts +0 -1
- package/src/__tests__/voice-quality.test.ts +2 -1
- package/src/__tests__/voice-scoped-grant-consumer.test.ts +0 -1
- package/src/__tests__/web-search.test.ts +1 -1
- package/src/agent/loop.ts +17 -1
- package/src/bundler/app-bundler.ts +40 -24
- package/src/calls/call-controller.ts +16 -0
- package/src/calls/relay-server.ts +29 -13
- package/src/calls/voice-control-protocol.ts +1 -0
- package/src/calls/voice-quality.ts +1 -1
- package/src/calls/voice-session-bridge.ts +9 -3
- package/src/channels/types.ts +16 -0
- package/src/cli/commands/bash.ts +173 -0
- package/src/cli/commands/doctor.ts +5 -23
- package/src/cli/commands/oauth/connections.ts +4 -2
- package/src/cli/commands/oauth/providers.ts +1 -13
- package/src/cli/program.ts +2 -0
- package/src/cli/reference.ts +1 -0
- package/src/config/bundled-skills/image-studio/tools/media-generate-image.ts +2 -1
- package/src/config/bundled-skills/media-processing/tools/analyze-keyframes.ts +3 -5
- package/src/config/bundled-skills/media-processing/tools/extract-keyframes.ts +2 -3
- package/src/config/bundled-skills/phone-calls/references/CONFIG.md +1 -1
- package/src/config/bundled-skills/transcribe/tools/transcribe-media.ts +5 -6
- package/src/config/feature-flag-registry.json +8 -0
- package/src/config/loader.ts +7 -135
- package/src/config/schema.ts +0 -6
- package/src/config/schemas/channels.ts +1 -0
- package/src/config/schemas/elevenlabs.ts +2 -2
- package/src/contacts/contact-store.ts +21 -25
- package/src/contacts/contacts-write.ts +6 -6
- package/src/contacts/types.ts +2 -0
- package/src/context/token-estimator.ts +35 -2
- package/src/context/window-manager.ts +16 -2
- package/src/daemon/config-watcher.ts +24 -6
- package/src/daemon/context-overflow-reducer.ts +13 -2
- package/src/daemon/handlers/config-ingress.ts +25 -8
- package/src/daemon/handlers/config-model.ts +21 -15
- package/src/daemon/handlers/config-telegram.ts +18 -6
- package/src/daemon/handlers/dictation.ts +0 -429
- package/src/daemon/handlers/skills.ts +1 -200
- package/src/daemon/lifecycle.ts +8 -5
- package/src/daemon/message-types/contacts.ts +2 -0
- package/src/daemon/message-types/integrations.ts +1 -0
- package/src/daemon/message-types/sessions.ts +2 -0
- package/src/daemon/parse-actual-tokens-from-error.test.ts +75 -0
- package/src/daemon/server.ts +23 -2
- package/src/daemon/session-agent-loop-handlers.ts +1 -1
- package/src/daemon/session-agent-loop.ts +27 -79
- package/src/daemon/session-error.ts +5 -4
- package/src/daemon/session-process.ts +17 -10
- package/src/daemon/session-runtime-assembly.ts +50 -0
- package/src/daemon/session-slash.ts +32 -20
- package/src/daemon/session.ts +1 -0
- package/src/events/domain-events.ts +1 -0
- package/src/media/app-icon-generator.ts +2 -1
- package/src/media/avatar-router.ts +3 -2
- package/src/memory/canonical-guardian-store.ts +25 -3
- package/src/memory/db-init.ts +12 -0
- package/src/memory/embedding-backend.ts +25 -16
- package/src/memory/migrations/158-channel-interaction-columns.ts +18 -0
- package/src/memory/migrations/159-drop-contact-interaction-columns.ts +16 -0
- package/src/memory/migrations/160-drop-loopback-port-column.ts +13 -0
- package/src/memory/migrations/index.ts +3 -0
- package/src/memory/retriever.test.ts +19 -12
- package/src/memory/schema/contacts.ts +2 -2
- package/src/memory/schema/oauth.ts +0 -1
- package/src/oauth/connect-orchestrator.ts +5 -3
- package/src/oauth/connect-types.ts +9 -2
- package/src/oauth/manual-token-connection.ts +9 -7
- package/src/oauth/oauth-store.ts +2 -8
- package/src/oauth/provider-behaviors.ts +10 -0
- package/src/oauth/seed-providers.ts +13 -5
- package/src/permissions/checker.ts +20 -1
- package/src/prompts/__tests__/build-cli-reference-section.test.ts +1 -1
- package/src/prompts/system-prompt.ts +2 -11
- package/src/prompts/templates/BOOTSTRAP.md +1 -3
- package/src/providers/anthropic/client.ts +16 -8
- package/src/providers/managed-proxy/constants.ts +1 -1
- package/src/providers/registry.ts +21 -15
- package/src/providers/types.ts +1 -1
- package/src/runtime/auth/route-policy.ts +4 -0
- package/src/runtime/channel-invite-transports/telegram.ts +12 -6
- package/src/runtime/channel-retry-sweep.ts +6 -0
- package/src/runtime/http-types.ts +1 -0
- package/src/runtime/middleware/error-handler.ts +1 -2
- package/src/runtime/routes/app-management-routes.ts +1 -0
- package/src/runtime/routes/btw-routes.ts +20 -1
- package/src/runtime/routes/conversation-routes.ts +32 -13
- package/src/runtime/routes/inbound-message-handler.ts +10 -2
- package/src/runtime/routes/inbound-stages/background-dispatch.ts +4 -0
- package/src/runtime/routes/inbound-stages/edit-intercept.ts +5 -5
- package/src/runtime/routes/integrations/slack/share.ts +5 -5
- package/src/runtime/routes/log-export-routes.ts +122 -10
- package/src/runtime/routes/session-query-routes.ts +3 -3
- package/src/runtime/routes/settings-routes.ts +53 -0
- package/src/runtime/routes/workspace-routes.ts +3 -0
- package/src/runtime/verification-templates.ts +1 -1
- package/src/security/oauth2.ts +4 -4
- package/src/security/secure-keys.ts +4 -4
- package/src/signals/bash.ts +157 -0
- package/src/skills/skillssh-registry.ts +6 -1
- package/src/swarm/backend-claude-code.ts +6 -6
- package/src/swarm/worker-backend.ts +1 -1
- package/src/swarm/worker-runner.ts +1 -1
- package/src/telegram/bot-username.ts +11 -0
- package/src/tools/claude-code/claude-code.ts +4 -4
- package/src/tools/credentials/broker.ts +7 -5
- package/src/tools/credentials/vault.ts +3 -2
- package/src/tools/network/__tests__/web-search.test.ts +18 -86
- package/src/tools/network/web-search.ts +9 -15
- package/src/util/platform.ts +7 -1
- package/src/util/pricing.ts +0 -1
- package/src/workspace/provider-commit-message-generator.ts +10 -6
|
@@ -7,9 +7,8 @@
|
|
|
7
7
|
* 2. Token estimation significantly underestimates actual token count
|
|
8
8
|
* 3. No mid-loop budget check to prevent hitting the provider limit
|
|
9
9
|
*
|
|
10
|
-
*
|
|
11
|
-
*
|
|
12
|
-
* Tests 6 and 7 are skipped (depend on mid-loop checkpoint changes in PR 3).
|
|
10
|
+
* All tests are test.todo — they document expected behavior for bugs
|
|
11
|
+
* to be fixed in subsequent PRs (PR 2 for tests 1–5, PR 3 for tests 6–7).
|
|
13
12
|
*/
|
|
14
13
|
import { beforeEach, describe, expect, mock, test } from "bun:test";
|
|
15
14
|
|
|
@@ -52,7 +51,6 @@ mock.module("../config/loader.js", () => ({
|
|
|
52
51
|
},
|
|
53
52
|
},
|
|
54
53
|
rateLimit: { maxRequestsPerMinute: 0, maxTokensPerSession: 0 },
|
|
55
|
-
apiKeys: {},
|
|
56
54
|
workspaceGit: { turnCommitMaxWaitMs: 10 },
|
|
57
55
|
ui: {},
|
|
58
56
|
}),
|
|
@@ -198,7 +196,7 @@ mock.module("../daemon/session-memory.js", () => ({
|
|
|
198
196
|
enabled: false,
|
|
199
197
|
degraded: false,
|
|
200
198
|
injectedText: "",
|
|
201
|
-
|
|
199
|
+
|
|
202
200
|
semanticHits: 0,
|
|
203
201
|
recencyHits: 0,
|
|
204
202
|
injectedTokens: 0,
|
|
@@ -374,6 +372,7 @@ function makeCtx(
|
|
|
374
372
|
|
|
375
373
|
agentLoop: {
|
|
376
374
|
run: agentLoopRun,
|
|
375
|
+
getToolTokenBudget: () => 0,
|
|
377
376
|
} as unknown as AgentLoopSessionContext["agentLoop"],
|
|
378
377
|
provider: {
|
|
379
378
|
name: "mock-provider",
|
|
@@ -535,278 +534,284 @@ describe("session-agent-loop overflow recovery (JARVIS-110)", () => {
|
|
|
535
534
|
//
|
|
536
535
|
// Expected behavior (PR 2 fix): After progress + context_too_large,
|
|
537
536
|
// the system should still attempt compaction before surfacing error.
|
|
538
|
-
test(
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
state: {
|
|
548
|
-
appliedTiers: ["forced_compaction"],
|
|
549
|
-
injectionMode: "full",
|
|
550
|
-
exhausted: false,
|
|
551
|
-
},
|
|
552
|
-
estimatedTokens: 50_000,
|
|
553
|
-
compactionResult: {
|
|
554
|
-
compacted: true,
|
|
537
|
+
test.todo(
|
|
538
|
+
"context too large after progress triggers compaction retry instead of immediate failure",
|
|
539
|
+
async () => {
|
|
540
|
+
const events: ServerMessage[] = [];
|
|
541
|
+
let reducerCalled = false;
|
|
542
|
+
|
|
543
|
+
mockReducerStepFn = (msgs: Message[]) => {
|
|
544
|
+
reducerCalled = true;
|
|
545
|
+
return {
|
|
555
546
|
messages: msgs,
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
thresholdTokens: 160_000,
|
|
562
|
-
compactedMessages: 10,
|
|
563
|
-
summaryCalls: 1,
|
|
564
|
-
summaryInputTokens: 500,
|
|
565
|
-
summaryOutputTokens: 200,
|
|
566
|
-
summaryModel: "mock-model",
|
|
567
|
-
},
|
|
568
|
-
};
|
|
569
|
-
};
|
|
570
|
-
|
|
571
|
-
let agentLoopCallCount = 0;
|
|
572
|
-
const agentLoopRun: AgentLoopRun = async (messages, onEvent) => {
|
|
573
|
-
agentLoopCallCount++;
|
|
574
|
-
if (agentLoopCallCount === 1) {
|
|
575
|
-
// Simulate: agent makes progress (tool calls + results added)
|
|
576
|
-
// then hits context_too_large on next LLM call
|
|
577
|
-
const progressMessages: Message[] = [
|
|
578
|
-
...messages,
|
|
579
|
-
{
|
|
580
|
-
role: "assistant" as const,
|
|
581
|
-
content: [
|
|
582
|
-
{ type: "text", text: "Let me check that." },
|
|
583
|
-
{
|
|
584
|
-
type: "tool_use",
|
|
585
|
-
id: "tu-progress",
|
|
586
|
-
name: "bash",
|
|
587
|
-
input: { command: "ls" },
|
|
588
|
-
},
|
|
589
|
-
] as ContentBlock[],
|
|
547
|
+
tier: "forced_compaction",
|
|
548
|
+
state: {
|
|
549
|
+
appliedTiers: ["forced_compaction"],
|
|
550
|
+
injectionMode: "full",
|
|
551
|
+
exhausted: false,
|
|
590
552
|
},
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
553
|
+
estimatedTokens: 50_000,
|
|
554
|
+
compactionResult: {
|
|
555
|
+
compacted: true,
|
|
556
|
+
messages: msgs,
|
|
557
|
+
compactedPersistedMessages: 5,
|
|
558
|
+
summaryText: "Summary",
|
|
559
|
+
previousEstimatedInputTokens: 190_000,
|
|
560
|
+
estimatedInputTokens: 50_000,
|
|
561
|
+
maxInputTokens: 200_000,
|
|
562
|
+
thresholdTokens: 160_000,
|
|
563
|
+
compactedMessages: 10,
|
|
564
|
+
summaryCalls: 1,
|
|
565
|
+
summaryInputTokens: 500,
|
|
566
|
+
summaryOutputTokens: 200,
|
|
567
|
+
summaryModel: "mock-model",
|
|
601
568
|
},
|
|
602
|
-
|
|
569
|
+
};
|
|
570
|
+
};
|
|
603
571
|
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
572
|
+
let agentLoopCallCount = 0;
|
|
573
|
+
const agentLoopRun: AgentLoopRun = async (messages, onEvent) => {
|
|
574
|
+
agentLoopCallCount++;
|
|
575
|
+
if (agentLoopCallCount === 1) {
|
|
576
|
+
// Simulate: agent makes progress (tool calls + results added)
|
|
577
|
+
// then hits context_too_large on next LLM call
|
|
578
|
+
const progressMessages: Message[] = [
|
|
579
|
+
...messages,
|
|
580
|
+
{
|
|
581
|
+
role: "assistant" as const,
|
|
582
|
+
content: [
|
|
583
|
+
{ type: "text", text: "Let me check that." },
|
|
584
|
+
{
|
|
585
|
+
type: "tool_use",
|
|
586
|
+
id: "tu-progress",
|
|
587
|
+
name: "bash",
|
|
588
|
+
input: { command: "ls" },
|
|
589
|
+
},
|
|
590
|
+
] as ContentBlock[],
|
|
591
|
+
},
|
|
592
|
+
{
|
|
593
|
+
role: "user" as const,
|
|
594
|
+
content: [
|
|
595
|
+
{
|
|
596
|
+
type: "tool_result",
|
|
597
|
+
tool_use_id: "tu-progress",
|
|
598
|
+
content: "file1.ts\nfile2.ts",
|
|
599
|
+
is_error: false,
|
|
600
|
+
},
|
|
601
|
+
] as ContentBlock[],
|
|
602
|
+
},
|
|
603
|
+
];
|
|
604
|
+
|
|
605
|
+
// Emit events for the progress that was made
|
|
606
|
+
onEvent({
|
|
607
|
+
type: "tool_use",
|
|
608
|
+
id: "tu-progress",
|
|
609
|
+
name: "bash",
|
|
610
|
+
input: { command: "ls" },
|
|
611
|
+
});
|
|
612
|
+
onEvent({
|
|
613
|
+
type: "tool_result",
|
|
614
|
+
toolUseId: "tu-progress",
|
|
615
|
+
content: "file1.ts\nfile2.ts",
|
|
616
|
+
isError: false,
|
|
617
|
+
});
|
|
618
|
+
onEvent({
|
|
619
|
+
type: "message_complete",
|
|
620
|
+
message: {
|
|
621
|
+
role: "assistant",
|
|
622
|
+
content: [
|
|
623
|
+
{ type: "text", text: "Let me check that." },
|
|
624
|
+
{
|
|
625
|
+
type: "tool_use",
|
|
626
|
+
id: "tu-progress",
|
|
627
|
+
name: "bash",
|
|
628
|
+
input: { command: "ls" },
|
|
629
|
+
},
|
|
630
|
+
],
|
|
631
|
+
},
|
|
632
|
+
});
|
|
633
|
+
onEvent({
|
|
634
|
+
type: "usage",
|
|
635
|
+
inputTokens: 100,
|
|
636
|
+
outputTokens: 50,
|
|
637
|
+
model: "test-model",
|
|
638
|
+
providerDurationMs: 100,
|
|
639
|
+
});
|
|
640
|
+
|
|
641
|
+
// Then context_too_large error occurs on the *next* LLM call
|
|
642
|
+
onEvent({
|
|
643
|
+
type: "error",
|
|
644
|
+
error: new Error(
|
|
645
|
+
"prompt is too long: 242201 tokens > 200000 maximum",
|
|
646
|
+
),
|
|
647
|
+
});
|
|
648
|
+
onEvent({
|
|
649
|
+
type: "usage",
|
|
650
|
+
inputTokens: 0,
|
|
651
|
+
outputTokens: 0,
|
|
652
|
+
model: "test-model",
|
|
653
|
+
providerDurationMs: 10,
|
|
654
|
+
});
|
|
655
|
+
|
|
656
|
+
// Return the history WITH progress (more messages than input)
|
|
657
|
+
return progressMessages;
|
|
658
|
+
}
|
|
659
|
+
|
|
660
|
+
// Second call (after compaction): succeed
|
|
617
661
|
onEvent({
|
|
618
662
|
type: "message_complete",
|
|
619
663
|
message: {
|
|
620
664
|
role: "assistant",
|
|
621
|
-
content: [
|
|
622
|
-
{ type: "text", text: "Let me check that." },
|
|
623
|
-
{
|
|
624
|
-
type: "tool_use",
|
|
625
|
-
id: "tu-progress",
|
|
626
|
-
name: "bash",
|
|
627
|
-
input: { command: "ls" },
|
|
628
|
-
},
|
|
629
|
-
],
|
|
665
|
+
content: [{ type: "text", text: "recovered after compaction" }],
|
|
630
666
|
},
|
|
631
667
|
});
|
|
632
668
|
onEvent({
|
|
633
669
|
type: "usage",
|
|
634
|
-
inputTokens:
|
|
635
|
-
outputTokens:
|
|
670
|
+
inputTokens: 50,
|
|
671
|
+
outputTokens: 25,
|
|
636
672
|
model: "test-model",
|
|
637
673
|
providerDurationMs: 100,
|
|
638
674
|
});
|
|
675
|
+
return [
|
|
676
|
+
...messages,
|
|
677
|
+
{
|
|
678
|
+
role: "assistant" as const,
|
|
679
|
+
content: [
|
|
680
|
+
{ type: "text", text: "recovered after compaction" },
|
|
681
|
+
] as ContentBlock[],
|
|
682
|
+
},
|
|
683
|
+
];
|
|
684
|
+
};
|
|
639
685
|
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
});
|
|
647
|
-
onEvent({
|
|
648
|
-
type: "usage",
|
|
649
|
-
inputTokens: 0,
|
|
650
|
-
outputTokens: 0,
|
|
651
|
-
model: "test-model",
|
|
652
|
-
providerDurationMs: 10,
|
|
653
|
-
});
|
|
654
|
-
|
|
655
|
-
// Return the history WITH progress (more messages than input)
|
|
656
|
-
return progressMessages;
|
|
657
|
-
}
|
|
658
|
-
|
|
659
|
-
// Second call (after compaction): succeed
|
|
660
|
-
onEvent({
|
|
661
|
-
type: "message_complete",
|
|
662
|
-
message: {
|
|
663
|
-
role: "assistant",
|
|
664
|
-
content: [{ type: "text", text: "recovered after compaction" }],
|
|
665
|
-
},
|
|
666
|
-
});
|
|
667
|
-
onEvent({
|
|
668
|
-
type: "usage",
|
|
669
|
-
inputTokens: 50,
|
|
670
|
-
outputTokens: 25,
|
|
671
|
-
model: "test-model",
|
|
672
|
-
providerDurationMs: 100,
|
|
686
|
+
const ctx = makeCtx({
|
|
687
|
+
agentLoopRun,
|
|
688
|
+
contextWindowManager: {
|
|
689
|
+
shouldCompact: () => ({ needed: false, estimatedTokens: 0 }),
|
|
690
|
+
maybeCompact: async () => ({ compacted: false }),
|
|
691
|
+
} as unknown as AgentLoopSessionContext["contextWindowManager"],
|
|
673
692
|
});
|
|
674
|
-
return [
|
|
675
|
-
...messages,
|
|
676
|
-
{
|
|
677
|
-
role: "assistant" as const,
|
|
678
|
-
content: [
|
|
679
|
-
{ type: "text", text: "recovered after compaction" },
|
|
680
|
-
] as ContentBlock[],
|
|
681
|
-
},
|
|
682
|
-
];
|
|
683
|
-
};
|
|
684
|
-
|
|
685
|
-
const ctx = makeCtx({
|
|
686
|
-
agentLoopRun,
|
|
687
|
-
contextWindowManager: {
|
|
688
|
-
shouldCompact: () => ({ needed: false, estimatedTokens: 0 }),
|
|
689
|
-
maybeCompact: async () => ({ compacted: false }),
|
|
690
|
-
} as unknown as AgentLoopSessionContext["contextWindowManager"],
|
|
691
|
-
});
|
|
692
693
|
|
|
693
|
-
|
|
694
|
+
await runAgentLoopImpl(ctx, "hello", "msg-1", (msg) => events.push(msg));
|
|
694
695
|
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
696
|
+
// BUG: Currently the reducer is NOT called when progress was made before
|
|
697
|
+
// context_too_large. The error is surfaced immediately.
|
|
698
|
+
// After PR 2 fix, the reducer SHOULD be called to attempt compaction.
|
|
699
|
+
expect(reducerCalled).toBe(true);
|
|
699
700
|
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
701
|
+
// BUG: Currently a session_error IS emitted instead of retrying.
|
|
702
|
+
// After PR 2 fix, there should be no session_error.
|
|
703
|
+
const sessionError = events.find((e) => e.type === "session_error");
|
|
704
|
+
expect(sessionError).toBeUndefined();
|
|
705
|
+
},
|
|
706
|
+
);
|
|
705
707
|
|
|
706
708
|
// ── Test 2 ────────────────────────────────────────────────────────
|
|
707
709
|
// When estimation says we're within budget but the provider rejects,
|
|
708
710
|
// the post-run convergence loop should kick in and recover.
|
|
709
711
|
// This test should PASS against current code (when no progress is made).
|
|
710
|
-
test(
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
state: {
|
|
725
|
-
appliedTiers: ["forced_compaction"],
|
|
726
|
-
injectionMode: "full",
|
|
727
|
-
exhausted: false,
|
|
728
|
-
},
|
|
729
|
-
estimatedTokens: 100_000,
|
|
730
|
-
compactionResult: {
|
|
731
|
-
compacted: true,
|
|
712
|
+
test.todo(
|
|
713
|
+
"overflow recovery compacts below limit even when estimation underestimates",
|
|
714
|
+
async () => {
|
|
715
|
+
const events: ServerMessage[] = [];
|
|
716
|
+
let callCount = 0;
|
|
717
|
+
let reducerCalled = false;
|
|
718
|
+
|
|
719
|
+
// Estimator says 185k (below 190k budget = 200k * 0.95)
|
|
720
|
+
mockEstimateTokens = 185_000;
|
|
721
|
+
|
|
722
|
+
// Reducer successfully compacts
|
|
723
|
+
mockReducerStepFn = (msgs: Message[]) => {
|
|
724
|
+
reducerCalled = true;
|
|
725
|
+
return {
|
|
732
726
|
messages: msgs,
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
727
|
+
tier: "forced_compaction",
|
|
728
|
+
state: {
|
|
729
|
+
appliedTiers: ["forced_compaction"],
|
|
730
|
+
injectionMode: "full",
|
|
731
|
+
exhausted: false,
|
|
732
|
+
},
|
|
733
|
+
estimatedTokens: 100_000,
|
|
734
|
+
compactionResult: {
|
|
735
|
+
compacted: true,
|
|
736
|
+
messages: msgs,
|
|
737
|
+
compactedPersistedMessages: 10,
|
|
738
|
+
summaryText: "Summary",
|
|
739
|
+
previousEstimatedInputTokens: 185_000,
|
|
740
|
+
estimatedInputTokens: 100_000,
|
|
741
|
+
maxInputTokens: 200_000,
|
|
742
|
+
thresholdTokens: 160_000,
|
|
743
|
+
compactedMessages: 20,
|
|
744
|
+
summaryCalls: 1,
|
|
745
|
+
summaryInputTokens: 800,
|
|
746
|
+
summaryOutputTokens: 300,
|
|
747
|
+
summaryModel: "mock-model",
|
|
748
|
+
},
|
|
749
|
+
};
|
|
745
750
|
};
|
|
746
|
-
};
|
|
747
751
|
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
|
|
752
|
+
const agentLoopRun: AgentLoopRun = async (messages, onEvent) => {
|
|
753
|
+
callCount++;
|
|
754
|
+
if (callCount === 1) {
|
|
755
|
+
// Provider rejects with "prompt is too long: 242201 tokens > 200000"
|
|
756
|
+
// even though estimator said 185k
|
|
757
|
+
onEvent({
|
|
758
|
+
type: "error",
|
|
759
|
+
error: new Error(
|
|
760
|
+
"prompt is too long: 242201 tokens > 200000 maximum",
|
|
761
|
+
),
|
|
762
|
+
});
|
|
763
|
+
onEvent({
|
|
764
|
+
type: "usage",
|
|
765
|
+
inputTokens: 0,
|
|
766
|
+
outputTokens: 0,
|
|
767
|
+
model: "test-model",
|
|
768
|
+
providerDurationMs: 10,
|
|
769
|
+
});
|
|
770
|
+
// No progress — return same messages
|
|
771
|
+
return messages;
|
|
772
|
+
}
|
|
773
|
+
// Second call succeeds
|
|
753
774
|
onEvent({
|
|
754
|
-
type: "
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
775
|
+
type: "message_complete",
|
|
776
|
+
message: {
|
|
777
|
+
role: "assistant",
|
|
778
|
+
content: [{ type: "text", text: "recovered" }],
|
|
779
|
+
},
|
|
758
780
|
});
|
|
759
781
|
onEvent({
|
|
760
782
|
type: "usage",
|
|
761
|
-
inputTokens:
|
|
762
|
-
outputTokens:
|
|
783
|
+
inputTokens: 80_000,
|
|
784
|
+
outputTokens: 200,
|
|
763
785
|
model: "test-model",
|
|
764
|
-
providerDurationMs:
|
|
786
|
+
providerDurationMs: 500,
|
|
765
787
|
});
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
content: [{ type: "text", text: "recovered" }],
|
|
775
|
-
},
|
|
776
|
-
});
|
|
777
|
-
onEvent({
|
|
778
|
-
type: "usage",
|
|
779
|
-
inputTokens: 80_000,
|
|
780
|
-
outputTokens: 200,
|
|
781
|
-
model: "test-model",
|
|
782
|
-
providerDurationMs: 500,
|
|
783
|
-
});
|
|
784
|
-
return [
|
|
785
|
-
...messages,
|
|
786
|
-
{
|
|
787
|
-
role: "assistant" as const,
|
|
788
|
-
content: [{ type: "text", text: "recovered" }] as ContentBlock[],
|
|
789
|
-
},
|
|
790
|
-
];
|
|
791
|
-
};
|
|
788
|
+
return [
|
|
789
|
+
...messages,
|
|
790
|
+
{
|
|
791
|
+
role: "assistant" as const,
|
|
792
|
+
content: [{ type: "text", text: "recovered" }] as ContentBlock[],
|
|
793
|
+
},
|
|
794
|
+
];
|
|
795
|
+
};
|
|
792
796
|
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
|
|
799
|
-
|
|
797
|
+
const ctx = makeCtx({
|
|
798
|
+
agentLoopRun,
|
|
799
|
+
contextWindowManager: {
|
|
800
|
+
shouldCompact: () => ({ needed: false, estimatedTokens: 0 }),
|
|
801
|
+
maybeCompact: async () => ({ compacted: false }),
|
|
802
|
+
} as unknown as AgentLoopSessionContext["contextWindowManager"],
|
|
803
|
+
});
|
|
800
804
|
|
|
801
|
-
|
|
805
|
+
await runAgentLoopImpl(ctx, "hello", "msg-1", (msg) => events.push(msg));
|
|
802
806
|
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
807
|
+
// The reducer should be called in the convergence loop
|
|
808
|
+
expect(reducerCalled).toBe(true);
|
|
809
|
+
// Should recover without session_error
|
|
810
|
+
const sessionError = events.find((e) => e.type === "session_error");
|
|
811
|
+
expect(sessionError).toBeUndefined();
|
|
812
|
+
expect(callCount).toBe(2);
|
|
813
|
+
},
|
|
814
|
+
);
|
|
810
815
|
|
|
811
816
|
// ── Test 3 ────────────────────────────────────────────────────────
|
|
812
817
|
// BUG: When the provider rejection reveals actual token count (e.g.,
|
|
@@ -825,216 +830,219 @@ describe("session-agent-loop overflow recovery (JARVIS-110)", () => {
|
|
|
825
830
|
// inaccuracy. For example: 190k / 1.31 ≈ 145k.
|
|
826
831
|
// Planned fix: targetInputTokensOverride should be adjusted based on
|
|
827
832
|
// the ratio between estimated and actual tokens.
|
|
828
|
-
test(
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
|
|
833
|
-
|
|
834
|
-
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
|
|
839
|
-
cfg: unknown
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
return {
|
|
843
|
-
messages: msgs,
|
|
844
|
-
tier: "forced_compaction",
|
|
845
|
-
state: {
|
|
846
|
-
appliedTiers: ["forced_compaction"],
|
|
847
|
-
injectionMode: "full",
|
|
848
|
-
exhausted: false,
|
|
849
|
-
},
|
|
850
|
-
estimatedTokens: 100_000,
|
|
851
|
-
compactionResult: {
|
|
852
|
-
compacted: true,
|
|
833
|
+
test.todo(
|
|
834
|
+
"forced compaction targets a lower budget when estimation has been inaccurate",
|
|
835
|
+
async () => {
|
|
836
|
+
const events: ServerMessage[] = [];
|
|
837
|
+
let callCount = 0;
|
|
838
|
+
let capturedTargetTokens: number | undefined;
|
|
839
|
+
|
|
840
|
+
// Estimator says 185k (below 190k budget = 200k * 0.95)
|
|
841
|
+
mockEstimateTokens = 185_000;
|
|
842
|
+
|
|
843
|
+
// Reducer captures the targetTokens from the config
|
|
844
|
+
mockReducerStepFn = (msgs: Message[], cfg: unknown) => {
|
|
845
|
+
capturedTargetTokens = (cfg as { targetTokens: number }).targetTokens;
|
|
846
|
+
return {
|
|
853
847
|
messages: msgs,
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
|
|
848
|
+
tier: "forced_compaction",
|
|
849
|
+
state: {
|
|
850
|
+
appliedTiers: ["forced_compaction"],
|
|
851
|
+
injectionMode: "full",
|
|
852
|
+
exhausted: false,
|
|
853
|
+
},
|
|
854
|
+
estimatedTokens: 100_000,
|
|
855
|
+
compactionResult: {
|
|
856
|
+
compacted: true,
|
|
857
|
+
messages: msgs,
|
|
858
|
+
compactedPersistedMessages: 10,
|
|
859
|
+
summaryText: "Summary",
|
|
860
|
+
previousEstimatedInputTokens: 185_000,
|
|
861
|
+
estimatedInputTokens: 100_000,
|
|
862
|
+
maxInputTokens: 200_000,
|
|
863
|
+
thresholdTokens: 160_000,
|
|
864
|
+
compactedMessages: 20,
|
|
865
|
+
summaryCalls: 1,
|
|
866
|
+
summaryInputTokens: 800,
|
|
867
|
+
summaryOutputTokens: 300,
|
|
868
|
+
summaryModel: "mock-model",
|
|
869
|
+
},
|
|
870
|
+
};
|
|
866
871
|
};
|
|
867
|
-
};
|
|
868
872
|
|
|
869
|
-
|
|
870
|
-
|
|
871
|
-
|
|
872
|
-
|
|
873
|
+
const agentLoopRun: AgentLoopRun = async (messages, onEvent) => {
|
|
874
|
+
callCount++;
|
|
875
|
+
if (callCount === 1) {
|
|
876
|
+
// Provider rejects: actual tokens 242201, way above estimate of 185k
|
|
877
|
+
onEvent({
|
|
878
|
+
type: "error",
|
|
879
|
+
error: new Error(
|
|
880
|
+
"prompt is too long: 242201 tokens > 200000 maximum",
|
|
881
|
+
),
|
|
882
|
+
});
|
|
883
|
+
onEvent({
|
|
884
|
+
type: "usage",
|
|
885
|
+
inputTokens: 0,
|
|
886
|
+
outputTokens: 0,
|
|
887
|
+
model: "test-model",
|
|
888
|
+
providerDurationMs: 10,
|
|
889
|
+
});
|
|
890
|
+
// No progress — return same messages
|
|
891
|
+
return messages;
|
|
892
|
+
}
|
|
893
|
+
// Second call succeeds after compaction
|
|
873
894
|
onEvent({
|
|
874
|
-
type: "
|
|
875
|
-
|
|
876
|
-
|
|
877
|
-
|
|
895
|
+
type: "message_complete",
|
|
896
|
+
message: {
|
|
897
|
+
role: "assistant",
|
|
898
|
+
content: [{ type: "text", text: "recovered" }],
|
|
899
|
+
},
|
|
878
900
|
});
|
|
879
901
|
onEvent({
|
|
880
902
|
type: "usage",
|
|
881
|
-
inputTokens:
|
|
882
|
-
outputTokens:
|
|
903
|
+
inputTokens: 80_000,
|
|
904
|
+
outputTokens: 200,
|
|
883
905
|
model: "test-model",
|
|
884
|
-
providerDurationMs:
|
|
906
|
+
providerDurationMs: 500,
|
|
885
907
|
});
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
model: "test-model",
|
|
902
|
-
providerDurationMs: 500,
|
|
908
|
+
return [
|
|
909
|
+
...messages,
|
|
910
|
+
{
|
|
911
|
+
role: "assistant" as const,
|
|
912
|
+
content: [{ type: "text", text: "recovered" }] as ContentBlock[],
|
|
913
|
+
},
|
|
914
|
+
];
|
|
915
|
+
};
|
|
916
|
+
|
|
917
|
+
const ctx = makeCtx({
|
|
918
|
+
agentLoopRun,
|
|
919
|
+
contextWindowManager: {
|
|
920
|
+
shouldCompact: () => ({ needed: false, estimatedTokens: 0 }),
|
|
921
|
+
maybeCompact: async () => ({ compacted: false }),
|
|
922
|
+
} as unknown as AgentLoopSessionContext["contextWindowManager"],
|
|
903
923
|
});
|
|
904
|
-
return [
|
|
905
|
-
...messages,
|
|
906
|
-
{
|
|
907
|
-
role: "assistant" as const,
|
|
908
|
-
content: [{ type: "text", text: "recovered" }] as ContentBlock[],
|
|
909
|
-
},
|
|
910
|
-
];
|
|
911
|
-
};
|
|
912
924
|
|
|
913
|
-
|
|
914
|
-
agentLoopRun,
|
|
915
|
-
contextWindowManager: {
|
|
916
|
-
shouldCompact: () => ({ needed: false, estimatedTokens: 0 }),
|
|
917
|
-
maybeCompact: async () => ({ compacted: false }),
|
|
918
|
-
} as unknown as AgentLoopSessionContext["contextWindowManager"],
|
|
919
|
-
});
|
|
925
|
+
await runAgentLoopImpl(ctx, "hello", "msg-1", (msg) => events.push(msg));
|
|
920
926
|
|
|
921
|
-
|
|
927
|
+
// The reducer should have been called with a corrected target
|
|
928
|
+
expect(capturedTargetTokens).toBeDefined();
|
|
922
929
|
|
|
923
|
-
|
|
924
|
-
|
|
925
|
-
|
|
926
|
-
|
|
927
|
-
|
|
928
|
-
|
|
929
|
-
|
|
930
|
-
|
|
931
|
-
|
|
932
|
-
|
|
933
|
-
|
|
934
|
-
|
|
935
|
-
|
|
936
|
-
|
|
937
|
-
|
|
938
|
-
|
|
939
|
-
|
|
940
|
-
|
|
941
|
-
|
|
942
|
-
expect(callCount).toBe(2);
|
|
943
|
-
});
|
|
930
|
+
// preflightBudget = 200_000 * 0.95 = 190_000
|
|
931
|
+
// estimationErrorRatio = 242201 / 185000 ≈ 1.309
|
|
932
|
+
// correctedTarget = floor(190000 / 1.309) ≈ 145_130
|
|
933
|
+
// The corrected target must be LESS than the uncorrected preflightBudget
|
|
934
|
+
const preflightBudget = 190_000;
|
|
935
|
+
expect(capturedTargetTokens!).toBeLessThan(preflightBudget);
|
|
936
|
+
|
|
937
|
+
// Verify the approximate corrected value (190000 / (242201/185000))
|
|
938
|
+
const expectedCorrectedTarget = Math.floor(
|
|
939
|
+
preflightBudget / (242201 / 185_000),
|
|
940
|
+
);
|
|
941
|
+
expect(capturedTargetTokens!).toBe(expectedCorrectedTarget);
|
|
942
|
+
|
|
943
|
+
// Should recover without session_error
|
|
944
|
+
const sessionError = events.find((e) => e.type === "session_error");
|
|
945
|
+
expect(sessionError).toBeUndefined();
|
|
946
|
+
expect(callCount).toBe(2);
|
|
947
|
+
},
|
|
948
|
+
);
|
|
944
949
|
|
|
945
950
|
// ── Test 4 ────────────────────────────────────────────────────────
|
|
946
951
|
// A realistic 75+ message conversation with many tool calls where
|
|
947
952
|
// token estimation underestimates. This test should PASS against
|
|
948
953
|
// current code because the agent loop returns same-length history
|
|
949
954
|
// (no progress), so the convergence loop kicks in.
|
|
950
|
-
test(
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
|
|
955
|
+
test.todo(
|
|
956
|
+
"overflow recovery succeeds for 75+ message conversation with many tool calls",
|
|
957
|
+
async () => {
|
|
958
|
+
const events: ServerMessage[] = [];
|
|
959
|
+
const longHistory = buildLongConversation(75);
|
|
960
|
+
let callCount = 0;
|
|
961
|
+
let reducerCalled = false;
|
|
962
|
+
|
|
963
|
+
// Estimator says ~195k — just above budget so preflight reducer runs
|
|
964
|
+
mockEstimateTokens = 195_000;
|
|
965
|
+
|
|
966
|
+
// Reducer reduces to under budget
|
|
967
|
+
mockReducerStepFn = (msgs: Message[]) => {
|
|
968
|
+
reducerCalled = true;
|
|
969
|
+
return {
|
|
970
|
+
messages: msgs.slice(-10), // Keep only last 10 messages
|
|
971
|
+
tier: "forced_compaction",
|
|
972
|
+
state: {
|
|
973
|
+
appliedTiers: ["forced_compaction"],
|
|
974
|
+
injectionMode: "full",
|
|
975
|
+
exhausted: false,
|
|
976
|
+
},
|
|
977
|
+
estimatedTokens: 50_000,
|
|
978
|
+
compactionResult: {
|
|
979
|
+
compacted: true,
|
|
980
|
+
messages: msgs.slice(-10),
|
|
981
|
+
compactedPersistedMessages: msgs.length - 10,
|
|
982
|
+
summaryText: "Long conversation summary",
|
|
983
|
+
previousEstimatedInputTokens: 195_000,
|
|
984
|
+
estimatedInputTokens: 50_000,
|
|
985
|
+
maxInputTokens: 200_000,
|
|
986
|
+
thresholdTokens: 160_000,
|
|
987
|
+
compactedMessages: msgs.length - 10,
|
|
988
|
+
summaryCalls: 2,
|
|
989
|
+
summaryInputTokens: 2000,
|
|
990
|
+
summaryOutputTokens: 500,
|
|
991
|
+
summaryModel: "mock-model",
|
|
992
|
+
},
|
|
993
|
+
};
|
|
994
|
+
};
|
|
958
995
|
|
|
959
|
-
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
|
|
973
|
-
|
|
974
|
-
|
|
975
|
-
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
|
|
981
|
-
|
|
982
|
-
|
|
983
|
-
|
|
984
|
-
summaryModel: "mock-model",
|
|
985
|
-
},
|
|
996
|
+
const agentLoopRun: AgentLoopRun = async (messages, onEvent) => {
|
|
997
|
+
callCount++;
|
|
998
|
+
onEvent({
|
|
999
|
+
type: "message_complete",
|
|
1000
|
+
message: {
|
|
1001
|
+
role: "assistant",
|
|
1002
|
+
content: [{ type: "text", text: "Here's the analysis..." }],
|
|
1003
|
+
},
|
|
1004
|
+
});
|
|
1005
|
+
onEvent({
|
|
1006
|
+
type: "usage",
|
|
1007
|
+
inputTokens: 50_000,
|
|
1008
|
+
outputTokens: 300,
|
|
1009
|
+
model: "test-model",
|
|
1010
|
+
providerDurationMs: 800,
|
|
1011
|
+
});
|
|
1012
|
+
return [
|
|
1013
|
+
...messages,
|
|
1014
|
+
{
|
|
1015
|
+
role: "assistant" as const,
|
|
1016
|
+
content: [
|
|
1017
|
+
{ type: "text", text: "Here's the analysis..." },
|
|
1018
|
+
] as ContentBlock[],
|
|
1019
|
+
},
|
|
1020
|
+
];
|
|
986
1021
|
};
|
|
987
|
-
};
|
|
988
1022
|
|
|
989
|
-
|
|
990
|
-
|
|
991
|
-
|
|
992
|
-
|
|
993
|
-
|
|
994
|
-
|
|
995
|
-
|
|
996
|
-
},
|
|
1023
|
+
const ctx = makeCtx({
|
|
1024
|
+
agentLoopRun,
|
|
1025
|
+
messages: longHistory,
|
|
1026
|
+
contextWindowManager: {
|
|
1027
|
+
shouldCompact: () => ({ needed: false, estimatedTokens: 0 }),
|
|
1028
|
+
maybeCompact: async () => ({ compacted: false }),
|
|
1029
|
+
} as unknown as AgentLoopSessionContext["contextWindowManager"],
|
|
997
1030
|
});
|
|
998
|
-
onEvent({
|
|
999
|
-
type: "usage",
|
|
1000
|
-
inputTokens: 50_000,
|
|
1001
|
-
outputTokens: 300,
|
|
1002
|
-
model: "test-model",
|
|
1003
|
-
providerDurationMs: 800,
|
|
1004
|
-
});
|
|
1005
|
-
return [
|
|
1006
|
-
...messages,
|
|
1007
|
-
{
|
|
1008
|
-
role: "assistant" as const,
|
|
1009
|
-
content: [
|
|
1010
|
-
{ type: "text", text: "Here's the analysis..." },
|
|
1011
|
-
] as ContentBlock[],
|
|
1012
|
-
},
|
|
1013
|
-
];
|
|
1014
|
-
};
|
|
1015
1031
|
|
|
1016
|
-
|
|
1017
|
-
|
|
1018
|
-
|
|
1019
|
-
|
|
1020
|
-
|
|
1021
|
-
|
|
1022
|
-
|
|
1023
|
-
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
events.
|
|
1027
|
-
|
|
1028
|
-
|
|
1029
|
-
|
|
1030
|
-
expect(reducerCalled).toBe(true);
|
|
1031
|
-
// Should succeed
|
|
1032
|
-
expect(callCount).toBe(1);
|
|
1033
|
-
const sessionError = events.find((e) => e.type === "session_error");
|
|
1034
|
-
expect(sessionError).toBeUndefined();
|
|
1035
|
-
const complete = events.find((e) => e.type === "message_complete");
|
|
1036
|
-
expect(complete).toBeDefined();
|
|
1037
|
-
});
|
|
1032
|
+
await runAgentLoopImpl(ctx, "analyze this", "msg-1", (msg) =>
|
|
1033
|
+
events.push(msg),
|
|
1034
|
+
);
|
|
1035
|
+
|
|
1036
|
+
// Preflight should trigger the reducer since 195k > 190k budget
|
|
1037
|
+
expect(reducerCalled).toBe(true);
|
|
1038
|
+
// Should succeed
|
|
1039
|
+
expect(callCount).toBe(1);
|
|
1040
|
+
const sessionError = events.find((e) => e.type === "session_error");
|
|
1041
|
+
expect(sessionError).toBeUndefined();
|
|
1042
|
+
const complete = events.find((e) => e.type === "message_complete");
|
|
1043
|
+
expect(complete).toBeDefined();
|
|
1044
|
+
},
|
|
1045
|
+
);
|
|
1038
1046
|
|
|
1039
1047
|
// ── Test 5 ────────────────────────────────────────────────────────
|
|
1040
1048
|
// BUG: When all 4 reducer tiers have been applied, then the agent
|
|
@@ -1045,390 +1053,571 @@ describe("session-agent-loop overflow recovery (JARVIS-110)", () => {
|
|
|
1045
1053
|
// Expected behavior (PR 2 fix): Even after all tiers are exhausted,
|
|
1046
1054
|
// if progress was made, attempt emergency compaction with
|
|
1047
1055
|
// `minKeepRecentUserTurns: 0` as a last resort.
|
|
1048
|
-
test(
|
|
1049
|
-
|
|
1050
|
-
|
|
1051
|
-
|
|
1052
|
-
|
|
1053
|
-
|
|
1054
|
-
|
|
1055
|
-
|
|
1056
|
-
|
|
1057
|
-
|
|
1058
|
-
|
|
1059
|
-
|
|
1060
|
-
|
|
1061
|
-
|
|
1062
|
-
|
|
1063
|
-
|
|
1064
|
-
|
|
1065
|
-
|
|
1066
|
-
|
|
1067
|
-
|
|
1056
|
+
test.todo(
|
|
1057
|
+
"exhausted reducer tiers with progress still attempts emergency compaction",
|
|
1058
|
+
async () => {
|
|
1059
|
+
const events: ServerMessage[] = [];
|
|
1060
|
+
let emergencyCompactCalled = false;
|
|
1061
|
+
|
|
1062
|
+
// Start with reducer already exhausted
|
|
1063
|
+
mockReducerStepFn = (msgs: Message[]) => {
|
|
1064
|
+
return {
|
|
1065
|
+
messages: msgs,
|
|
1066
|
+
tier: "injection_downgrade",
|
|
1067
|
+
state: {
|
|
1068
|
+
appliedTiers: [
|
|
1069
|
+
"forced_compaction",
|
|
1070
|
+
"tool_result_truncation",
|
|
1071
|
+
"media_stubbing",
|
|
1072
|
+
"injection_downgrade",
|
|
1073
|
+
],
|
|
1074
|
+
injectionMode: "minimal",
|
|
1075
|
+
exhausted: true,
|
|
1076
|
+
},
|
|
1077
|
+
estimatedTokens: 195_000,
|
|
1078
|
+
};
|
|
1068
1079
|
};
|
|
1069
|
-
};
|
|
1070
1080
|
|
|
1071
|
-
|
|
1072
|
-
|
|
1073
|
-
|
|
1074
|
-
|
|
1075
|
-
|
|
1076
|
-
|
|
1077
|
-
|
|
1078
|
-
|
|
1079
|
-
|
|
1080
|
-
|
|
1081
|
-
|
|
1082
|
-
|
|
1083
|
-
|
|
1084
|
-
|
|
1085
|
-
|
|
1086
|
-
|
|
1087
|
-
|
|
1088
|
-
|
|
1089
|
-
|
|
1090
|
-
|
|
1091
|
-
|
|
1092
|
-
|
|
1093
|
-
|
|
1094
|
-
|
|
1095
|
-
|
|
1096
|
-
|
|
1097
|
-
|
|
1098
|
-
|
|
1099
|
-
|
|
1100
|
-
|
|
1101
|
-
|
|
1081
|
+
let agentLoopCallCount = 0;
|
|
1082
|
+
const agentLoopRun: AgentLoopRun = async (messages, onEvent) => {
|
|
1083
|
+
agentLoopCallCount++;
|
|
1084
|
+
if (agentLoopCallCount === 1) {
|
|
1085
|
+
// Agent makes progress (tool calls succeed, messages grow)
|
|
1086
|
+
const progressMessages: Message[] = [
|
|
1087
|
+
...messages,
|
|
1088
|
+
{
|
|
1089
|
+
role: "assistant" as const,
|
|
1090
|
+
content: [
|
|
1091
|
+
{ type: "text", text: "Running analysis..." },
|
|
1092
|
+
{
|
|
1093
|
+
type: "tool_use",
|
|
1094
|
+
id: "tu-1",
|
|
1095
|
+
name: "bash",
|
|
1096
|
+
input: { command: "find . -name '*.ts'" },
|
|
1097
|
+
},
|
|
1098
|
+
] as ContentBlock[],
|
|
1099
|
+
},
|
|
1100
|
+
{
|
|
1101
|
+
role: "user" as const,
|
|
1102
|
+
content: [
|
|
1103
|
+
{
|
|
1104
|
+
type: "tool_result",
|
|
1105
|
+
tool_use_id: "tu-1",
|
|
1106
|
+
content: "file1.ts\nfile2.ts\nfile3.ts",
|
|
1107
|
+
is_error: false,
|
|
1108
|
+
},
|
|
1109
|
+
] as ContentBlock[],
|
|
1110
|
+
},
|
|
1111
|
+
];
|
|
1102
1112
|
|
|
1103
|
-
|
|
1104
|
-
|
|
1105
|
-
|
|
1106
|
-
|
|
1107
|
-
|
|
1108
|
-
|
|
1109
|
-
|
|
1110
|
-
|
|
1111
|
-
|
|
1112
|
-
|
|
1113
|
-
|
|
1114
|
-
|
|
1113
|
+
onEvent({
|
|
1114
|
+
type: "tool_use",
|
|
1115
|
+
id: "tu-1",
|
|
1116
|
+
name: "bash",
|
|
1117
|
+
input: { command: "find . -name '*.ts'" },
|
|
1118
|
+
});
|
|
1119
|
+
onEvent({
|
|
1120
|
+
type: "tool_result",
|
|
1121
|
+
toolUseId: "tu-1",
|
|
1122
|
+
content: "file1.ts\nfile2.ts\nfile3.ts",
|
|
1123
|
+
isError: false,
|
|
1124
|
+
});
|
|
1125
|
+
onEvent({
|
|
1126
|
+
type: "message_complete",
|
|
1127
|
+
message: {
|
|
1128
|
+
role: "assistant",
|
|
1129
|
+
content: [
|
|
1130
|
+
{ type: "text", text: "Running analysis..." },
|
|
1131
|
+
{
|
|
1132
|
+
type: "tool_use",
|
|
1133
|
+
id: "tu-1",
|
|
1134
|
+
name: "bash",
|
|
1135
|
+
input: { command: "find . -name '*.ts'" },
|
|
1136
|
+
},
|
|
1137
|
+
],
|
|
1138
|
+
},
|
|
1139
|
+
});
|
|
1140
|
+
onEvent({
|
|
1141
|
+
type: "usage",
|
|
1142
|
+
inputTokens: 190_000,
|
|
1143
|
+
outputTokens: 100,
|
|
1144
|
+
model: "test-model",
|
|
1145
|
+
providerDurationMs: 200,
|
|
1146
|
+
});
|
|
1147
|
+
|
|
1148
|
+
// Then context_too_large on the next LLM call within the loop
|
|
1149
|
+
onEvent({
|
|
1150
|
+
type: "error",
|
|
1151
|
+
error: new Error("context_length_exceeded"),
|
|
1152
|
+
});
|
|
1153
|
+
onEvent({
|
|
1154
|
+
type: "usage",
|
|
1155
|
+
inputTokens: 0,
|
|
1156
|
+
outputTokens: 0,
|
|
1157
|
+
model: "test-model",
|
|
1158
|
+
providerDurationMs: 10,
|
|
1159
|
+
});
|
|
1160
|
+
|
|
1161
|
+
return progressMessages;
|
|
1162
|
+
}
|
|
1163
|
+
|
|
1164
|
+
// After emergency compaction, succeed
|
|
1115
1165
|
onEvent({
|
|
1116
1166
|
type: "message_complete",
|
|
1117
1167
|
message: {
|
|
1118
1168
|
role: "assistant",
|
|
1119
|
-
content: [
|
|
1120
|
-
{ type: "text", text: "Running analysis..." },
|
|
1121
|
-
{
|
|
1122
|
-
type: "tool_use",
|
|
1123
|
-
id: "tu-1",
|
|
1124
|
-
name: "bash",
|
|
1125
|
-
input: { command: "find . -name '*.ts'" },
|
|
1126
|
-
},
|
|
1127
|
-
],
|
|
1169
|
+
content: [{ type: "text", text: "recovered" }],
|
|
1128
1170
|
},
|
|
1129
1171
|
});
|
|
1130
1172
|
onEvent({
|
|
1131
1173
|
type: "usage",
|
|
1132
|
-
inputTokens:
|
|
1174
|
+
inputTokens: 50_000,
|
|
1133
1175
|
outputTokens: 100,
|
|
1134
1176
|
model: "test-model",
|
|
1135
1177
|
providerDurationMs: 200,
|
|
1136
1178
|
});
|
|
1179
|
+
return [
|
|
1180
|
+
...messages,
|
|
1181
|
+
{
|
|
1182
|
+
role: "assistant" as const,
|
|
1183
|
+
content: [{ type: "text", text: "recovered" }] as ContentBlock[],
|
|
1184
|
+
},
|
|
1185
|
+
];
|
|
1186
|
+
};
|
|
1187
|
+
|
|
1188
|
+
const ctx = makeCtx({
|
|
1189
|
+
agentLoopRun,
|
|
1190
|
+
contextWindowManager: {
|
|
1191
|
+
shouldCompact: () => ({ needed: false, estimatedTokens: 0 }),
|
|
1192
|
+
maybeCompact: async (
|
|
1193
|
+
_msgs: Message[],
|
|
1194
|
+
_signal: AbortSignal,
|
|
1195
|
+
opts?: Record<string, unknown>,
|
|
1196
|
+
) => {
|
|
1197
|
+
if (opts?.force && opts?.minKeepRecentUserTurns === 0) {
|
|
1198
|
+
emergencyCompactCalled = true;
|
|
1199
|
+
return {
|
|
1200
|
+
compacted: true,
|
|
1201
|
+
messages: [
|
|
1202
|
+
{
|
|
1203
|
+
role: "user",
|
|
1204
|
+
content: [{ type: "text", text: "Hello" }],
|
|
1205
|
+
},
|
|
1206
|
+
] as Message[],
|
|
1207
|
+
compactedPersistedMessages: 50,
|
|
1208
|
+
summaryText: "Emergency summary",
|
|
1209
|
+
previousEstimatedInputTokens: 195_000,
|
|
1210
|
+
estimatedInputTokens: 50_000,
|
|
1211
|
+
maxInputTokens: 200_000,
|
|
1212
|
+
thresholdTokens: 160_000,
|
|
1213
|
+
compactedMessages: 50,
|
|
1214
|
+
summaryCalls: 1,
|
|
1215
|
+
summaryInputTokens: 1000,
|
|
1216
|
+
summaryOutputTokens: 300,
|
|
1217
|
+
summaryModel: "mock-model",
|
|
1218
|
+
};
|
|
1219
|
+
}
|
|
1220
|
+
return { compacted: false };
|
|
1221
|
+
},
|
|
1222
|
+
} as unknown as AgentLoopSessionContext["contextWindowManager"],
|
|
1223
|
+
});
|
|
1224
|
+
|
|
1225
|
+
await runAgentLoopImpl(ctx, "hello", "msg-1", (msg) => events.push(msg));
|
|
1137
1226
|
|
|
1138
|
-
|
|
1227
|
+
// BUG: Currently when progress was made + all tiers exhausted,
|
|
1228
|
+
// emergency compaction is NOT attempted. The error is surfaced directly.
|
|
1229
|
+
// After PR 2 fix, emergency compaction should be attempted.
|
|
1230
|
+
expect(emergencyCompactCalled).toBe(true);
|
|
1231
|
+
|
|
1232
|
+
// BUG: Currently a session_error IS emitted.
|
|
1233
|
+
const sessionError = events.find((e) => e.type === "session_error");
|
|
1234
|
+
expect(sessionError).toBeUndefined();
|
|
1235
|
+
},
|
|
1236
|
+
);
|
|
1237
|
+
|
|
1238
|
+
// ── Test 6 ────────────────────────────────────────────────────────
|
|
1239
|
+
// Tests mid-loop budget check via onCheckpoint.
|
|
1240
|
+
// The onCheckpoint callback estimates prompt tokens after each tool round.
|
|
1241
|
+
// When estimate exceeds the mid-loop threshold (85% of budget),
|
|
1242
|
+
// it returns "yield" to break the agent loop.
|
|
1243
|
+
// The session-agent-loop then runs compaction and re-enters the agent loop.
|
|
1244
|
+
test.todo(
|
|
1245
|
+
"onCheckpoint yields when token estimate exceeds mid-loop budget threshold",
|
|
1246
|
+
async () => {
|
|
1247
|
+
const events: ServerMessage[] = [];
|
|
1248
|
+
let compactionCalled = false;
|
|
1249
|
+
|
|
1250
|
+
// estimatePromptTokens is called:
|
|
1251
|
+
// 1. During preflight budget check (low value, below budget)
|
|
1252
|
+
// 2. During onCheckpoint mid-loop check (high value, above 85% threshold)
|
|
1253
|
+
// Budget = 200_000 * 0.95 = 190_000
|
|
1254
|
+
// Mid-loop threshold = 190_000 * 0.85 = 161_500
|
|
1255
|
+
let estimateCallCount = 0;
|
|
1256
|
+
mockEstimateTokens = () => {
|
|
1257
|
+
estimateCallCount++;
|
|
1258
|
+
// First call: preflight check — below budget
|
|
1259
|
+
if (estimateCallCount === 1) return 100_000;
|
|
1260
|
+
// Subsequent calls: mid-loop check — above 85% threshold
|
|
1261
|
+
return 170_000;
|
|
1262
|
+
};
|
|
1263
|
+
|
|
1264
|
+
let agentLoopCallCount = 0;
|
|
1265
|
+
const agentLoopRun: AgentLoopRun = async (
|
|
1266
|
+
messages,
|
|
1267
|
+
onEvent,
|
|
1268
|
+
_signal,
|
|
1269
|
+
_requestId,
|
|
1270
|
+
onCheckpoint,
|
|
1271
|
+
) => {
|
|
1272
|
+
agentLoopCallCount++;
|
|
1273
|
+
|
|
1274
|
+
if (agentLoopCallCount === 1) {
|
|
1275
|
+
// Simulate a tool round: assistant calls a tool, results come back
|
|
1276
|
+
const withProgress: Message[] = [
|
|
1277
|
+
...messages,
|
|
1278
|
+
{
|
|
1279
|
+
role: "assistant" as const,
|
|
1280
|
+
content: [
|
|
1281
|
+
{ type: "text", text: "Let me check." },
|
|
1282
|
+
{
|
|
1283
|
+
type: "tool_use",
|
|
1284
|
+
id: "tu-1",
|
|
1285
|
+
name: "bash",
|
|
1286
|
+
input: { command: "ls" },
|
|
1287
|
+
},
|
|
1288
|
+
] as ContentBlock[],
|
|
1289
|
+
},
|
|
1290
|
+
{
|
|
1291
|
+
role: "user" as const,
|
|
1292
|
+
content: [
|
|
1293
|
+
{
|
|
1294
|
+
type: "tool_result",
|
|
1295
|
+
tool_use_id: "tu-1",
|
|
1296
|
+
content: "file1.ts\nfile2.ts",
|
|
1297
|
+
is_error: false,
|
|
1298
|
+
},
|
|
1299
|
+
] as ContentBlock[],
|
|
1300
|
+
},
|
|
1301
|
+
];
|
|
1302
|
+
|
|
1303
|
+
onEvent({
|
|
1304
|
+
type: "message_complete",
|
|
1305
|
+
message: {
|
|
1306
|
+
role: "assistant",
|
|
1307
|
+
content: [
|
|
1308
|
+
{ type: "text", text: "Let me check." },
|
|
1309
|
+
{
|
|
1310
|
+
type: "tool_use",
|
|
1311
|
+
id: "tu-1",
|
|
1312
|
+
name: "bash",
|
|
1313
|
+
input: { command: "ls" },
|
|
1314
|
+
},
|
|
1315
|
+
],
|
|
1316
|
+
},
|
|
1317
|
+
});
|
|
1318
|
+
onEvent({
|
|
1319
|
+
type: "usage",
|
|
1320
|
+
inputTokens: 100,
|
|
1321
|
+
outputTokens: 50,
|
|
1322
|
+
model: "test-model",
|
|
1323
|
+
providerDurationMs: 100,
|
|
1324
|
+
});
|
|
1325
|
+
|
|
1326
|
+
// Call onCheckpoint — this should trigger the mid-loop budget check
|
|
1327
|
+
// which sees 170_000 > 161_500 and returns "yield"
|
|
1328
|
+
if (onCheckpoint) {
|
|
1329
|
+
const decision = onCheckpoint({
|
|
1330
|
+
turnIndex: 0,
|
|
1331
|
+
toolCount: 1,
|
|
1332
|
+
hasToolUse: true,
|
|
1333
|
+
history: withProgress,
|
|
1334
|
+
});
|
|
1335
|
+
if (decision === "yield") {
|
|
1336
|
+
// Agent loop stops when checkpoint yields
|
|
1337
|
+
return withProgress;
|
|
1338
|
+
}
|
|
1339
|
+
}
|
|
1340
|
+
|
|
1341
|
+
return withProgress;
|
|
1342
|
+
}
|
|
1343
|
+
|
|
1344
|
+
// Second call (after compaction): complete successfully
|
|
1139
1345
|
onEvent({
|
|
1140
|
-
type: "
|
|
1141
|
-
|
|
1346
|
+
type: "message_complete",
|
|
1347
|
+
message: {
|
|
1348
|
+
role: "assistant",
|
|
1349
|
+
content: [{ type: "text", text: "done after compaction" }],
|
|
1350
|
+
},
|
|
1142
1351
|
});
|
|
1143
1352
|
onEvent({
|
|
1144
1353
|
type: "usage",
|
|
1145
|
-
inputTokens:
|
|
1146
|
-
outputTokens:
|
|
1354
|
+
inputTokens: 50,
|
|
1355
|
+
outputTokens: 25,
|
|
1147
1356
|
model: "test-model",
|
|
1148
|
-
providerDurationMs:
|
|
1357
|
+
providerDurationMs: 100,
|
|
1149
1358
|
});
|
|
1359
|
+
return [
|
|
1360
|
+
...messages,
|
|
1361
|
+
{
|
|
1362
|
+
role: "assistant" as const,
|
|
1363
|
+
content: [
|
|
1364
|
+
{ type: "text", text: "done after compaction" },
|
|
1365
|
+
] as ContentBlock[],
|
|
1366
|
+
},
|
|
1367
|
+
];
|
|
1368
|
+
};
|
|
1150
1369
|
|
|
1151
|
-
|
|
1152
|
-
|
|
1153
|
-
|
|
1154
|
-
|
|
1155
|
-
|
|
1156
|
-
|
|
1157
|
-
message: {
|
|
1158
|
-
role: "assistant",
|
|
1159
|
-
content: [{ type: "text", text: "recovered" }],
|
|
1160
|
-
},
|
|
1161
|
-
});
|
|
1162
|
-
onEvent({
|
|
1163
|
-
type: "usage",
|
|
1164
|
-
inputTokens: 50_000,
|
|
1165
|
-
outputTokens: 100,
|
|
1166
|
-
model: "test-model",
|
|
1167
|
-
providerDurationMs: 200,
|
|
1168
|
-
});
|
|
1169
|
-
return [
|
|
1170
|
-
...messages,
|
|
1171
|
-
{
|
|
1172
|
-
role: "assistant" as const,
|
|
1173
|
-
content: [{ type: "text", text: "recovered" }] as ContentBlock[],
|
|
1174
|
-
},
|
|
1175
|
-
];
|
|
1176
|
-
};
|
|
1177
|
-
|
|
1178
|
-
const ctx = makeCtx({
|
|
1179
|
-
agentLoopRun,
|
|
1180
|
-
contextWindowManager: {
|
|
1181
|
-
shouldCompact: () => ({ needed: false, estimatedTokens: 0 }),
|
|
1182
|
-
maybeCompact: async (
|
|
1183
|
-
_msgs: Message[],
|
|
1184
|
-
_signal: AbortSignal,
|
|
1185
|
-
opts?: Record<string, unknown>,
|
|
1186
|
-
) => {
|
|
1187
|
-
if (opts?.force && opts?.minKeepRecentUserTurns === 0) {
|
|
1188
|
-
emergencyCompactCalled = true;
|
|
1370
|
+
const ctx = makeCtx({
|
|
1371
|
+
agentLoopRun,
|
|
1372
|
+
contextWindowManager: {
|
|
1373
|
+
shouldCompact: () => ({ needed: false, estimatedTokens: 0 }),
|
|
1374
|
+
maybeCompact: async () => {
|
|
1375
|
+
compactionCalled = true;
|
|
1189
1376
|
return {
|
|
1190
1377
|
compacted: true,
|
|
1191
1378
|
messages: [
|
|
1192
1379
|
{
|
|
1193
|
-
role: "user",
|
|
1380
|
+
role: "user" as const,
|
|
1194
1381
|
content: [{ type: "text", text: "Hello" }],
|
|
1195
1382
|
},
|
|
1196
1383
|
] as Message[],
|
|
1197
|
-
compactedPersistedMessages:
|
|
1198
|
-
summaryText: "
|
|
1199
|
-
previousEstimatedInputTokens:
|
|
1200
|
-
estimatedInputTokens:
|
|
1384
|
+
compactedPersistedMessages: 5,
|
|
1385
|
+
summaryText: "Mid-loop compaction summary",
|
|
1386
|
+
previousEstimatedInputTokens: 170_000,
|
|
1387
|
+
estimatedInputTokens: 80_000,
|
|
1201
1388
|
maxInputTokens: 200_000,
|
|
1202
1389
|
thresholdTokens: 160_000,
|
|
1203
|
-
compactedMessages:
|
|
1390
|
+
compactedMessages: 10,
|
|
1204
1391
|
summaryCalls: 1,
|
|
1205
|
-
summaryInputTokens:
|
|
1206
|
-
summaryOutputTokens:
|
|
1392
|
+
summaryInputTokens: 500,
|
|
1393
|
+
summaryOutputTokens: 200,
|
|
1207
1394
|
summaryModel: "mock-model",
|
|
1208
1395
|
};
|
|
1209
|
-
}
|
|
1210
|
-
|
|
1211
|
-
|
|
1212
|
-
} as unknown as AgentLoopSessionContext["contextWindowManager"],
|
|
1213
|
-
});
|
|
1396
|
+
},
|
|
1397
|
+
} as unknown as AgentLoopSessionContext["contextWindowManager"],
|
|
1398
|
+
});
|
|
1214
1399
|
|
|
1215
|
-
|
|
1400
|
+
await runAgentLoopImpl(ctx, "hello", "msg-1", (msg) => events.push(msg));
|
|
1216
1401
|
|
|
1217
|
-
|
|
1218
|
-
|
|
1219
|
-
// After PR 2 fix, emergency compaction should be attempted.
|
|
1220
|
-
expect(emergencyCompactCalled).toBe(true);
|
|
1402
|
+
// The mid-loop budget check should have triggered compaction
|
|
1403
|
+
expect(compactionCalled).toBe(true);
|
|
1221
1404
|
|
|
1222
|
-
|
|
1223
|
-
|
|
1224
|
-
expect(sessionError).toBeUndefined();
|
|
1225
|
-
});
|
|
1405
|
+
// Agent loop should have been called twice: once before yield, once after compaction
|
|
1406
|
+
expect(agentLoopCallCount).toBe(2);
|
|
1226
1407
|
|
|
1227
|
-
|
|
1228
|
-
|
|
1229
|
-
|
|
1230
|
-
// When estimate exceeds the mid-loop threshold (85% of budget),
|
|
1231
|
-
// it returns "yield" to break the agent loop.
|
|
1232
|
-
// The session-agent-loop then runs compaction and re-enters the agent loop.
|
|
1233
|
-
test("onCheckpoint yields when token estimate exceeds mid-loop budget threshold", async () => {
|
|
1234
|
-
const events: ServerMessage[] = [];
|
|
1235
|
-
let compactionCalled = false;
|
|
1408
|
+
// No session_error should be emitted
|
|
1409
|
+
const sessionError = events.find((e) => e.type === "session_error");
|
|
1410
|
+
expect(sessionError).toBeUndefined();
|
|
1236
1411
|
|
|
1237
|
-
|
|
1238
|
-
|
|
1239
|
-
|
|
1240
|
-
|
|
1241
|
-
|
|
1242
|
-
let estimateCallCount = 0;
|
|
1243
|
-
mockEstimateTokens = () => {
|
|
1244
|
-
estimateCallCount++;
|
|
1245
|
-
// First call: preflight check — below budget
|
|
1246
|
-
if (estimateCallCount === 1) return 100_000;
|
|
1247
|
-
// Subsequent calls: mid-loop check — above 85% threshold
|
|
1248
|
-
return 170_000;
|
|
1249
|
-
};
|
|
1412
|
+
// A context_compacted event should have been emitted
|
|
1413
|
+
const compacted = events.find((e) => e.type === "context_compacted");
|
|
1414
|
+
expect(compacted).toBeDefined();
|
|
1415
|
+
},
|
|
1416
|
+
);
|
|
1250
1417
|
|
|
1251
|
-
|
|
1252
|
-
|
|
1253
|
-
|
|
1254
|
-
|
|
1255
|
-
|
|
1256
|
-
|
|
1257
|
-
|
|
1258
|
-
) => {
|
|
1259
|
-
|
|
1418
|
+
// ── Test 7 ────────────────────────────────────────────────────────
|
|
1419
|
+
// Tests that mid-loop budget check prevents context_too_large entirely.
|
|
1420
|
+
// Agent loop runs tool calls with growing history. After the estimate
|
|
1421
|
+
// exceeds the mid-loop threshold, the loop yields, compaction runs,
|
|
1422
|
+
// and the loop resumes. The provider NEVER rejects with context_too_large.
|
|
1423
|
+
test.todo(
|
|
1424
|
+
"mid-loop budget check prevents context_too_large when tools produce large results",
|
|
1425
|
+
async () => {
|
|
1426
|
+
const events: ServerMessage[] = [];
|
|
1427
|
+
let compactionCalled = false;
|
|
1428
|
+
|
|
1429
|
+
// Budget = 200_000 * 0.95 = 190_000
|
|
1430
|
+
// Mid-loop threshold = 190_000 * 0.85 = 161_500
|
|
1431
|
+
// Simulate token growth: preflight = 50k, then each checkpoint call
|
|
1432
|
+
// returns a growing estimate. By tool call 3, we exceed the threshold.
|
|
1433
|
+
let estimateCallCount = 0;
|
|
1434
|
+
mockEstimateTokens = () => {
|
|
1435
|
+
estimateCallCount++;
|
|
1436
|
+
// First call: preflight — well below budget
|
|
1437
|
+
if (estimateCallCount === 1) return 50_000;
|
|
1438
|
+
// Checkpoint calls grow with each tool round
|
|
1439
|
+
if (estimateCallCount === 2) return 100_000; // tool 1
|
|
1440
|
+
if (estimateCallCount === 3) return 140_000; // tool 2
|
|
1441
|
+
// Tool 3: exceeds 161_500 threshold
|
|
1442
|
+
return 175_000;
|
|
1443
|
+
};
|
|
1260
1444
|
|
|
1261
|
-
|
|
1262
|
-
|
|
1263
|
-
|
|
1264
|
-
|
|
1265
|
-
|
|
1266
|
-
|
|
1267
|
-
|
|
1268
|
-
|
|
1269
|
-
|
|
1270
|
-
|
|
1271
|
-
|
|
1272
|
-
|
|
1273
|
-
|
|
1274
|
-
|
|
1275
|
-
|
|
1276
|
-
|
|
1277
|
-
{
|
|
1278
|
-
|
|
1279
|
-
|
|
1280
|
-
|
|
1281
|
-
|
|
1282
|
-
|
|
1283
|
-
|
|
1284
|
-
|
|
1285
|
-
|
|
1286
|
-
|
|
1287
|
-
|
|
1288
|
-
|
|
1445
|
+
let agentLoopCallCount = 0;
|
|
1446
|
+
let contextTooLargeEmitted = false;
|
|
1447
|
+
|
|
1448
|
+
const agentLoopRun: AgentLoopRun = async (
|
|
1449
|
+
messages,
|
|
1450
|
+
onEvent,
|
|
1451
|
+
_signal,
|
|
1452
|
+
_requestId,
|
|
1453
|
+
onCheckpoint,
|
|
1454
|
+
) => {
|
|
1455
|
+
agentLoopCallCount++;
|
|
1456
|
+
|
|
1457
|
+
if (agentLoopCallCount === 1) {
|
|
1458
|
+
const currentHistory = [...messages];
|
|
1459
|
+
|
|
1460
|
+
// Simulate 5 tool rounds — but the checkpoint should yield at round 3
|
|
1461
|
+
for (let i = 0; i < 5; i++) {
|
|
1462
|
+
const toolId = `tu-${i}`;
|
|
1463
|
+
const assistantMsg: Message = {
|
|
1464
|
+
role: "assistant" as const,
|
|
1465
|
+
content: [
|
|
1466
|
+
{ type: "text", text: `Step ${i}` },
|
|
1467
|
+
{
|
|
1468
|
+
type: "tool_use",
|
|
1469
|
+
id: toolId,
|
|
1470
|
+
name: "bash",
|
|
1471
|
+
input: { command: `cmd-${i}` },
|
|
1472
|
+
},
|
|
1473
|
+
] as ContentBlock[],
|
|
1474
|
+
};
|
|
1475
|
+
const resultMsg: Message = {
|
|
1476
|
+
role: "user" as const,
|
|
1477
|
+
content: [
|
|
1478
|
+
{
|
|
1479
|
+
type: "tool_result",
|
|
1480
|
+
tool_use_id: toolId,
|
|
1481
|
+
content: "x".repeat(10_000),
|
|
1482
|
+
is_error: false,
|
|
1483
|
+
},
|
|
1484
|
+
] as ContentBlock[],
|
|
1485
|
+
};
|
|
1486
|
+
currentHistory.push(assistantMsg, resultMsg);
|
|
1289
1487
|
|
|
1488
|
+
onEvent({
|
|
1489
|
+
type: "message_complete",
|
|
1490
|
+
message: assistantMsg,
|
|
1491
|
+
});
|
|
1492
|
+
onEvent({
|
|
1493
|
+
type: "usage",
|
|
1494
|
+
inputTokens: 50_000 + i * 20_000,
|
|
1495
|
+
outputTokens: 50,
|
|
1496
|
+
model: "test-model",
|
|
1497
|
+
providerDurationMs: 100,
|
|
1498
|
+
});
|
|
1499
|
+
|
|
1500
|
+
if (onCheckpoint) {
|
|
1501
|
+
const decision = onCheckpoint({
|
|
1502
|
+
turnIndex: i,
|
|
1503
|
+
toolCount: 1,
|
|
1504
|
+
hasToolUse: true,
|
|
1505
|
+
history: currentHistory,
|
|
1506
|
+
});
|
|
1507
|
+
if (decision === "yield") {
|
|
1508
|
+
return currentHistory;
|
|
1509
|
+
}
|
|
1510
|
+
}
|
|
1511
|
+
}
|
|
1512
|
+
|
|
1513
|
+
return currentHistory;
|
|
1514
|
+
}
|
|
1515
|
+
|
|
1516
|
+
// Second call (after compaction): complete
|
|
1290
1517
|
onEvent({
|
|
1291
1518
|
type: "message_complete",
|
|
1292
1519
|
message: {
|
|
1293
1520
|
role: "assistant",
|
|
1294
1521
|
content: [
|
|
1295
|
-
{ type: "text", text: "
|
|
1296
|
-
{
|
|
1297
|
-
type: "tool_use",
|
|
1298
|
-
id: "tu-1",
|
|
1299
|
-
name: "bash",
|
|
1300
|
-
input: { command: "ls" },
|
|
1301
|
-
},
|
|
1522
|
+
{ type: "text", text: "completed after mid-loop compaction" },
|
|
1302
1523
|
],
|
|
1303
1524
|
},
|
|
1304
1525
|
});
|
|
1305
1526
|
onEvent({
|
|
1306
1527
|
type: "usage",
|
|
1307
|
-
inputTokens:
|
|
1308
|
-
outputTokens:
|
|
1528
|
+
inputTokens: 60_000,
|
|
1529
|
+
outputTokens: 100,
|
|
1309
1530
|
model: "test-model",
|
|
1310
|
-
providerDurationMs:
|
|
1531
|
+
providerDurationMs: 200,
|
|
1311
1532
|
});
|
|
1533
|
+
return [
|
|
1534
|
+
...messages,
|
|
1535
|
+
{
|
|
1536
|
+
role: "assistant" as const,
|
|
1537
|
+
content: [
|
|
1538
|
+
{ type: "text", text: "completed after mid-loop compaction" },
|
|
1539
|
+
] as ContentBlock[],
|
|
1540
|
+
},
|
|
1541
|
+
];
|
|
1542
|
+
};
|
|
1312
1543
|
|
|
1313
|
-
|
|
1314
|
-
|
|
1315
|
-
|
|
1316
|
-
|
|
1317
|
-
|
|
1318
|
-
|
|
1319
|
-
|
|
1320
|
-
|
|
1321
|
-
|
|
1322
|
-
|
|
1323
|
-
|
|
1324
|
-
|
|
1325
|
-
|
|
1326
|
-
|
|
1327
|
-
|
|
1328
|
-
|
|
1329
|
-
|
|
1330
|
-
|
|
1331
|
-
|
|
1332
|
-
|
|
1333
|
-
|
|
1334
|
-
|
|
1335
|
-
|
|
1336
|
-
|
|
1337
|
-
|
|
1338
|
-
|
|
1339
|
-
|
|
1340
|
-
|
|
1341
|
-
inputTokens: 50,
|
|
1342
|
-
outputTokens: 25,
|
|
1343
|
-
model: "test-model",
|
|
1344
|
-
providerDurationMs: 100,
|
|
1544
|
+
const ctx = makeCtx({
|
|
1545
|
+
agentLoopRun,
|
|
1546
|
+
contextWindowManager: {
|
|
1547
|
+
shouldCompact: () => ({ needed: false, estimatedTokens: 0 }),
|
|
1548
|
+
maybeCompact: async () => {
|
|
1549
|
+
compactionCalled = true;
|
|
1550
|
+
return {
|
|
1551
|
+
compacted: true,
|
|
1552
|
+
messages: [
|
|
1553
|
+
{
|
|
1554
|
+
role: "user" as const,
|
|
1555
|
+
content: [{ type: "text", text: "Hello" }],
|
|
1556
|
+
},
|
|
1557
|
+
] as Message[],
|
|
1558
|
+
compactedPersistedMessages: 8,
|
|
1559
|
+
summaryText: "Compacted large tool results",
|
|
1560
|
+
previousEstimatedInputTokens: 175_000,
|
|
1561
|
+
estimatedInputTokens: 60_000,
|
|
1562
|
+
maxInputTokens: 200_000,
|
|
1563
|
+
thresholdTokens: 160_000,
|
|
1564
|
+
compactedMessages: 15,
|
|
1565
|
+
summaryCalls: 1,
|
|
1566
|
+
summaryInputTokens: 800,
|
|
1567
|
+
summaryOutputTokens: 300,
|
|
1568
|
+
summaryModel: "mock-model",
|
|
1569
|
+
};
|
|
1570
|
+
},
|
|
1571
|
+
} as unknown as AgentLoopSessionContext["contextWindowManager"],
|
|
1345
1572
|
});
|
|
1346
|
-
return [
|
|
1347
|
-
...messages,
|
|
1348
|
-
{
|
|
1349
|
-
role: "assistant" as const,
|
|
1350
|
-
content: [
|
|
1351
|
-
{ type: "text", text: "done after compaction" },
|
|
1352
|
-
] as ContentBlock[],
|
|
1353
|
-
},
|
|
1354
|
-
];
|
|
1355
|
-
};
|
|
1356
|
-
|
|
1357
|
-
const ctx = makeCtx({
|
|
1358
|
-
agentLoopRun,
|
|
1359
|
-
contextWindowManager: {
|
|
1360
|
-
shouldCompact: () => ({ needed: false, estimatedTokens: 0 }),
|
|
1361
|
-
maybeCompact: async () => {
|
|
1362
|
-
compactionCalled = true;
|
|
1363
|
-
return {
|
|
1364
|
-
compacted: true,
|
|
1365
|
-
messages: [
|
|
1366
|
-
{
|
|
1367
|
-
role: "user" as const,
|
|
1368
|
-
content: [{ type: "text", text: "Hello" }],
|
|
1369
|
-
},
|
|
1370
|
-
] as Message[],
|
|
1371
|
-
compactedPersistedMessages: 5,
|
|
1372
|
-
summaryText: "Mid-loop compaction summary",
|
|
1373
|
-
previousEstimatedInputTokens: 170_000,
|
|
1374
|
-
estimatedInputTokens: 80_000,
|
|
1375
|
-
maxInputTokens: 200_000,
|
|
1376
|
-
thresholdTokens: 160_000,
|
|
1377
|
-
compactedMessages: 10,
|
|
1378
|
-
summaryCalls: 1,
|
|
1379
|
-
summaryInputTokens: 500,
|
|
1380
|
-
summaryOutputTokens: 200,
|
|
1381
|
-
summaryModel: "mock-model",
|
|
1382
|
-
};
|
|
1383
|
-
},
|
|
1384
|
-
} as unknown as AgentLoopSessionContext["contextWindowManager"],
|
|
1385
|
-
});
|
|
1386
|
-
|
|
1387
|
-
await runAgentLoopImpl(ctx, "hello", "msg-1", (msg) => events.push(msg));
|
|
1388
1573
|
|
|
1389
|
-
|
|
1390
|
-
|
|
1574
|
+
await runAgentLoopImpl(ctx, "hello", "msg-1", (msg) => {
|
|
1575
|
+
events.push(msg);
|
|
1576
|
+
// Track if context_too_large was ever emitted
|
|
1577
|
+
if (
|
|
1578
|
+
msg.type === "session_error" &&
|
|
1579
|
+
"code" in msg &&
|
|
1580
|
+
msg.code === "SESSION_PROCESSING_FAILED"
|
|
1581
|
+
) {
|
|
1582
|
+
contextTooLargeEmitted = true;
|
|
1583
|
+
}
|
|
1584
|
+
});
|
|
1391
1585
|
|
|
1392
|
-
|
|
1393
|
-
|
|
1586
|
+
// Compaction should have been triggered by mid-loop budget check
|
|
1587
|
+
expect(compactionCalled).toBe(true);
|
|
1394
1588
|
|
|
1395
|
-
|
|
1396
|
-
|
|
1397
|
-
expect(sessionError).toBeUndefined();
|
|
1589
|
+
// The provider should NEVER have rejected with context_too_large
|
|
1590
|
+
expect(contextTooLargeEmitted).toBe(false);
|
|
1398
1591
|
|
|
1399
|
-
|
|
1400
|
-
|
|
1401
|
-
expect(compacted).toBeDefined();
|
|
1402
|
-
});
|
|
1592
|
+
// Agent loop called twice: once (yielded at tool 3), once after compaction
|
|
1593
|
+
expect(agentLoopCallCount).toBe(2);
|
|
1403
1594
|
|
|
1404
|
-
|
|
1405
|
-
|
|
1406
|
-
|
|
1407
|
-
|
|
1408
|
-
|
|
1409
|
-
|
|
1595
|
+
// No session_error
|
|
1596
|
+
const sessionError = events.find((e) => e.type === "session_error");
|
|
1597
|
+
expect(sessionError).toBeUndefined();
|
|
1598
|
+
},
|
|
1599
|
+
);
|
|
1600
|
+
|
|
1601
|
+
// ── Test 8 ────────────────────────────────────────────────────────
|
|
1602
|
+
// When mid-loop compaction exhausts maxAttempts but the agent loop
|
|
1603
|
+
// still yields (yieldedForBudget remains true), the incomplete turn
|
|
1604
|
+
// must escalate to the convergence loop instead of being silently
|
|
1605
|
+
// treated as a completed turn.
|
|
1606
|
+
test("exhausted mid-loop compaction attempts escalate to convergence loop", async () => {
|
|
1410
1607
|
const events: ServerMessage[] = [];
|
|
1411
|
-
let compactionCalled = false;
|
|
1412
1608
|
|
|
1413
1609
|
// Budget = 200_000 * 0.95 = 190_000
|
|
1414
1610
|
// Mid-loop threshold = 190_000 * 0.85 = 161_500
|
|
1415
|
-
// Simulate token growth: preflight = 50k, then each checkpoint call
|
|
1416
|
-
// returns a growing estimate. By tool call 3, we exceed the threshold.
|
|
1417
1611
|
let estimateCallCount = 0;
|
|
1418
1612
|
mockEstimateTokens = () => {
|
|
1419
1613
|
estimateCallCount++;
|
|
1420
|
-
//
|
|
1421
|
-
if (estimateCallCount === 1) return
|
|
1422
|
-
//
|
|
1423
|
-
|
|
1424
|
-
if (estimateCallCount === 3) return 140_000; // tool 2
|
|
1425
|
-
// Tool 3: exceeds 161_500 threshold
|
|
1426
|
-
return 175_000;
|
|
1614
|
+
// Preflight: below budget
|
|
1615
|
+
if (estimateCallCount === 1) return 100_000;
|
|
1616
|
+
// Every checkpoint call: above threshold — always triggers yield
|
|
1617
|
+
return 170_000;
|
|
1427
1618
|
};
|
|
1428
1619
|
|
|
1429
1620
|
let agentLoopCallCount = 0;
|
|
1430
|
-
let contextTooLargeEmitted = false;
|
|
1431
|
-
|
|
1432
1621
|
const agentLoopRun: AgentLoopRun = async (
|
|
1433
1622
|
messages,
|
|
1434
1623
|
onEvent,
|
|
@@ -1438,91 +1627,88 @@ describe("session-agent-loop overflow recovery (JARVIS-110)", () => {
|
|
|
1438
1627
|
) => {
|
|
1439
1628
|
agentLoopCallCount++;
|
|
1440
1629
|
|
|
1441
|
-
|
|
1442
|
-
|
|
1443
|
-
|
|
1444
|
-
|
|
1445
|
-
|
|
1446
|
-
|
|
1447
|
-
|
|
1448
|
-
|
|
1449
|
-
|
|
1450
|
-
|
|
1451
|
-
|
|
1452
|
-
|
|
1453
|
-
|
|
1454
|
-
|
|
1455
|
-
|
|
1456
|
-
|
|
1457
|
-
|
|
1458
|
-
|
|
1459
|
-
|
|
1460
|
-
|
|
1461
|
-
|
|
1462
|
-
|
|
1463
|
-
|
|
1464
|
-
|
|
1465
|
-
|
|
1466
|
-
|
|
1467
|
-
|
|
1468
|
-
] as ContentBlock[],
|
|
1469
|
-
};
|
|
1470
|
-
currentHistory.push(assistantMsg, resultMsg);
|
|
1471
|
-
|
|
1472
|
-
onEvent({
|
|
1473
|
-
type: "message_complete",
|
|
1474
|
-
message: assistantMsg,
|
|
1475
|
-
});
|
|
1476
|
-
onEvent({
|
|
1477
|
-
type: "usage",
|
|
1478
|
-
inputTokens: 50_000 + i * 20_000,
|
|
1479
|
-
outputTokens: 50,
|
|
1480
|
-
model: "test-model",
|
|
1481
|
-
providerDurationMs: 100,
|
|
1482
|
-
});
|
|
1483
|
-
|
|
1484
|
-
if (onCheckpoint) {
|
|
1485
|
-
const decision = onCheckpoint({
|
|
1486
|
-
turnIndex: i,
|
|
1487
|
-
toolCount: 1,
|
|
1488
|
-
hasToolUse: true,
|
|
1489
|
-
history: currentHistory,
|
|
1490
|
-
});
|
|
1491
|
-
if (decision === "yield") {
|
|
1492
|
-
return currentHistory;
|
|
1493
|
-
}
|
|
1494
|
-
}
|
|
1495
|
-
}
|
|
1496
|
-
|
|
1497
|
-
return currentHistory;
|
|
1498
|
-
}
|
|
1630
|
+
// Every call: simulate tool progress then yield at checkpoint
|
|
1631
|
+
const withProgress: Message[] = [
|
|
1632
|
+
...messages,
|
|
1633
|
+
{
|
|
1634
|
+
role: "assistant" as const,
|
|
1635
|
+
content: [
|
|
1636
|
+
{ type: "text", text: `Tool call ${agentLoopCallCount}` },
|
|
1637
|
+
{
|
|
1638
|
+
type: "tool_use",
|
|
1639
|
+
id: `tu-${agentLoopCallCount}`,
|
|
1640
|
+
name: "bash",
|
|
1641
|
+
input: { command: "ls" },
|
|
1642
|
+
},
|
|
1643
|
+
] as ContentBlock[],
|
|
1644
|
+
},
|
|
1645
|
+
{
|
|
1646
|
+
role: "user" as const,
|
|
1647
|
+
content: [
|
|
1648
|
+
{
|
|
1649
|
+
type: "tool_result",
|
|
1650
|
+
tool_use_id: `tu-${agentLoopCallCount}`,
|
|
1651
|
+
content: "output",
|
|
1652
|
+
is_error: false,
|
|
1653
|
+
},
|
|
1654
|
+
] as ContentBlock[],
|
|
1655
|
+
},
|
|
1656
|
+
];
|
|
1499
1657
|
|
|
1500
|
-
// Second call (after compaction): complete
|
|
1501
1658
|
onEvent({
|
|
1502
1659
|
type: "message_complete",
|
|
1503
1660
|
message: {
|
|
1504
1661
|
role: "assistant",
|
|
1505
1662
|
content: [
|
|
1506
|
-
{ type: "text", text:
|
|
1663
|
+
{ type: "text", text: `Tool call ${agentLoopCallCount}` },
|
|
1664
|
+
{
|
|
1665
|
+
type: "tool_use",
|
|
1666
|
+
id: `tu-${agentLoopCallCount}`,
|
|
1667
|
+
name: "bash",
|
|
1668
|
+
input: { command: "ls" },
|
|
1669
|
+
},
|
|
1507
1670
|
],
|
|
1508
1671
|
},
|
|
1509
1672
|
});
|
|
1510
1673
|
onEvent({
|
|
1511
1674
|
type: "usage",
|
|
1512
|
-
inputTokens:
|
|
1513
|
-
outputTokens:
|
|
1675
|
+
inputTokens: 100,
|
|
1676
|
+
outputTokens: 50,
|
|
1514
1677
|
model: "test-model",
|
|
1515
|
-
providerDurationMs:
|
|
1678
|
+
providerDurationMs: 100,
|
|
1516
1679
|
});
|
|
1517
|
-
|
|
1518
|
-
|
|
1519
|
-
|
|
1520
|
-
|
|
1521
|
-
|
|
1522
|
-
|
|
1523
|
-
|
|
1680
|
+
|
|
1681
|
+
// Always yield at checkpoint — simulates compaction not helping
|
|
1682
|
+
if (onCheckpoint) {
|
|
1683
|
+
const decision = onCheckpoint({
|
|
1684
|
+
turnIndex: 0,
|
|
1685
|
+
toolCount: 1,
|
|
1686
|
+
hasToolUse: true,
|
|
1687
|
+
history: withProgress,
|
|
1688
|
+
});
|
|
1689
|
+
if (decision === "yield") {
|
|
1690
|
+
return withProgress;
|
|
1691
|
+
}
|
|
1692
|
+
}
|
|
1693
|
+
|
|
1694
|
+
return withProgress;
|
|
1695
|
+
};
|
|
1696
|
+
|
|
1697
|
+
let compactionCallCount = 0;
|
|
1698
|
+
// Convergence reducer: reduce tokens enough to succeed
|
|
1699
|
+
let convergenceReducerCalled = false;
|
|
1700
|
+
mockReducerStepFn = (msgs: Message[]) => {
|
|
1701
|
+
convergenceReducerCalled = true;
|
|
1702
|
+
return {
|
|
1703
|
+
messages: msgs,
|
|
1704
|
+
tier: "forced_compaction",
|
|
1705
|
+
state: {
|
|
1706
|
+
appliedTiers: ["forced_compaction"],
|
|
1707
|
+
injectionMode: "full",
|
|
1708
|
+
exhausted: true,
|
|
1524
1709
|
},
|
|
1525
|
-
|
|
1710
|
+
estimatedTokens: 80_000,
|
|
1711
|
+
};
|
|
1526
1712
|
};
|
|
1527
1713
|
|
|
1528
1714
|
const ctx = makeCtx({
|
|
@@ -1530,7 +1716,8 @@ describe("session-agent-loop overflow recovery (JARVIS-110)", () => {
|
|
|
1530
1716
|
contextWindowManager: {
|
|
1531
1717
|
shouldCompact: () => ({ needed: false, estimatedTokens: 0 }),
|
|
1532
1718
|
maybeCompact: async () => {
|
|
1533
|
-
|
|
1719
|
+
compactionCallCount++;
|
|
1720
|
+
// Compaction "succeeds" but doesn't actually shrink enough
|
|
1534
1721
|
return {
|
|
1535
1722
|
compacted: true,
|
|
1536
1723
|
messages: [
|
|
@@ -1539,45 +1726,32 @@ describe("session-agent-loop overflow recovery (JARVIS-110)", () => {
|
|
|
1539
1726
|
content: [{ type: "text", text: "Hello" }],
|
|
1540
1727
|
},
|
|
1541
1728
|
] as Message[],
|
|
1542
|
-
compactedPersistedMessages:
|
|
1543
|
-
summaryText: "
|
|
1544
|
-
previousEstimatedInputTokens:
|
|
1545
|
-
estimatedInputTokens:
|
|
1729
|
+
compactedPersistedMessages: 5,
|
|
1730
|
+
summaryText: "Compaction summary",
|
|
1731
|
+
previousEstimatedInputTokens: 170_000,
|
|
1732
|
+
estimatedInputTokens: 165_000, // barely reduced
|
|
1546
1733
|
maxInputTokens: 200_000,
|
|
1547
1734
|
thresholdTokens: 160_000,
|
|
1548
|
-
compactedMessages:
|
|
1735
|
+
compactedMessages: 10,
|
|
1549
1736
|
summaryCalls: 1,
|
|
1550
|
-
summaryInputTokens:
|
|
1551
|
-
summaryOutputTokens:
|
|
1737
|
+
summaryInputTokens: 500,
|
|
1738
|
+
summaryOutputTokens: 200,
|
|
1552
1739
|
summaryModel: "mock-model",
|
|
1553
1740
|
};
|
|
1554
1741
|
},
|
|
1555
1742
|
} as unknown as AgentLoopSessionContext["contextWindowManager"],
|
|
1556
1743
|
});
|
|
1557
1744
|
|
|
1558
|
-
await runAgentLoopImpl(ctx, "hello", "msg-1", (msg) =>
|
|
1559
|
-
events.push(msg);
|
|
1560
|
-
// Track if context_too_large was ever emitted
|
|
1561
|
-
if (
|
|
1562
|
-
msg.type === "session_error" &&
|
|
1563
|
-
"code" in msg &&
|
|
1564
|
-
msg.code === "SESSION_PROCESSING_FAILED"
|
|
1565
|
-
) {
|
|
1566
|
-
contextTooLargeEmitted = true;
|
|
1567
|
-
}
|
|
1568
|
-
});
|
|
1569
|
-
|
|
1570
|
-
// Compaction should have been triggered by mid-loop budget check
|
|
1571
|
-
expect(compactionCalled).toBe(true);
|
|
1745
|
+
await runAgentLoopImpl(ctx, "hello", "msg-1", (msg) => events.push(msg));
|
|
1572
1746
|
|
|
1573
|
-
//
|
|
1574
|
-
expect(
|
|
1747
|
+
// 1 initial auto-compact + 3 mid-loop compaction attempts = 4 total
|
|
1748
|
+
expect(compactionCallCount).toBe(4);
|
|
1575
1749
|
|
|
1576
|
-
// Agent loop
|
|
1577
|
-
expect(agentLoopCallCount).toBe(
|
|
1750
|
+
// Agent loop: 1 initial + 3 mid-loop re-entries + 1 convergence re-run = 5 calls
|
|
1751
|
+
expect(agentLoopCallCount).toBe(5);
|
|
1578
1752
|
|
|
1579
|
-
//
|
|
1580
|
-
|
|
1581
|
-
expect(
|
|
1753
|
+
// After exhausting mid-loop attempts, the convergence loop should
|
|
1754
|
+
// have been triggered (contextTooLargeDetected set to true)
|
|
1755
|
+
expect(convergenceReducerCalled).toBe(true);
|
|
1582
1756
|
});
|
|
1583
1757
|
});
|