@vellumai/assistant 0.4.49 → 0.4.50
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/ARCHITECTURE.md +24 -33
- package/README.md +3 -3
- package/docs/architecture/memory.md +180 -119
- package/package.json +2 -2
- package/src/__tests__/agent-loop.test.ts +3 -1
- package/src/__tests__/anthropic-provider.test.ts +114 -23
- package/src/__tests__/approval-cascade.test.ts +1 -15
- package/src/__tests__/approval-routes-http.test.ts +2 -0
- package/src/__tests__/assistant-feature-flag-guard.test.ts +0 -23
- package/src/__tests__/canonical-guardian-store.test.ts +95 -0
- package/src/__tests__/checker.test.ts +13 -0
- package/src/__tests__/config-schema.test.ts +1 -68
- package/src/__tests__/context-memory-e2e.test.ts +11 -100
- package/src/__tests__/conversation-routes-guardian-reply.test.ts +8 -0
- package/src/__tests__/conversation-routes-slash-commands.test.ts +1 -0
- package/src/__tests__/credential-security-e2e.test.ts +1 -0
- package/src/__tests__/credential-vault-unit.test.ts +4 -0
- package/src/__tests__/credential-vault.test.ts +13 -1
- package/src/__tests__/cu-unified-flow.test.ts +532 -0
- package/src/__tests__/date-context.test.ts +93 -77
- package/src/__tests__/deterministic-verification-control-plane.test.ts +64 -0
- package/src/__tests__/guardian-routing-invariants.test.ts +93 -0
- package/src/__tests__/history-repair.test.ts +245 -0
- package/src/__tests__/host-cu-proxy.test.ts +165 -3
- package/src/__tests__/http-user-message-parity.test.ts +1 -0
- package/src/__tests__/invite-redemption-service.test.ts +65 -1
- package/src/__tests__/keychain-broker-client.test.ts +4 -4
- package/src/__tests__/memory-context-benchmark.benchmark.test.ts +56 -18
- package/src/__tests__/memory-lifecycle-e2e.test.ts +244 -387
- package/src/__tests__/memory-recall-quality.test.ts +244 -407
- package/src/__tests__/memory-regressions.experimental.test.ts +126 -101
- package/src/__tests__/memory-regressions.test.ts +477 -2841
- package/src/__tests__/memory-retrieval.benchmark.test.ts +33 -150
- package/src/__tests__/memory-upsert-concurrency.test.ts +5 -244
- package/src/__tests__/mime-builder.test.ts +28 -0
- package/src/__tests__/native-web-search.test.ts +1 -0
- package/src/__tests__/oauth-cli.test.ts +572 -5
- package/src/__tests__/oauth-store.test.ts +120 -6
- package/src/__tests__/qdrant-collection-migration.test.ts +53 -8
- package/src/__tests__/registry.test.ts +0 -1
- package/src/__tests__/relay-server.test.ts +46 -1
- package/src/__tests__/schedule-tools.test.ts +32 -0
- package/src/__tests__/script-proxy-certs.test.ts +1 -1
- package/src/__tests__/secret-onetime-send.test.ts +1 -0
- package/src/__tests__/secure-keys.test.ts +7 -2
- package/src/__tests__/send-endpoint-busy.test.ts +3 -0
- package/src/__tests__/session-abort-tool-results.test.ts +1 -14
- package/src/__tests__/session-agent-loop-overflow.test.ts +1583 -0
- package/src/__tests__/session-agent-loop.test.ts +19 -15
- package/src/__tests__/session-confirmation-signals.test.ts +1 -15
- package/src/__tests__/session-error.test.ts +124 -2
- package/src/__tests__/session-history-web-search.test.ts +918 -0
- package/src/__tests__/session-pre-run-repair.test.ts +1 -14
- package/src/__tests__/session-provider-retry-repair.test.ts +25 -28
- package/src/__tests__/session-queue.test.ts +37 -27
- package/src/__tests__/session-runtime-assembly.test.ts +54 -0
- package/src/__tests__/session-slash-known.test.ts +1 -15
- package/src/__tests__/session-slash-queue.test.ts +1 -15
- package/src/__tests__/session-slash-unknown.test.ts +1 -15
- package/src/__tests__/session-workspace-cache-state.test.ts +3 -33
- package/src/__tests__/session-workspace-injection.test.ts +3 -37
- package/src/__tests__/session-workspace-tool-tracking.test.ts +3 -37
- package/src/__tests__/skills-install-extract.test.ts +93 -0
- package/src/__tests__/skillssh-registry.test.ts +451 -0
- package/src/__tests__/trust-store.test.ts +15 -0
- package/src/__tests__/voice-invite-redemption.test.ts +32 -1
- package/src/agent/ax-tree-compaction.test.ts +51 -0
- package/src/agent/loop.ts +39 -12
- package/src/approvals/AGENTS.md +1 -1
- package/src/approvals/guardian-request-resolvers.ts +14 -2
- package/src/bundler/compiler-tools.ts +66 -2
- package/src/calls/call-domain.ts +132 -0
- package/src/calls/call-store.ts +6 -0
- package/src/calls/relay-server.ts +43 -5
- package/src/calls/relay-setup-router.ts +17 -1
- package/src/calls/twilio-config.ts +1 -1
- package/src/calls/types.ts +3 -1
- package/src/cli/commands/doctor.ts +4 -3
- package/src/cli/commands/mcp.ts +46 -59
- package/src/cli/commands/memory.ts +16 -165
- package/src/cli/commands/oauth/apps.ts +31 -2
- package/src/cli/commands/oauth/connections.ts +431 -97
- package/src/cli/commands/oauth/providers.ts +15 -1
- package/src/cli/commands/sessions.ts +5 -2
- package/src/cli/commands/skills.ts +173 -1
- package/src/cli/http-client.ts +0 -20
- package/src/cli/main-screen.tsx +2 -2
- package/src/cli/program.ts +5 -6
- package/src/cli.ts +4 -10
- package/src/config/bundled-skills/computer-use/TOOLS.json +1 -1
- package/src/config/bundled-skills/computer-use/tools/computer-use-observe.ts +12 -0
- package/src/config/bundled-tool-registry.ts +2 -5
- package/src/config/schema.ts +1 -12
- package/src/config/schemas/memory-lifecycle.ts +0 -9
- package/src/config/schemas/memory-processing.ts +0 -180
- package/src/config/schemas/memory-retrieval.ts +32 -104
- package/src/config/schemas/memory.ts +0 -10
- package/src/config/types.ts +0 -4
- package/src/context/window-manager.ts +4 -1
- package/src/daemon/config-watcher.ts +61 -3
- package/src/daemon/daemon-control.ts +1 -1
- package/src/daemon/date-context.ts +114 -31
- package/src/daemon/handlers/sessions.ts +18 -13
- package/src/daemon/handlers/skills.ts +20 -1
- package/src/daemon/history-repair.ts +72 -8
- package/src/daemon/host-cu-proxy.ts +55 -26
- package/src/daemon/lifecycle.ts +31 -3
- package/src/daemon/mcp-reload-service.ts +2 -2
- package/src/daemon/message-types/computer-use.ts +1 -12
- package/src/daemon/message-types/memory.ts +4 -16
- package/src/daemon/message-types/messages.ts +1 -0
- package/src/daemon/message-types/sessions.ts +4 -0
- package/src/daemon/server.ts +12 -1
- package/src/daemon/session-agent-loop-handlers.ts +38 -0
- package/src/daemon/session-agent-loop.ts +334 -48
- package/src/daemon/session-error.ts +89 -6
- package/src/daemon/session-history.ts +17 -7
- package/src/daemon/session-media-retry.ts +6 -2
- package/src/daemon/session-memory.ts +69 -149
- package/src/daemon/session-process.ts +10 -1
- package/src/daemon/session-runtime-assembly.ts +49 -19
- package/src/daemon/session-surfaces.ts +4 -1
- package/src/daemon/session-tool-setup.ts +7 -1
- package/src/daemon/session.ts +12 -2
- package/src/instrument.ts +61 -1
- package/src/memory/admin.ts +2 -191
- package/src/memory/canonical-guardian-store.ts +38 -2
- package/src/memory/conversation-crud.ts +0 -33
- package/src/memory/conversation-queries.ts +22 -3
- package/src/memory/db-init.ts +28 -0
- package/src/memory/embedding-backend.ts +84 -8
- package/src/memory/embedding-types.ts +9 -1
- package/src/memory/indexer.ts +7 -46
- package/src/memory/items-extractor.ts +274 -76
- package/src/memory/job-handlers/backfill.ts +2 -127
- package/src/memory/job-handlers/cleanup.ts +2 -16
- package/src/memory/job-handlers/extraction.ts +2 -138
- package/src/memory/job-handlers/index-maintenance.ts +1 -6
- package/src/memory/job-handlers/summarization.ts +3 -148
- package/src/memory/job-utils.ts +21 -59
- package/src/memory/jobs-store.ts +1 -159
- package/src/memory/jobs-worker.ts +9 -52
- package/src/memory/migrations/104-core-indexes.ts +3 -3
- package/src/memory/migrations/149-oauth-tables.ts +2 -0
- package/src/memory/migrations/150-oauth-apps-client-secret-path.ts +98 -0
- package/src/memory/migrations/151-oauth-providers-ping-url.ts +11 -0
- package/src/memory/migrations/152-memory-item-supersession.ts +44 -0
- package/src/memory/migrations/153-drop-entity-tables.ts +15 -0
- package/src/memory/migrations/154-drop-fts.ts +20 -0
- package/src/memory/migrations/155-drop-conflicts.ts +7 -0
- package/src/memory/migrations/156-call-session-invite-metadata.ts +24 -0
- package/src/memory/migrations/index.ts +7 -0
- package/src/memory/qdrant-client.ts +148 -51
- package/src/memory/raw-query.ts +1 -1
- package/src/memory/retriever.test.ts +294 -273
- package/src/memory/retriever.ts +421 -645
- package/src/memory/schema/calls.ts +2 -0
- package/src/memory/schema/memory-core.ts +3 -48
- package/src/memory/schema/oauth.ts +2 -0
- package/src/memory/search/formatting.ts +263 -176
- package/src/memory/search/lexical.ts +1 -254
- package/src/memory/search/ranking.ts +0 -455
- package/src/memory/search/semantic.ts +100 -14
- package/src/memory/search/staleness.ts +47 -0
- package/src/memory/search/tier-classifier.ts +21 -0
- package/src/memory/search/types.ts +15 -77
- package/src/memory/task-memory-cleanup.ts +4 -6
- package/src/messaging/providers/gmail/mime-builder.ts +17 -7
- package/src/oauth/byo-connection.test.ts +8 -1
- package/src/oauth/oauth-store.ts +113 -27
- package/src/oauth/seed-providers.ts +6 -0
- package/src/oauth/token-persistence.ts +11 -3
- package/src/permissions/defaults.ts +1 -0
- package/src/permissions/trust-store.ts +23 -1
- package/src/playbooks/playbook-compiler.ts +1 -1
- package/src/prompts/system-prompt.ts +18 -2
- package/src/providers/anthropic/client.ts +56 -126
- package/src/providers/types.ts +7 -1
- package/src/runtime/AGENTS.md +9 -0
- package/src/runtime/auth/route-policy.ts +6 -3
- package/src/runtime/guardian-reply-router.ts +24 -22
- package/src/runtime/http-server.ts +2 -2
- package/src/runtime/invite-redemption-service.ts +19 -1
- package/src/runtime/invite-service.ts +25 -0
- package/src/runtime/pending-interactions.ts +2 -2
- package/src/runtime/routes/brain-graph-routes.ts +10 -90
- package/src/runtime/routes/conversation-routes.ts +9 -1
- package/src/runtime/routes/inbound-stages/acl-enforcement.ts +21 -12
- package/src/runtime/routes/memory-item-routes.test.ts +754 -0
- package/src/runtime/routes/memory-item-routes.ts +503 -0
- package/src/runtime/routes/session-management-routes.ts +3 -3
- package/src/runtime/routes/settings-routes.ts +2 -2
- package/src/runtime/routes/trust-rules-routes.ts +14 -0
- package/src/runtime/routes/workspace-routes.ts +2 -1
- package/src/security/keychain-broker-client.ts +17 -4
- package/src/security/secure-keys.ts +25 -3
- package/src/security/token-manager.ts +36 -36
- package/src/skills/catalog-install.ts +74 -18
- package/src/skills/skillssh-registry.ts +503 -0
- package/src/tools/assets/search.ts +5 -1
- package/src/tools/computer-use/definitions.ts +0 -10
- package/src/tools/computer-use/registry.ts +1 -1
- package/src/tools/credentials/vault.ts +1 -3
- package/src/tools/memory/definitions.ts +4 -13
- package/src/tools/memory/handlers.test.ts +83 -103
- package/src/tools/memory/handlers.ts +50 -85
- package/src/tools/schedule/create.ts +8 -1
- package/src/tools/schedule/update.ts +8 -1
- package/src/tools/skills/load.ts +25 -2
- package/src/__tests__/clarification-resolver.test.ts +0 -193
- package/src/__tests__/conflict-intent-tokenization.test.ts +0 -160
- package/src/__tests__/conflict-policy.test.ts +0 -269
- package/src/__tests__/conflict-store.test.ts +0 -372
- package/src/__tests__/contradiction-checker.test.ts +0 -361
- package/src/__tests__/entity-extractor.test.ts +0 -211
- package/src/__tests__/entity-search.test.ts +0 -1117
- package/src/__tests__/profile-compiler.test.ts +0 -392
- package/src/__tests__/session-conflict-gate.test.ts +0 -1228
- package/src/__tests__/session-profile-injection.test.ts +0 -557
- package/src/config/bundled-skills/knowledge-graph/SKILL.md +0 -25
- package/src/config/bundled-skills/knowledge-graph/TOOLS.json +0 -66
- package/src/config/bundled-skills/knowledge-graph/tools/graph-query.ts +0 -211
- package/src/daemon/session-conflict-gate.ts +0 -167
- package/src/daemon/session-dynamic-profile.ts +0 -77
- package/src/memory/clarification-resolver.ts +0 -417
- package/src/memory/conflict-intent.ts +0 -205
- package/src/memory/conflict-policy.ts +0 -127
- package/src/memory/conflict-store.ts +0 -410
- package/src/memory/contradiction-checker.ts +0 -508
- package/src/memory/entity-extractor.ts +0 -535
- package/src/memory/format-recall.ts +0 -47
- package/src/memory/fts-reconciler.ts +0 -165
- package/src/memory/job-handlers/conflict.ts +0 -200
- package/src/memory/profile-compiler.ts +0 -195
- package/src/memory/recall-cache.ts +0 -117
- package/src/memory/search/entity.ts +0 -535
- package/src/memory/search/query-expansion.test.ts +0 -70
- package/src/memory/search/query-expansion.ts +0 -118
- package/src/runtime/routes/mcp-routes.ts +0 -20
|
@@ -0,0 +1,1583 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Overflow recovery test suite for JARVIS-110.
|
|
3
|
+
*
|
|
4
|
+
* Reproduces the failure modes observed in long conversations (75+ messages)
|
|
5
|
+
* where context overflow recovery fails because:
|
|
6
|
+
* 1. Progress during the agent loop bypasses the convergence retry
|
|
7
|
+
* 2. Token estimation significantly underestimates actual token count
|
|
8
|
+
* 3. No mid-loop budget check to prevent hitting the provider limit
|
|
9
|
+
*
|
|
10
|
+
* Tests 2, 3, and 4 pass against the current code.
|
|
11
|
+
* Tests 1, 5 fail (documenting bugs to be fixed in PR 2).
|
|
12
|
+
* Tests 6 and 7 are skipped (depend on mid-loop checkpoint changes in PR 3).
|
|
13
|
+
*/
|
|
14
|
+
import { beforeEach, describe, expect, mock, test } from "bun:test";
|
|
15
|
+
|
|
16
|
+
import type {
|
|
17
|
+
AgentEvent,
|
|
18
|
+
CheckpointDecision,
|
|
19
|
+
CheckpointInfo,
|
|
20
|
+
} from "../agent/loop.js";
|
|
21
|
+
import type { ServerMessage } from "../daemon/message-protocol.js";
|
|
22
|
+
import type { ContentBlock, Message } from "../providers/types.js";
|
|
23
|
+
|
|
24
|
+
// ── Module mocks (must precede imports of the module under test) ─────
|
|
25
|
+
|
|
26
|
+
mock.module("../util/logger.js", () => ({
|
|
27
|
+
getLogger: () =>
|
|
28
|
+
new Proxy({} as Record<string, unknown>, { get: () => () => {} }),
|
|
29
|
+
}));
|
|
30
|
+
|
|
31
|
+
mock.module("../util/platform.js", () => ({
|
|
32
|
+
getDataDir: () => "/tmp",
|
|
33
|
+
}));
|
|
34
|
+
|
|
35
|
+
mock.module("../config/loader.js", () => ({
|
|
36
|
+
getConfig: () => ({
|
|
37
|
+
provider: "mock-provider",
|
|
38
|
+
maxTokens: 4096,
|
|
39
|
+
thinking: false,
|
|
40
|
+
contextWindow: {
|
|
41
|
+
maxInputTokens: 200_000,
|
|
42
|
+
thresholdTokens: 160_000,
|
|
43
|
+
preserveRecentMessages: 6,
|
|
44
|
+
summaryModel: "mock-model",
|
|
45
|
+
maxSummaryTokens: 512,
|
|
46
|
+
overflowRecovery: {
|
|
47
|
+
enabled: true,
|
|
48
|
+
safetyMarginRatio: 0.05,
|
|
49
|
+
maxAttempts: 3,
|
|
50
|
+
interactiveLatestTurnCompression: "summarize",
|
|
51
|
+
nonInteractiveLatestTurnCompression: "truncate",
|
|
52
|
+
},
|
|
53
|
+
},
|
|
54
|
+
rateLimit: { maxRequestsPerMinute: 0, maxTokensPerSession: 0 },
|
|
55
|
+
apiKeys: {},
|
|
56
|
+
workspaceGit: { turnCommitMaxWaitMs: 10 },
|
|
57
|
+
ui: {},
|
|
58
|
+
}),
|
|
59
|
+
loadRawConfig: () => ({}),
|
|
60
|
+
saveRawConfig: () => {},
|
|
61
|
+
invalidateConfigCache: () => {},
|
|
62
|
+
}));
|
|
63
|
+
|
|
64
|
+
// ── Overflow recovery mocks ──────────────────────────────────────────
|
|
65
|
+
|
|
66
|
+
// Token estimator — controllable per-test via mockEstimateTokens.
|
|
67
|
+
// Can be a number (constant) or a function for dynamic behavior.
|
|
68
|
+
let mockEstimateTokens: number | (() => number) = 1000;
|
|
69
|
+
mock.module("../context/token-estimator.js", () => ({
|
|
70
|
+
estimatePromptTokens: () =>
|
|
71
|
+
typeof mockEstimateTokens === "function"
|
|
72
|
+
? mockEstimateTokens()
|
|
73
|
+
: mockEstimateTokens,
|
|
74
|
+
}));
|
|
75
|
+
|
|
76
|
+
// Reducer: by default returns the input untouched and marks exhausted
|
|
77
|
+
let mockReducerStepFn:
|
|
78
|
+
| ((msgs: Message[], cfg: unknown, state: unknown) => unknown)
|
|
79
|
+
| null = null;
|
|
80
|
+
mock.module("../daemon/context-overflow-reducer.js", () => ({
|
|
81
|
+
createInitialReducerState: () => ({
|
|
82
|
+
appliedTiers: [],
|
|
83
|
+
injectionMode: "full" as const,
|
|
84
|
+
exhausted: false,
|
|
85
|
+
}),
|
|
86
|
+
reduceContextOverflow: async (
|
|
87
|
+
msgs: Message[],
|
|
88
|
+
cfg: unknown,
|
|
89
|
+
state: unknown,
|
|
90
|
+
) => {
|
|
91
|
+
if (mockReducerStepFn) return mockReducerStepFn(msgs, cfg, state);
|
|
92
|
+
return {
|
|
93
|
+
messages: msgs,
|
|
94
|
+
tier: "forced_compaction",
|
|
95
|
+
state: {
|
|
96
|
+
appliedTiers: [
|
|
97
|
+
"forced_compaction",
|
|
98
|
+
"tool_result_truncation",
|
|
99
|
+
"media_stubbing",
|
|
100
|
+
"injection_downgrade",
|
|
101
|
+
],
|
|
102
|
+
injectionMode: "full",
|
|
103
|
+
exhausted: true,
|
|
104
|
+
},
|
|
105
|
+
estimatedTokens: 1000,
|
|
106
|
+
};
|
|
107
|
+
},
|
|
108
|
+
}));
|
|
109
|
+
|
|
110
|
+
// Policy: default to fail_gracefully
|
|
111
|
+
let mockOverflowAction: string = "fail_gracefully";
|
|
112
|
+
mock.module("../daemon/context-overflow-policy.js", () => ({
|
|
113
|
+
resolveOverflowAction: () => mockOverflowAction,
|
|
114
|
+
}));
|
|
115
|
+
|
|
116
|
+
// Approval: default to denied
|
|
117
|
+
let mockApprovalResult = { approved: false };
|
|
118
|
+
mock.module("../daemon/context-overflow-approval.js", () => ({
|
|
119
|
+
requestCompressionApproval: async () => mockApprovalResult,
|
|
120
|
+
CONTEXT_OVERFLOW_TOOL_NAME: "context_overflow_compression",
|
|
121
|
+
}));
|
|
122
|
+
|
|
123
|
+
let hookBlocked = false;
|
|
124
|
+
let hookBlockedBy = "";
|
|
125
|
+
|
|
126
|
+
mock.module("../hooks/manager.js", () => ({
|
|
127
|
+
getHookManager: () => ({
|
|
128
|
+
trigger: async (hookName: string) => {
|
|
129
|
+
if (hookName === "pre-message" && hookBlocked) {
|
|
130
|
+
return { blocked: true, blockedBy: hookBlockedBy };
|
|
131
|
+
}
|
|
132
|
+
return { blocked: false };
|
|
133
|
+
},
|
|
134
|
+
}),
|
|
135
|
+
}));
|
|
136
|
+
|
|
137
|
+
mock.module("../memory/conversation-crud.js", () => ({
|
|
138
|
+
getConversationThreadType: () => "default",
|
|
139
|
+
setConversationOriginChannelIfUnset: () => {},
|
|
140
|
+
updateConversationUsage: () => {},
|
|
141
|
+
getMessages: () => [],
|
|
142
|
+
getConversation: () => ({
|
|
143
|
+
id: "conv-1",
|
|
144
|
+
contextSummary: null,
|
|
145
|
+
contextCompactedMessageCount: 0,
|
|
146
|
+
totalInputTokens: 0,
|
|
147
|
+
totalOutputTokens: 0,
|
|
148
|
+
totalEstimatedCost: 0,
|
|
149
|
+
title: null,
|
|
150
|
+
}),
|
|
151
|
+
provenanceFromTrustContext: () => ({
|
|
152
|
+
source: "user",
|
|
153
|
+
trustContext: undefined,
|
|
154
|
+
}),
|
|
155
|
+
getConversationOriginInterface: () => null,
|
|
156
|
+
addMessage: () => ({ id: "mock-msg-id" }),
|
|
157
|
+
deleteMessageById: () => {},
|
|
158
|
+
updateConversationContextWindow: () => {},
|
|
159
|
+
updateConversationTitle: () => {},
|
|
160
|
+
getConversationOriginChannel: () => null,
|
|
161
|
+
getMessageById: () => null,
|
|
162
|
+
updateMessageContent: () => {},
|
|
163
|
+
}));
|
|
164
|
+
|
|
165
|
+
mock.module("../memory/retriever.js", () => ({
|
|
166
|
+
buildMemoryRecall: async () => ({
|
|
167
|
+
enabled: false,
|
|
168
|
+
degraded: false,
|
|
169
|
+
injectedText: "",
|
|
170
|
+
|
|
171
|
+
semanticHits: 0,
|
|
172
|
+
recencyHits: 0,
|
|
173
|
+
injectedTokens: 0,
|
|
174
|
+
latencyMs: 0,
|
|
175
|
+
}),
|
|
176
|
+
stripMemoryRecallMessages: (msgs: Message[]) => msgs,
|
|
177
|
+
}));
|
|
178
|
+
|
|
179
|
+
mock.module("../memory/app-store.js", () => ({
|
|
180
|
+
getApp: () => null,
|
|
181
|
+
listAppFiles: () => [],
|
|
182
|
+
getAppsDir: () => "/tmp/apps",
|
|
183
|
+
}));
|
|
184
|
+
|
|
185
|
+
mock.module("../memory/app-git-service.js", () => ({
|
|
186
|
+
commitAppTurnChanges: () => Promise.resolve(),
|
|
187
|
+
}));
|
|
188
|
+
|
|
189
|
+
mock.module("../daemon/session-memory.js", () => ({
|
|
190
|
+
prepareMemoryContext: async (
|
|
191
|
+
_ctx: unknown,
|
|
192
|
+
_content: string,
|
|
193
|
+
_id: string,
|
|
194
|
+
_signal: AbortSignal,
|
|
195
|
+
) => ({
|
|
196
|
+
runMessages: [],
|
|
197
|
+
recall: {
|
|
198
|
+
enabled: false,
|
|
199
|
+
degraded: false,
|
|
200
|
+
injectedText: "",
|
|
201
|
+
|
|
202
|
+
semanticHits: 0,
|
|
203
|
+
recencyHits: 0,
|
|
204
|
+
injectedTokens: 0,
|
|
205
|
+
latencyMs: 0,
|
|
206
|
+
tier1Count: 0,
|
|
207
|
+
tier2Count: 0,
|
|
208
|
+
hybridSearchMs: 0,
|
|
209
|
+
},
|
|
210
|
+
}),
|
|
211
|
+
}));
|
|
212
|
+
|
|
213
|
+
mock.module("../daemon/session-runtime-assembly.js", () => ({
|
|
214
|
+
applyRuntimeInjections: (msgs: Message[]) => msgs,
|
|
215
|
+
stripInjectedContext: (msgs: Message[]) => msgs,
|
|
216
|
+
}));
|
|
217
|
+
|
|
218
|
+
mock.module("../daemon/date-context.js", () => ({
|
|
219
|
+
buildTemporalContext: () => null,
|
|
220
|
+
}));
|
|
221
|
+
|
|
222
|
+
mock.module("../daemon/history-repair.js", () => ({
|
|
223
|
+
repairHistory: (msgs: Message[]) => ({
|
|
224
|
+
messages: msgs,
|
|
225
|
+
stats: {
|
|
226
|
+
assistantToolResultsMigrated: 0,
|
|
227
|
+
missingToolResultsInserted: 0,
|
|
228
|
+
orphanToolResultsDowngraded: 0,
|
|
229
|
+
consecutiveSameRoleMerged: 0,
|
|
230
|
+
},
|
|
231
|
+
}),
|
|
232
|
+
deepRepairHistory: (msgs: Message[]) => ({ messages: msgs, stats: {} }),
|
|
233
|
+
}));
|
|
234
|
+
|
|
235
|
+
mock.module("../daemon/session-history.js", () => ({
|
|
236
|
+
consolidateAssistantMessages: () => {},
|
|
237
|
+
}));
|
|
238
|
+
|
|
239
|
+
const recordUsageMock = mock(() => {});
|
|
240
|
+
mock.module("../daemon/session-usage.js", () => ({
|
|
241
|
+
recordUsage: recordUsageMock,
|
|
242
|
+
}));
|
|
243
|
+
|
|
244
|
+
const resolveAssistantAttachmentsMock = mock(async () => ({
|
|
245
|
+
assistantAttachments: [],
|
|
246
|
+
emittedAttachments: [],
|
|
247
|
+
directiveWarnings: [],
|
|
248
|
+
}));
|
|
249
|
+
mock.module("../daemon/session-attachments.js", () => ({
|
|
250
|
+
resolveAssistantAttachments: resolveAssistantAttachmentsMock,
|
|
251
|
+
approveHostAttachmentRead: async () => true,
|
|
252
|
+
formatAttachmentWarnings: () => "",
|
|
253
|
+
}));
|
|
254
|
+
|
|
255
|
+
mock.module("../daemon/assistant-attachments.js", () => ({
|
|
256
|
+
cleanAssistantContent: (content: unknown[]) => ({
|
|
257
|
+
cleanedContent: content,
|
|
258
|
+
directives: [],
|
|
259
|
+
warnings: [],
|
|
260
|
+
}),
|
|
261
|
+
drainDirectiveDisplayBuffer: (buffer: string) => ({
|
|
262
|
+
emitText: buffer,
|
|
263
|
+
bufferedRemainder: "",
|
|
264
|
+
}),
|
|
265
|
+
}));
|
|
266
|
+
|
|
267
|
+
mock.module("../daemon/session-media-retry.js", () => ({
|
|
268
|
+
stripMediaPayloadsForRetry: (msgs: Message[]) => ({
|
|
269
|
+
messages: msgs,
|
|
270
|
+
modified: false,
|
|
271
|
+
replacedBlocks: 0,
|
|
272
|
+
latestUserIndex: null,
|
|
273
|
+
}),
|
|
274
|
+
raceWithTimeout: async () => "completed" as const,
|
|
275
|
+
}));
|
|
276
|
+
|
|
277
|
+
mock.module("../workspace/turn-commit.js", () => ({
|
|
278
|
+
commitTurnChanges: async () => {},
|
|
279
|
+
}));
|
|
280
|
+
|
|
281
|
+
mock.module("../workspace/git-service.js", () => ({
|
|
282
|
+
getWorkspaceGitService: () => ({
|
|
283
|
+
ensureInitialized: async () => {},
|
|
284
|
+
}),
|
|
285
|
+
}));
|
|
286
|
+
|
|
287
|
+
mock.module("../daemon/session-error.js", () => ({
|
|
288
|
+
classifySessionError: (_err: unknown, _ctx: unknown) => ({
|
|
289
|
+
code: "SESSION_PROCESSING_FAILED",
|
|
290
|
+
userMessage: "Something went wrong processing your message.",
|
|
291
|
+
retryable: false,
|
|
292
|
+
errorCategory: "processing_failed",
|
|
293
|
+
}),
|
|
294
|
+
isUserCancellation: (err: unknown, ctx: { aborted?: boolean }) => {
|
|
295
|
+
if (!ctx.aborted) return false;
|
|
296
|
+
if (err instanceof DOMException && err.name === "AbortError") return true;
|
|
297
|
+
if (err instanceof Error && err.name === "AbortError") return true;
|
|
298
|
+
return false;
|
|
299
|
+
},
|
|
300
|
+
buildSessionErrorMessage: (
|
|
301
|
+
sessionId: string,
|
|
302
|
+
classified: Record<string, unknown>,
|
|
303
|
+
) => ({
|
|
304
|
+
type: "session_error",
|
|
305
|
+
sessionId,
|
|
306
|
+
...classified,
|
|
307
|
+
}),
|
|
308
|
+
isContextTooLarge: (msg: string) =>
|
|
309
|
+
/context.?length.?exceeded|prompt.?is.?too.?long|too many.*input.*tokens/i.test(
|
|
310
|
+
msg,
|
|
311
|
+
),
|
|
312
|
+
}));
|
|
313
|
+
|
|
314
|
+
mock.module("../daemon/session-slash.js", () => ({
|
|
315
|
+
isProviderOrderingError: (msg: string) =>
|
|
316
|
+
/ordering|before.*after|messages.*order/i.test(msg),
|
|
317
|
+
}));
|
|
318
|
+
|
|
319
|
+
mock.module("../util/truncate.js", () => ({
|
|
320
|
+
truncate: (s: string) => s,
|
|
321
|
+
}));
|
|
322
|
+
|
|
323
|
+
mock.module("../agent/message-types.js", () => ({
|
|
324
|
+
createAssistantMessage: (text: string) => ({
|
|
325
|
+
role: "assistant" as const,
|
|
326
|
+
content: [{ type: "text", text }],
|
|
327
|
+
}),
|
|
328
|
+
}));
|
|
329
|
+
|
|
330
|
+
mock.module("../memory/llm-request-log-store.js", () => ({
|
|
331
|
+
recordRequestLog: () => {},
|
|
332
|
+
}));
|
|
333
|
+
|
|
334
|
+
// ── Imports (after mocks) ────────────────────────────────────────────
|
|
335
|
+
|
|
336
|
+
import {
|
|
337
|
+
type AgentLoopSessionContext,
|
|
338
|
+
runAgentLoopImpl,
|
|
339
|
+
} from "../daemon/session-agent-loop.js";
|
|
340
|
+
|
|
341
|
+
// ── Test helpers ─────────────────────────────────────────────────────
|
|
342
|
+
|
|
343
|
+
type AgentLoopRun = (
|
|
344
|
+
messages: Message[],
|
|
345
|
+
onEvent: (event: AgentEvent) => void,
|
|
346
|
+
signal?: AbortSignal,
|
|
347
|
+
requestId?: string,
|
|
348
|
+
onCheckpoint?: (checkpoint: CheckpointInfo) => CheckpointDecision,
|
|
349
|
+
) => Promise<Message[]>;
|
|
350
|
+
|
|
351
|
+
function makeCtx(
|
|
352
|
+
overrides?: Partial<AgentLoopSessionContext> & {
|
|
353
|
+
agentLoopRun?: AgentLoopRun;
|
|
354
|
+
},
|
|
355
|
+
): AgentLoopSessionContext {
|
|
356
|
+
const agentLoopRun =
|
|
357
|
+
overrides?.agentLoopRun ??
|
|
358
|
+
(async (messages: Message[]) => [
|
|
359
|
+
...messages,
|
|
360
|
+
{
|
|
361
|
+
role: "assistant" as const,
|
|
362
|
+
content: [{ type: "text" as const, text: "response" }],
|
|
363
|
+
},
|
|
364
|
+
]);
|
|
365
|
+
|
|
366
|
+
return {
|
|
367
|
+
conversationId: "test-conv",
|
|
368
|
+
messages: [
|
|
369
|
+
{ role: "user", content: [{ type: "text", text: "Hello" }] },
|
|
370
|
+
] as Message[],
|
|
371
|
+
processing: true,
|
|
372
|
+
abortController: new AbortController(),
|
|
373
|
+
currentRequestId: "test-req",
|
|
374
|
+
|
|
375
|
+
agentLoop: {
|
|
376
|
+
run: agentLoopRun,
|
|
377
|
+
} as unknown as AgentLoopSessionContext["agentLoop"],
|
|
378
|
+
provider: {
|
|
379
|
+
name: "mock-provider",
|
|
380
|
+
sendMessage: async () => ({
|
|
381
|
+
content: [{ type: "text", text: "title" }],
|
|
382
|
+
model: "mock",
|
|
383
|
+
usage: { inputTokens: 0, outputTokens: 0 },
|
|
384
|
+
stopReason: "end_turn",
|
|
385
|
+
}),
|
|
386
|
+
} as unknown as AgentLoopSessionContext["provider"],
|
|
387
|
+
systemPrompt: "system prompt",
|
|
388
|
+
|
|
389
|
+
contextWindowManager: {
|
|
390
|
+
shouldCompact: () => ({ needed: false, estimatedTokens: 0 }),
|
|
391
|
+
maybeCompact: async () => ({ compacted: false }),
|
|
392
|
+
} as unknown as AgentLoopSessionContext["contextWindowManager"],
|
|
393
|
+
contextCompactedMessageCount: 0,
|
|
394
|
+
contextCompactedAt: null,
|
|
395
|
+
|
|
396
|
+
memoryPolicy: { scopeId: "default", includeDefaultFallback: true },
|
|
397
|
+
|
|
398
|
+
currentActiveSurfaceId: undefined,
|
|
399
|
+
currentPage: undefined,
|
|
400
|
+
surfaceState: new Map(),
|
|
401
|
+
pendingSurfaceActions: new Map(),
|
|
402
|
+
surfaceActionRequestIds: new Set<string>(),
|
|
403
|
+
currentTurnSurfaces: [],
|
|
404
|
+
|
|
405
|
+
workingDir: "/tmp",
|
|
406
|
+
workspaceTopLevelContext: null,
|
|
407
|
+
workspaceTopLevelDirty: false,
|
|
408
|
+
channelCapabilities: undefined,
|
|
409
|
+
commandIntent: undefined,
|
|
410
|
+
trustContext: undefined,
|
|
411
|
+
|
|
412
|
+
coreToolNames: new Set(),
|
|
413
|
+
allowedToolNames: undefined,
|
|
414
|
+
preactivatedSkillIds: undefined,
|
|
415
|
+
skillProjectionState: new Map(),
|
|
416
|
+
skillProjectionCache:
|
|
417
|
+
new Map() as unknown as AgentLoopSessionContext["skillProjectionCache"],
|
|
418
|
+
|
|
419
|
+
traceEmitter: {
|
|
420
|
+
emit: () => {},
|
|
421
|
+
} as unknown as AgentLoopSessionContext["traceEmitter"],
|
|
422
|
+
profiler: {
|
|
423
|
+
startRequest: () => {},
|
|
424
|
+
emitSummary: () => {},
|
|
425
|
+
} as unknown as AgentLoopSessionContext["profiler"],
|
|
426
|
+
usageStats: {
|
|
427
|
+
totalInputTokens: 0,
|
|
428
|
+
totalOutputTokens: 0,
|
|
429
|
+
totalEstimatedCost: 0,
|
|
430
|
+
model: "",
|
|
431
|
+
},
|
|
432
|
+
turnCount: 0,
|
|
433
|
+
|
|
434
|
+
lastAssistantAttachments: [],
|
|
435
|
+
lastAttachmentWarnings: [],
|
|
436
|
+
|
|
437
|
+
hasNoClient: false,
|
|
438
|
+
prompter: {} as unknown as AgentLoopSessionContext["prompter"],
|
|
439
|
+
queue: {} as unknown as AgentLoopSessionContext["queue"],
|
|
440
|
+
|
|
441
|
+
getWorkspaceGitService: () => ({ ensureInitialized: async () => {} }),
|
|
442
|
+
commitTurnChanges: async () => {},
|
|
443
|
+
|
|
444
|
+
refreshWorkspaceTopLevelContextIfNeeded: () => {},
|
|
445
|
+
markWorkspaceTopLevelDirty: () => {},
|
|
446
|
+
emitActivityState: () => {},
|
|
447
|
+
getQueueDepth: () => 0,
|
|
448
|
+
hasQueuedMessages: () => false,
|
|
449
|
+
canHandoffAtCheckpoint: () => false,
|
|
450
|
+
drainQueue: () => {},
|
|
451
|
+
getTurnInterfaceContext: () => null,
|
|
452
|
+
getTurnChannelContext: () => ({
|
|
453
|
+
userMessageChannel: "vellum" as const,
|
|
454
|
+
assistantMessageChannel: "vellum" as const,
|
|
455
|
+
}),
|
|
456
|
+
|
|
457
|
+
...overrides,
|
|
458
|
+
} as AgentLoopSessionContext;
|
|
459
|
+
}
|
|
460
|
+
|
|
461
|
+
/**
|
|
462
|
+
* Build a realistic long conversation with interleaved tool calls.
|
|
463
|
+
* Returns an array of messages simulating a 75+ message conversation
|
|
464
|
+
* with a mix of text, tool_use, and tool_result blocks.
|
|
465
|
+
*/
|
|
466
|
+
function buildLongConversation(messageCount: number): Message[] {
|
|
467
|
+
const messages: Message[] = [];
|
|
468
|
+
for (let i = 0; i < messageCount; i++) {
|
|
469
|
+
if (i % 3 === 0) {
|
|
470
|
+
// User text message
|
|
471
|
+
messages.push({
|
|
472
|
+
role: "user",
|
|
473
|
+
content: [
|
|
474
|
+
{
|
|
475
|
+
type: "text",
|
|
476
|
+
text: `User message ${i}: ${"x".repeat(200)} some detailed instructions about the task at hand`,
|
|
477
|
+
},
|
|
478
|
+
],
|
|
479
|
+
});
|
|
480
|
+
} else if (i % 3 === 1) {
|
|
481
|
+
// Assistant with tool_use
|
|
482
|
+
messages.push({
|
|
483
|
+
role: "assistant",
|
|
484
|
+
content: [
|
|
485
|
+
{ type: "text", text: `Thinking about step ${i}...` },
|
|
486
|
+
{
|
|
487
|
+
type: "tool_use",
|
|
488
|
+
id: `tool-${i}`,
|
|
489
|
+
name: i % 6 === 1 ? "bash" : "file_read",
|
|
490
|
+
input: {
|
|
491
|
+
command: `some command ${i}`,
|
|
492
|
+
path: `/path/to/file-${i}.ts`,
|
|
493
|
+
},
|
|
494
|
+
},
|
|
495
|
+
],
|
|
496
|
+
});
|
|
497
|
+
} else {
|
|
498
|
+
// User with tool_result
|
|
499
|
+
messages.push({
|
|
500
|
+
role: "user",
|
|
501
|
+
content: [
|
|
502
|
+
{
|
|
503
|
+
type: "tool_result",
|
|
504
|
+
tool_use_id: `tool-${i - 1}`,
|
|
505
|
+
content: `Result of tool call ${i - 1}: ${"output data ".repeat(50)}`,
|
|
506
|
+
is_error: false,
|
|
507
|
+
},
|
|
508
|
+
],
|
|
509
|
+
});
|
|
510
|
+
}
|
|
511
|
+
}
|
|
512
|
+
return messages as Message[];
|
|
513
|
+
}
|
|
514
|
+
|
|
515
|
+
// ── Tests ────────────────────────────────────────────────────────────
|
|
516
|
+
|
|
517
|
+
beforeEach(() => {
|
|
518
|
+
hookBlocked = false;
|
|
519
|
+
hookBlockedBy = "";
|
|
520
|
+
mockEstimateTokens = 1000;
|
|
521
|
+
mockReducerStepFn = null;
|
|
522
|
+
mockOverflowAction = "fail_gracefully";
|
|
523
|
+
mockApprovalResult = { approved: false };
|
|
524
|
+
recordUsageMock.mockClear();
|
|
525
|
+
});
|
|
526
|
+
|
|
527
|
+
describe("session-agent-loop overflow recovery (JARVIS-110)", () => {
|
|
528
|
+
// ── Test 1 ────────────────────────────────────────────────────────
|
|
529
|
+
// BUG: When the agent loop makes progress (adds messages to history)
|
|
530
|
+
// before hitting context_too_large, the convergence loop at line 864
|
|
531
|
+
// checks `updatedHistory.length === preRunHistoryLength` which is
|
|
532
|
+
// false when progress was made. This means the reducer is never
|
|
533
|
+
// invoked — the error is surfaced immediately at line 1163-1175
|
|
534
|
+
// without any compaction attempt.
|
|
535
|
+
//
|
|
536
|
+
// Expected behavior (PR 2 fix): After progress + context_too_large,
|
|
537
|
+
// the system should still attempt compaction before surfacing error.
|
|
538
|
+
test("context too large after progress triggers compaction retry instead of immediate failure", async () => {
|
|
539
|
+
const events: ServerMessage[] = [];
|
|
540
|
+
let reducerCalled = false;
|
|
541
|
+
|
|
542
|
+
mockReducerStepFn = (msgs: Message[]) => {
|
|
543
|
+
reducerCalled = true;
|
|
544
|
+
return {
|
|
545
|
+
messages: msgs,
|
|
546
|
+
tier: "forced_compaction",
|
|
547
|
+
state: {
|
|
548
|
+
appliedTiers: ["forced_compaction"],
|
|
549
|
+
injectionMode: "full",
|
|
550
|
+
exhausted: false,
|
|
551
|
+
},
|
|
552
|
+
estimatedTokens: 50_000,
|
|
553
|
+
compactionResult: {
|
|
554
|
+
compacted: true,
|
|
555
|
+
messages: msgs,
|
|
556
|
+
compactedPersistedMessages: 5,
|
|
557
|
+
summaryText: "Summary",
|
|
558
|
+
previousEstimatedInputTokens: 190_000,
|
|
559
|
+
estimatedInputTokens: 50_000,
|
|
560
|
+
maxInputTokens: 200_000,
|
|
561
|
+
thresholdTokens: 160_000,
|
|
562
|
+
compactedMessages: 10,
|
|
563
|
+
summaryCalls: 1,
|
|
564
|
+
summaryInputTokens: 500,
|
|
565
|
+
summaryOutputTokens: 200,
|
|
566
|
+
summaryModel: "mock-model",
|
|
567
|
+
},
|
|
568
|
+
};
|
|
569
|
+
};
|
|
570
|
+
|
|
571
|
+
let agentLoopCallCount = 0;
|
|
572
|
+
const agentLoopRun: AgentLoopRun = async (messages, onEvent) => {
|
|
573
|
+
agentLoopCallCount++;
|
|
574
|
+
if (agentLoopCallCount === 1) {
|
|
575
|
+
// Simulate: agent makes progress (tool calls + results added)
|
|
576
|
+
// then hits context_too_large on next LLM call
|
|
577
|
+
const progressMessages: Message[] = [
|
|
578
|
+
...messages,
|
|
579
|
+
{
|
|
580
|
+
role: "assistant" as const,
|
|
581
|
+
content: [
|
|
582
|
+
{ type: "text", text: "Let me check that." },
|
|
583
|
+
{
|
|
584
|
+
type: "tool_use",
|
|
585
|
+
id: "tu-progress",
|
|
586
|
+
name: "bash",
|
|
587
|
+
input: { command: "ls" },
|
|
588
|
+
},
|
|
589
|
+
] as ContentBlock[],
|
|
590
|
+
},
|
|
591
|
+
{
|
|
592
|
+
role: "user" as const,
|
|
593
|
+
content: [
|
|
594
|
+
{
|
|
595
|
+
type: "tool_result",
|
|
596
|
+
tool_use_id: "tu-progress",
|
|
597
|
+
content: "file1.ts\nfile2.ts",
|
|
598
|
+
is_error: false,
|
|
599
|
+
},
|
|
600
|
+
] as ContentBlock[],
|
|
601
|
+
},
|
|
602
|
+
];
|
|
603
|
+
|
|
604
|
+
// Emit events for the progress that was made
|
|
605
|
+
onEvent({
|
|
606
|
+
type: "tool_use",
|
|
607
|
+
id: "tu-progress",
|
|
608
|
+
name: "bash",
|
|
609
|
+
input: { command: "ls" },
|
|
610
|
+
});
|
|
611
|
+
onEvent({
|
|
612
|
+
type: "tool_result",
|
|
613
|
+
toolUseId: "tu-progress",
|
|
614
|
+
content: "file1.ts\nfile2.ts",
|
|
615
|
+
isError: false,
|
|
616
|
+
});
|
|
617
|
+
onEvent({
|
|
618
|
+
type: "message_complete",
|
|
619
|
+
message: {
|
|
620
|
+
role: "assistant",
|
|
621
|
+
content: [
|
|
622
|
+
{ type: "text", text: "Let me check that." },
|
|
623
|
+
{
|
|
624
|
+
type: "tool_use",
|
|
625
|
+
id: "tu-progress",
|
|
626
|
+
name: "bash",
|
|
627
|
+
input: { command: "ls" },
|
|
628
|
+
},
|
|
629
|
+
],
|
|
630
|
+
},
|
|
631
|
+
});
|
|
632
|
+
onEvent({
|
|
633
|
+
type: "usage",
|
|
634
|
+
inputTokens: 100,
|
|
635
|
+
outputTokens: 50,
|
|
636
|
+
model: "test-model",
|
|
637
|
+
providerDurationMs: 100,
|
|
638
|
+
});
|
|
639
|
+
|
|
640
|
+
// Then context_too_large error occurs on the *next* LLM call
|
|
641
|
+
onEvent({
|
|
642
|
+
type: "error",
|
|
643
|
+
error: new Error(
|
|
644
|
+
"prompt is too long: 242201 tokens > 200000 maximum",
|
|
645
|
+
),
|
|
646
|
+
});
|
|
647
|
+
onEvent({
|
|
648
|
+
type: "usage",
|
|
649
|
+
inputTokens: 0,
|
|
650
|
+
outputTokens: 0,
|
|
651
|
+
model: "test-model",
|
|
652
|
+
providerDurationMs: 10,
|
|
653
|
+
});
|
|
654
|
+
|
|
655
|
+
// Return the history WITH progress (more messages than input)
|
|
656
|
+
return progressMessages;
|
|
657
|
+
}
|
|
658
|
+
|
|
659
|
+
// Second call (after compaction): succeed
|
|
660
|
+
onEvent({
|
|
661
|
+
type: "message_complete",
|
|
662
|
+
message: {
|
|
663
|
+
role: "assistant",
|
|
664
|
+
content: [{ type: "text", text: "recovered after compaction" }],
|
|
665
|
+
},
|
|
666
|
+
});
|
|
667
|
+
onEvent({
|
|
668
|
+
type: "usage",
|
|
669
|
+
inputTokens: 50,
|
|
670
|
+
outputTokens: 25,
|
|
671
|
+
model: "test-model",
|
|
672
|
+
providerDurationMs: 100,
|
|
673
|
+
});
|
|
674
|
+
return [
|
|
675
|
+
...messages,
|
|
676
|
+
{
|
|
677
|
+
role: "assistant" as const,
|
|
678
|
+
content: [
|
|
679
|
+
{ type: "text", text: "recovered after compaction" },
|
|
680
|
+
] as ContentBlock[],
|
|
681
|
+
},
|
|
682
|
+
];
|
|
683
|
+
};
|
|
684
|
+
|
|
685
|
+
const ctx = makeCtx({
|
|
686
|
+
agentLoopRun,
|
|
687
|
+
contextWindowManager: {
|
|
688
|
+
shouldCompact: () => ({ needed: false, estimatedTokens: 0 }),
|
|
689
|
+
maybeCompact: async () => ({ compacted: false }),
|
|
690
|
+
} as unknown as AgentLoopSessionContext["contextWindowManager"],
|
|
691
|
+
});
|
|
692
|
+
|
|
693
|
+
await runAgentLoopImpl(ctx, "hello", "msg-1", (msg) => events.push(msg));
|
|
694
|
+
|
|
695
|
+
// BUG: Currently the reducer is NOT called when progress was made before
|
|
696
|
+
// context_too_large. The error is surfaced immediately.
|
|
697
|
+
// After PR 2 fix, the reducer SHOULD be called to attempt compaction.
|
|
698
|
+
expect(reducerCalled).toBe(true);
|
|
699
|
+
|
|
700
|
+
// BUG: Currently a session_error IS emitted instead of retrying.
|
|
701
|
+
// After PR 2 fix, there should be no session_error.
|
|
702
|
+
const sessionError = events.find((e) => e.type === "session_error");
|
|
703
|
+
expect(sessionError).toBeUndefined();
|
|
704
|
+
});
|
|
705
|
+
|
|
706
|
+
// ── Test 2 ────────────────────────────────────────────────────────
|
|
707
|
+
// When estimation says we're within budget but the provider rejects,
|
|
708
|
+
// the post-run convergence loop should kick in and recover.
|
|
709
|
+
// This test should PASS against current code (when no progress is made).
|
|
710
|
+
test("overflow recovery compacts below limit even when estimation underestimates", async () => {
|
|
711
|
+
const events: ServerMessage[] = [];
|
|
712
|
+
let callCount = 0;
|
|
713
|
+
let reducerCalled = false;
|
|
714
|
+
|
|
715
|
+
// Estimator says 185k (below 190k budget = 200k * 0.95)
|
|
716
|
+
mockEstimateTokens = 185_000;
|
|
717
|
+
|
|
718
|
+
// Reducer successfully compacts
|
|
719
|
+
mockReducerStepFn = (msgs: Message[]) => {
|
|
720
|
+
reducerCalled = true;
|
|
721
|
+
return {
|
|
722
|
+
messages: msgs,
|
|
723
|
+
tier: "forced_compaction",
|
|
724
|
+
state: {
|
|
725
|
+
appliedTiers: ["forced_compaction"],
|
|
726
|
+
injectionMode: "full",
|
|
727
|
+
exhausted: false,
|
|
728
|
+
},
|
|
729
|
+
estimatedTokens: 100_000,
|
|
730
|
+
compactionResult: {
|
|
731
|
+
compacted: true,
|
|
732
|
+
messages: msgs,
|
|
733
|
+
compactedPersistedMessages: 10,
|
|
734
|
+
summaryText: "Summary",
|
|
735
|
+
previousEstimatedInputTokens: 185_000,
|
|
736
|
+
estimatedInputTokens: 100_000,
|
|
737
|
+
maxInputTokens: 200_000,
|
|
738
|
+
thresholdTokens: 160_000,
|
|
739
|
+
compactedMessages: 20,
|
|
740
|
+
summaryCalls: 1,
|
|
741
|
+
summaryInputTokens: 800,
|
|
742
|
+
summaryOutputTokens: 300,
|
|
743
|
+
summaryModel: "mock-model",
|
|
744
|
+
},
|
|
745
|
+
};
|
|
746
|
+
};
|
|
747
|
+
|
|
748
|
+
const agentLoopRun: AgentLoopRun = async (messages, onEvent) => {
|
|
749
|
+
callCount++;
|
|
750
|
+
if (callCount === 1) {
|
|
751
|
+
// Provider rejects with "prompt is too long: 242201 tokens > 200000"
|
|
752
|
+
// even though estimator said 185k
|
|
753
|
+
onEvent({
|
|
754
|
+
type: "error",
|
|
755
|
+
error: new Error(
|
|
756
|
+
"prompt is too long: 242201 tokens > 200000 maximum",
|
|
757
|
+
),
|
|
758
|
+
});
|
|
759
|
+
onEvent({
|
|
760
|
+
type: "usage",
|
|
761
|
+
inputTokens: 0,
|
|
762
|
+
outputTokens: 0,
|
|
763
|
+
model: "test-model",
|
|
764
|
+
providerDurationMs: 10,
|
|
765
|
+
});
|
|
766
|
+
// No progress — return same messages
|
|
767
|
+
return messages;
|
|
768
|
+
}
|
|
769
|
+
// Second call succeeds
|
|
770
|
+
onEvent({
|
|
771
|
+
type: "message_complete",
|
|
772
|
+
message: {
|
|
773
|
+
role: "assistant",
|
|
774
|
+
content: [{ type: "text", text: "recovered" }],
|
|
775
|
+
},
|
|
776
|
+
});
|
|
777
|
+
onEvent({
|
|
778
|
+
type: "usage",
|
|
779
|
+
inputTokens: 80_000,
|
|
780
|
+
outputTokens: 200,
|
|
781
|
+
model: "test-model",
|
|
782
|
+
providerDurationMs: 500,
|
|
783
|
+
});
|
|
784
|
+
return [
|
|
785
|
+
...messages,
|
|
786
|
+
{
|
|
787
|
+
role: "assistant" as const,
|
|
788
|
+
content: [{ type: "text", text: "recovered" }] as ContentBlock[],
|
|
789
|
+
},
|
|
790
|
+
];
|
|
791
|
+
};
|
|
792
|
+
|
|
793
|
+
const ctx = makeCtx({
|
|
794
|
+
agentLoopRun,
|
|
795
|
+
contextWindowManager: {
|
|
796
|
+
shouldCompact: () => ({ needed: false, estimatedTokens: 0 }),
|
|
797
|
+
maybeCompact: async () => ({ compacted: false }),
|
|
798
|
+
} as unknown as AgentLoopSessionContext["contextWindowManager"],
|
|
799
|
+
});
|
|
800
|
+
|
|
801
|
+
await runAgentLoopImpl(ctx, "hello", "msg-1", (msg) => events.push(msg));
|
|
802
|
+
|
|
803
|
+
// The reducer should be called in the convergence loop
|
|
804
|
+
expect(reducerCalled).toBe(true);
|
|
805
|
+
// Should recover without session_error
|
|
806
|
+
const sessionError = events.find((e) => e.type === "session_error");
|
|
807
|
+
expect(sessionError).toBeUndefined();
|
|
808
|
+
expect(callCount).toBe(2);
|
|
809
|
+
});
|
|
810
|
+
|
|
811
|
+
// ── Test 3 ────────────────────────────────────────────────────────
|
|
812
|
+
// BUG: When the provider rejection reveals actual token count (e.g.,
|
|
813
|
+
// "242201 tokens > 200000"), the reducer should target a budget below
|
|
814
|
+
// the actual limit (not below the estimator's inaccurate budget).
|
|
815
|
+
// Currently the reducer always uses `preflightBudget` (190k) as the
|
|
816
|
+
// target, but the actual tokens were 242k — so 190k is already too
|
|
817
|
+
// high relative to the real count. The target should be adjusted
|
|
818
|
+
// downward based on the observed mismatch.
|
|
819
|
+
//
|
|
820
|
+
// Expected behavior (PR 4 fix): `targetInputTokensOverride` should
|
|
821
|
+
// be adjusted based on the ratio between estimated and actual tokens.
|
|
822
|
+
// BUG: The targetTokens passed to the reducer is preflightBudget = 190k.
|
|
823
|
+
// But when the actual token count is 242k (1.31x the estimate of 185k),
|
|
824
|
+
// the target should be adjusted downward to account for the estimation
|
|
825
|
+
// inaccuracy. For example: 190k / 1.31 ≈ 145k.
|
|
826
|
+
// Planned fix: targetInputTokensOverride should be adjusted based on
|
|
827
|
+
// the ratio between estimated and actual tokens.
|
|
828
|
+
test("forced compaction targets a lower budget when estimation has been inaccurate", async () => {
|
|
829
|
+
const events: ServerMessage[] = [];
|
|
830
|
+
let callCount = 0;
|
|
831
|
+
let capturedTargetTokens: number | undefined;
|
|
832
|
+
|
|
833
|
+
// Estimator says 185k (below 190k budget = 200k * 0.95)
|
|
834
|
+
mockEstimateTokens = 185_000;
|
|
835
|
+
|
|
836
|
+
// Reducer captures the targetTokens from the config
|
|
837
|
+
mockReducerStepFn = (
|
|
838
|
+
msgs: Message[],
|
|
839
|
+
cfg: unknown,
|
|
840
|
+
) => {
|
|
841
|
+
capturedTargetTokens = (cfg as { targetTokens: number }).targetTokens;
|
|
842
|
+
return {
|
|
843
|
+
messages: msgs,
|
|
844
|
+
tier: "forced_compaction",
|
|
845
|
+
state: {
|
|
846
|
+
appliedTiers: ["forced_compaction"],
|
|
847
|
+
injectionMode: "full",
|
|
848
|
+
exhausted: false,
|
|
849
|
+
},
|
|
850
|
+
estimatedTokens: 100_000,
|
|
851
|
+
compactionResult: {
|
|
852
|
+
compacted: true,
|
|
853
|
+
messages: msgs,
|
|
854
|
+
compactedPersistedMessages: 10,
|
|
855
|
+
summaryText: "Summary",
|
|
856
|
+
previousEstimatedInputTokens: 185_000,
|
|
857
|
+
estimatedInputTokens: 100_000,
|
|
858
|
+
maxInputTokens: 200_000,
|
|
859
|
+
thresholdTokens: 160_000,
|
|
860
|
+
compactedMessages: 20,
|
|
861
|
+
summaryCalls: 1,
|
|
862
|
+
summaryInputTokens: 800,
|
|
863
|
+
summaryOutputTokens: 300,
|
|
864
|
+
summaryModel: "mock-model",
|
|
865
|
+
},
|
|
866
|
+
};
|
|
867
|
+
};
|
|
868
|
+
|
|
869
|
+
const agentLoopRun: AgentLoopRun = async (messages, onEvent) => {
|
|
870
|
+
callCount++;
|
|
871
|
+
if (callCount === 1) {
|
|
872
|
+
// Provider rejects: actual tokens 242201, way above estimate of 185k
|
|
873
|
+
onEvent({
|
|
874
|
+
type: "error",
|
|
875
|
+
error: new Error(
|
|
876
|
+
"prompt is too long: 242201 tokens > 200000 maximum",
|
|
877
|
+
),
|
|
878
|
+
});
|
|
879
|
+
onEvent({
|
|
880
|
+
type: "usage",
|
|
881
|
+
inputTokens: 0,
|
|
882
|
+
outputTokens: 0,
|
|
883
|
+
model: "test-model",
|
|
884
|
+
providerDurationMs: 10,
|
|
885
|
+
});
|
|
886
|
+
// No progress — return same messages
|
|
887
|
+
return messages;
|
|
888
|
+
}
|
|
889
|
+
// Second call succeeds after compaction
|
|
890
|
+
onEvent({
|
|
891
|
+
type: "message_complete",
|
|
892
|
+
message: {
|
|
893
|
+
role: "assistant",
|
|
894
|
+
content: [{ type: "text", text: "recovered" }],
|
|
895
|
+
},
|
|
896
|
+
});
|
|
897
|
+
onEvent({
|
|
898
|
+
type: "usage",
|
|
899
|
+
inputTokens: 80_000,
|
|
900
|
+
outputTokens: 200,
|
|
901
|
+
model: "test-model",
|
|
902
|
+
providerDurationMs: 500,
|
|
903
|
+
});
|
|
904
|
+
return [
|
|
905
|
+
...messages,
|
|
906
|
+
{
|
|
907
|
+
role: "assistant" as const,
|
|
908
|
+
content: [{ type: "text", text: "recovered" }] as ContentBlock[],
|
|
909
|
+
},
|
|
910
|
+
];
|
|
911
|
+
};
|
|
912
|
+
|
|
913
|
+
const ctx = makeCtx({
|
|
914
|
+
agentLoopRun,
|
|
915
|
+
contextWindowManager: {
|
|
916
|
+
shouldCompact: () => ({ needed: false, estimatedTokens: 0 }),
|
|
917
|
+
maybeCompact: async () => ({ compacted: false }),
|
|
918
|
+
} as unknown as AgentLoopSessionContext["contextWindowManager"],
|
|
919
|
+
});
|
|
920
|
+
|
|
921
|
+
await runAgentLoopImpl(ctx, "hello", "msg-1", (msg) => events.push(msg));
|
|
922
|
+
|
|
923
|
+
// The reducer should have been called with a corrected target
|
|
924
|
+
expect(capturedTargetTokens).toBeDefined();
|
|
925
|
+
|
|
926
|
+
// preflightBudget = 200_000 * 0.95 = 190_000
|
|
927
|
+
// estimationErrorRatio = 242201 / 185000 ≈ 1.309
|
|
928
|
+
// correctedTarget = floor(190000 / 1.309) ≈ 145_130
|
|
929
|
+
// The corrected target must be LESS than the uncorrected preflightBudget
|
|
930
|
+
const preflightBudget = 190_000;
|
|
931
|
+
expect(capturedTargetTokens!).toBeLessThan(preflightBudget);
|
|
932
|
+
|
|
933
|
+
// Verify the approximate corrected value (190000 / (242201/185000))
|
|
934
|
+
const expectedCorrectedTarget = Math.floor(
|
|
935
|
+
preflightBudget / (242201 / 185_000),
|
|
936
|
+
);
|
|
937
|
+
expect(capturedTargetTokens!).toBe(expectedCorrectedTarget);
|
|
938
|
+
|
|
939
|
+
// Should recover without session_error
|
|
940
|
+
const sessionError = events.find((e) => e.type === "session_error");
|
|
941
|
+
expect(sessionError).toBeUndefined();
|
|
942
|
+
expect(callCount).toBe(2);
|
|
943
|
+
});
|
|
944
|
+
|
|
945
|
+
// ── Test 4 ────────────────────────────────────────────────────────
|
|
946
|
+
// A realistic 75+ message conversation with many tool calls where
|
|
947
|
+
// token estimation underestimates. This test should PASS against
|
|
948
|
+
// current code because the agent loop returns same-length history
|
|
949
|
+
// (no progress), so the convergence loop kicks in.
|
|
950
|
+
test("overflow recovery succeeds for 75+ message conversation with many tool calls", async () => {
|
|
951
|
+
const events: ServerMessage[] = [];
|
|
952
|
+
const longHistory = buildLongConversation(75);
|
|
953
|
+
let callCount = 0;
|
|
954
|
+
let reducerCalled = false;
|
|
955
|
+
|
|
956
|
+
// Estimator says ~195k — just above budget so preflight reducer runs
|
|
957
|
+
mockEstimateTokens = 195_000;
|
|
958
|
+
|
|
959
|
+
// Reducer reduces to under budget
|
|
960
|
+
mockReducerStepFn = (msgs: Message[]) => {
|
|
961
|
+
reducerCalled = true;
|
|
962
|
+
return {
|
|
963
|
+
messages: msgs.slice(-10), // Keep only last 10 messages
|
|
964
|
+
tier: "forced_compaction",
|
|
965
|
+
state: {
|
|
966
|
+
appliedTiers: ["forced_compaction"],
|
|
967
|
+
injectionMode: "full",
|
|
968
|
+
exhausted: false,
|
|
969
|
+
},
|
|
970
|
+
estimatedTokens: 50_000,
|
|
971
|
+
compactionResult: {
|
|
972
|
+
compacted: true,
|
|
973
|
+
messages: msgs.slice(-10),
|
|
974
|
+
compactedPersistedMessages: msgs.length - 10,
|
|
975
|
+
summaryText: "Long conversation summary",
|
|
976
|
+
previousEstimatedInputTokens: 195_000,
|
|
977
|
+
estimatedInputTokens: 50_000,
|
|
978
|
+
maxInputTokens: 200_000,
|
|
979
|
+
thresholdTokens: 160_000,
|
|
980
|
+
compactedMessages: msgs.length - 10,
|
|
981
|
+
summaryCalls: 2,
|
|
982
|
+
summaryInputTokens: 2000,
|
|
983
|
+
summaryOutputTokens: 500,
|
|
984
|
+
summaryModel: "mock-model",
|
|
985
|
+
},
|
|
986
|
+
};
|
|
987
|
+
};
|
|
988
|
+
|
|
989
|
+
const agentLoopRun: AgentLoopRun = async (messages, onEvent) => {
|
|
990
|
+
callCount++;
|
|
991
|
+
onEvent({
|
|
992
|
+
type: "message_complete",
|
|
993
|
+
message: {
|
|
994
|
+
role: "assistant",
|
|
995
|
+
content: [{ type: "text", text: "Here's the analysis..." }],
|
|
996
|
+
},
|
|
997
|
+
});
|
|
998
|
+
onEvent({
|
|
999
|
+
type: "usage",
|
|
1000
|
+
inputTokens: 50_000,
|
|
1001
|
+
outputTokens: 300,
|
|
1002
|
+
model: "test-model",
|
|
1003
|
+
providerDurationMs: 800,
|
|
1004
|
+
});
|
|
1005
|
+
return [
|
|
1006
|
+
...messages,
|
|
1007
|
+
{
|
|
1008
|
+
role: "assistant" as const,
|
|
1009
|
+
content: [
|
|
1010
|
+
{ type: "text", text: "Here's the analysis..." },
|
|
1011
|
+
] as ContentBlock[],
|
|
1012
|
+
},
|
|
1013
|
+
];
|
|
1014
|
+
};
|
|
1015
|
+
|
|
1016
|
+
const ctx = makeCtx({
|
|
1017
|
+
agentLoopRun,
|
|
1018
|
+
messages: longHistory,
|
|
1019
|
+
contextWindowManager: {
|
|
1020
|
+
shouldCompact: () => ({ needed: false, estimatedTokens: 0 }),
|
|
1021
|
+
maybeCompact: async () => ({ compacted: false }),
|
|
1022
|
+
} as unknown as AgentLoopSessionContext["contextWindowManager"],
|
|
1023
|
+
});
|
|
1024
|
+
|
|
1025
|
+
await runAgentLoopImpl(ctx, "analyze this", "msg-1", (msg) =>
|
|
1026
|
+
events.push(msg),
|
|
1027
|
+
);
|
|
1028
|
+
|
|
1029
|
+
// Preflight should trigger the reducer since 195k > 190k budget
|
|
1030
|
+
expect(reducerCalled).toBe(true);
|
|
1031
|
+
// Should succeed
|
|
1032
|
+
expect(callCount).toBe(1);
|
|
1033
|
+
const sessionError = events.find((e) => e.type === "session_error");
|
|
1034
|
+
expect(sessionError).toBeUndefined();
|
|
1035
|
+
const complete = events.find((e) => e.type === "message_complete");
|
|
1036
|
+
expect(complete).toBeDefined();
|
|
1037
|
+
});
|
|
1038
|
+
|
|
1039
|
+
// ── Test 5 ────────────────────────────────────────────────────────
|
|
1040
|
+
// BUG: When all 4 reducer tiers have been applied, then the agent
|
|
1041
|
+
// makes progress and context_too_large fires again, no emergency
|
|
1042
|
+
// compaction is attempted. The `else if` at line 1163 just surfaces
|
|
1043
|
+
// the error.
|
|
1044
|
+
//
|
|
1045
|
+
// Expected behavior (PR 2 fix): Even after all tiers are exhausted,
|
|
1046
|
+
// if progress was made, attempt emergency compaction with
|
|
1047
|
+
// `minKeepRecentUserTurns: 0` as a last resort.
|
|
1048
|
+
test("exhausted reducer tiers with progress still attempts emergency compaction", async () => {
|
|
1049
|
+
const events: ServerMessage[] = [];
|
|
1050
|
+
let emergencyCompactCalled = false;
|
|
1051
|
+
|
|
1052
|
+
// Start with reducer already exhausted
|
|
1053
|
+
mockReducerStepFn = (msgs: Message[]) => {
|
|
1054
|
+
return {
|
|
1055
|
+
messages: msgs,
|
|
1056
|
+
tier: "injection_downgrade",
|
|
1057
|
+
state: {
|
|
1058
|
+
appliedTiers: [
|
|
1059
|
+
"forced_compaction",
|
|
1060
|
+
"tool_result_truncation",
|
|
1061
|
+
"media_stubbing",
|
|
1062
|
+
"injection_downgrade",
|
|
1063
|
+
],
|
|
1064
|
+
injectionMode: "minimal",
|
|
1065
|
+
exhausted: true,
|
|
1066
|
+
},
|
|
1067
|
+
estimatedTokens: 195_000,
|
|
1068
|
+
};
|
|
1069
|
+
};
|
|
1070
|
+
|
|
1071
|
+
let agentLoopCallCount = 0;
|
|
1072
|
+
const agentLoopRun: AgentLoopRun = async (messages, onEvent) => {
|
|
1073
|
+
agentLoopCallCount++;
|
|
1074
|
+
if (agentLoopCallCount === 1) {
|
|
1075
|
+
// Agent makes progress (tool calls succeed, messages grow)
|
|
1076
|
+
const progressMessages: Message[] = [
|
|
1077
|
+
...messages,
|
|
1078
|
+
{
|
|
1079
|
+
role: "assistant" as const,
|
|
1080
|
+
content: [
|
|
1081
|
+
{ type: "text", text: "Running analysis..." },
|
|
1082
|
+
{
|
|
1083
|
+
type: "tool_use",
|
|
1084
|
+
id: "tu-1",
|
|
1085
|
+
name: "bash",
|
|
1086
|
+
input: { command: "find . -name '*.ts'" },
|
|
1087
|
+
},
|
|
1088
|
+
] as ContentBlock[],
|
|
1089
|
+
},
|
|
1090
|
+
{
|
|
1091
|
+
role: "user" as const,
|
|
1092
|
+
content: [
|
|
1093
|
+
{
|
|
1094
|
+
type: "tool_result",
|
|
1095
|
+
tool_use_id: "tu-1",
|
|
1096
|
+
content: "file1.ts\nfile2.ts\nfile3.ts",
|
|
1097
|
+
is_error: false,
|
|
1098
|
+
},
|
|
1099
|
+
] as ContentBlock[],
|
|
1100
|
+
},
|
|
1101
|
+
];
|
|
1102
|
+
|
|
1103
|
+
onEvent({
|
|
1104
|
+
type: "tool_use",
|
|
1105
|
+
id: "tu-1",
|
|
1106
|
+
name: "bash",
|
|
1107
|
+
input: { command: "find . -name '*.ts'" },
|
|
1108
|
+
});
|
|
1109
|
+
onEvent({
|
|
1110
|
+
type: "tool_result",
|
|
1111
|
+
toolUseId: "tu-1",
|
|
1112
|
+
content: "file1.ts\nfile2.ts\nfile3.ts",
|
|
1113
|
+
isError: false,
|
|
1114
|
+
});
|
|
1115
|
+
onEvent({
|
|
1116
|
+
type: "message_complete",
|
|
1117
|
+
message: {
|
|
1118
|
+
role: "assistant",
|
|
1119
|
+
content: [
|
|
1120
|
+
{ type: "text", text: "Running analysis..." },
|
|
1121
|
+
{
|
|
1122
|
+
type: "tool_use",
|
|
1123
|
+
id: "tu-1",
|
|
1124
|
+
name: "bash",
|
|
1125
|
+
input: { command: "find . -name '*.ts'" },
|
|
1126
|
+
},
|
|
1127
|
+
],
|
|
1128
|
+
},
|
|
1129
|
+
});
|
|
1130
|
+
onEvent({
|
|
1131
|
+
type: "usage",
|
|
1132
|
+
inputTokens: 190_000,
|
|
1133
|
+
outputTokens: 100,
|
|
1134
|
+
model: "test-model",
|
|
1135
|
+
providerDurationMs: 200,
|
|
1136
|
+
});
|
|
1137
|
+
|
|
1138
|
+
// Then context_too_large on the next LLM call within the loop
|
|
1139
|
+
onEvent({
|
|
1140
|
+
type: "error",
|
|
1141
|
+
error: new Error("context_length_exceeded"),
|
|
1142
|
+
});
|
|
1143
|
+
onEvent({
|
|
1144
|
+
type: "usage",
|
|
1145
|
+
inputTokens: 0,
|
|
1146
|
+
outputTokens: 0,
|
|
1147
|
+
model: "test-model",
|
|
1148
|
+
providerDurationMs: 10,
|
|
1149
|
+
});
|
|
1150
|
+
|
|
1151
|
+
return progressMessages;
|
|
1152
|
+
}
|
|
1153
|
+
|
|
1154
|
+
// After emergency compaction, succeed
|
|
1155
|
+
onEvent({
|
|
1156
|
+
type: "message_complete",
|
|
1157
|
+
message: {
|
|
1158
|
+
role: "assistant",
|
|
1159
|
+
content: [{ type: "text", text: "recovered" }],
|
|
1160
|
+
},
|
|
1161
|
+
});
|
|
1162
|
+
onEvent({
|
|
1163
|
+
type: "usage",
|
|
1164
|
+
inputTokens: 50_000,
|
|
1165
|
+
outputTokens: 100,
|
|
1166
|
+
model: "test-model",
|
|
1167
|
+
providerDurationMs: 200,
|
|
1168
|
+
});
|
|
1169
|
+
return [
|
|
1170
|
+
...messages,
|
|
1171
|
+
{
|
|
1172
|
+
role: "assistant" as const,
|
|
1173
|
+
content: [{ type: "text", text: "recovered" }] as ContentBlock[],
|
|
1174
|
+
},
|
|
1175
|
+
];
|
|
1176
|
+
};
|
|
1177
|
+
|
|
1178
|
+
const ctx = makeCtx({
|
|
1179
|
+
agentLoopRun,
|
|
1180
|
+
contextWindowManager: {
|
|
1181
|
+
shouldCompact: () => ({ needed: false, estimatedTokens: 0 }),
|
|
1182
|
+
maybeCompact: async (
|
|
1183
|
+
_msgs: Message[],
|
|
1184
|
+
_signal: AbortSignal,
|
|
1185
|
+
opts?: Record<string, unknown>,
|
|
1186
|
+
) => {
|
|
1187
|
+
if (opts?.force && opts?.minKeepRecentUserTurns === 0) {
|
|
1188
|
+
emergencyCompactCalled = true;
|
|
1189
|
+
return {
|
|
1190
|
+
compacted: true,
|
|
1191
|
+
messages: [
|
|
1192
|
+
{
|
|
1193
|
+
role: "user",
|
|
1194
|
+
content: [{ type: "text", text: "Hello" }],
|
|
1195
|
+
},
|
|
1196
|
+
] as Message[],
|
|
1197
|
+
compactedPersistedMessages: 50,
|
|
1198
|
+
summaryText: "Emergency summary",
|
|
1199
|
+
previousEstimatedInputTokens: 195_000,
|
|
1200
|
+
estimatedInputTokens: 50_000,
|
|
1201
|
+
maxInputTokens: 200_000,
|
|
1202
|
+
thresholdTokens: 160_000,
|
|
1203
|
+
compactedMessages: 50,
|
|
1204
|
+
summaryCalls: 1,
|
|
1205
|
+
summaryInputTokens: 1000,
|
|
1206
|
+
summaryOutputTokens: 300,
|
|
1207
|
+
summaryModel: "mock-model",
|
|
1208
|
+
};
|
|
1209
|
+
}
|
|
1210
|
+
return { compacted: false };
|
|
1211
|
+
},
|
|
1212
|
+
} as unknown as AgentLoopSessionContext["contextWindowManager"],
|
|
1213
|
+
});
|
|
1214
|
+
|
|
1215
|
+
await runAgentLoopImpl(ctx, "hello", "msg-1", (msg) => events.push(msg));
|
|
1216
|
+
|
|
1217
|
+
// BUG: Currently when progress was made + all tiers exhausted,
|
|
1218
|
+
// emergency compaction is NOT attempted. The error is surfaced directly.
|
|
1219
|
+
// After PR 2 fix, emergency compaction should be attempted.
|
|
1220
|
+
expect(emergencyCompactCalled).toBe(true);
|
|
1221
|
+
|
|
1222
|
+
// BUG: Currently a session_error IS emitted.
|
|
1223
|
+
const sessionError = events.find((e) => e.type === "session_error");
|
|
1224
|
+
expect(sessionError).toBeUndefined();
|
|
1225
|
+
});
|
|
1226
|
+
|
|
1227
|
+
// ── Test 6 ────────────────────────────────────────────────────────
|
|
1228
|
+
// Tests mid-loop budget check via onCheckpoint.
|
|
1229
|
+
// The onCheckpoint callback estimates prompt tokens after each tool round.
|
|
1230
|
+
// When estimate exceeds the mid-loop threshold (85% of budget),
|
|
1231
|
+
// it returns "yield" to break the agent loop.
|
|
1232
|
+
// The session-agent-loop then runs compaction and re-enters the agent loop.
|
|
1233
|
+
test("onCheckpoint yields when token estimate exceeds mid-loop budget threshold", async () => {
|
|
1234
|
+
const events: ServerMessage[] = [];
|
|
1235
|
+
let compactionCalled = false;
|
|
1236
|
+
|
|
1237
|
+
// estimatePromptTokens is called:
|
|
1238
|
+
// 1. During preflight budget check (low value, below budget)
|
|
1239
|
+
// 2. During onCheckpoint mid-loop check (high value, above 85% threshold)
|
|
1240
|
+
// Budget = 200_000 * 0.95 = 190_000
|
|
1241
|
+
// Mid-loop threshold = 190_000 * 0.85 = 161_500
|
|
1242
|
+
let estimateCallCount = 0;
|
|
1243
|
+
mockEstimateTokens = () => {
|
|
1244
|
+
estimateCallCount++;
|
|
1245
|
+
// First call: preflight check — below budget
|
|
1246
|
+
if (estimateCallCount === 1) return 100_000;
|
|
1247
|
+
// Subsequent calls: mid-loop check — above 85% threshold
|
|
1248
|
+
return 170_000;
|
|
1249
|
+
};
|
|
1250
|
+
|
|
1251
|
+
let agentLoopCallCount = 0;
|
|
1252
|
+
const agentLoopRun: AgentLoopRun = async (
|
|
1253
|
+
messages,
|
|
1254
|
+
onEvent,
|
|
1255
|
+
_signal,
|
|
1256
|
+
_requestId,
|
|
1257
|
+
onCheckpoint,
|
|
1258
|
+
) => {
|
|
1259
|
+
agentLoopCallCount++;
|
|
1260
|
+
|
|
1261
|
+
if (agentLoopCallCount === 1) {
|
|
1262
|
+
// Simulate a tool round: assistant calls a tool, results come back
|
|
1263
|
+
const withProgress: Message[] = [
|
|
1264
|
+
...messages,
|
|
1265
|
+
{
|
|
1266
|
+
role: "assistant" as const,
|
|
1267
|
+
content: [
|
|
1268
|
+
{ type: "text", text: "Let me check." },
|
|
1269
|
+
{
|
|
1270
|
+
type: "tool_use",
|
|
1271
|
+
id: "tu-1",
|
|
1272
|
+
name: "bash",
|
|
1273
|
+
input: { command: "ls" },
|
|
1274
|
+
},
|
|
1275
|
+
] as ContentBlock[],
|
|
1276
|
+
},
|
|
1277
|
+
{
|
|
1278
|
+
role: "user" as const,
|
|
1279
|
+
content: [
|
|
1280
|
+
{
|
|
1281
|
+
type: "tool_result",
|
|
1282
|
+
tool_use_id: "tu-1",
|
|
1283
|
+
content: "file1.ts\nfile2.ts",
|
|
1284
|
+
is_error: false,
|
|
1285
|
+
},
|
|
1286
|
+
] as ContentBlock[],
|
|
1287
|
+
},
|
|
1288
|
+
];
|
|
1289
|
+
|
|
1290
|
+
onEvent({
|
|
1291
|
+
type: "message_complete",
|
|
1292
|
+
message: {
|
|
1293
|
+
role: "assistant",
|
|
1294
|
+
content: [
|
|
1295
|
+
{ type: "text", text: "Let me check." },
|
|
1296
|
+
{
|
|
1297
|
+
type: "tool_use",
|
|
1298
|
+
id: "tu-1",
|
|
1299
|
+
name: "bash",
|
|
1300
|
+
input: { command: "ls" },
|
|
1301
|
+
},
|
|
1302
|
+
],
|
|
1303
|
+
},
|
|
1304
|
+
});
|
|
1305
|
+
onEvent({
|
|
1306
|
+
type: "usage",
|
|
1307
|
+
inputTokens: 100,
|
|
1308
|
+
outputTokens: 50,
|
|
1309
|
+
model: "test-model",
|
|
1310
|
+
providerDurationMs: 100,
|
|
1311
|
+
});
|
|
1312
|
+
|
|
1313
|
+
// Call onCheckpoint — this should trigger the mid-loop budget check
|
|
1314
|
+
// which sees 170_000 > 161_500 and returns "yield"
|
|
1315
|
+
if (onCheckpoint) {
|
|
1316
|
+
const decision = onCheckpoint({
|
|
1317
|
+
turnIndex: 0,
|
|
1318
|
+
toolCount: 1,
|
|
1319
|
+
hasToolUse: true,
|
|
1320
|
+
history: withProgress,
|
|
1321
|
+
});
|
|
1322
|
+
if (decision === "yield") {
|
|
1323
|
+
// Agent loop stops when checkpoint yields
|
|
1324
|
+
return withProgress;
|
|
1325
|
+
}
|
|
1326
|
+
}
|
|
1327
|
+
|
|
1328
|
+
return withProgress;
|
|
1329
|
+
}
|
|
1330
|
+
|
|
1331
|
+
// Second call (after compaction): complete successfully
|
|
1332
|
+
onEvent({
|
|
1333
|
+
type: "message_complete",
|
|
1334
|
+
message: {
|
|
1335
|
+
role: "assistant",
|
|
1336
|
+
content: [{ type: "text", text: "done after compaction" }],
|
|
1337
|
+
},
|
|
1338
|
+
});
|
|
1339
|
+
onEvent({
|
|
1340
|
+
type: "usage",
|
|
1341
|
+
inputTokens: 50,
|
|
1342
|
+
outputTokens: 25,
|
|
1343
|
+
model: "test-model",
|
|
1344
|
+
providerDurationMs: 100,
|
|
1345
|
+
});
|
|
1346
|
+
return [
|
|
1347
|
+
...messages,
|
|
1348
|
+
{
|
|
1349
|
+
role: "assistant" as const,
|
|
1350
|
+
content: [
|
|
1351
|
+
{ type: "text", text: "done after compaction" },
|
|
1352
|
+
] as ContentBlock[],
|
|
1353
|
+
},
|
|
1354
|
+
];
|
|
1355
|
+
};
|
|
1356
|
+
|
|
1357
|
+
const ctx = makeCtx({
|
|
1358
|
+
agentLoopRun,
|
|
1359
|
+
contextWindowManager: {
|
|
1360
|
+
shouldCompact: () => ({ needed: false, estimatedTokens: 0 }),
|
|
1361
|
+
maybeCompact: async () => {
|
|
1362
|
+
compactionCalled = true;
|
|
1363
|
+
return {
|
|
1364
|
+
compacted: true,
|
|
1365
|
+
messages: [
|
|
1366
|
+
{
|
|
1367
|
+
role: "user" as const,
|
|
1368
|
+
content: [{ type: "text", text: "Hello" }],
|
|
1369
|
+
},
|
|
1370
|
+
] as Message[],
|
|
1371
|
+
compactedPersistedMessages: 5,
|
|
1372
|
+
summaryText: "Mid-loop compaction summary",
|
|
1373
|
+
previousEstimatedInputTokens: 170_000,
|
|
1374
|
+
estimatedInputTokens: 80_000,
|
|
1375
|
+
maxInputTokens: 200_000,
|
|
1376
|
+
thresholdTokens: 160_000,
|
|
1377
|
+
compactedMessages: 10,
|
|
1378
|
+
summaryCalls: 1,
|
|
1379
|
+
summaryInputTokens: 500,
|
|
1380
|
+
summaryOutputTokens: 200,
|
|
1381
|
+
summaryModel: "mock-model",
|
|
1382
|
+
};
|
|
1383
|
+
},
|
|
1384
|
+
} as unknown as AgentLoopSessionContext["contextWindowManager"],
|
|
1385
|
+
});
|
|
1386
|
+
|
|
1387
|
+
await runAgentLoopImpl(ctx, "hello", "msg-1", (msg) => events.push(msg));
|
|
1388
|
+
|
|
1389
|
+
// The mid-loop budget check should have triggered compaction
|
|
1390
|
+
expect(compactionCalled).toBe(true);
|
|
1391
|
+
|
|
1392
|
+
// Agent loop should have been called twice: once before yield, once after compaction
|
|
1393
|
+
expect(agentLoopCallCount).toBe(2);
|
|
1394
|
+
|
|
1395
|
+
// No session_error should be emitted
|
|
1396
|
+
const sessionError = events.find((e) => e.type === "session_error");
|
|
1397
|
+
expect(sessionError).toBeUndefined();
|
|
1398
|
+
|
|
1399
|
+
// A context_compacted event should have been emitted
|
|
1400
|
+
const compacted = events.find((e) => e.type === "context_compacted");
|
|
1401
|
+
expect(compacted).toBeDefined();
|
|
1402
|
+
});
|
|
1403
|
+
|
|
1404
|
+
// ── Test 7 ────────────────────────────────────────────────────────
|
|
1405
|
+
// Tests that mid-loop budget check prevents context_too_large entirely.
|
|
1406
|
+
// Agent loop runs tool calls with growing history. After the estimate
|
|
1407
|
+
// exceeds the mid-loop threshold, the loop yields, compaction runs,
|
|
1408
|
+
// and the loop resumes. The provider NEVER rejects with context_too_large.
|
|
1409
|
+
test("mid-loop budget check prevents context_too_large when tools produce large results", async () => {
|
|
1410
|
+
const events: ServerMessage[] = [];
|
|
1411
|
+
let compactionCalled = false;
|
|
1412
|
+
|
|
1413
|
+
// Budget = 200_000 * 0.95 = 190_000
|
|
1414
|
+
// Mid-loop threshold = 190_000 * 0.85 = 161_500
|
|
1415
|
+
// Simulate token growth: preflight = 50k, then each checkpoint call
|
|
1416
|
+
// returns a growing estimate. By tool call 3, we exceed the threshold.
|
|
1417
|
+
let estimateCallCount = 0;
|
|
1418
|
+
mockEstimateTokens = () => {
|
|
1419
|
+
estimateCallCount++;
|
|
1420
|
+
// First call: preflight — well below budget
|
|
1421
|
+
if (estimateCallCount === 1) return 50_000;
|
|
1422
|
+
// Checkpoint calls grow with each tool round
|
|
1423
|
+
if (estimateCallCount === 2) return 100_000; // tool 1
|
|
1424
|
+
if (estimateCallCount === 3) return 140_000; // tool 2
|
|
1425
|
+
// Tool 3: exceeds 161_500 threshold
|
|
1426
|
+
return 175_000;
|
|
1427
|
+
};
|
|
1428
|
+
|
|
1429
|
+
let agentLoopCallCount = 0;
|
|
1430
|
+
let contextTooLargeEmitted = false;
|
|
1431
|
+
|
|
1432
|
+
const agentLoopRun: AgentLoopRun = async (
|
|
1433
|
+
messages,
|
|
1434
|
+
onEvent,
|
|
1435
|
+
_signal,
|
|
1436
|
+
_requestId,
|
|
1437
|
+
onCheckpoint,
|
|
1438
|
+
) => {
|
|
1439
|
+
agentLoopCallCount++;
|
|
1440
|
+
|
|
1441
|
+
if (agentLoopCallCount === 1) {
|
|
1442
|
+
const currentHistory = [...messages];
|
|
1443
|
+
|
|
1444
|
+
// Simulate 5 tool rounds — but the checkpoint should yield at round 3
|
|
1445
|
+
for (let i = 0; i < 5; i++) {
|
|
1446
|
+
const toolId = `tu-${i}`;
|
|
1447
|
+
const assistantMsg: Message = {
|
|
1448
|
+
role: "assistant" as const,
|
|
1449
|
+
content: [
|
|
1450
|
+
{ type: "text", text: `Step ${i}` },
|
|
1451
|
+
{
|
|
1452
|
+
type: "tool_use",
|
|
1453
|
+
id: toolId,
|
|
1454
|
+
name: "bash",
|
|
1455
|
+
input: { command: `cmd-${i}` },
|
|
1456
|
+
},
|
|
1457
|
+
] as ContentBlock[],
|
|
1458
|
+
};
|
|
1459
|
+
const resultMsg: Message = {
|
|
1460
|
+
role: "user" as const,
|
|
1461
|
+
content: [
|
|
1462
|
+
{
|
|
1463
|
+
type: "tool_result",
|
|
1464
|
+
tool_use_id: toolId,
|
|
1465
|
+
content: "x".repeat(10_000),
|
|
1466
|
+
is_error: false,
|
|
1467
|
+
},
|
|
1468
|
+
] as ContentBlock[],
|
|
1469
|
+
};
|
|
1470
|
+
currentHistory.push(assistantMsg, resultMsg);
|
|
1471
|
+
|
|
1472
|
+
onEvent({
|
|
1473
|
+
type: "message_complete",
|
|
1474
|
+
message: assistantMsg,
|
|
1475
|
+
});
|
|
1476
|
+
onEvent({
|
|
1477
|
+
type: "usage",
|
|
1478
|
+
inputTokens: 50_000 + i * 20_000,
|
|
1479
|
+
outputTokens: 50,
|
|
1480
|
+
model: "test-model",
|
|
1481
|
+
providerDurationMs: 100,
|
|
1482
|
+
});
|
|
1483
|
+
|
|
1484
|
+
if (onCheckpoint) {
|
|
1485
|
+
const decision = onCheckpoint({
|
|
1486
|
+
turnIndex: i,
|
|
1487
|
+
toolCount: 1,
|
|
1488
|
+
hasToolUse: true,
|
|
1489
|
+
history: currentHistory,
|
|
1490
|
+
});
|
|
1491
|
+
if (decision === "yield") {
|
|
1492
|
+
return currentHistory;
|
|
1493
|
+
}
|
|
1494
|
+
}
|
|
1495
|
+
}
|
|
1496
|
+
|
|
1497
|
+
return currentHistory;
|
|
1498
|
+
}
|
|
1499
|
+
|
|
1500
|
+
// Second call (after compaction): complete
|
|
1501
|
+
onEvent({
|
|
1502
|
+
type: "message_complete",
|
|
1503
|
+
message: {
|
|
1504
|
+
role: "assistant",
|
|
1505
|
+
content: [
|
|
1506
|
+
{ type: "text", text: "completed after mid-loop compaction" },
|
|
1507
|
+
],
|
|
1508
|
+
},
|
|
1509
|
+
});
|
|
1510
|
+
onEvent({
|
|
1511
|
+
type: "usage",
|
|
1512
|
+
inputTokens: 60_000,
|
|
1513
|
+
outputTokens: 100,
|
|
1514
|
+
model: "test-model",
|
|
1515
|
+
providerDurationMs: 200,
|
|
1516
|
+
});
|
|
1517
|
+
return [
|
|
1518
|
+
...messages,
|
|
1519
|
+
{
|
|
1520
|
+
role: "assistant" as const,
|
|
1521
|
+
content: [
|
|
1522
|
+
{ type: "text", text: "completed after mid-loop compaction" },
|
|
1523
|
+
] as ContentBlock[],
|
|
1524
|
+
},
|
|
1525
|
+
];
|
|
1526
|
+
};
|
|
1527
|
+
|
|
1528
|
+
const ctx = makeCtx({
|
|
1529
|
+
agentLoopRun,
|
|
1530
|
+
contextWindowManager: {
|
|
1531
|
+
shouldCompact: () => ({ needed: false, estimatedTokens: 0 }),
|
|
1532
|
+
maybeCompact: async () => {
|
|
1533
|
+
compactionCalled = true;
|
|
1534
|
+
return {
|
|
1535
|
+
compacted: true,
|
|
1536
|
+
messages: [
|
|
1537
|
+
{
|
|
1538
|
+
role: "user" as const,
|
|
1539
|
+
content: [{ type: "text", text: "Hello" }],
|
|
1540
|
+
},
|
|
1541
|
+
] as Message[],
|
|
1542
|
+
compactedPersistedMessages: 8,
|
|
1543
|
+
summaryText: "Compacted large tool results",
|
|
1544
|
+
previousEstimatedInputTokens: 175_000,
|
|
1545
|
+
estimatedInputTokens: 60_000,
|
|
1546
|
+
maxInputTokens: 200_000,
|
|
1547
|
+
thresholdTokens: 160_000,
|
|
1548
|
+
compactedMessages: 15,
|
|
1549
|
+
summaryCalls: 1,
|
|
1550
|
+
summaryInputTokens: 800,
|
|
1551
|
+
summaryOutputTokens: 300,
|
|
1552
|
+
summaryModel: "mock-model",
|
|
1553
|
+
};
|
|
1554
|
+
},
|
|
1555
|
+
} as unknown as AgentLoopSessionContext["contextWindowManager"],
|
|
1556
|
+
});
|
|
1557
|
+
|
|
1558
|
+
await runAgentLoopImpl(ctx, "hello", "msg-1", (msg) => {
|
|
1559
|
+
events.push(msg);
|
|
1560
|
+
// Track if context_too_large was ever emitted
|
|
1561
|
+
if (
|
|
1562
|
+
msg.type === "session_error" &&
|
|
1563
|
+
"code" in msg &&
|
|
1564
|
+
msg.code === "SESSION_PROCESSING_FAILED"
|
|
1565
|
+
) {
|
|
1566
|
+
contextTooLargeEmitted = true;
|
|
1567
|
+
}
|
|
1568
|
+
});
|
|
1569
|
+
|
|
1570
|
+
// Compaction should have been triggered by mid-loop budget check
|
|
1571
|
+
expect(compactionCalled).toBe(true);
|
|
1572
|
+
|
|
1573
|
+
// The provider should NEVER have rejected with context_too_large
|
|
1574
|
+
expect(contextTooLargeEmitted).toBe(false);
|
|
1575
|
+
|
|
1576
|
+
// Agent loop called twice: once (yielded at tool 3), once after compaction
|
|
1577
|
+
expect(agentLoopCallCount).toBe(2);
|
|
1578
|
+
|
|
1579
|
+
// No session_error
|
|
1580
|
+
const sessionError = events.find((e) => e.type === "session_error");
|
|
1581
|
+
expect(sessionError).toBeUndefined();
|
|
1582
|
+
});
|
|
1583
|
+
});
|