@gajae-code/coding-agent 0.5.2 → 0.5.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. package/CHANGELOG.md +23 -0
  2. package/dist/types/async/job-manager.d.ts +6 -0
  3. package/dist/types/config/model-profiles.d.ts +10 -0
  4. package/dist/types/dap/client.d.ts +2 -1
  5. package/dist/types/edit/read-file.d.ts +6 -0
  6. package/dist/types/eval/js/context-manager.d.ts +3 -0
  7. package/dist/types/eval/js/executor.d.ts +1 -0
  8. package/dist/types/exec/bash-executor.d.ts +2 -0
  9. package/dist/types/gjc-runtime/tmux-sessions.d.ts +7 -1
  10. package/dist/types/lsp/types.d.ts +2 -0
  11. package/dist/types/modes/bridge/bridge-mode.d.ts +1 -0
  12. package/dist/types/modes/components/model-selector.d.ts +2 -0
  13. package/dist/types/modes/components/oauth-selector.d.ts +1 -0
  14. package/dist/types/modes/components/runtime-mcp-add-wizard.d.ts +1 -0
  15. package/dist/types/modes/components/tool-execution.d.ts +1 -0
  16. package/dist/types/modes/interactive-mode.d.ts +1 -0
  17. package/dist/types/modes/types.d.ts +1 -0
  18. package/dist/types/runtime/process-lifecycle.d.ts +108 -0
  19. package/dist/types/runtime-mcp/transports/stdio.d.ts +1 -0
  20. package/dist/types/runtime-mcp/types.d.ts +2 -0
  21. package/dist/types/session/agent-session.d.ts +29 -1
  22. package/dist/types/session/artifacts.d.ts +4 -1
  23. package/dist/types/session/streaming-output.d.ts +12 -0
  24. package/dist/types/slash-commands/helpers/fast-status-report.d.ts +76 -0
  25. package/dist/types/tools/bash.d.ts +1 -0
  26. package/dist/types/tools/browser/tab-supervisor.d.ts +9 -0
  27. package/dist/types/tools/sqlite-reader.d.ts +2 -1
  28. package/dist/types/web/search/providers/codex.d.ts +4 -4
  29. package/package.json +7 -7
  30. package/src/async/job-manager.ts +181 -43
  31. package/src/config/file-lock.ts +9 -1
  32. package/src/config/model-profile-activation.ts +71 -3
  33. package/src/config/model-profiles.ts +39 -14
  34. package/src/dap/client.ts +105 -64
  35. package/src/dap/session.ts +44 -7
  36. package/src/defaults/gjc/skills/deep-interview/SKILL.md +11 -2
  37. package/src/defaults/gjc/skills/ralplan/SKILL.md +2 -2
  38. package/src/defaults/gjc/skills/ultragoal/SKILL.md +2 -2
  39. package/src/edit/read-file.ts +19 -1
  40. package/src/eval/js/context-manager.ts +228 -65
  41. package/src/eval/js/executor.ts +2 -0
  42. package/src/eval/js/index.ts +1 -0
  43. package/src/eval/js/worker-core.ts +10 -6
  44. package/src/eval/py/executor.ts +68 -19
  45. package/src/eval/py/kernel.ts +46 -22
  46. package/src/eval/py/runner.py +68 -14
  47. package/src/exec/bash-executor.ts +49 -13
  48. package/src/gjc-runtime/deep-interview-runtime.ts +14 -13
  49. package/src/gjc-runtime/ralplan-runtime.ts +10 -0
  50. package/src/gjc-runtime/state-runtime.ts +73 -0
  51. package/src/gjc-runtime/tmux-gc.ts +86 -37
  52. package/src/gjc-runtime/tmux-sessions.ts +44 -6
  53. package/src/gjc-runtime/ultragoal-runtime.ts +8 -4
  54. package/src/internal-urls/artifact-protocol.ts +10 -1
  55. package/src/internal-urls/docs-index.generated.ts +2 -2
  56. package/src/lsp/client.ts +64 -26
  57. package/src/lsp/index.ts +2 -1
  58. package/src/lsp/lspmux.ts +33 -9
  59. package/src/lsp/types.ts +2 -0
  60. package/src/modes/bridge/bridge-mode.ts +21 -0
  61. package/src/modes/components/assistant-message.ts +10 -2
  62. package/src/modes/components/bash-execution.ts +5 -1
  63. package/src/modes/components/eval-execution.ts +5 -1
  64. package/src/modes/components/model-selector.ts +34 -2
  65. package/src/modes/components/oauth-selector.ts +5 -0
  66. package/src/modes/components/runtime-mcp-add-wizard.ts +58 -7
  67. package/src/modes/components/skill-message.ts +24 -16
  68. package/src/modes/components/tool-execution.ts +6 -0
  69. package/src/modes/controllers/extension-ui-controller.ts +33 -6
  70. package/src/modes/controllers/input-controller.ts +19 -0
  71. package/src/modes/controllers/selector-controller.ts +6 -1
  72. package/src/modes/interactive-mode.ts +13 -0
  73. package/src/modes/types.ts +1 -0
  74. package/src/modes/utils/ui-helpers.ts +5 -2
  75. package/src/prompts/agents/executor.md +1 -1
  76. package/src/runtime/process-lifecycle.ts +400 -0
  77. package/src/runtime-mcp/manager.ts +164 -50
  78. package/src/runtime-mcp/transports/http.ts +12 -11
  79. package/src/runtime-mcp/transports/stdio.ts +64 -38
  80. package/src/runtime-mcp/types.ts +3 -0
  81. package/src/sdk.ts +27 -0
  82. package/src/session/agent-session.ts +271 -25
  83. package/src/session/artifacts.ts +17 -2
  84. package/src/session/blob-store.ts +36 -2
  85. package/src/session/session-manager.ts +29 -13
  86. package/src/session/streaming-output.ts +95 -3
  87. package/src/setup/model-onboarding-guidance.ts +10 -3
  88. package/src/skill-state/active-state.ts +79 -7
  89. package/src/slash-commands/builtin-registry.ts +30 -3
  90. package/src/slash-commands/helpers/fast-status-report.ts +111 -0
  91. package/src/tools/archive-reader.ts +10 -1
  92. package/src/tools/bash.ts +11 -4
  93. package/src/tools/browser/registry.ts +17 -1
  94. package/src/tools/browser/tab-supervisor.ts +22 -0
  95. package/src/tools/browser.ts +38 -4
  96. package/src/tools/cron.ts +2 -6
  97. package/src/tools/read.ts +11 -12
  98. package/src/tools/sqlite-reader.ts +19 -5
  99. package/src/web/search/providers/codex.ts +6 -5
@@ -41,6 +41,8 @@ import {
41
41
  calculatePromptTokens,
42
42
  collectEntriesForBranchSummary,
43
43
  compact,
44
+ type EmergencyCompactionSample,
45
+ emergencyCompactionReason,
44
46
  estimateMessageTokensHeuristic,
45
47
  estimateTokens,
46
48
  generateBranchSummary,
@@ -142,6 +144,7 @@ import { onAppendOnlyModeChanged } from "../config/settings";
142
144
  import { RawSseDebugBuffer } from "../debug/raw-sse-buffer";
143
145
  import { loadCapability } from "../discovery";
144
146
  import { expandApplyPatchToEntries, normalizeDiff, normalizeToLF, ParseError, previewPatch, stripBom } from "../edit";
147
+ import { disposeVmContextsByOwner } from "../eval/js/context-manager";
145
148
  import {
146
149
  disposeKernelSessionsByOwner,
147
150
  executePython as executePythonCommand,
@@ -234,6 +237,7 @@ import {
234
237
  import type { ToolSession } from "../tools";
235
238
  import { AskTool } from "../tools/ask";
236
239
  import { assertEditableFile } from "../tools/auto-generated-guard";
240
+ import { releaseTabsForOwner } from "../tools/browser/tab-supervisor";
237
241
  import type { CheckpointState } from "../tools/checkpoint";
238
242
  import { outputMeta, wrapToolWithMetaNotice } from "../tools/output-meta";
239
243
  import { normalizeLocalScheme, resolveToCwd } from "../tools/path-utils";
@@ -545,6 +549,13 @@ function formatRetryFallbackBaseSelector(selector: RetryFallbackSelector): strin
545
549
 
546
550
  const IRC_REPLY_MAX_BYTES = 4096;
547
551
 
552
+ /**
553
+ * Hard cap for {@link AgentSession.disposeChildSubprocesses}. A `SIGINT`/`SIGTERM` handler
554
+ * awaits this teardown before exiting, so it must never block longer than this even if a
555
+ * subprocess (wedged Chrome renderer, stuck Python cell) refuses to settle.
556
+ */
557
+ const SIGNAL_TEARDOWN_TIMEOUT_MS = 5_000;
558
+
548
559
  /**
549
560
  * Collapse degenerate IRC ephemeral replies before they hit the relay.
550
561
  * Models occasionally loop on a single line (~16 reports of N-times-repeated
@@ -907,6 +918,7 @@ export class AgentSession {
907
918
  // Compaction state
908
919
  #compactionAbortController: AbortController | undefined = undefined;
909
920
  #autoCompactionAbortController: AbortController | undefined = undefined;
921
+ #resourceSampler: () => EmergencyCompactionSample = () => this.#defaultResourceSample();
910
922
  #prePromptContextCheckPromise: Promise<void> | undefined = undefined;
911
923
 
912
924
  // Branch summarization state
@@ -3187,6 +3199,13 @@ export class AgentSession {
3187
3199
  }
3188
3200
  }
3189
3201
  await shutdownAllLspClients();
3202
+ // F13: release only THIS session's browser tabs on dispose (kill:false → remote
3203
+ // browsers disconnect, headless close gracefully). Scoped by the session id the
3204
+ // browser tool tagged tabs with, so other live sessions' tabs are untouched.
3205
+ // No-op when this session opened no tabs. Failure is logged, not thrown.
3206
+ await releaseTabsForOwner(this.sessionManager.getSessionId()).catch((error: unknown) =>
3207
+ logger.warn("session dispose: releaseTabsForOwner failed", { error }),
3208
+ );
3190
3209
  const pythonExecutionsSettled = await this.#prepareEvalExecutionsForDispose();
3191
3210
  if (!pythonExecutionsSettled) {
3192
3211
  logger.warn(
@@ -3194,6 +3213,7 @@ export class AgentSession {
3194
3213
  );
3195
3214
  }
3196
3215
  await disposeKernelSessionsByOwner(this.#evalKernelOwnerId);
3216
+ await disposeVmContextsByOwner(this.#evalKernelOwnerId);
3197
3217
  this.#releasePowerAssertion();
3198
3218
  await this.sessionManager.close();
3199
3219
  this.#closeAllProviderSessions("dispose");
@@ -3208,6 +3228,36 @@ export class AgentSession {
3208
3228
  this.#eventListeners = [];
3209
3229
  }
3210
3230
 
3231
+ /**
3232
+ * Bounded, best-effort teardown of the subprocess-spawning resources this session
3233
+ * owns: the browser tool's headless/spawned Chrome and the Python eval kernel + JS VM
3234
+ * contexts. Unlike {@link dispose}, this touches only child processes and is time-boxed,
3235
+ * so a top-level `SIGINT`/`SIGTERM`/`SIGHUP` handler can run it without hanging — without
3236
+ * it, an external kill bypasses `dispose()` and orphans Chrome/Python to PID 1 (#698).
3237
+ *
3238
+ * Idempotent: every step is a no-op once the graceful {@link dispose} path has released
3239
+ * the resources. Never throws; per-step failures are logged and the whole run is capped
3240
+ * at `timeoutMs` so a wedged subprocess can't stall process exit.
3241
+ */
3242
+ async disposeChildSubprocesses(timeoutMs = SIGNAL_TEARDOWN_TIMEOUT_MS): Promise<void> {
3243
+ const sessionId = this.sessionManager.getSessionId();
3244
+ const kernelOwnerId = this.#evalKernelOwnerId;
3245
+ const work = Promise.allSettled([
3246
+ // kill:true so a forced exit also reaps spawned-app Chrome we own (headless
3247
+ // always closes; connected/attached browsers only disconnect — never killed).
3248
+ releaseTabsForOwner(sessionId, { kill: true }).catch((error: unknown) =>
3249
+ logger.warn("signal teardown: releaseTabsForOwner failed", { error }),
3250
+ ),
3251
+ disposeKernelSessionsByOwner(kernelOwnerId).catch((error: unknown) =>
3252
+ logger.warn("signal teardown: disposeKernelSessionsByOwner failed", { error }),
3253
+ ),
3254
+ disposeVmContextsByOwner(kernelOwnerId).catch((error: unknown) =>
3255
+ logger.warn("signal teardown: disposeVmContextsByOwner failed", { error }),
3256
+ ),
3257
+ ]);
3258
+ await Promise.race([work, Bun.sleep(timeoutMs)]);
3259
+ }
3260
+
3211
3261
  #closeAllProviderSessions(reason: string): void {
3212
3262
  for (const [providerKey, state] of this.#providerSessionState) {
3213
3263
  try {
@@ -6016,6 +6066,44 @@ export class AgentSession {
6016
6066
  );
6017
6067
  }
6018
6068
 
6069
+ /**
6070
+ * True when the configured `serviceTier` resolves to `"priority"` for the
6071
+ * given model `provider`. Returns false for scoped tiers that don't match
6072
+ * (e.g. `"openai-only"` on an anthropic provider) and when `provider` is
6073
+ * undefined. This is the canonical provider-aware fast-mode predicate.
6074
+ */
6075
+ isFastForProvider(provider?: string): boolean {
6076
+ // Fast mode applies to a concrete model's provider. With no provider
6077
+ // (no model selected) it cannot apply, even under an unscoped `priority`
6078
+ // tier that `resolveServiceTier` would otherwise pass through.
6079
+ if (provider === undefined) return false;
6080
+ return resolveServiceTier(this.serviceTier, provider) === "priority";
6081
+ }
6082
+
6083
+ /**
6084
+ * Effective service tier applied to task-tool subagent sessions
6085
+ * (executor/architect/planner/critic). They run under `task.serviceTier`
6086
+ * unless it is `"inherit"`, in which case they inherit the main session
6087
+ * tier — mirroring `createSubagentSettings`.
6088
+ */
6089
+ #subagentServiceTier(): ServiceTier | undefined {
6090
+ const configured = this.settings.get("task.serviceTier");
6091
+ if (configured === "inherit") return this.serviceTier;
6092
+ if (configured === "none") return undefined;
6093
+ return configured;
6094
+ }
6095
+
6096
+ /**
6097
+ * Provider-aware fast-mode predicate for task-tool subagent roles, evaluated
6098
+ * against the effective subagent tier (`task.serviceTier`) rather than the
6099
+ * main session tier. Use this for `task.agentModelOverrides` role rows so the
6100
+ * ⚡ glyph reflects the tier the subagent actually runs under.
6101
+ */
6102
+ isFastForSubagentProvider(provider?: string): boolean {
6103
+ if (provider === undefined) return false;
6104
+ return resolveServiceTier(this.#subagentServiceTier(), provider) === "priority";
6105
+ }
6106
+
6019
6107
  /**
6020
6108
  * True when the configured `serviceTier` resolves to `"priority"` for the
6021
6109
  * *currently selected model's provider*. Returns false for scoped tiers
@@ -6023,7 +6111,7 @@ export class AgentSession {
6023
6111
  * no model is selected.
6024
6112
  */
6025
6113
  isFastModeActive(): boolean {
6026
- return resolveServiceTier(this.serviceTier, this.model?.provider) === "priority";
6114
+ return this.isFastForProvider(this.model?.provider);
6027
6115
  }
6028
6116
 
6029
6117
  setServiceTier(serviceTier: ServiceTier | undefined): void {
@@ -6394,6 +6482,7 @@ export class AgentSession {
6394
6482
  model,
6395
6483
  apiKey,
6396
6484
  {
6485
+ ...this.#maintenanceProviderTransport(),
6397
6486
  systemPrompt: this.#baseSystemPrompt,
6398
6487
  tools: this.agent.state.tools,
6399
6488
  customInstructions,
@@ -6587,11 +6676,55 @@ export class AgentSession {
6587
6676
  }
6588
6677
  }
6589
6678
 
6679
+ /** Test seam: override the emergency-compaction resource sampler so tests never read real RSS. */
6680
+ setResourceSampler(sampler: () => EmergencyCompactionSample): void {
6681
+ this.#resourceSampler = sampler;
6682
+ }
6683
+
6684
+ #defaultResourceSample(): EmergencyCompactionSample {
6685
+ let providerBytes = 0;
6686
+ let imageBytes = 0;
6687
+ for (const message of this.state.messages) {
6688
+ const content = (message as { content?: unknown }).content;
6689
+ if (typeof content === "string") {
6690
+ providerBytes += content.length;
6691
+ } else if (Array.isArray(content)) {
6692
+ for (const block of content) {
6693
+ if (!block || typeof block !== "object") continue;
6694
+ const typed = block as { text?: unknown; data?: unknown };
6695
+ if (typeof typed.text === "string") providerBytes += typed.text.length;
6696
+ if (typeof typed.data === "string") {
6697
+ imageBytes += typed.data.length;
6698
+ providerBytes += typed.data.length;
6699
+ }
6700
+ }
6701
+ }
6702
+ }
6703
+ return {
6704
+ heapUsedBytes: process.memoryUsage().heapUsed,
6705
+ providerBytes,
6706
+ messageCount: this.state.messages.length,
6707
+ imageBytes,
6708
+ };
6709
+ }
6710
+
6590
6711
  async #checkEstimatedContextBeforePromptOnce(pendingMessages: readonly AgentMessage[]): Promise<void> {
6591
6712
  const model = this.model;
6592
6713
  if (!model) return;
6593
6714
  const contextWindow = model.contextWindow ?? 0;
6594
6715
  if (contextWindow <= 0) return;
6716
+ // F6: non-disableable emergency floor — compact before OOM even when token-based
6717
+ // compaction is disabled or its threshold is set too high (weak-hardware protection).
6718
+ const emergencyReason = emergencyCompactionReason(this.#resourceSampler());
6719
+ if (emergencyReason) {
6720
+ logger.warn("Emergency compaction triggered (resource floor exceeded)", { reason: emergencyReason });
6721
+ await this.#runAutoCompaction("overflow", false, false, {
6722
+ continueAfterMaintenance: false,
6723
+ deferHandoffMaintenance: false,
6724
+ force: true,
6725
+ });
6726
+ return;
6727
+ }
6595
6728
  const compactionSettings = this.settings.getGroup("compaction");
6596
6729
  if (!compactionSettings.enabled || compactionSettings.strategy === "off") return;
6597
6730
 
@@ -7243,7 +7376,17 @@ export class AgentSession {
7243
7376
  addCandidate(this.#resolveRoleModelFull(role, availableModels, currentModel).model);
7244
7377
  }
7245
7378
 
7246
- const sortedByContext = [...availableModels].sort((a, b) => b.contextWindow - a.contextWindow);
7379
+ // Last-resort fallback: the largest-context model that shares the ACTIVE
7380
+ // model's provider. Scoping this to the current provider keeps auto-
7381
+ // compaction on the user's configured/custom route instead of silently
7382
+ // defaulting to an unrelated provider (e.g. a stray OpenAI credential
7383
+ // with no remaining credit) just because it happens to be in the bundled
7384
+ // catalog. Cross-provider compaction stays possible, but only when the
7385
+ // user opts in explicitly via modelRoles (handled by the loop above).
7386
+ const fallbackProvider = currentModel?.provider;
7387
+ const sortedByContext = [...availableModels]
7388
+ .filter(model => fallbackProvider === undefined || model.provider === fallbackProvider)
7389
+ .sort((a, b) => b.contextWindow - a.contextWindow);
7247
7390
  for (const model of sortedByContext) {
7248
7391
  if (!seen.has(this.#getModelKey(model))) {
7249
7392
  addCandidate(model);
@@ -7271,6 +7414,25 @@ export class AgentSession {
7271
7414
  );
7272
7415
  }
7273
7416
 
7417
+ /**
7418
+ * Transport-affinity fields forwarded into local maintenance one-shot LLM
7419
+ * calls (compaction, handoff, branch summary) so they reuse the live turn's
7420
+ * provider session state and configured WebSocket transport preference
7421
+ * instead of falling back to a fresh HTTP/SSE session. Mirrors the
7422
+ * `providerSessionId ?? sessionId` affinity the agent loop sends per turn.
7423
+ */
7424
+ #maintenanceProviderTransport(): {
7425
+ sessionId: string | undefined;
7426
+ providerSessionState: Map<string, ProviderSessionState>;
7427
+ preferWebsockets: boolean | undefined;
7428
+ } {
7429
+ return {
7430
+ sessionId: this.agent.providerSessionId ?? this.agent.sessionId,
7431
+ providerSessionState: this.#providerSessionState,
7432
+ preferWebsockets: this.agent.preferWebsockets,
7433
+ };
7434
+ }
7435
+
7274
7436
  async #compactWithFallbackModel(
7275
7437
  preparation: CompactionPreparation,
7276
7438
  customInstructions: string | undefined,
@@ -7287,6 +7449,7 @@ export class AgentSession {
7287
7449
  try {
7288
7450
  return await compact(preparation, candidate, apiKey, customInstructions, signal, {
7289
7451
  ...options,
7452
+ ...this.#maintenanceProviderTransport(),
7290
7453
  metadata: this.agent.metadataForProvider(candidate.provider),
7291
7454
  convertToLlm,
7292
7455
  telemetry,
@@ -7367,11 +7530,13 @@ export class AgentSession {
7367
7530
  reason: "overflow" | "threshold" | "idle",
7368
7531
  willRetry: boolean,
7369
7532
  deferred = false,
7370
- options?: { continueAfterMaintenance?: boolean; deferHandoffMaintenance?: boolean },
7533
+ options?: { continueAfterMaintenance?: boolean; deferHandoffMaintenance?: boolean; force?: boolean },
7371
7534
  ): Promise<void> {
7372
7535
  const compactionSettings = this.settings.getGroup("compaction");
7373
- if (compactionSettings.strategy === "off") return;
7374
- if (reason !== "idle" && !compactionSettings.enabled) return;
7536
+ // `force` is the non-disableable emergency floor (F6): it bypasses the user's
7537
+ // disabled/off settings so a resource-floor breach still compacts before OOM.
7538
+ if (!options?.force && compactionSettings.strategy === "off") return;
7539
+ if (!options?.force && reason !== "idle" && !compactionSettings.enabled) return;
7375
7540
  const generation = this.#promptGeneration;
7376
7541
  if (
7377
7542
  options?.deferHandoffMaintenance !== false &&
@@ -7574,6 +7739,7 @@ export class AgentSession {
7574
7739
  while (true) {
7575
7740
  try {
7576
7741
  compactResult = await compact(preparation, candidate, apiKey, undefined, autoCompactionSignal, {
7742
+ ...this.#maintenanceProviderTransport(),
7577
7743
  promptOverride: compactionPrep.hookPrompt,
7578
7744
  extraContext: compactionPrep.hookContext,
7579
7745
  remoteInstructions: this.#baseSystemPrompt.join("\n\n"),
@@ -7799,7 +7965,12 @@ export class AgentSession {
7799
7965
  */
7800
7966
  #isRetryableError(message: AssistantMessage): boolean {
7801
7967
  const classification = this.#classifyErrorForRetry(message);
7802
- return classification === "usage_limit" || classification === "transient" || classification === "unknown";
7968
+ return (
7969
+ classification === "usage_limit" ||
7970
+ classification === "transient" ||
7971
+ classification === "unknown" ||
7972
+ classification === "first_event_timeout"
7973
+ );
7803
7974
  }
7804
7975
 
7805
7976
  #isTransientErrorMessage(errorMessage: string): boolean {
@@ -7825,6 +7996,33 @@ export class AgentSession {
7825
7996
  );
7826
7997
  }
7827
7998
 
7999
+ #isFirstEventTimeoutErrorMessage(errorMessage: string): boolean {
8000
+ // First-event timeout: the stream watchdog aborted because no event
8001
+ // arrived within the first-event window. Matches the shared lazy-stream
8002
+ // message and the per-provider variants
8003
+ // ("<Provider> stream timed out while waiting for the first event").
8004
+ return /timed?\s*out while waiting for the first event|timeout waiting for first/i.test(errorMessage);
8005
+ }
8006
+
8007
+ /**
8008
+ * Whether a first-event timeout on the error's provider should fail closed —
8009
+ * i.e. retry a bounded number of times (capped at retry.maxRetries) and then
8010
+ * surface, instead of joining the unbounded transient-retry class.
8011
+ *
8012
+ * Targets the ollama-chat API, which is exclusively ollama-cloud (local
8013
+ * Ollama uses the openai-responses API). That remote, queued backend can
8014
+ * stall before its first token even for tiny prompts; an unbounded
8015
+ * continuation retry re-issues the full request on every attempt and can
8016
+ * silently spike upstream usage (#713). First-party providers keep their
8017
+ * existing unbounded first-event-timeout retry behavior.
8018
+ */
8019
+ #shouldFailClosedOnFirstEventTimeout(message: AssistantMessage): boolean {
8020
+ // Prefer the active model's API (the model that produced the error);
8021
+ // the errored message's API is a fallback for the rare case where the
8022
+ // session model has already moved on.
8023
+ return this.model?.api === "ollama-chat" || message.api === "ollama-chat";
8024
+ }
8025
+
7828
8026
  #isTerminalErrorMessage(errorMessage: string): boolean {
7829
8027
  // Errors that will never succeed on retry (auth/permission, malformed
7830
8028
  // request, unknown/unsupported model). These surface immediately rather
@@ -7846,11 +8044,12 @@ export class AgentSession {
7846
8044
 
7847
8045
  /**
7848
8046
  * Ordered retry classification: overflow (compaction) -> terminal (surface)
7849
- * -> usage_limit (rotation) -> transient (retry) -> unknown (retry).
8047
+ * -> usage_limit (rotation) -> first_event_timeout (bounded retry) ->
8048
+ * transient (retry) -> unknown (retry).
7850
8049
  */
7851
8050
  #classifyErrorForRetry(
7852
8051
  message: AssistantMessage,
7853
- ): "none" | "overflow" | "terminal" | "usage_limit" | "transient" | "unknown" {
8052
+ ): "none" | "overflow" | "terminal" | "usage_limit" | "first_event_timeout" | "transient" | "unknown" {
7854
8053
  if (message.stopReason !== "error" || !message.errorMessage) return "none";
7855
8054
  const contextWindow = this.model?.contextWindow ?? 0;
7856
8055
  if (isContextOverflow(message, contextWindow)) return "overflow";
@@ -7878,6 +8077,13 @@ export class AgentSession {
7878
8077
  if (isTerminalHttp4xx && (explicitStatus !== undefined || !/rate.?limit|too many requests/i.test(err))) {
7879
8078
  return "terminal";
7880
8079
  }
8080
+ // A first-event timeout on ollama-cloud (the ollama-chat API) must not
8081
+ // join the unbounded transient class: each continuation retry re-issues
8082
+ // the full request to a remote, billable backend, so an unbounded loop
8083
+ // can silently spike usage (#713). Bound it to retry.maxRetries instead.
8084
+ if (this.#isFirstEventTimeoutErrorMessage(err) && this.#shouldFailClosedOnFirstEventTimeout(message)) {
8085
+ return "first_event_timeout";
8086
+ }
7881
8087
  if (this.#isTransientErrorMessage(err)) return "transient";
7882
8088
  return "unknown";
7883
8089
  }
@@ -9381,6 +9587,7 @@ export class AgentSession {
9381
9587
  }
9382
9588
  const branchSummarySettings = this.settings.getGroup("branchSummary");
9383
9589
  const result = await generateBranchSummary(entriesToSummarize, {
9590
+ ...this.#maintenanceProviderTransport(),
9384
9591
  model,
9385
9592
  apiKey,
9386
9593
  signal: this.#branchSummaryAbortController.signal,
@@ -9508,17 +9715,15 @@ export class AgentSession {
9508
9715
  */
9509
9716
  getSessionStats(): SessionStats {
9510
9717
  const state = this.state;
9511
- const userMessages = state.messages.filter(m => m.role === "user").length;
9512
- const assistantMessages = state.messages.filter(m => m.role === "assistant").length;
9513
- const toolResults = state.messages.filter(m => m.role === "toolResult").length;
9514
-
9718
+ let userMessages = 0;
9719
+ let assistantMessages = 0;
9720
+ let toolResults = 0;
9515
9721
  let toolCalls = 0;
9516
9722
  let totalInput = 0;
9517
9723
  let totalOutput = 0;
9518
9724
  let totalCacheRead = 0;
9519
9725
  let totalCacheWrite = 0;
9520
9726
  let totalCost = 0;
9521
-
9522
9727
  let totalPremiumRequests = 0;
9523
9728
  const getTaskToolUsage = (details: unknown): Usage | undefined => {
9524
9729
  if (!details || typeof details !== "object") return undefined;
@@ -9528,8 +9733,13 @@ export class AgentSession {
9528
9733
  return usage as Usage;
9529
9734
  };
9530
9735
 
9736
+ // Single pass over messages (replaces three role filters plus a separate usage
9737
+ // loop) so per-turn stats stay O(messages + assistant content blocks), not O(4N).
9531
9738
  for (const message of state.messages) {
9532
- if (message.role === "assistant") {
9739
+ if (message.role === "user") {
9740
+ userMessages += 1;
9741
+ } else if (message.role === "assistant") {
9742
+ assistantMessages += 1;
9533
9743
  const assistantMsg = message as AssistantMessage;
9534
9744
  toolCalls += assistantMsg.content.filter(c => c.type === "toolCall").length;
9535
9745
  totalInput += assistantMsg.usage.input;
@@ -9538,17 +9748,18 @@ export class AgentSession {
9538
9748
  totalCacheWrite += assistantMsg.usage.cacheWrite;
9539
9749
  totalPremiumRequests += assistantMsg.usage.premiumRequests ?? 0;
9540
9750
  totalCost += assistantMsg.usage.cost.total;
9541
- }
9542
-
9543
- if (message.role === "toolResult" && message.toolName === "task") {
9544
- const usage = getTaskToolUsage(message.details);
9545
- if (usage) {
9546
- totalInput += usage.input;
9547
- totalOutput += usage.output;
9548
- totalCacheRead += usage.cacheRead;
9549
- totalCacheWrite += usage.cacheWrite;
9550
- totalPremiumRequests += usage.premiumRequests ?? 0;
9551
- totalCost += usage.cost.total;
9751
+ } else if (message.role === "toolResult") {
9752
+ toolResults += 1;
9753
+ if (message.toolName === "task") {
9754
+ const usage = getTaskToolUsage(message.details);
9755
+ if (usage) {
9756
+ totalInput += usage.input;
9757
+ totalOutput += usage.output;
9758
+ totalCacheRead += usage.cacheRead;
9759
+ totalCacheWrite += usage.cacheWrite;
9760
+ totalPremiumRequests += usage.premiumRequests ?? 0;
9761
+ totalCost += usage.cost.total;
9762
+ }
9552
9763
  }
9553
9764
  }
9554
9765
  }
@@ -9709,11 +9920,46 @@ export class AgentSession {
9709
9920
  return tokens;
9710
9921
  }
9711
9922
 
9923
+ #nativeTokenCache = new WeakMap<AgentMessage, { len: number; tokens: number }>();
9924
+
9925
+ /** Cheap content-size signal to invalidate the native token cache on mutation (growth). */
9926
+ /**
9927
+ * Cheap content-size signal to invalidate the native token cache on mutation. Recursively
9928
+ * sums string lengths across the whole message (depth-bounded), so it covers every
9929
+ * provider-visible shape (text/thinking/tool args, toolResult output, tool names, etc.)
9930
+ * without allocating a serialized copy. A size-preserving in-place edit yields only a
9931
+ * benign estimate drift.
9932
+ */
9933
+ #messageTokenSize(value: unknown, depth = 0): number {
9934
+ if (depth > 6) return 0;
9935
+ if (typeof value === "string") return value.length;
9936
+ if (typeof value === "number" || typeof value === "boolean") return 8;
9937
+ if (Array.isArray(value)) {
9938
+ let size = 0;
9939
+ for (const item of value) size += this.#messageTokenSize(item, depth + 1);
9940
+ return size;
9941
+ }
9942
+ if (value && typeof value === "object") {
9943
+ let size = 0;
9944
+ for (const item of Object.values(value)) size += this.#messageTokenSize(item, depth + 1);
9945
+ return size;
9946
+ }
9947
+ return 0;
9948
+ }
9949
+
9712
9950
  #estimateMessageNativeContextTokens(message: AgentMessage): number {
9951
+ // F10/F22: cache the expensive native token count per message object, invalidated by a
9952
+ // cheap content-size signal, so unchanged (stable-size) messages are not re-tokenized on
9953
+ // every pre-prompt estimate. A rare size-preserving in-place edit yields only a benign
9954
+ // token-estimate drift, never wrong output.
9955
+ const len = this.#messageTokenSize(message);
9956
+ const cached = this.#nativeTokenCache.get(message);
9957
+ if (cached && cached.len === len) return cached.tokens;
9713
9958
  let tokens = 0;
9714
9959
  for (const llmMessage of convertToLlm([message])) {
9715
9960
  tokens += estimateTokens(llmMessage);
9716
9961
  }
9962
+ this.#nativeTokenCache.set(message, { len, tokens });
9717
9963
  return tokens;
9718
9964
  }
9719
9965
 
@@ -7,6 +7,11 @@
7
7
  import * as fs from "node:fs/promises";
8
8
  import * as path from "node:path";
9
9
 
10
+ import { DEFAULT_ARTIFACT_MAX_BYTES, truncateHeadBytes } from "./streaming-output";
11
+ export interface ArtifactSaveOptions {
12
+ maxBytes?: number;
13
+ }
14
+
10
15
  /**
11
16
  * Manages artifact storage for a session.
12
17
  *
@@ -94,9 +99,19 @@ export class ArtifactManager {
94
99
  * @param toolType Tool name for file extension (e.g., "bash", "read")
95
100
  * @returns Artifact ID (numeric string)
96
101
  */
97
- async save(content: string, toolType: string): Promise<string> {
102
+ async save(content: string, toolType: string, options: ArtifactSaveOptions = {}): Promise<string> {
98
103
  const { id, path } = await this.allocatePath(toolType);
99
- await Bun.write(path, content);
104
+ const maxBytes = Math.max(0, options.maxBytes ?? DEFAULT_ARTIFACT_MAX_BYTES);
105
+ const contentBytes = Buffer.byteLength(content, "utf-8");
106
+ if (contentBytes > maxBytes) {
107
+ const truncated = truncateHeadBytes(content, maxBytes);
108
+ await Bun.write(
109
+ path,
110
+ `${truncated.text}\n[artifact truncated after ${truncated.bytes} bytes; omitted at least ${contentBytes - truncated.bytes} bytes]\n`,
111
+ );
112
+ } else {
113
+ await Bun.write(path, content);
114
+ }
100
115
  return id;
101
116
  }
102
117
 
@@ -167,19 +167,49 @@ export class EphemeralBlobStore extends BlobStore {
167
167
  }
168
168
 
169
169
  export class MemoryBlobStore extends BlobStore {
170
+ /**
171
+ * Generous byte/count LRU bound (F8). Content-addressed resident blobs are fail-closed
172
+ * on miss (callers raise/handle {@link ResidentBlobMissingError}), so evicting the
173
+ * least-recently-used entry on an extremely large session is preferable to unbounded
174
+ * RAM growth. The caps sit well above normal usage and only trip on pathological sizes.
175
+ */
176
+ static readonly #MAX_BYTES = 64 * 1024 * 1024;
177
+ static readonly #MAX_COUNT = 4096;
178
+
170
179
  #blobs = new Map<string, Buffer>();
180
+ #bytes = 0;
171
181
 
172
182
  constructor() {
173
183
  super(":memory:");
174
184
  }
175
185
 
186
+ #store(hash: string, data: Buffer): void {
187
+ const existing = this.#blobs.get(hash);
188
+ if (existing) {
189
+ this.#blobs.delete(hash);
190
+ this.#bytes -= existing.byteLength;
191
+ }
192
+ this.#blobs.set(hash, data);
193
+ this.#bytes += data.byteLength;
194
+ while (
195
+ (this.#bytes > MemoryBlobStore.#MAX_BYTES || this.#blobs.size > MemoryBlobStore.#MAX_COUNT) &&
196
+ this.#blobs.size > 1
197
+ ) {
198
+ const oldest = this.#blobs.keys().next().value;
199
+ if (oldest === undefined) break;
200
+ const evicted = this.#blobs.get(oldest);
201
+ this.#blobs.delete(oldest);
202
+ if (evicted) this.#bytes -= evicted.byteLength;
203
+ }
204
+ }
205
+
176
206
  async put(data: Buffer): Promise<BlobPutResult> {
177
207
  return this.putSync(data);
178
208
  }
179
209
 
180
210
  putSync(data: Buffer): BlobPutResult {
181
211
  const hash = new Bun.SHA256().update(data).digest("hex");
182
- this.#blobs.set(hash, Buffer.from(data));
212
+ this.#store(hash, Buffer.from(data));
183
213
  return {
184
214
  hash,
185
215
  path: `memory:${hash}`,
@@ -195,7 +225,11 @@ export class MemoryBlobStore extends BlobStore {
195
225
 
196
226
  getSync(hash: string): Buffer | null {
197
227
  const data = this.#blobs.get(hash);
198
- return data ? Buffer.from(data) : null;
228
+ if (!data) return null;
229
+ // Refresh LRU recency on hit so hot blobs survive eviction.
230
+ this.#blobs.delete(hash);
231
+ this.#blobs.set(hash, data);
232
+ return Buffer.from(data);
199
233
  }
200
234
 
201
235
  async has(hash: string): Promise<boolean> {
@@ -889,8 +889,27 @@ async function resolvePersistedBlobRefs(value: unknown, blobStore: BlobStore, ke
889
889
  );
890
890
  }
891
891
 
892
+ /**
893
+ * Run async tasks with bounded concurrency so an image-heavy resume never materializes
894
+ * every blob's base64 simultaneously (F8: avoids the transient OOM spike of an unbounded
895
+ * Promise.all over all historical images).
896
+ */
897
+ const BLOB_RESOLVE_CONCURRENCY = 8;
898
+ async function runWithConcurrency(tasks: Array<() => Promise<void>>, limit: number): Promise<void> {
899
+ let next = 0;
900
+ const worker = async (): Promise<void> => {
901
+ while (next < tasks.length) {
902
+ const index = next;
903
+ next += 1;
904
+ await tasks[index]!();
905
+ }
906
+ };
907
+ const workerCount = Math.max(1, Math.min(limit, tasks.length));
908
+ await Promise.all(Array.from({ length: workerCount }, () => worker()));
909
+ }
910
+
892
911
  async function resolveBlobRefsInEntries(entries: FileEntry[], blobStore: BlobStore): Promise<void> {
893
- const promises: Promise<void>[] = [];
912
+ const tasks: Array<() => Promise<void>> = [];
894
913
 
895
914
  for (const entry of entries) {
896
915
  if (entry.type === "session") continue;
@@ -902,22 +921,19 @@ async function resolveBlobRefsInEntries(entries: FileEntry[], blobStore: BlobSto
902
921
  contentArray = entry.content;
903
922
  }
904
923
 
905
- if (contentArray) {
906
- for (const block of contentArray) {
907
- if (isImageBlock(block) && isBlobRef(block.data)) {
908
- promises.push(
909
- resolveImageData(blobStore, block.data).then(resolved => {
910
- block.data = resolved;
911
- }),
912
- );
924
+ tasks.push(async () => {
925
+ if (contentArray) {
926
+ for (const block of contentArray) {
927
+ if (isImageBlock(block) && isBlobRef(block.data)) {
928
+ block.data = await resolveImageData(blobStore, block.data);
929
+ }
913
930
  }
914
931
  }
915
- }
916
-
917
- promises.push(resolvePersistedBlobRefs(entry, blobStore));
932
+ await resolvePersistedBlobRefs(entry, blobStore);
933
+ });
918
934
  }
919
935
 
920
- await Promise.all(promises);
936
+ await runWithConcurrency(tasks, BLOB_RESOLVE_CONCURRENCY);
921
937
  }
922
938
 
923
939
  /**