npm - ummaya - Versions diffs - 0.2.4 → 0.2.6 - Mend

ummaya 0.2.4 → 0.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (482) hide show

package/README.md +15 -2
package/bin/ummaya +10 -1
package/bun.lock +180 -244
package/npm-shrinkwrap.json +760 -1760
package/package.json +39 -22
package/prompts/manifest.yaml +1 -1
package/prompts/system_v1.md +1 -0
package/pyproject.toml +27 -2
package/specs/2803-document-production-hardening/contracts/document-tools.schema.json +1043 -0
package/src/ummaya/_canonical/__init__.py +2 -0
package/src/ummaya/_canonical/baselines.yaml +113 -0
package/src/ummaya/engine/engine.py +29 -132
package/src/ummaya/evidence/__init__.py +21 -2
package/src/ummaya/evidence/dataset_contract.py +193 -0
package/src/ummaya/evidence/document_authoring_cases.py +33 -0
package/src/ummaya/evidence/document_harness.py +313 -0
package/src/ummaya/evidence/document_viewer_ux.py +391 -0
package/src/ummaya/evidence/gates.py +70 -0
package/src/ummaya/evidence/json_types.py +20 -0
package/src/ummaya/evidence/models.py +88 -1
package/src/ummaya/evidence/output_payload.py +89 -0
package/src/ummaya/evidence/payload_documents.py +233 -0
package/src/ummaya/evidence/route_contracts.py +224 -0
package/src/ummaya/evidence/route_helpers.py +150 -0
package/src/ummaya/evidence/runner.py +81 -212
package/src/ummaya/evidence/source_provenance.py +246 -0
package/src/ummaya/evidence/source_provenance_redaction.py +176 -0
package/src/ummaya/evidence/tool_layer.py +39 -0
package/src/ummaya/evidence/tool_layer_models.py +151 -0
package/src/ummaya/ipc/adapter_manifest_emitter.py +26 -10
package/src/ummaya/ipc/document_intent_normalization.py +185 -0
package/src/ummaya/ipc/frame_schema.py +5 -5
package/src/ummaya/ipc/route_diagnostics.py +73 -0
package/src/ummaya/ipc/stdio.py +1109 -477
package/src/ummaya/llm/client.py +102 -3
package/src/ummaya/llm/config.py +8 -3
package/src/ummaya/primitives/__init__.py +6 -2
package/src/ummaya/primitives/delegation.py +1 -1
package/src/ummaya/primitives/document.py +28 -0
package/src/ummaya/settings.py +0 -3
package/src/ummaya/tools/discovery_bridge.py +17 -1
package/src/ummaya/tools/documents/__init__.py +297 -0
package/src/ummaya/tools/documents/adapter_registry.py +487 -0
package/src/ummaya/tools/documents/archive_container_probe.py +167 -0
package/src/ummaya/tools/documents/artifact_store.py +454 -0
package/src/ummaya/tools/documents/authoring.py +283 -0
package/src/ummaya/tools/documents/baselines.py +132 -0
package/src/ummaya/tools/documents/capability.py +331 -0
package/src/ummaya/tools/documents/contracts.py +112 -0
package/src/ummaya/tools/documents/conversion.py +521 -0
package/src/ummaya/tools/documents/diff.py +275 -0
package/src/ummaya/tools/documents/engines.py +163 -0
package/src/ummaya/tools/documents/evaluation.py +291 -0
package/src/ummaya/tools/documents/explicit_values.py +108 -0
package/src/ummaya/tools/documents/fixtures.py +174 -0
package/src/ummaya/tools/documents/format_completion_audit.py +471 -0
package/src/ummaya/tools/documents/formats/__init__.py +2 -0
package/src/ummaya/tools/documents/formats/archive.py +528 -0
package/src/ummaya/tools/documents/formats/base.py +41 -0
package/src/ummaya/tools/documents/formats/code_file.py +211 -0
package/src/ummaya/tools/documents/formats/data_file.py +272 -0
package/src/ummaya/tools/documents/formats/hwp.py +284 -0
package/src/ummaya/tools/documents/formats/hwpx.py +1837 -0
package/src/ummaya/tools/documents/formats/odf.py +435 -0
package/src/ummaya/tools/documents/formats/ooxml.py +1030 -0
package/src/ummaya/tools/documents/formats/passive.py +766 -0
package/src/ummaya/tools/documents/formats/pdf.py +702 -0
package/src/ummaya/tools/documents/formats/text_web.py +268 -0
package/src/ummaya/tools/documents/hwp_conversion_probe.py +178 -0
package/src/ummaya/tools/documents/hwp_direct_candidate.py +141 -0
package/src/ummaya/tools/documents/inspection.py +289 -0
package/src/ummaya/tools/documents/intake.py +1079 -0
package/src/ummaya/tools/documents/legacy_office_promotion_probe.py +366 -0
package/src/ummaya/tools/documents/models.py +1598 -0
package/src/ummaya/tools/documents/odf_promotion_probe.py +167 -0
package/src/ummaya/tools/documents/orchestrator.py +96 -0
package/src/ummaya/tools/documents/passive_capability_probe.py +251 -0
package/src/ummaya/tools/documents/patch.py +170 -0
package/src/ummaya/tools/documents/pdfa_conformance.py +284 -0
package/src/ummaya/tools/documents/pdfa_promotion_probe.py +198 -0
package/src/ummaya/tools/documents/permissions.py +110 -0
package/src/ummaya/tools/documents/planner.py +616 -0
package/src/ummaya/tools/documents/registry.py +2733 -0
package/src/ummaya/tools/documents/render.py +978 -0
package/src/ummaya/tools/documents/render_comparison.py +113 -0
package/src/ummaya/tools/documents/render_comparison_models.py +74 -0
package/src/ummaya/tools/documents/render_comparison_regions.py +73 -0
package/src/ummaya/tools/documents/render_comparison_style.py +161 -0
package/src/ummaya/tools/documents/reread.py +157 -0
package/src/ummaya/tools/documents/runtime_authoring.py +244 -0
package/src/ummaya/tools/documents/runtime_authoring_bundle.py +76 -0
package/src/ummaya/tools/documents/scorecard.py +184 -0
package/src/ummaya/tools/documents/socratic_planner.py +193 -0
package/src/ummaya/tools/documents/style.py +48 -0
package/src/ummaya/tools/documents/tool_defs.py +523 -0
package/src/ummaya/tools/documents/validate.py +347 -0
package/src/ummaya/tools/executor.py +29 -0
package/src/ummaya/tools/live_proxy.py +0 -3
package/src/ummaya/tools/models.py +5 -1
package/src/ummaya/tools/register_all.py +8 -0
package/src/ummaya/tools/registry.py +10 -1
package/src/ummaya/tools/routing/__init__.py +59 -0
package/src/ummaya/tools/routing/builder.py +105 -0
package/src/ummaya/tools/routing/cards.py +29 -0
package/src/ummaya/tools/routing/decision_service.py +534 -0
package/src/ummaya/tools/routing/decision_types.py +74 -0
package/src/ummaya/tools/routing/feasibility.py +122 -0
package/src/ummaya/tools/routing/intent.py +17 -0
package/src/ummaya/tools/routing/intent_extractor.py +207 -0
package/src/ummaya/tools/routing/intent_patterns.py +160 -0
package/src/ummaya/tools/routing/intent_public_data.py +150 -0
package/src/ummaya/tools/routing/intent_types.py +48 -0
package/src/ummaya/tools/routing/lint.py +78 -0
package/src/ummaya/tools/routing/metadata.py +174 -0
package/src/ummaya/tools/routing/projection.py +340 -0
package/src/ummaya/tools/routing/retrieval_policy.py +629 -0
package/src/ummaya/tools/routing/schema.py +81 -0
package/src/ummaya/tools/routing/types.py +96 -0
package/src/ummaya/tools/routing_index.py +2 -2
package/src/ummaya/tools/search.py +34 -746
package/tests/fixtures/documents/public_forms/baselines.yaml +113 -0
package/tui/bun.lock +126 -305
package/tui/package.json +35 -22
package/tui/src/.cc-byte-identical-whitelist.yaml +266 -0
package/tui/src/QueryEngine.ts +12 -8
package/tui/src/bridge/inboundAttachments.ts +3 -3
package/tui/src/cli/handlers/auth.ts +3 -12
package/tui/src/cli/handlers/mcp.tsx +0 -1
package/tui/src/cli/print.ts +8 -9
package/tui/src/commands/insights.ts +1 -1
package/tui/src/commands/install-github-app/types.ts +8 -30
package/tui/src/commands/plugin/types.ts +6 -28
package/tui/src/commands/plugin/unifiedTypes.ts +4 -26
package/tui/src/commands/rename/generateSessionName.ts +1 -1
package/tui/src/components/Feedback.tsx +1 -1
package/tui/src/components/LogoV2/EmergencyTip.tsx +11 -2
package/tui/src/components/LogoV2/WelcomeV2.tsx +1 -3
package/tui/src/components/ScrollKeybindingHandler.tsx +6 -6
package/tui/src/components/Spinner/types.ts +6 -28
package/tui/src/components/agents/generateAgent.ts +1 -1
package/tui/src/components/agents/new-agent-creation/types.ts +4 -26
package/tui/src/components/config/EnvSecretIsolatedEditor.tsx +1 -1
package/tui/src/components/mcp/types.ts +16 -38
package/tui/src/components/messages/AssistantToolUseMessage.tsx +3 -2
package/tui/src/components/messages/UserCrossSessionMessage.ts +16 -4
package/tui/src/components/messages/UserForkBoilerplateMessage.ts +16 -4
package/tui/src/components/messages/UserGitHubWebhookMessage.ts +16 -4
package/tui/src/components/messages/UserToolResultMessage/utils.tsx +3 -2
package/tui/src/components/permissions/MonitorPermissionRequest/MonitorPermissionRequest.ts +9 -4
package/tui/src/components/permissions/ReviewArtifactPermissionRequest/ReviewArtifactPermissionRequest.ts +9 -4
package/tui/src/components/primitive/DocumentSocraticReviewBlock.tsx +129 -0
package/tui/src/components/primitive/DocumentToolResultCard.tsx +224 -0
package/tui/src/components/primitive/documentSocraticReview.ts +215 -0
package/tui/src/components/primitive/index.tsx +43 -1
package/tui/src/components/primitive/types.ts +137 -0
package/tui/src/components/ui/option.ts +4 -26
package/tui/src/constants/common.ts +0 -2
package/tui/src/constants/prompts.ts +4 -3
package/tui/src/constants/querySource.ts +4 -26
package/tui/src/entrypoints/sdk/controlTypes.ts +26 -48
package/tui/src/entrypoints/sdk/coreTypes.generated.ts +3 -25
package/tui/src/entrypoints/sdk/runtimeTypes.ts +38 -60
package/tui/src/entrypoints/sdk/sdkUtilityTypes.ts +4 -26
package/tui/src/entrypoints/sdk/settingsTypes.generated.ts +3 -25
package/tui/src/entrypoints/sdk/toolTypes.ts +3 -25
package/tui/src/hooks/toolPermission/handlers/interactiveHandler.ts +10 -0
package/tui/src/hooks/useApiKeyVerification.ts +1 -1
package/tui/src/hooks/useVirtualScroll.ts +1 -1
package/tui/src/ink/ink.tsx +33 -14
package/tui/src/ink/reconciler.ts +2 -3
package/tui/src/ink/render-to-screen.ts +30 -10
package/tui/src/ipc/bridge.ts +62 -15
package/tui/src/ipc/bridgeSingleton.ts +5 -1
package/tui/src/ipc/codec.ts +3 -3
package/tui/src/ipc/frames.generated.ts +12 -12
package/tui/src/ipc/llmClient.ts +151 -27
package/tui/src/ipc/schema/frame.schema.json +1 -1
package/tui/src/keybindings/defaultBindings.ts +4 -0
package/tui/src/main.tsx +32 -15
package/tui/src/native-ts/file-index/index.ts +33 -3
package/tui/src/observability/surface.ts +2 -2
package/tui/src/probes/toolRegistryProbe.tsx +3 -1
package/tui/src/projectOnboardingState.ts +7 -6
package/tui/src/query/chatMessageTypes.ts +18 -0
package/tui/src/query/chatMessagesBuilder.ts +1 -1
package/tui/src/query/deps.ts +1 -1
package/tui/src/query/messageGuards.ts +106 -0
package/tui/src/query/publicDataTerminalRepair.ts +384 -0
package/tui/src/query/run.ts +1075 -0
package/tui/src/query/supportBoundary.ts +168 -0
package/tui/src/query/toolResultErrors.ts +103 -0
package/tui/src/query/toolRunner.ts +687 -0
package/tui/src/query/unavailableToolRepair.ts +118 -0
package/tui/src/query.ts +9 -2186
package/tui/src/screens/REPL.tsx +40 -29
package/tui/src/services/api/adapterManifest.ts +4 -0
package/tui/src/services/api/backendChat/events.ts +117 -0
package/tui/src/services/api/backendChat/finalMessage.ts +40 -0
package/tui/src/services/api/backendChat/frame.ts +9 -0
package/tui/src/services/api/backendChat/streaming.ts +430 -0
package/tui/src/services/api/backendChat/types.ts +62 -0
package/tui/src/services/api/backendChat.ts +1 -0
package/tui/src/services/api/client.ts +65 -2
package/tui/src/services/api/errorUtils.ts +5 -5
package/tui/src/services/api/errors.ts +1 -1
package/tui/src/services/api/logging.ts +1 -1
package/tui/src/services/api/ummaya/evidence.ts +194 -0
package/tui/src/services/api/ummaya/messages.ts +255 -0
package/tui/src/services/api/ummaya/nonStreaming.ts +66 -0
package/tui/src/services/api/ummaya/provider.ts +200 -0
package/tui/src/services/api/ummaya/reasoning.ts +24 -0
package/tui/src/services/api/ummaya/request.ts +200 -0
package/tui/src/services/api/ummaya/selectionContext.ts +240 -0
package/tui/src/services/api/ummaya/streaming.ts +365 -0
package/tui/src/services/api/ummaya/streamingPayload.ts +129 -0
package/tui/src/services/api/ummaya/streamingReader.ts +40 -0
package/tui/src/services/api/ummaya/toolSelection.ts +217 -0
package/tui/src/services/api/ummaya/types.ts +110 -0
package/tui/src/services/api/ummaya/usage.ts +30 -0
package/tui/src/services/api/ummaya.ts +26 -418
package/tui/src/services/api/withRetry.ts +1 -1
package/tui/src/services/awaySummary.ts +2 -2
package/tui/src/services/claudeAiLimits.ts +1 -1
package/tui/src/services/compact/autoCompact.ts +1 -1
package/tui/src/services/compact/compact.ts +1 -1
package/tui/src/services/lsp/types.ts +8 -30
package/tui/src/services/tips/types.ts +6 -28
package/tui/src/services/tokenEstimation.ts +1 -1
package/tui/src/services/toolRegistry/bootGuard.ts +5 -5
package/tui/src/services/toolUseSummary/toolUseSummaryGenerator.ts +1 -1
package/tui/src/services/tools/toolExecution.ts +94 -1
package/tui/src/store/pendingPermissionSlot.ts +1 -1
package/tui/src/store/session-store.ts +10 -36
package/tui/src/stubs/any-stub.ts +15 -10
package/tui/src/stubs/color-diff-napi.ts +37 -23
package/tui/src/stubs/globals.d.ts +3 -3
package/tui/src/stubs/macro-preload.ts +23 -12
package/tui/src/tools/AdapterTool/AdapterTool.ts +1207 -714
package/tui/src/tools/AdapterTool/routeDiagnostics.ts +75 -0
package/tui/src/tools/AgentTool/AgentTool.tsx +84 -1371
package/tui/src/tools/AgentTool/agentToolHandoff.ts +114 -0
package/tui/src/tools/AgentTool/agentToolPartialResult.ts +16 -0
package/tui/src/tools/AgentTool/agentToolProgress.ts +32 -0
package/tui/src/tools/AgentTool/agentToolResolver.ts +161 -0
package/tui/src/tools/AgentTool/agentToolResult.ts +163 -0
package/tui/src/tools/AgentTool/agentToolUtils.ts +14 -686
package/tui/src/tools/AgentTool/asyncAgentLifecycle.ts +208 -0
package/tui/src/tools/AgentTool/asyncLifecycle.ts +153 -0
package/tui/src/tools/AgentTool/backgroundedCompletion.ts +126 -0
package/tui/src/tools/AgentTool/backgroundedLifecycle.ts +174 -0
package/tui/src/tools/AgentTool/foregroundBackground.ts +83 -0
package/tui/src/tools/AgentTool/foregroundDrain.tsx +133 -0
package/tui/src/tools/AgentTool/foregroundFinalize.ts +98 -0
package/tui/src/tools/AgentTool/foregroundLifecycle.tsx +237 -0
package/tui/src/tools/AgentTool/foregroundProgress.tsx +169 -0
package/tui/src/tools/AgentTool/foregroundTask.ts +89 -0
package/tui/src/tools/AgentTool/forkSubagent.ts +1 -12
package/tui/src/tools/AgentTool/forkSubagentGate.ts +34 -0
package/tui/src/tools/AgentTool/launchRouting.ts +203 -0
package/tui/src/tools/AgentTool/lifecycle.ts +244 -0
package/tui/src/tools/AgentTool/mcpRouting.ts +73 -0
package/tui/src/tools/AgentTool/orchestrationSupport.ts +70 -0
package/tui/src/tools/AgentTool/permissions.ts +39 -0
package/tui/src/tools/AgentTool/promptSetup.ts +181 -0
package/tui/src/tools/AgentTool/remoteRouting.ts +62 -0
package/tui/src/tools/AgentTool/resultMapping.ts +116 -0
package/tui/src/tools/AgentTool/resumeAgent.ts +39 -107
package/tui/src/tools/AgentTool/resumeAgentHelpers.ts +140 -0
package/tui/src/tools/AgentTool/runAgent.ts +1 -1
package/tui/src/tools/AgentTool/runtimeConfig.ts +57 -0
package/tui/src/tools/AgentTool/schemas.ts +196 -0
package/tui/src/tools/AgentTool/sourceVerificationPropagation.ts +263 -0
package/tui/src/tools/AgentTool/worktreeLifecycle.ts +105 -0
package/tui/src/tools/AskUserQuestionTool/AskUserQuestionTool.tsx +174 -202
package/tui/src/tools/BashTool/BashTool.tsx +71 -1072
package/tui/src/tools/BashTool/bashCommandHelpers.ts +12 -12
package/tui/src/tools/BashTool/bashPermissions/astPreflight.ts +173 -0
package/tui/src/tools/BashTool/bashPermissions/classifierChecks.ts +199 -0
package/tui/src/tools/BashTool/bashPermissions/compoundGuards.ts +53 -0
package/tui/src/tools/BashTool/bashPermissions/constants.ts +99 -0
package/tui/src/tools/BashTool/bashPermissions/index.ts +38 -0
package/tui/src/tools/BashTool/bashPermissions/legacyMisparsing.ts +62 -0
package/tui/src/tools/BashTool/bashPermissions/main.ts +135 -0
package/tui/src/tools/BashTool/bashPermissions/normalizedCommands.ts +33 -0
package/tui/src/tools/BashTool/bashPermissions/operatorFlow.ts +98 -0
package/tui/src/tools/BashTool/bashPermissions/permissionChecks.ts +200 -0
package/tui/src/tools/BashTool/bashPermissions/prefixSuggestions.ts +88 -0
package/tui/src/tools/BashTool/bashPermissions/promptClassifierRules.ts +125 -0
package/tui/src/tools/BashTool/bashPermissions/ruleDelegates.ts +19 -0
package/tui/src/tools/BashTool/bashPermissions/ruleMatching.ts +145 -0
package/tui/src/tools/BashTool/bashPermissions/sandboxAutoAllow.ts +75 -0
package/tui/src/tools/BashTool/bashPermissions/subcommandFlow.ts +205 -0
package/tui/src/tools/BashTool/bashPermissions/subcommandGuards.ts +73 -0
package/tui/src/tools/BashTool/bashPermissions/subcommandResultHelpers.ts +116 -0
package/tui/src/tools/BashTool/bashPermissions/types.ts +26 -0
package/tui/src/tools/BashTool/bashPermissions/wrapperStripping.ts +139 -0
package/tui/src/tools/BashTool/bashPermissions.ts +26 -2621
package/tui/src/tools/BashTool/call.ts +202 -0
package/tui/src/tools/BashTool/callLoader.ts +35 -0
package/tui/src/tools/BashTool/commandClassification.ts +151 -0
package/tui/src/tools/BashTool/commandClassificationLoader.ts +40 -0
package/tui/src/tools/BashTool/cwdReset.ts +33 -0
package/tui/src/tools/BashTool/lineTruncation.ts +11 -0
package/tui/src/tools/BashTool/modeValidation.ts +13 -1
package/tui/src/tools/BashTool/outputPersistence.ts +42 -0
package/tui/src/tools/BashTool/permissionClassification.ts +66 -0
package/tui/src/tools/BashTool/permissionLoader.ts +44 -0
package/tui/src/tools/BashTool/resultLoader.ts +29 -0
package/tui/src/tools/BashTool/resultMapping.ts +83 -0
package/tui/src/tools/BashTool/sandboxPolicy.ts +79 -0
package/tui/src/tools/BashTool/schemas.ts +65 -0
package/tui/src/tools/BashTool/sedEditExecution.ts +59 -0
package/tui/src/tools/BashTool/shellExecution.tsx +245 -0
package/tui/src/tools/BashTool/shellOutputUtils.ts +85 -0
package/tui/src/tools/BashTool/shellPermissionGauntlet.ts +97 -0
package/tui/src/tools/BashTool/uiLoader.ts +37 -0
package/tui/src/tools/BriefTool/upload.ts +1 -1
package/tui/src/tools/CalculatorTool/parser.ts +2 -2
package/tui/src/tools/DocumentPrimitive/DocumentPrimitive.ts +262 -0
package/tui/src/tools/DocumentPrimitive/dispatchNormalization.ts +270 -0
package/tui/src/tools/DocumentPrimitive/documentDestinationPath.ts +18 -0
package/tui/src/tools/DocumentPrimitive/documentMutationGuard.ts +22 -0
package/tui/src/tools/DocumentPrimitive/documentPatchNormalization.ts +248 -0
package/tui/src/tools/DocumentPrimitive/documentSourceVerification.ts +245 -0
package/tui/src/tools/DocumentPrimitive/documentSourceVerificationFields.ts +103 -0
package/tui/src/tools/DocumentPrimitive/modelVisibleOutput.ts +40 -0
package/tui/src/tools/DocumentPrimitive/prompt.ts +35 -0
package/tui/src/tools/FileEditTool/FileEditTool.ts +9 -507
package/tui/src/tools/FileEditTool/call.ts +228 -0
package/tui/src/tools/FileEditTool/validateInput.ts +196 -0
package/tui/src/tools/FileReadTool/imageProcessor.ts +13 -0
package/tui/src/tools/FileWriteTool/FileWriteTool.ts +7 -300
package/tui/src/tools/FileWriteTool/call.ts +223 -0
package/tui/src/tools/FileWriteTool/validateInput.ts +80 -0
package/tui/src/tools/ListMcpResourcesTool/ListMcpResourcesTool.ts +19 -3
package/tui/src/tools/LookupPrimitive/LookupPrimitive.ts +25 -32
package/tui/src/tools/LookupPrimitive/prompt.ts +0 -2
package/tui/src/tools/MCPTool/trustPolicy.ts +118 -0
package/tui/src/tools/McpAuthTool/McpAuthTool.ts +21 -3
package/tui/src/tools/NotebookEditTool/NotebookEditTool.ts +7 -326
package/tui/src/tools/NotebookEditTool/call.ts +254 -0
package/tui/src/tools/NotebookEditTool/notebookModel.ts +51 -0
package/tui/src/tools/NotebookEditTool/validateInput.ts +142 -0
package/tui/src/tools/PowerShellTool/PowerShellTool.tsx +46 -937
package/tui/src/tools/PowerShellTool/acceptEditsCommandValidation.ts +162 -0
package/tui/src/tools/PowerShellTool/call.ts +179 -0
package/tui/src/tools/PowerShellTool/callLoader.ts +37 -0
package/tui/src/tools/PowerShellTool/commandClassification.ts +86 -0
package/tui/src/tools/PowerShellTool/modeValidation.ts +25 -332
package/tui/src/tools/PowerShellTool/outputPersistence.ts +42 -0
package/tui/src/tools/PowerShellTool/permissionClassification.ts +28 -0
package/tui/src/tools/PowerShellTool/resultLoader.ts +31 -0
package/tui/src/tools/PowerShellTool/resultMapping.ts +75 -0
package/tui/src/tools/PowerShellTool/schemas.ts +40 -0
package/tui/src/tools/PowerShellTool/shellExecution.tsx +258 -0
package/tui/src/tools/PowerShellTool/symlinkModeValidation.ts +44 -0
package/tui/src/tools/PowerShellTool/uiLoader.ts +37 -0
package/tui/src/tools/PowerShellTool/validation.ts +39 -0
package/tui/src/tools/ReadMcpResourceTool/ReadMcpResourceTool.ts +19 -3
package/tui/src/tools/ResolveLocationPrimitive/ResolveLocationPrimitive.ts +1 -11
package/tui/src/tools/ResolveLocationPrimitive/prompt.ts +2 -6
package/tui/src/tools/SkillTool/SkillTool.ts +2 -2
package/tui/src/tools/SubmitPrimitive/SubmitPrimitive.ts +27 -10
package/tui/src/tools/TaskCreateTool/TaskCreateTool.ts +16 -2
package/tui/src/tools/TaskGetTool/TaskGetTool.ts +23 -3
package/tui/src/tools/TaskListTool/TaskListTool.ts +22 -4
package/tui/src/tools/TaskOutputTool/TaskOutputTool.tsx +46 -547
package/tui/src/tools/TaskOutputTool/lookup.ts +216 -0
package/tui/src/tools/TaskOutputTool/render.tsx +257 -0
package/tui/src/tools/TaskOutputTool/schemas.ts +55 -0
package/tui/src/tools/TaskOutputTool/serialization.ts +36 -0
package/tui/src/tools/TaskStopTool/TaskStopTool.ts +10 -0
package/tui/src/tools/TaskUpdateTool/TaskUpdateTool.ts +14 -364
package/tui/src/tools/TaskUpdateTool/completion.ts +62 -0
package/tui/src/tools/TaskUpdateTool/schemas.ts +62 -0
package/tui/src/tools/TaskUpdateTool/serialization.ts +46 -0
package/tui/src/tools/TaskUpdateTool/statusUpdate.ts +247 -0
package/tui/src/tools/TodoWriteTool/TodoWriteTool.ts +21 -2
package/tui/src/tools/ToolSearchTool/ToolSearchTool.ts +21 -302
package/tui/src/tools/ToolSearchTool/ccSupportTools.ts +223 -0
package/tui/src/tools/ToolSearchTool/descriptionCache.ts +50 -0
package/tui/src/tools/ToolSearchTool/keywordSearch.ts +216 -0
package/tui/src/tools/ToolSearchTool/prompt.ts +10 -4
package/tui/src/tools/ToolSearchTool/resultMapping.ts +30 -0
package/tui/src/tools/ToolSearchTool/schemas.ts +30 -0
package/tui/src/tools/ToolSearchTool/searchPool.ts +47 -0
package/tui/src/tools/ToolSearchTool/supportIntentHints.ts +140 -0
package/tui/src/tools/TranslateTool/TranslateTool.ts +1 -1
package/tui/src/tools/VerifyPrimitive/VerifyPrimitive.ts +2 -1
package/tui/src/tools/WebFetchTool/WebFetchTool.ts +43 -138
package/tui/src/tools/WebFetchTool/call.ts +227 -0
package/tui/src/tools/WebFetchTool/resolvedAddressSafety.ts +78 -0
package/tui/src/tools/WebFetchTool/sourceVerification.ts +204 -0
package/tui/src/tools/WebFetchTool/types.ts +23 -0
package/tui/src/tools/WebFetchTool/urlSafety.ts +181 -0
package/tui/src/tools/WebFetchTool/utils.ts +1 -1
package/tui/src/tools/WebSearchTool/UI.tsx +0 -1
package/tui/src/tools/WebSearchTool/WebSearchTool.ts +9 -313
package/tui/src/tools/WebSearchTool/call.ts +33 -0
package/tui/src/tools/WebSearchTool/responseMapping.ts +190 -0
package/tui/src/tools/WebSearchTool/resultBlock.ts +47 -0
package/tui/src/tools/WebSearchTool/schemas.ts +47 -0
package/tui/src/tools/WebSearchTool/toolSchema.ts +12 -0
package/tui/src/tools/WorkspaceToolAdapter/WorkspaceToolAdapter.ts +79 -0
package/tui/src/tools/WorkspaceToolAdapter/allowedRootPolicy.ts +85 -0
package/tui/src/tools/WorkspaceToolAdapter/documentFormatGuards.ts +73 -0
package/tui/src/tools/WorkspaceToolAdapter/inputNormalization.ts +105 -0
package/tui/src/tools/WorkspaceToolAdapter/mcpExposurePolicy.ts +64 -0
package/tui/src/tools/WorkspaceToolAdapter/toolDefFactory.ts +215 -0
package/tui/src/tools/WorkspaceToolAdapter/toolNames.ts +6 -0
package/tui/src/tools/WorkspaceToolAdapter/workspacePolicy.ts +15 -0
package/tui/src/tools/_shared/dispatchPrimitive.ts +6 -6
package/tui/src/tools/_shared/documentChangeToPatch.ts +125 -0
package/tui/src/tools/_shared/documentDispatchArguments.ts +87 -0
package/tui/src/tools/_shared/documentPrimitiveTimeout.ts +13 -0
package/tui/src/tools/_shared/documentToolResultRender.ts +98 -0
package/tui/src/tools/_shared/pendingCallRegistry.ts +1 -6
package/tui/src/tools/_shared/rootPrimitiveInput.ts +1 -0
package/tui/src/tools/_shared/toolChoiceRepair/documentCompletionPatterns.ts +58 -0
package/tui/src/tools/_shared/toolChoiceRepair/documentCompletionPrompt.ts +271 -0
package/tui/src/tools/_shared/toolChoiceRepair/documentRepair.ts +452 -0
package/tui/src/tools/_shared/toolChoiceRepair/messageAccess.ts +80 -0
package/tui/src/tools/_shared/toolChoiceRepair/publicDataRepair.ts +92 -0
package/tui/src/tools/_shared/toolChoiceRepair/supportRepair.ts +135 -0
package/tui/src/tools/_shared/toolChoiceRepair.ts +55 -860
package/tui/src/tools/shared/mockDisclaimer.ts +1 -1
package/tui/src/tools.ts +39 -190
package/tui/src/types/fileSuggestion.ts +4 -26
package/tui/src/types/generated/events_mono/claude_code/v1/claude_code_internal_event.ts +186 -148
package/tui/src/types/generated/events_mono/common/v1/auth.ts +25 -11
package/tui/src/types/generated/events_mono/growthbook/v1/growthbook_experiment_event.ts +47 -30
package/tui/src/types/generated/google/protobuf/timestamp.ts +21 -7
package/tui/src/types/message.ts +80 -102
package/tui/src/types/messageQueueTypes.ts +6 -28
package/tui/src/types/notebook.ts +16 -38
package/tui/src/types/statusLine.ts +4 -26
package/tui/src/types/tools.ts +24 -46
package/tui/src/types/utils.ts +6 -28
package/tui/src/upstreamproxy/relay.ts +7 -3
package/tui/src/upstreamproxy/upstreamproxy.ts +1 -1
package/tui/src/utils/assistantMessageFactories.ts +9 -3
package/tui/src/utils/auth.ts +129 -139
package/tui/src/utils/bash/ast.ts +23 -23
package/tui/src/utils/bash/bashParser.ts +5 -5
package/tui/src/utils/billing.ts +1 -1
package/tui/src/utils/claudeDesktop.ts +4 -4
package/tui/src/utils/collapseReadSearch.ts +3 -3
package/tui/src/utils/cronTasks.ts +1 -1
package/tui/src/utils/execFileNoThrow.ts +1 -1
package/tui/src/utils/filePersistence/types.ts +16 -38
package/tui/src/utils/forkedAgent.ts +1 -1
package/tui/src/utils/gracefulShutdown.ts +4 -4
package/tui/src/utils/heapDumpService.ts +12 -8
package/tui/src/utils/hooks/apiQueryHookHelper.ts +1 -1
package/tui/src/utils/hooks/execPromptHook.ts +1 -1
package/tui/src/utils/hooks/skillImprovement.ts +1 -1
package/tui/src/utils/mcp/dateTimeParser.ts +1 -1
package/tui/src/utils/messages.ts +18 -0
package/tui/src/utils/migrateSessions.ts +3 -3
package/tui/src/utils/model/model.ts +6 -6
package/tui/src/utils/permissions/yoloClassifier.ts +1 -1
package/tui/src/utils/plugins/headlessPluginInstall.ts +1 -1
package/tui/src/utils/plugins/mcpPluginIntegration.ts +1 -1
package/tui/src/utils/plugins/mcpbHandler.ts +1 -1
package/tui/src/utils/plugins/pluginLoader.ts +8 -8
package/tui/src/utils/protectedNamespace.ts +5 -3
package/tui/src/utils/rawJsonToolCall.ts +242 -0
package/tui/src/utils/ripgrep.ts +16 -7
package/tui/src/utils/sessionTitle.ts +1 -1
package/tui/src/utils/settings/permissionValidation.ts +14 -2
package/tui/src/utils/shell/prefix.ts +1 -1
package/tui/src/utils/sideQuery.ts +1 -1
package/tui/src/utils/systemThemeWatcher.ts +13 -3
package/tui/src/utils/teleport.tsx +1 -1
package/uv.lock +426 -45
package/tui/src/services/api/claude.ts +0 -3540
package/tui/src/tools/_shared/directPublicDataGuard.ts +0 -362
package/tui/src/tools/_shared/kmaAnalysisGuard.ts +0 -197
package/tui/src/tools/_shared/kmaAviationGuard.ts +0 -70
package/tui/src/tools/_shared/nmcAedGuard.ts +0 -234
package/tui/src/tools/_shared/protectedCheckGuard.ts +0 -207
package/tui/src/tools/_shared/textToolCallGuard.ts +0 -91

package/tui/src/services/api/claude.ts DELETED Viewed

@@ -1,3540 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// Spec 2521 — byte-copy(2521) baseline restored from
-//   .references/claude-code-sourcemap/restored-src/src/services/api/claude.ts
-//   (CC 2.1.88, SHA-256 6d3fd16e608120d502e70ec461ffb66bcbca12fa86862859606c9118f977a999).
-// Three labeled swap commits layer atop the byte-copy:
-//   • swap/llm-provider(2521)     — @anthropic-ai/sdk imports → sdk-compat.ts
-//   • swap/anti-anthropic-1p(2521) — claude.ai 1P call-graph deadened via
-//     UMMAYA-stubbed support modules (services/claudeAiLimits.ts + utils/auth.ts
-//     are inert no-ops since Epic #1633). The 1P functions in this file
-//     (getOauthAccountInfo, currentLimits, extractQuotaStatusFromHeaders,
-//     getCLISyspromptPrefix's claude.ai branches, account_uuid telemetry)
-//     remain in the byte-copy text but resolve to no-op returns at runtime.
-//     No UMMAYA callers reach this file (verified post-Spec-2293), so the
-//     1P graph is doubly dead — by callgraph (no callers) and by support-
-//     module inertness. Spec 2521 byte-copy philosophy (FR-002): keep the
-//     reference text intact; deactivate via supporting infrastructure
-//     instead of deleting in-file.
-//   • swap/identifier-rename(2521) — citizen-visible Claude/Anthropic brand
-//     tokens → UMMAYA/EXAONE/FriendliAI (T013).
-// This file has zero callers in tui/src after Spec 2293; it is retained as
-// the authoritative CC streaming-handler reference for future audit replays
-// (specs/2521-llm-swap-cc-rebuild/scripts/replay_rebuild.sh).
-import type {
-  BetaContentBlock,
-  BetaContentBlockParam,
-  BetaImageBlockParam,
-  BetaJSONOutputFormat,
-  BetaMessage,
-  BetaMessageDeltaUsage,
-  BetaMessageStreamParams,
-  BetaOutputConfig,
-  BetaRawMessageStreamEvent,
-  BetaRequestDocumentBlock,
-  BetaStopReason,
-  BetaToolChoiceAuto,
-  BetaToolChoiceTool,
-  BetaToolResultBlockParam,
-  BetaToolUnion,
-  BetaUsage,
-  BetaMessageParam as MessageParam,
-} from '../../sdk-compat.js'
-import type { TextBlockParam } from '../../sdk-compat.js'
-import type { Stream } from '../../sdk-compat.js'
-import { randomUUID } from 'crypto'
-import {
-  getAPIProvider,
-  isFirstPartyAnthropicBaseUrl,
-} from 'src/utils/model/providers.js'
-import {
-  getAttributionHeader,
-  getCLISyspromptPrefix,
-} from '../../constants/system.js'
-import {
-  getEmptyToolPermissionContext,
-  type QueryChainTracking,
-  type Tool,
-  type ToolPermissionContext,
-  type Tools,
-  toolMatchesName,
-} from '../../Tool.js'
-import type { AgentDefinition } from '../../tools/AgentTool/loadAgentsDir.js'
-import {
-  type ConnectorTextBlock,
-  type ConnectorTextDelta,
-  isConnectorTextBlock,
-} from '../../types/connectorText.js'
-import type {
-  AssistantMessage,
-  Message,
-  StreamEvent,
-  SystemAPIErrorMessage,
-  UserMessage,
-} from '../../types/message.js'
-import {
-  type CacheScope,
-  logAPIPrefix,
-  splitSysPromptPrefix,
-  toolToAPISchema,
-} from '../../utils/api.js'
-import { getOauthAccountInfo } from '../../utils/auth.js'
-import {
-  getBedrockExtraBodyParamsBetas,
-  getMergedBetas,
-  getModelBetas,
-} from '../../utils/betas.js'
-import { getOrCreateUserID } from '../../utils/config.js'
-import {
-  CAPPED_DEFAULT_MAX_TOKENS,
-  getModelMaxOutputTokens,
-  getSonnet1mExpTreatmentEnabled,
-} from '../../utils/context.js'
-import { resolveAppliedEffort } from '../../utils/effort.js'
-import { isEnvTruthy } from '../../utils/envUtils.js'
-import { errorMessage } from '../../utils/errors.js'
-import { computeFingerprintFromMessages } from '../../utils/fingerprint.js'
-import { captureAPIRequest, logError } from '../../utils/log.js'
-import { normalizeMessagesForAPI } from '../../utils/messageApiNormalize.js'
-import * as messageUtils from '../../utils/messages.js'
-import { createAssistantAPIErrorMessage } from '../../utils/assistantMessageFactories.js'
-import { createUserMessage } from '../../utils/userMessageFactories.js'
-import {
-  getDefaultOpusModel,
-  getDefaultSonnetModel,
-  getSmallFastModel,
-  isNonCustomOpusModel,
-} from '../../utils/model/model.js'
-import {
-  asSystemPrompt,
-  type SystemPrompt,
-} from '../../utils/systemPromptType.js'
-import { tokenCountFromLastAPIResponse } from '../../utils/tokens.js'
-import { getDynamicConfig_BLOCKS_ON_INIT } from '../analytics/growthbook.js'
-import {
-  currentLimits,
-  extractQuotaStatusFromError,
-  extractQuotaStatusFromHeaders,
-} from '../claudeAiLimits.js'
-import { getAPIContextManagement } from '../compact/apiMicrocompact.js'
-/* eslint-disable @typescript-eslint/no-require-imports */
-const autoModeStateModule = feature('TRANSCRIPT_CLASSIFIER')
-  ? (require('../../utils/permissions/autoModeState.js') as typeof import('../../utils/permissions/autoModeState.js'))
-  : null
-import { feature } from 'bun:bundle'
-// SWAP/llm-provider(2521): @anthropic-ai/sdk + /error → UMMAYA sdk-compat
-// aliases (ClientOptions, APIError, APIConnectionTimeoutError, APIUserAbortError
-// all re-exported by sdk-compat.ts as structural stubs).
-import type { ClientOptions } from '../../sdk-compat.js'
-import type { ReasoningMode } from '../../utils/kExaoneReasoning.js'
-import {
-  APIConnectionTimeoutError,
-  APIError,
-  APIUserAbortError,
-} from '../../sdk-compat.js'
-import {
-  getAfkModeHeaderLatched,
-  getCacheEditingHeaderLatched,
-  getFastModeHeaderLatched,
-  getLastApiCompletionTimestamp,
-  getPromptCache1hAllowlist,
-  getPromptCache1hEligible,
-  getSessionId,
-  getThinkingClearLatched,
-  setAfkModeHeaderLatched,
-  setCacheEditingHeaderLatched,
-  setFastModeHeaderLatched,
-  setLastMainRequestId,
-  setPromptCache1hAllowlist,
-  setPromptCache1hEligible,
-  setThinkingClearLatched,
-} from 'src/bootstrap/state.js'
-import {
-  AFK_MODE_BETA_HEADER,
-  CONTEXT_1M_BETA_HEADER,
-  CONTEXT_MANAGEMENT_BETA_HEADER,
-  EFFORT_BETA_HEADER,
-  FAST_MODE_BETA_HEADER,
-  PROMPT_CACHING_SCOPE_BETA_HEADER,
-  REDACT_THINKING_BETA_HEADER,
-  STRUCTURED_OUTPUTS_BETA_HEADER,
-  TASK_BUDGETS_BETA_HEADER,
-} from 'src/constants/betas.js'
-import type { QuerySource } from 'src/constants/querySource.js'
-import type { Notification } from 'src/context/notifications.js'
-import { addToTotalSessionCost } from 'src/cost-tracker.js'
-import { getFeatureValue_CACHED_MAY_BE_STALE } from 'src/services/analytics/growthbook.js'
-import type { AgentId } from 'src/types/ids.js'
-import {
-  ADVISOR_TOOL_INSTRUCTIONS,
-  getExperimentAdvisorModels,
-  isAdvisorEnabled,
-  isValidAdvisorModel,
-  modelSupportsAdvisor,
-} from 'src/utils/advisor.js'
-import { getAgentContext } from 'src/utils/agentContext.js'
-import { isClaudeAISubscriber } from 'src/utils/auth.js'
-import {
-  getToolSearchBetaHeader,
-  modelSupportsStructuredOutputs,
-  shouldIncludeFirstPartyOnlyBetas,
-  shouldUseGlobalCacheScope,
-} from 'src/utils/betas.js'
-import { CLAUDE_IN_CHROME_MCP_SERVER_NAME } from 'src/utils/claudeInChrome/common.js'
-import { CHROME_TOOL_SEARCH_INSTRUCTIONS } from 'src/utils/claudeInChrome/prompt.js'
-import { getMaxThinkingTokensForModel } from 'src/utils/context.js'
-import { logForDebugging } from 'src/utils/debug.js'
-import { logForDiagnosticsNoPII } from 'src/utils/diagLogs.js'
-import { type EffortValue, modelSupportsEffort } from 'src/utils/effort.js'
-import {
-  isFastModeAvailable,
-  isFastModeCooldown,
-  isFastModeEnabled,
-  isFastModeSupportedByModel,
-} from 'src/utils/fastMode.js'
-import { returnValue } from 'src/utils/generators.js'
-import { headlessProfilerCheckpoint } from 'src/utils/headlessProfiler.js'
-import { isMcpInstructionsDeltaEnabled } from 'src/utils/mcpInstructionsDelta.js'
-import { calculateUSDCost } from 'src/utils/modelCost.js'
-import { endQueryProfile, queryCheckpoint } from 'src/utils/queryProfiler.js'
-import {
-  modelSupportsAdaptiveThinking,
-  modelSupportsThinking,
-  type ThinkingConfig,
-} from 'src/utils/thinking.js'
-import {
-  extractDiscoveredToolNames,
-  isDeferredToolsDeltaEnabled,
-  isToolSearchEnabled,
-} from 'src/utils/toolSearch.js'
-import { API_MAX_MEDIA_PER_REQUEST } from '../../constants/apiLimits.js'
-import { ADVISOR_BETA_HEADER } from '../../constants/betas.js'
-import {
-  formatDeferredToolLine,
-  isDeferredTool,
-  TOOL_SEARCH_TOOL_NAME,
-} from '../../tools/ToolSearchTool/prompt.js'
-import {
-  getAdapterToolByName,
-  selectTopKAdapterToolNamesForQuery,
-} from '../../tools/AdapterTool/AdapterTool.js'
-import { isNonSyntheticUserText } from '../../tools/_shared/citizenUserText.js'
-import { shouldSuppressUmmayaToolCallsForAnswerSynthesis } from '../../tools/_shared/toolChoiceRepair.js'
-import { count } from '../../utils/array.js'
-import { insertBlockAfterToolResults } from '../../utils/contentArray.js'
-import { validateBoundedIntEnvVar } from '../../utils/envValidation.js'
-import { safeParseJSON } from '../../utils/json.js'
-import { getInferenceProfileBackingModel } from '../../utils/model/bedrock.js'
-import {
-  normalizeModelStringForAPI,
-  parseUserSpecifiedModel,
-} from '../../utils/model/model.js'
-import {
-  startSessionActivity,
-  stopSessionActivity,
-} from '../../utils/sessionActivity.js'
-import { jsonStringify } from '../../utils/slowOperations.js'
-import {
-  isBetaTracingEnabled,
-  type LLMRequestNewContext,
-  startLLMRequestSpan,
-} from '../../utils/telemetry/sessionTracing.js'
-/* eslint-enable @typescript-eslint/no-require-imports */
-import {
-  type AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
-  logEvent,
-} from '../analytics/index.js'
-import {
-  consumePendingCacheEdits,
-  getPinnedCacheEdits,
-  markToolsSentToAPIState,
-  pinCacheEdits,
-} from '../compact/microCompact.js'
-import { getInitializationStatus } from '../lsp/manager.js'
-import { isToolFromMcpServer } from '../mcp/utils.js'
-import { withStreamingVCR, withVCR } from '../vcr.js'
-import { CLIENT_REQUEST_ID_HEADER, getAnthropicClient } from './client.js'
-import {
-  API_ERROR_MESSAGE_PREFIX,
-  CUSTOM_OFF_SWITCH_MESSAGE,
-  getAssistantMessageFromError,
-  getErrorMessageIfRefusal,
-} from './errors.js'
-import {
-  EMPTY_USAGE,
-  type GlobalCacheStrategy,
-  logAPIError,
-  logAPIQuery,
-  logAPISuccessAndDuration,
-  type NonNullableUsage,
-} from './logging.js'
-import {
-  CACHE_TTL_1HOUR_MS,
-  checkResponseForCacheBreak,
-  recordPromptState,
-} from './promptCacheBreakDetection.js'
-import {
-  CannotRetryError,
-  FallbackTriggeredError,
-  is529Error,
-  type RetryContext,
-  withRetry,
-} from './withRetry.js'
-const {
-  ensureToolResultPairing,
-  normalizeContentFromAPI,
-  stripAdvisorBlocks,
-  stripCallerFieldFromAssistantMessage,
-  stripToolReferenceBlocksFromUserMessage,
-} = messageUtils
-// Define a type that represents valid JSON values
-type JsonValue = string | number | boolean | null | JsonObject | JsonArray
-type JsonObject = { [key: string]: JsonValue }
-type JsonArray = JsonValue[]
-/**
- * Assemble the extra body parameters for the API request, based on the
- * CLAUDE_CODE_EXTRA_BODY environment variable if present and on any beta
- * headers (primarily for Bedrock requests).
- *
- * @param betaHeaders - An array of beta headers to include in the request.
- * @returns A JSON object representing the extra body parameters.
- */
-export function getExtraBodyParams(betaHeaders?: string[]): JsonObject {
-  // Parse user's extra body parameters first
-  const extraBodyStr = process.env.CLAUDE_CODE_EXTRA_BODY
-  let result: JsonObject = {}
-  if (extraBodyStr) {
-    try {
-      // Parse as JSON, which can be null, boolean, number, string, array or object
-      const parsed = safeParseJSON(extraBodyStr)
-      // We expect an object with key-value pairs to spread into API parameters
-      if (parsed && typeof parsed === 'object' && !Array.isArray(parsed)) {
-        // Shallow clone — safeParseJSON is LRU-cached and returns the same
-        // object reference for the same string. Mutating `result` below
-        // would poison the cache, causing stale values to persist.
-        result = { ...(parsed as JsonObject) }
-      } else {
-        logForDebugging(
-          `CLAUDE_CODE_EXTRA_BODY env var must be a JSON object, but was given ${extraBodyStr}`,
-          { level: 'error' },
-        )
-      }
-    } catch (error) {
-      logForDebugging(
-        `Error parsing CLAUDE_CODE_EXTRA_BODY: ${errorMessage(error)}`,
-        { level: 'error' },
-      )
-    }
-  }
-  // Anti-distillation: send fake_tools opt-in for 1P CLI only
-  if (
-    feature('ANTI_DISTILLATION_CC')
-      ? process.env.CLAUDE_CODE_ENTRYPOINT === 'cli' &&
-        shouldIncludeFirstPartyOnlyBetas() &&
-        getFeatureValue_CACHED_MAY_BE_STALE(
-          'tengu_anti_distill_fake_tool_injection',
-          false,
-        )
-      : false
-  ) {
-    result.anti_distillation = ['fake_tools']
-  }
-  // Handle beta headers if provided
-  if (betaHeaders && betaHeaders.length > 0) {
-    if (result.anthropic_beta && Array.isArray(result.anthropic_beta)) {
-      // Add to existing array, avoiding duplicates
-      const existingHeaders = result.anthropic_beta as string[]
-      const newHeaders = betaHeaders.filter(
-        header => !existingHeaders.includes(header),
-      )
-      result.anthropic_beta = [...existingHeaders, ...newHeaders]
-    } else {
-      // Create new array with the beta headers
-      result.anthropic_beta = betaHeaders
-    }
-  }
-  return result
-}
-export function getPromptCachingEnabled(model: string): boolean {
-  // Global disable takes precedence
-  if (isEnvTruthy(process.env.DISABLE_PROMPT_CACHING)) return false
-  // Check if we should disable for small/fast model
-  if (isEnvTruthy(process.env.DISABLE_PROMPT_CACHING_HAIKU)) {
-    const smallFastModel = getSmallFastModel()
-    if (model === smallFastModel) return false
-  }
-  // Check if we should disable for default Sonnet
-  if (isEnvTruthy(process.env.DISABLE_PROMPT_CACHING_SONNET)) {
-    const defaultSonnet = getDefaultSonnetModel()
-    if (model === defaultSonnet) return false
-  }
-  // Check if we should disable for default Opus
-  if (isEnvTruthy(process.env.DISABLE_PROMPT_CACHING_OPUS)) {
-    const defaultOpus = getDefaultOpusModel()
-    if (model === defaultOpus) return false
-  }
-  return true
-}
-export function getCacheControl({
-  scope,
-  querySource,
-}: {
-  scope?: CacheScope
-  querySource?: QuerySource
-} = {}): {
-  type: 'ephemeral'
-  ttl?: '1h'
-  scope?: CacheScope
-} {
-  return {
-    type: 'ephemeral',
-    ...(should1hCacheTTL(querySource) && { ttl: '1h' }),
-    ...(scope === 'global' && { scope }),
-  }
-}
-/**
- * Determines if 1h TTL should be used for prompt caching.
- *
- * Only applied when:
- * 1. User is eligible (ant or subscriber within rate limits)
- * 2. The query source matches a pattern in the GrowthBook allowlist
- *
- * GrowthBook config shape: { allowlist: string[] }
- * Patterns support trailing '*' for prefix matching.
- * Examples:
- * - { allowlist: ["repl_main_thread*", "sdk"] } — main thread + SDK only
- * - { allowlist: ["repl_main_thread*", "sdk", "agent:*"] } — also subagents
- * - { allowlist: ["*"] } — all sources
- *
- * The allowlist is cached in STATE for session stability — prevents mixed
- * TTLs when GrowthBook's disk cache updates mid-request.
- */
-function should1hCacheTTL(querySource?: QuerySource): boolean {
-  // 3P Bedrock users get 1h TTL when opted in via env var — they manage their own billing
-  // No GrowthBook gating needed since 3P users don't have GrowthBook configured
-  if (
-    getAPIProvider() === 'bedrock' &&
-    isEnvTruthy(process.env.ENABLE_PROMPT_CACHING_1H_BEDROCK)
-  ) {
-    return true
-  }
-  // Latch eligibility in bootstrap state for session stability — prevents
-  // mid-session overage flips from changing the cache_control TTL, which
-  // would bust the server-side prompt cache (~20K tokens per flip).
-  let userEligible = getPromptCache1hEligible()
-  if (userEligible === null) {
-    userEligible =
-      process.env.USER_TYPE === 'ant' ||
-      (isClaudeAISubscriber() && !currentLimits.isUsingOverage)
-    setPromptCache1hEligible(userEligible)
-  }
-  if (!userEligible) return false
-  // Cache allowlist in bootstrap state for session stability — prevents mixed
-  // TTLs when GrowthBook's disk cache updates mid-request
-  let allowlist = getPromptCache1hAllowlist()
-  if (allowlist === null) {
-    const config = getFeatureValue_CACHED_MAY_BE_STALE<{
-      allowlist?: string[]
-    }>('tengu_prompt_cache_1h_config', {})
-    allowlist = config.allowlist ?? []
-    setPromptCache1hAllowlist(allowlist)
-  }
-  return (
-    querySource !== undefined &&
-    allowlist.some(pattern =>
-      pattern.endsWith('*')
-        ? querySource.startsWith(pattern.slice(0, -1))
-        : querySource === pattern,
-    )
-  )
-}
-/**
- * Configure effort parameters for API request.
- *
- */
-function configureEffortParams(
-  effortValue: EffortValue | undefined,
-  outputConfig: BetaOutputConfig,
-  extraBodyParams: Record<string, unknown>,
-  betas: string[],
-  model: string,
-): void {
-  if (!modelSupportsEffort(model) || 'effort' in outputConfig) {
-    return
-  }
-  if (effortValue === undefined) {
-    betas.push(EFFORT_BETA_HEADER)
-  } else if (typeof effortValue === 'string') {
-    // Send string effort level as is
-    outputConfig.effort = effortValue
-    betas.push(EFFORT_BETA_HEADER)
-  } else if (process.env.USER_TYPE === 'ant') {
-    // Numeric effort override - ant-only (uses anthropic_internal)
-    const existingInternal =
-      (extraBodyParams.anthropic_internal as Record<string, unknown>) || {}
-    extraBodyParams.anthropic_internal = {
-      ...existingInternal,
-      effort_override: effortValue,
-    }
-  }
-}
-// output_config.task_budget — API-side token budget awareness for the model.
-// Stainless SDK types don't yet include task_budget on BetaOutputConfig, so we
-// define the wire shape locally and cast. The API validates on receipt; see
-// api/api/schemas/messages/request/output_config.py:12-39 in the monorepo.
-// Beta: task-budgets-2026-03-13 (EAP, claude-strudel-eap only as of Mar 2026).
-type TaskBudgetParam = {
-  type: 'tokens'
-  total: number
-  remaining?: number
-}
-export function configureTaskBudgetParams(
-  taskBudget: Options['taskBudget'],
-  outputConfig: BetaOutputConfig & { task_budget?: TaskBudgetParam },
-  betas: string[],
-): void {
-  if (
-    !taskBudget ||
-    'task_budget' in outputConfig ||
-    !shouldIncludeFirstPartyOnlyBetas()
-  ) {
-    return
-  }
-  outputConfig.task_budget = {
-    type: 'tokens',
-    total: taskBudget.total,
-    ...(taskBudget.remaining !== undefined && {
-      remaining: taskBudget.remaining,
-    }),
-  }
-  if (!betas.includes(TASK_BUDGETS_BETA_HEADER)) {
-    betas.push(TASK_BUDGETS_BETA_HEADER)
-  }
-}
-export function getAPIMetadata() {
-  // https://docs.google.com/document/d/1dURO9ycXXQCBS0V4Vhl4poDBRgkelFc5t2BNPoEgH5Q/edit?tab=t.0#heading=h.5g7nec5b09w5
-  let extra: JsonObject = {}
-  const extraStr = process.env.CLAUDE_CODE_EXTRA_METADATA
-  if (extraStr) {
-    const parsed = safeParseJSON(extraStr, false)
-    if (parsed && typeof parsed === 'object' && !Array.isArray(parsed)) {
-      extra = parsed as JsonObject
-    } else {
-      logForDebugging(
-        `CLAUDE_CODE_EXTRA_METADATA env var must be a JSON object, but was given ${extraStr}`,
-        { level: 'error' },
-      )
-    }
-  }
-  return {
-    user_id: jsonStringify({
-      ...extra,
-      device_id: getOrCreateUserID(),
-      // Only include OAuth account UUID when actively using OAuth authentication
-      account_uuid: getOauthAccountInfo()?.accountUuid ?? '',
-      session_id: getSessionId(),
-    }),
-  }
-}
-export async function verifyApiKey(
-  apiKey: string,
-  isNonInteractiveSession: boolean,
-): Promise<boolean> {
-  // Skip API verification if running in print mode (isNonInteractiveSession)
-  if (isNonInteractiveSession) {
-    return true
-  }
-  try {
-    // WARNING: if you change this to use a non-Haiku model, this request will fail in 1P unless it uses getCLISyspromptPrefix.
-    const model = getSmallFastModel()
-    const betas = getModelBetas(model)
-    return await returnValue(
-      withRetry(
-        () =>
-          getAnthropicClient({
-            apiKey,
-            maxRetries: 3,
-            model,
-            source: 'verify_api_key',
-          }),
-        async anthropic => {
-          const messages: MessageParam[] = [{ role: 'user', content: 'test' }]
-          // biome-ignore lint/plugin: API key verification is intentionally a minimal direct call
-          await anthropic.beta.messages.create({
-            model,
-            max_tokens: 1,
-            messages,
-            temperature: 1,
-            ...(betas.length > 0 && { betas }),
-            metadata: getAPIMetadata(),
-            ...getExtraBodyParams(),
-          })
-          return true
-        },
-        { maxRetries: 2, model, thinkingConfig: { type: 'disabled' } }, // Use fewer retries for API key verification
-      ),
-    )
-  } catch (errorFromRetry) {
-    let error = errorFromRetry
-    if (errorFromRetry instanceof CannotRetryError) {
-      error = errorFromRetry.originalError
-    }
-    logError(error)
-    // Check for authentication error
-    if (
-      error instanceof Error &&
-      error.message.includes(
-        '{"type":"error","error":{"type":"authentication_error","message":"invalid x-api-key"}}',
-      )
-    ) {
-      return false
-    }
-    throw error
-  }
-}
-export function userMessageToMessageParam(
-  message: UserMessage,
-  addCache = false,
-  enablePromptCaching: boolean,
-  querySource?: QuerySource,
-): MessageParam {
-  if (addCache) {
-    if (typeof message.message.content === 'string') {
-      return {
-        role: 'user',
-        content: [
-          {
-            type: 'text',
-            text: message.message.content,
-            ...(enablePromptCaching && {
-              cache_control: getCacheControl({ querySource }),
-            }),
-          },
-        ],
-      }
-    } else {
-      return {
-        role: 'user',
-        content: message.message.content.map((_, i) => ({
-          ..._,
-          ...(i === message.message.content.length - 1
-            ? enablePromptCaching
-              ? { cache_control: getCacheControl({ querySource }) }
-              : {}
-            : {}),
-        })),
-      }
-    }
-  }
-  // Clone array content to prevent in-place mutations (e.g., insertCacheEditsBlock's
-  // splice) from contaminating the original message. Without cloning, multiple calls
-  // to addCacheBreakpoints share the same array and each splices in duplicate cache_edits.
-  return {
-    role: 'user',
-    content: Array.isArray(message.message.content)
-      ? [...message.message.content]
-      : message.message.content,
-  }
-}
-export function assistantMessageToMessageParam(
-  message: AssistantMessage,
-  addCache = false,
-  enablePromptCaching: boolean,
-  querySource?: QuerySource,
-): MessageParam {
-  if (addCache) {
-    if (typeof message.message.content === 'string') {
-      return {
-        role: 'assistant',
-        content: [
-          {
-            type: 'text',
-            text: message.message.content,
-            ...(enablePromptCaching && {
-              cache_control: getCacheControl({ querySource }),
-            }),
-          },
-        ],
-      }
-    } else {
-      return {
-        role: 'assistant',
-        content: message.message.content.map((_, i) => ({
-          ..._,
-          ...(i === message.message.content.length - 1 &&
-          _.type !== 'thinking' &&
-          _.type !== 'redacted_thinking' &&
-          (feature('CONNECTOR_TEXT') ? !isConnectorTextBlock(_) : true)
-            ? enablePromptCaching
-              ? { cache_control: getCacheControl({ querySource }) }
-              : {}
-            : {}),
-        })),
-      }
-    }
-  }
-  return {
-    role: 'assistant',
-    content: message.message.content,
-  }
-}
-export type Options = {
-  getToolPermissionContext: () => Promise<ToolPermissionContext>
-  model: string
-  toolChoice?: BetaToolChoiceTool | BetaToolChoiceAuto | undefined
-  isNonInteractiveSession: boolean
-  extraToolSchemas?: BetaToolUnion[]
-  maxOutputTokensOverride?: number
-  fallbackModel?: string
-  onStreamingFallback?: () => void
-  querySource: QuerySource
-  agents: AgentDefinition[]
-  allowedAgentTypes?: string[]
-  hasAppendSystemPrompt: boolean
-  fetchOverride?: ClientOptions['fetch']
-  enablePromptCaching?: boolean
-  skipCacheWrite?: boolean
-  temperatureOverride?: number
-  effortValue?: EffortValue
-  reasoningMode?: ReasoningMode
-  mcpTools: Tools
-  hasPendingMcpServers?: boolean
-  queryTracking?: QueryChainTracking
-  agentId?: AgentId // Only set for subagents
-  outputFormat?: BetaJSONOutputFormat
-  fastMode?: boolean
-  advisorModel?: string
-  addNotification?: (notif: Notification) => void
-  // API-side task budget (output_config.task_budget). Distinct from the
-  // tokenBudget.ts +500k auto-continue feature — this one is sent to the API
-  // so the model can pace itself. `remaining` is computed by the caller
-  // (query.ts decrements across the agentic loop).
-  taskBudget?: { total: number; remaining?: number }
-}
-export async function queryModelWithoutStreaming({
-  messages,
-  systemPrompt,
-  thinkingConfig,
-  tools,
-  signal,
-  options,
-}: {
-  messages: Message[]
-  systemPrompt: SystemPrompt
-  thinkingConfig: ThinkingConfig
-  tools: Tools
-  signal: AbortSignal
-  options: Options
-}): Promise<AssistantMessage> {
-  // Store the assistant message but continue consuming the generator to ensure
-  // logAPISuccessAndDuration gets called (which happens after all yields)
-  let assistantMessage: AssistantMessage | undefined
-  for await (const message of withStreamingVCR(messages, async function* () {
-    yield* queryModel(
-      messages,
-      systemPrompt,
-      thinkingConfig,
-      tools,
-      signal,
-      options,
-    )
-  })) {
-    if (message.type === 'assistant') {
-      assistantMessage = message
-    }
-  }
-  if (!assistantMessage) {
-    // If the signal was aborted, throw APIUserAbortError instead of a generic error
-    // This allows callers to handle abort scenarios gracefully
-    if (signal.aborted) {
-      throw new APIUserAbortError()
-    }
-    throw new Error('No assistant message found')
-  }
-  return assistantMessage
-}
-export async function* queryModelWithStreaming({
-  messages,
-  systemPrompt,
-  thinkingConfig,
-  tools,
-  signal,
-  options,
-}: {
-  messages: Message[]
-  systemPrompt: SystemPrompt
-  thinkingConfig: ThinkingConfig
-  tools: Tools
-  signal: AbortSignal
-  options: Options
-}): AsyncGenerator<
-  StreamEvent | AssistantMessage | SystemAPIErrorMessage,
-  void
-> {
-  return yield* withStreamingVCR(messages, async function* () {
-    yield* queryModel(
-      messages,
-      systemPrompt,
-      thinkingConfig,
-      tools,
-      signal,
-      options,
-    )
-  })
-}
-/**
- * Determines if an LSP tool should be deferred (tool appears with defer_loading: true)
- * because LSP initialization is not yet complete.
- */
-function shouldDeferLspTool(tool: Tool): boolean {
-  if (!('isLsp' in tool) || !tool.isLsp) {
-    return false
-  }
-  const status = getInitializationStatus()
-  // Defer when pending or not started
-  return status.status === 'pending' || status.status === 'not-started'
-}
-function latestUserTextForToolRetrieval(messages: Message[]): string {
-  for (let i = messages.length - 1; i >= 0; i--) {
-    const message = messages[i] as {
-      type?: string
-      message?: { content?: unknown }
-    }
-    if (message?.type !== 'user') continue
-    const content = message.message?.content
-    if (typeof content === 'string') {
-      if (isNonSyntheticUserText(content)) return content
-      continue
-    }
-    if (Array.isArray(content)) {
-      const text = content
-        .filter(
-          (block): block is { type: string; text: string } =>
-            block?.type === 'text' && typeof block.text === 'string',
-        )
-        .map(block => block.text)
-        .join('')
-      if (isNonSyntheticUserText(text)) return text
-    }
-  }
-  return ''
-}
-/**
- * Per-attempt timeout for non-streaming fallback requests, in milliseconds.
- * Reads API_TIMEOUT_MS when set so slow backends and the streaming path
- * share the same ceiling.
- *
- * Remote sessions default to 120s to stay under CCR's container idle-kill
- * (~5min) so a hung fallback to a wedged backend surfaces a clean
- * APIConnectionTimeoutError instead of stalling past SIGKILL.
- *
- * Otherwise defaults to 300s — long enough for slow backends without
- * approaching the API's 10-minute non-streaming boundary.
- */
-function getNonstreamingFallbackTimeoutMs(): number {
-  const override = parseInt(process.env.API_TIMEOUT_MS || '', 10)
-  if (override) return override
-  return isEnvTruthy(process.env.CLAUDE_CODE_REMOTE) ? 120_000 : 300_000
-}
-/**
- * Helper generator for non-streaming API requests.
- * Encapsulates the common pattern of creating a withRetry generator,
- * iterating to yield system messages, and returning the final BetaMessage.
- */
-export async function* executeNonStreamingRequest(
-  clientOptions: {
-    model: string
-    fetchOverride?: Options['fetchOverride']
-    source: string
-  },
-  retryOptions: {
-    model: string
-    fallbackModel?: string
-    thinkingConfig: ThinkingConfig
-    fastMode?: boolean
-    signal: AbortSignal
-    initialConsecutive529Errors?: number
-    querySource?: QuerySource
-  },
-  paramsFromContext: (context: RetryContext) => BetaMessageStreamParams,
-  onAttempt: (attempt: number, start: number, maxOutputTokens: number) => void,
-  captureRequest: (params: BetaMessageStreamParams) => void,
-  /**
-   * Request ID of the failed streaming attempt this fallback is recovering
-   * from. Emitted in tengu_nonstreaming_fallback_error for funnel correlation.
-   */
-  originatingRequestId?: string | null,
-): AsyncGenerator<SystemAPIErrorMessage, BetaMessage> {
-  const fallbackTimeoutMs = getNonstreamingFallbackTimeoutMs()
-  const generator = withRetry(
-    () =>
-      getAnthropicClient({
-        maxRetries: 0,
-        model: clientOptions.model,
-        fetchOverride: clientOptions.fetchOverride,
-        source: clientOptions.source,
-      }),
-    async (anthropic, attempt, context) => {
-      const start = Date.now()
-      const retryParams = paramsFromContext(context)
-      captureRequest(retryParams)
-      onAttempt(attempt, start, retryParams.max_tokens)
-      const adjustedParams = adjustParamsForNonStreaming(
-        retryParams,
-        MAX_NON_STREAMING_TOKENS,
-      )
-      try {
-        // biome-ignore lint/plugin: non-streaming API call
-        return await anthropic.beta.messages.create(
-          {
-            ...adjustedParams,
-            model: normalizeModelStringForAPI(adjustedParams.model),
-          },
-          {
-            signal: retryOptions.signal,
-            timeout: fallbackTimeoutMs,
-          },
-        )
-      } catch (err) {
-        // User aborts are not errors — re-throw immediately without logging
-        if (err instanceof APIUserAbortError) throw err
-        // Instrumentation: record when the non-streaming request errors (including
-        // timeouts). Lets us distinguish "fallback hung past container kill"
-        // (no event) from "fallback hit the bounded timeout" (this event).
-        logForDiagnosticsNoPII('error', 'cli_nonstreaming_fallback_error')
-        logEvent('tengu_nonstreaming_fallback_error', {
-          model:
-            clientOptions.model as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
-          error:
-            err instanceof Error
-              ? (err.name as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS)
-              : ('unknown' as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS),
-          attempt,
-          timeout_ms: fallbackTimeoutMs,
-          request_id: (originatingRequestId ??
-            'unknown') as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
-        })
-        throw err
-      }
-    },
-    {
-      model: retryOptions.model,
-      fallbackModel: retryOptions.fallbackModel,
-      thinkingConfig: retryOptions.thinkingConfig,
-      ...(isFastModeEnabled() && { fastMode: retryOptions.fastMode }),
-      signal: retryOptions.signal,
-      initialConsecutive529Errors: retryOptions.initialConsecutive529Errors,
-      querySource: retryOptions.querySource,
-    },
-  )
-  let e
-  do {
-    e = await generator.next()
-    if (!e.done && e.value.type === 'system') {
-      yield e.value
-    }
-  } while (!e.done)
-  return e.value as BetaMessage
-}
-/**
- * Extracts the request ID from the most recent assistant message in the
- * conversation. Used to link consecutive API requests in analytics so we can
- * join them for cache-hit-rate analysis and incremental token tracking.
- *
- * Deriving this from the message array (rather than global state) ensures each
- * query chain (main thread, subagent, teammate) tracks its own request chain
- * independently, and rollback/undo naturally updates the value.
- */
-function getPreviousRequestIdFromMessages(
-  messages: Message[],
-): string | undefined {
-  for (let i = messages.length - 1; i >= 0; i--) {
-    const msg = messages[i]!
-    if (msg.type === 'assistant' && msg.requestId) {
-      return msg.requestId
-    }
-  }
-  return undefined
-}
-function isMedia(
-  block: BetaContentBlockParam,
-): block is BetaImageBlockParam | BetaRequestDocumentBlock {
-  return block.type === 'image' || block.type === 'document'
-}
-function isToolResult(
-  block: BetaContentBlockParam,
-): block is BetaToolResultBlockParam {
-  return block.type === 'tool_result'
-}
-/**
- * Ensures messages contain at most `limit` media items (images + documents).
- * Strips oldest media first to preserve the most recent.
- */
-export function stripExcessMediaItems(
-  messages: (UserMessage | AssistantMessage)[],
-  limit: number,
-): (UserMessage | AssistantMessage)[] {
-  let toRemove = 0
-  for (const msg of messages) {
-    if (!Array.isArray(msg.message.content)) continue
-    for (const block of msg.message.content) {
-      if (isMedia(block)) toRemove++
-      if (isToolResult(block) && Array.isArray(block.content)) {
-        for (const nested of block.content) {
-          if (isMedia(nested)) toRemove++
-        }
-      }
-    }
-  }
-  toRemove -= limit
-  if (toRemove <= 0) return messages
-  return messages.map(msg => {
-    if (toRemove <= 0) return msg
-    const content = msg.message.content
-    if (!Array.isArray(content)) return msg
-    const before = toRemove
-    const stripped = content
-      .map(block => {
-        if (
-          toRemove <= 0 ||
-          !isToolResult(block) ||
-          !Array.isArray(block.content)
-        )
-          return block
-        const filtered = block.content.filter(n => {
-          if (toRemove > 0 && isMedia(n)) {
-            toRemove--
-            return false
-          }
-          return true
-        })
-        return filtered.length === block.content.length
-          ? block
-          : { ...block, content: filtered }
-      })
-      .filter(block => {
-        if (toRemove > 0 && isMedia(block)) {
-          toRemove--
-          return false
-        }
-        return true
-      })
-    return before === toRemove
-      ? msg
-      : {
-          ...msg,
-          message: { ...msg.message, content: stripped },
-        }
-  }) as (UserMessage | AssistantMessage)[]
-}
-async function* queryModel(
-  messages: Message[],
-  systemPrompt: SystemPrompt,
-  thinkingConfig: ThinkingConfig,
-  tools: Tools,
-  signal: AbortSignal,
-  options: Options,
-): AsyncGenerator<
-  StreamEvent | AssistantMessage | SystemAPIErrorMessage,
-  void
-> {
-  // Check cheap conditions first — the off-switch await blocks on GrowthBook
-  // init (~10ms). For non-Opus models (haiku, sonnet) this skips the await
-  // entirely. Subscribers don't hit this path at all.
-  if (
-    !isClaudeAISubscriber() &&
-    isNonCustomOpusModel(options.model) &&
-    (
-      await getDynamicConfig_BLOCKS_ON_INIT<{ activated: boolean }>(
-        'tengu-off-switch',
-        {
-          activated: false,
-        },
-      )
-    ).activated
-  ) {
-    logEvent('tengu_off_switch_query', {})
-    yield getAssistantMessageFromError(
-      new Error(CUSTOM_OFF_SWITCH_MESSAGE),
-      options.model,
-    )
-    return
-  }
-  // Derive previous request ID from the last assistant message in this query chain.
-  // This is scoped per message array (main thread, subagent, teammate each have their own),
-  // so concurrent agents don't clobber each other's request chain tracking.
-  // Also naturally handles rollback/undo since removed messages won't be in the array.
-  const previousRequestId = getPreviousRequestIdFromMessages(messages)
-  const resolvedModel =
-    getAPIProvider() === 'bedrock' &&
-    options.model.includes('application-inference-profile')
-      ? ((await getInferenceProfileBackingModel(options.model)) ??
-        options.model)
-      : options.model
-  queryCheckpoint('query_tool_schema_build_start')
-  const isAgenticQuery =
-    options.querySource.startsWith('repl_main_thread') ||
-    options.querySource.startsWith('agent:') ||
-    options.querySource === 'sdk' ||
-    options.querySource === 'hook_agent' ||
-    options.querySource === 'verification_agent'
-  const betas = getMergedBetas(options.model, { isAgenticQuery })
-  // Always send the advisor beta header when advisor is enabled, so
-  // non-agentic queries (compact, side_question, extract_memories, etc.)
-  // can parse advisor server_tool_use blocks already in the conversation history.
-  if (isAdvisorEnabled()) {
-    betas.push(ADVISOR_BETA_HEADER)
-  }
-  let advisorModel: string | undefined
-  if (isAgenticQuery && isAdvisorEnabled()) {
-    let advisorOption = options.advisorModel
-    const advisorExperiment = getExperimentAdvisorModels()
-    if (advisorExperiment !== undefined) {
-      if (
-        normalizeModelStringForAPI(advisorExperiment.baseModel) ===
-        normalizeModelStringForAPI(options.model)
-      ) {
-        // Override the advisor model if the base model matches. We
-        // should only have experiment models if the user cannot
-        // configure it themselves.
-        advisorOption = advisorExperiment.advisorModel
-      }
-    }
-    if (advisorOption) {
-      const normalizedAdvisorModel = normalizeModelStringForAPI(
-        parseUserSpecifiedModel(advisorOption),
-      )
-      if (!modelSupportsAdvisor(options.model)) {
-        logForDebugging(
-          `[AdvisorTool] Skipping advisor - base model ${options.model} does not support advisor`,
-        )
-      } else if (!isValidAdvisorModel(normalizedAdvisorModel)) {
-        logForDebugging(
-          `[AdvisorTool] Skipping advisor - ${normalizedAdvisorModel} is not a valid advisor model`,
-        )
-      } else {
-        advisorModel = normalizedAdvisorModel
-        logForDebugging(
-          `[AdvisorTool] Server-side tool enabled with ${advisorModel} as the advisor model`,
-        )
-      }
-    }
-  }
-  // Check if tool search is enabled (checks mode, model support, and threshold for auto mode)
-  // This is async because it may need to calculate MCP tool description sizes for TstAuto mode
-  let useToolSearch = await isToolSearchEnabled(
-    options.model,
-    tools,
-    options.getToolPermissionContext,
-    options.agents,
-    'query',
-  )
-  const turnLocalAdapterToolNames = new Set(
-    selectTopKAdapterToolNamesForQuery(
-      latestUserTextForToolRetrieval(messages),
-    ),
-  )
-  if (options.toolChoice?.type === 'tool') {
-    turnLocalAdapterToolNames.add(options.toolChoice.name)
-  }
-  if (turnLocalAdapterToolNames.size > 0) {
-    logForDebugging(
-      `UMMAYA turn-local adapter schemas: ${[...turnLocalAdapterToolNames].join(', ')}`,
-    )
-  }
-  const requestTools =
-    turnLocalAdapterToolNames.size === 0
-      ? tools
-      : [
-          ...tools,
-          ...[...turnLocalAdapterToolNames]
-            .filter(toolName => !tools.some(tool => tool.name === toolName))
-            .map(toolName => getAdapterToolByName(toolName))
-            .filter((tool): tool is NonNullable<typeof tool> => Boolean(tool)),
-        ]
-  // Precompute once — isDeferredTool does 2 GrowthBook lookups per call.
-  // Include turn-local synced adapters even if the long-lived TUI tool pool
-  // was assembled before the latest backend manifest frame arrived.
-  const deferredToolNames = new Set<string>()
-  if (useToolSearch) {
-    for (const t of requestTools) {
-      if (isDeferredTool(t)) deferredToolNames.add(t.name)
-    }
-  }
-  // Even if tool search mode is enabled, skip if there are no deferred tools
-  // AND no MCP servers are still connecting. When servers are pending, keep
-  // ToolSearch available so the model can discover tools after they connect.
-  if (
-    useToolSearch &&
-    deferredToolNames.size === 0 &&
-    !options.hasPendingMcpServers
-  ) {
-    logForDebugging(
-      'Tool search disabled: no deferred tools available to search',
-    )
-    useToolSearch = false
-  }
-  const suppressUmmayaToolCalls =
-    shouldSuppressUmmayaToolCallsForAnswerSynthesis({ messages, tools: requestTools })
-  if (suppressUmmayaToolCalls) {
-    logForDebugging('UMMAYA suppressing tool schemas for answer synthesis')
-  }
-  // Filter out ToolSearchTool if tool search is not enabled for this model
-  // ToolSearchTool returns tool_reference blocks which unsupported models can't handle
-  let filteredTools: Tools
-  if (suppressUmmayaToolCalls) {
-    filteredTools = []
-  } else if (useToolSearch) {
-    // Dynamic tool loading: Only include deferred tools that have been discovered
-    // via tool_reference blocks in the message history. This eliminates the need
-    // to predeclare all deferred tools upfront and removes limits on tool quantity.
-    const discoveredToolNames = extractDiscoveredToolNames(messages)
-    filteredTools = requestTools.filter(tool => {
-      // 0.2.1 exposed the lightweight root primitives together with concrete
-      // adapter schemas. Keep that surface so K-EXAONE preserves CC-style
-      // prose→tool→prose loop painting, while still limiting concrete adapter
-      // schemas to the turn-local top-k set.
-      if (turnLocalAdapterToolNames.has(tool.name)) return true
-      // Always include non-deferred tools
-      if (!deferredToolNames.has(tool.name)) return true
-      // Always include ToolSearchTool (so it can discover more tools)
-      if (toolMatchesName(tool, TOOL_SEARCH_TOOL_NAME)) return true
-      // Only include deferred tools that have been discovered
-      return discoveredToolNames.has(tool.name)
-    })
-  } else {
-    filteredTools = requestTools.filter(t => {
-      if (toolMatchesName(t, TOOL_SEARCH_TOOL_NAME)) return false
-      // Keep non-deferred root primitives even when concrete top-k adapter
-      // schemas are available; this matches the released 0.2.1 loop surface.
-      if (isDeferredTool(t)) return turnLocalAdapterToolNames.has(t.name)
-      return true
-    })
-  }
-  // Add tool search beta header if enabled - required for defer_loading to be accepted
-  // Header differs by provider: 1P/Foundry use advanced-tool-use, Vertex/Bedrock use tool-search-tool
-  // For Bedrock, this header must go in extraBodyParams, not the betas array
-  const toolSearchHeader = useToolSearch ? getToolSearchBetaHeader() : null
-  if (toolSearchHeader && getAPIProvider() !== 'bedrock') {
-    if (!betas.includes(toolSearchHeader)) {
-      betas.push(toolSearchHeader)
-    }
-  }
-  // Determine if cached microcompact is enabled for this model.
-  // Computed once here (in async context) and captured by paramsFromContext.
-  // The beta header is also captured here to avoid a top-level import of the
-  // ant-only CACHE_EDITING_BETA_HEADER constant.
-  let cachedMCEnabled = false
-  let cacheEditingBetaHeader = ''
-  if (feature('CACHED_MICROCOMPACT')) {
-    const {
-      isCachedMicrocompactEnabled,
-      isModelSupportedForCacheEditing,
-      getCachedMCConfig,
-    } = await import('../compact/cachedMicrocompact.js')
-    const betas = await import('src/constants/betas.js')
-    cacheEditingBetaHeader = betas.CACHE_EDITING_BETA_HEADER
-    const featureEnabled = isCachedMicrocompactEnabled()
-    const modelSupported = isModelSupportedForCacheEditing(options.model)
-    cachedMCEnabled = featureEnabled && modelSupported
-    const config = getCachedMCConfig()
-    logForDebugging(
-      `Cached MC gate: enabled=${featureEnabled} modelSupported=${modelSupported} model=${options.model} supportedModels=${jsonStringify(config.supportedModels)}`,
-    )
-  }
-  const useGlobalCacheFeature = shouldUseGlobalCacheScope()
-  const willDefer = (t: Tool) =>
-    useToolSearch && (deferredToolNames.has(t.name) || shouldDeferLspTool(t))
-  // MCP tools are per-user → dynamic tool section → can't globally cache.
-  // Only gate when an MCP tool will actually render (not defer_loading).
-  const needsToolBasedCacheMarker =
-    useGlobalCacheFeature &&
-    filteredTools.some(t => t.isMcp === true && !willDefer(t))
-  // Ensure prompt_caching_scope beta header is present when global cache is enabled.
-  if (
-    useGlobalCacheFeature &&
-    !betas.includes(PROMPT_CACHING_SCOPE_BETA_HEADER)
-  ) {
-    betas.push(PROMPT_CACHING_SCOPE_BETA_HEADER)
-  }
-  // Determine global cache strategy for logging
-  const globalCacheStrategy: GlobalCacheStrategy = useGlobalCacheFeature
-    ? needsToolBasedCacheMarker
-      ? 'none'
-      : 'system_prompt'
-    : 'none'
-  // Build tool schemas, adding defer_loading for MCP tools when tool search is enabled
-  // Note: We pass the full `tools` list (not filteredTools) to toolToAPISchema so that
-  // ToolSearchTool's prompt can list ALL available MCP tools. The filtering only affects
-  // which tools are actually sent to the API, not what the model sees in tool descriptions.
-  const toolSchemas = await Promise.all(
-    filteredTools.map(tool =>
-      toolToAPISchema(tool, {
-        getToolPermissionContext: options.getToolPermissionContext,
-        tools,
-        agents: options.agents,
-        allowedAgentTypes: options.allowedAgentTypes,
-        model: options.model,
-        deferLoading: willDefer(tool),
-      }),
-    ),
-  )
-  if (useToolSearch) {
-    const includedDeferredTools = count(filteredTools, t =>
-      deferredToolNames.has(t.name),
-    )
-    logForDebugging(
-      `Dynamic tool loading: ${includedDeferredTools}/${deferredToolNames.size} deferred tools included`,
-    )
-  }
-  queryCheckpoint('query_tool_schema_build_end')
-  // Normalize messages before building system prompt (needed for fingerprinting)
-  // Instrumentation: Track message count before normalization
-  logEvent('tengu_api_before_normalize', {
-    preNormalizedMessageCount: messages.length,
-  })
-  queryCheckpoint('query_message_normalization_start')
-  let messagesForAPI = normalizeMessagesForAPI(messages, filteredTools)
-  queryCheckpoint('query_message_normalization_end')
-  // Model-specific post-processing: strip tool-search-specific fields if the
-  // selected model doesn't support tool search.
-  //
-  // Why is this needed in addition to normalizeMessagesForAPI?
-  // - normalizeMessagesForAPI uses isToolSearchEnabledNoModelCheck() because it's
-  //   called from ~20 places (analytics, feedback, sharing, etc.), many of which
-  //   don't have model context. Adding model to its signature would be a large refactor.
-  // - This post-processing uses the model-aware isToolSearchEnabled() check
-  // - This handles mid-conversation model switching (e.g., Sonnet → Haiku) where
-  //   stale tool-search fields from the previous model would cause 400 errors
-  //
-  // Note: For assistant messages, normalizeMessagesForAPI already normalized the
-  // tool inputs, so stripCallerFieldFromAssistantMessage only needs to remove the
-  // 'caller' field (not re-normalize inputs).
-  if (!useToolSearch) {
-    messagesForAPI = messagesForAPI.map(msg => {
-      switch (msg.type) {
-        case 'user':
-          // Strip tool_reference blocks from tool_result content
-          return stripToolReferenceBlocksFromUserMessage(msg)
-        case 'assistant':
-          // Strip 'caller' field from tool_use blocks
-          return stripCallerFieldFromAssistantMessage(msg)
-        default:
-          return msg
-      }
-    })
-  }
-  // Repair tool_use/tool_result pairing mismatches that can occur when resuming
-  // remote/teleport sessions. Inserts synthetic error tool_results for orphaned
-  // tool_uses and strips orphaned tool_results referencing non-existent tool_uses.
-  messagesForAPI = ensureToolResultPairing(messagesForAPI)
-  // Strip advisor blocks — the API rejects them without the beta header.
-  if (!betas.includes(ADVISOR_BETA_HEADER)) {
-    messagesForAPI = stripAdvisorBlocks(messagesForAPI)
-  }
-  // Strip excess media items before making the API call.
-  // The API rejects requests with >100 media items but returns a confusing error.
-  // Rather than erroring (which is hard to recover from in Cowork/CCD), we
-  // silently drop the oldest media items to stay within the limit.
-  messagesForAPI = stripExcessMediaItems(
-    messagesForAPI,
-    API_MAX_MEDIA_PER_REQUEST,
-  )
-  // Instrumentation: Track message count after normalization
-  logEvent('tengu_api_after_normalize', {
-    postNormalizedMessageCount: messagesForAPI.length,
-  })
-  // Compute fingerprint from first user message for attribution.
-  // Must run BEFORE injecting synthetic messages (e.g. deferred tool names)
-  // so the fingerprint reflects the actual user input.
-  const fingerprint = computeFingerprintFromMessages(messagesForAPI)
-  // When the delta attachment is enabled, deferred tools are announced
-  // via persisted deferred_tools_delta attachments instead of this
-  // ephemeral prepend (which busts cache whenever the pool changes).
-  if (useToolSearch && !isDeferredToolsDeltaEnabled()) {
-    const deferredToolList = tools
-      .filter(t => deferredToolNames.has(t.name))
-      .map(formatDeferredToolLine)
-      .sort()
-      .join('\n')
-    if (deferredToolList) {
-      messagesForAPI = [
-        createUserMessage({
-          content: `<available-deferred-tools>\n${deferredToolList}\n</available-deferred-tools>`,
-          isMeta: true,
-        }),
-        ...messagesForAPI,
-      ]
-    }
-  }
-  // Chrome tool-search instructions: when the delta attachment is enabled,
-  // these are carried as a client-side block in mcp_instructions_delta
-  // (attachments.ts) instead of here. This per-request sys-prompt append
-  // busts the prompt cache when chrome connects late.
-  const hasChromeTools = filteredTools.some(t =>
-    isToolFromMcpServer(t.name, CLAUDE_IN_CHROME_MCP_SERVER_NAME),
-  )
-  const injectChromeHere =
-    useToolSearch && hasChromeTools && !isMcpInstructionsDeltaEnabled()
-  // filter(Boolean) works by converting each element to a boolean - empty strings become false and are filtered out.
-  systemPrompt = asSystemPrompt(
-    [
-      getAttributionHeader(fingerprint),
-      getCLISyspromptPrefix({
-        isNonInteractive: options.isNonInteractiveSession,
-        hasAppendSystemPrompt: options.hasAppendSystemPrompt,
-      }),
-      ...systemPrompt,
-      ...(advisorModel ? [ADVISOR_TOOL_INSTRUCTIONS] : []),
-      ...(injectChromeHere ? [CHROME_TOOL_SEARCH_INSTRUCTIONS] : []),
-    ].filter(Boolean),
-  )
-  // Prepend system prompt block for easy API identification
-  logAPIPrefix(systemPrompt)
-  const enablePromptCaching =
-    options.enablePromptCaching ?? getPromptCachingEnabled(options.model)
-  const system = buildSystemPromptBlocks(systemPrompt, enablePromptCaching, {
-    skipGlobalCacheForSystemPrompt: needsToolBasedCacheMarker,
-    querySource: options.querySource,
-  })
-  const useBetas = betas.length > 0
-  // Build minimal context for detailed tracing (when beta tracing is enabled)
-  // Note: The actual new_context message extraction is done in sessionTracing.ts using
-  // hash-based tracking per querySource (agent) from the messagesForAPI array
-  const extraToolSchemas = [...(options.extraToolSchemas ?? [])]
-  if (advisorModel) {
-    // Server tools must be in the tools array by API contract. Appended after
-    // toolSchemas (which carries the cache_control marker) so toggling /advisor
-    // only churns the small suffix, not the cached prefix.
-    extraToolSchemas.push({
-      type: 'advisor_20260301',
-      name: 'advisor',
-      model: advisorModel,
-    } as unknown as BetaToolUnion)
-  }
-  const allTools = [...toolSchemas, ...extraToolSchemas]
-  const isFastMode =
-    isFastModeEnabled() &&
-    isFastModeAvailable() &&
-    !isFastModeCooldown() &&
-    isFastModeSupportedByModel(options.model) &&
-    !!options.fastMode
-  // Sticky-on latches for dynamic beta headers. Each header, once first
-  // sent, keeps being sent for the rest of the session so mid-session
-  // toggles don't change the server-side cache key and bust ~50-70K tokens.
-  // Latches are cleared on /clear and /compact via clearBetaHeaderLatches().
-  // Per-call gates (isAgenticQuery, querySource===repl_main_thread) stay
-  // per-call so non-agentic queries keep their own stable header set.
-  let afkHeaderLatched = getAfkModeHeaderLatched() === true
-  if (feature('TRANSCRIPT_CLASSIFIER')) {
-    if (
-      !afkHeaderLatched &&
-      isAgenticQuery &&
-      shouldIncludeFirstPartyOnlyBetas() &&
-      (autoModeStateModule?.isAutoModeActive() ?? false)
-    ) {
-      afkHeaderLatched = true
-      setAfkModeHeaderLatched(true)
-    }
-  }
-  let fastModeHeaderLatched = getFastModeHeaderLatched() === true
-  if (!fastModeHeaderLatched && isFastMode) {
-    fastModeHeaderLatched = true
-    setFastModeHeaderLatched(true)
-  }
-  let cacheEditingHeaderLatched = getCacheEditingHeaderLatched() === true
-  if (feature('CACHED_MICROCOMPACT')) {
-    if (
-      !cacheEditingHeaderLatched &&
-      cachedMCEnabled &&
-      getAPIProvider() === 'firstParty' &&
-      options.querySource === 'repl_main_thread'
-    ) {
-      cacheEditingHeaderLatched = true
-      setCacheEditingHeaderLatched(true)
-    }
-  }
-  // Only latch from agentic queries so a classifier call doesn't flip the
-  // main thread's context_management mid-turn.
-  let thinkingClearLatched = getThinkingClearLatched() === true
-  if (!thinkingClearLatched && isAgenticQuery) {
-    const lastCompletion = getLastApiCompletionTimestamp()
-    if (
-      lastCompletion !== null &&
-      Date.now() - lastCompletion > CACHE_TTL_1HOUR_MS
-    ) {
-      thinkingClearLatched = true
-      setThinkingClearLatched(true)
-    }
-  }
-  const effort = resolveAppliedEffort(options.model, options.effortValue)
-  if (feature('PROMPT_CACHE_BREAK_DETECTION')) {
-    // Exclude defer_loading tools from the hash -- the API strips them from the
-    // prompt, so they never affect the actual cache key. Including them creates
-    // false-positive "tool schemas changed" breaks when tools are discovered or
-    // MCP servers reconnect.
-    const toolsForCacheDetection = allTools.filter(
-      t => !('defer_loading' in t && t.defer_loading),
-    )
-    // Capture everything that could affect the server-side cache key.
-    // Pass latched header values (not live state) so break detection
-    // reflects what we actually send, not what the user toggled.
-    recordPromptState({
-      system,
-      toolSchemas: toolsForCacheDetection,
-      querySource: options.querySource,
-      model: options.model,
-      agentId: options.agentId,
-      fastMode: fastModeHeaderLatched,
-      globalCacheStrategy,
-      betas,
-      autoModeActive: afkHeaderLatched,
-      isUsingOverage: currentLimits.isUsingOverage ?? false,
-      cachedMCEnabled: cacheEditingHeaderLatched,
-      effortValue: effort,
-      extraBodyParams: getExtraBodyParams(),
-    })
-  }
-  const newContext: LLMRequestNewContext | undefined = isBetaTracingEnabled()
-    ? {
-        systemPrompt: systemPrompt.join('\n\n'),
-        querySource: options.querySource,
-        tools: jsonStringify(allTools),
-      }
-    : undefined
-  // Capture the span so we can pass it to endLLMRequestSpan later
-  // This ensures responses are matched to the correct request when multiple requests run in parallel
-  const llmSpan = startLLMRequestSpan(
-    options.model,
-    newContext,
-    messagesForAPI,
-    isFastMode,
-  )
-  const startIncludingRetries = Date.now()
-  let start = Date.now()
-  let attemptNumber = 0
-  const attemptStartTimes: number[] = []
-  let stream: Stream<BetaRawMessageStreamEvent> | undefined = undefined
-  let streamRequestId: string | null | undefined = undefined
-  let clientRequestId: string | undefined = undefined
-  // eslint-disable-next-line eslint-plugin-n/no-unsupported-features/node-builtins -- Response is available in Node 18+ and is used by the SDK
-  let streamResponse: Response | undefined = undefined
-  // Release all stream resources to prevent native memory leaks.
-  // The Response object holds native TLS/socket buffers that live outside the
-  // V8 heap (observed on the Node.js/npm path; see GH #32920), so we must
-  // explicitly cancel and release it regardless of how the generator exits.
-  function releaseStreamResources(): void {
-    cleanupStream(stream)
-    stream = undefined
-    if (streamResponse) {
-      streamResponse.body?.cancel().catch(() => {})
-      streamResponse = undefined
-    }
-  }
-  // Consume pending cache edits ONCE before paramsFromContext is defined.
-  // paramsFromContext is called multiple times (logging, retries), so consuming
-  // inside it would cause the first call to steal edits from subsequent calls.
-  const consumedCacheEdits = cachedMCEnabled ? consumePendingCacheEdits() : null
-  const consumedPinnedEdits = cachedMCEnabled ? getPinnedCacheEdits() : []
-  // Capture the betas sent in the last API request, including the ones that
-  // were dynamically added, so we can log and send it to telemetry.
-  let lastRequestBetas: string[] | undefined
-  const paramsFromContext = (retryContext: RetryContext) => {
-    const betasParams = [...betas]
-    // Append 1M beta dynamically for the Sonnet 1M experiment.
-    if (
-      !betasParams.includes(CONTEXT_1M_BETA_HEADER) &&
-      getSonnet1mExpTreatmentEnabled(retryContext.model)
-    ) {
-      betasParams.push(CONTEXT_1M_BETA_HEADER)
-    }
-    // For Bedrock, include both model-based betas and dynamically-added tool search header
-    const bedrockBetas =
-      getAPIProvider() === 'bedrock'
-        ? [
-            ...getBedrockExtraBodyParamsBetas(retryContext.model),
-            ...(toolSearchHeader ? [toolSearchHeader] : []),
-          ]
-        : []
-    const extraBodyParams = getExtraBodyParams(bedrockBetas)
-    const outputConfig: BetaOutputConfig = {
-      ...((extraBodyParams.output_config as BetaOutputConfig) ?? {}),
-    }
-    configureEffortParams(
-      effort,
-      outputConfig,
-      extraBodyParams,
-      betasParams,
-      options.model,
-    )
-    configureTaskBudgetParams(
-      options.taskBudget,
-      outputConfig as BetaOutputConfig & { task_budget?: TaskBudgetParam },
-      betasParams,
-    )
-    // Merge outputFormat into extraBodyParams.output_config alongside effort
-    // Requires structured-outputs beta header per SDK (see parse() in messages.mjs)
-    if (options.outputFormat && !('format' in outputConfig)) {
-      outputConfig.format = options.outputFormat as BetaJSONOutputFormat
-      // Add beta header if not already present and provider supports it
-      if (
-        modelSupportsStructuredOutputs(options.model) &&
-        !betasParams.includes(STRUCTURED_OUTPUTS_BETA_HEADER)
-      ) {
-        betasParams.push(STRUCTURED_OUTPUTS_BETA_HEADER)
-      }
-    }
-    // Retry context gets preference because it tries to course correct if we exceed the context window limit
-    const maxOutputTokens =
-      retryContext?.maxTokensOverride ||
-      options.maxOutputTokensOverride ||
-      getMaxOutputTokensForModel(options.model)
-    const hasThinking =
-      thinkingConfig.type !== 'disabled' &&
-      !isEnvTruthy(process.env.CLAUDE_CODE_DISABLE_THINKING)
-    let thinking: BetaMessageStreamParams['thinking'] | undefined = undefined
-    // IMPORTANT: Do not change the adaptive-vs-budget thinking selection below
-    // without notifying the model launch DRI and research. This is a sensitive
-    // setting that can greatly affect model quality and bashing.
-    if (hasThinking && modelSupportsThinking(options.model)) {
-      if (
-        !isEnvTruthy(process.env.CLAUDE_CODE_DISABLE_ADAPTIVE_THINKING) &&
-        modelSupportsAdaptiveThinking(options.model)
-      ) {
-        // For models that support adaptive thinking, always use adaptive
-        // thinking without a budget.
-        thinking = {
-          type: 'adaptive',
-        } satisfies BetaMessageStreamParams['thinking']
-      } else {
-        // For models that do not support adaptive thinking, use the default
-        // thinking budget unless explicitly specified.
-        let thinkingBudget = getMaxThinkingTokensForModel(options.model)
-        if (
-          thinkingConfig.type === 'enabled' &&
-          thinkingConfig.budgetTokens !== undefined
-        ) {
-          thinkingBudget = thinkingConfig.budgetTokens
-        }
-        thinkingBudget = Math.min(maxOutputTokens - 1, thinkingBudget)
-        thinking = {
-          budget_tokens: thinkingBudget,
-          type: 'enabled',
-        } satisfies BetaMessageStreamParams['thinking']
-      }
-    }
-    // Get API context management strategies if enabled
-    const contextManagement = getAPIContextManagement({
-      hasThinking,
-      isRedactThinkingActive: betasParams.includes(REDACT_THINKING_BETA_HEADER),
-      clearAllThinking: thinkingClearLatched,
-    })
-    const enablePromptCaching =
-      options.enablePromptCaching ?? getPromptCachingEnabled(retryContext.model)
-    // Fast mode: header is latched session-stable (cache-safe), but
-    // `speed='fast'` stays dynamic so cooldown still suppresses the actual
-    // fast-mode request without changing the cache key.
-    let speed: BetaMessageStreamParams['speed']
-    const isFastModeForRetry =
-      isFastModeEnabled() &&
-      isFastModeAvailable() &&
-      !isFastModeCooldown() &&
-      isFastModeSupportedByModel(options.model) &&
-      !!retryContext.fastMode
-    if (isFastModeForRetry) {
-      speed = 'fast'
-    }
-    if (fastModeHeaderLatched && !betasParams.includes(FAST_MODE_BETA_HEADER)) {
-      betasParams.push(FAST_MODE_BETA_HEADER)
-    }
-    // AFK mode beta: latched once auto mode is first activated. Still gated
-    // by isAgenticQuery per-call so classifiers/compaction don't get it.
-    if (feature('TRANSCRIPT_CLASSIFIER')) {
-      if (
-        afkHeaderLatched &&
-        shouldIncludeFirstPartyOnlyBetas() &&
-        isAgenticQuery &&
-        !betasParams.includes(AFK_MODE_BETA_HEADER)
-      ) {
-        betasParams.push(AFK_MODE_BETA_HEADER)
-      }
-    }
-    // Cache editing beta: header is latched session-stable; useCachedMC
-    // (controls cache_edits body behavior) stays live so edits stop when
-    // the feature disables but the header doesn't flip.
-    const useCachedMC =
-      cachedMCEnabled &&
-      getAPIProvider() === 'firstParty' &&
-      options.querySource === 'repl_main_thread'
-    if (
-      cacheEditingHeaderLatched &&
-      getAPIProvider() === 'firstParty' &&
-      options.querySource === 'repl_main_thread' &&
-      !betasParams.includes(cacheEditingBetaHeader)
-    ) {
-      betasParams.push(cacheEditingBetaHeader)
-      logForDebugging(
-        'Cache editing beta header enabled for cached microcompact',
-      )
-    }
-    // Only send temperature when thinking is disabled — the API requires
-    // temperature: 1 when thinking is enabled, which is already the default.
-    const temperature = !hasThinking
-      ? (options.temperatureOverride ?? 1)
-      : undefined
-    lastRequestBetas = betasParams
-    return {
-      model: normalizeModelStringForAPI(options.model),
-      messages: addCacheBreakpoints(
-        messagesForAPI,
-        enablePromptCaching,
-        options.querySource,
-        useCachedMC,
-        consumedCacheEdits,
-        consumedPinnedEdits,
-        options.skipCacheWrite,
-      ),
-      system,
-      tools: allTools,
-      tool_choice: options.toolChoice,
-      ...(useBetas && { betas: betasParams }),
-      metadata: getAPIMetadata(),
-      max_tokens: maxOutputTokens,
-      thinking,
-      ...(temperature !== undefined && { temperature }),
-      ...(contextManagement &&
-        useBetas &&
-        betasParams.includes(CONTEXT_MANAGEMENT_BETA_HEADER) && {
-          context_management: contextManagement,
-        }),
-      ...extraBodyParams,
-      ...(Object.keys(outputConfig).length > 0 && {
-        output_config: outputConfig,
-      }),
-      ...(speed !== undefined && { speed }),
-      ...(options.reasoningMode !== undefined && {
-        reasoning_mode: options.reasoningMode,
-      }),
-    }
-  }
-  // Compute log scalars synchronously so the fire-and-forget .then() closure
-  // captures only primitives instead of paramsFromContext's full closure scope
-  // (messagesForAPI, system, allTools, betas — the entire request-building
-  // context), which would otherwise be pinned until the promise resolves.
-  {
-    const queryParams = paramsFromContext({
-      model: options.model,
-      thinkingConfig,
-    })
-    const logMessagesLength = queryParams.messages.length
-    const logBetas = useBetas ? (queryParams.betas ?? []) : []
-    const logThinkingType = queryParams.thinking?.type ?? 'disabled'
-    const logEffortValue = queryParams.output_config?.effort
-    void options.getToolPermissionContext().then(permissionContext => {
-      logAPIQuery({
-        model: options.model,
-        messagesLength: logMessagesLength,
-        temperature: options.temperatureOverride ?? 1,
-        betas: logBetas,
-        permissionMode: permissionContext.mode,
-        querySource: options.querySource,
-        queryTracking: options.queryTracking,
-        thinkingType: logThinkingType,
-        effortValue: logEffortValue,
-        fastMode: isFastMode,
-        previousRequestId,
-      })
-    })
-  }
-  const newMessages: AssistantMessage[] = []
-  let ttftMs = 0
-  let partialMessage: BetaMessage | undefined = undefined
-  const contentBlocks: (BetaContentBlock | ConnectorTextBlock)[] = []
-  let usage: NonNullableUsage = EMPTY_USAGE
-  let costUSD = 0
-  let stopReason: BetaStopReason | null = null
-  let didFallBackToNonStreaming = false
-  let fallbackMessage: AssistantMessage | undefined
-  let maxOutputTokens = 0
-  let responseHeaders: globalThis.Headers | undefined = undefined
-  let research: unknown = undefined
-  let isFastModeRequest = isFastMode // Keep separate state as it may change if falling back
-  let isAdvisorInProgress = false
-  try {
-    queryCheckpoint('query_client_creation_start')
-    const generator = withRetry(
-      () =>
-        getAnthropicClient({
-          maxRetries: 0, // Disabled auto-retry in favor of manual implementation
-          model: options.model,
-          fetchOverride: options.fetchOverride,
-          source: options.querySource,
-        }),
-      async (anthropic, attempt, context) => {
-        attemptNumber = attempt
-        isFastModeRequest = context.fastMode ?? false
-        start = Date.now()
-        attemptStartTimes.push(start)
-        // Client has been created by withRetry's getClient() call. This fires
-        // once per attempt; on retries the client is usually cached (withRetry
-        // only calls getClient() again after auth errors), so the delta from
-        // client_creation_start is meaningful on attempt 1.
-        queryCheckpoint('query_client_creation_end')
-        const params = paramsFromContext(context)
-        captureAPIRequest(params, options.querySource) // Capture for bug reports
-        maxOutputTokens = params.max_tokens
-        // Fire immediately before the fetch is dispatched. .withResponse() below
-        // awaits until response headers arrive, so this MUST be before the await
-        // or the "Network TTFB" phase measurement is wrong.
-        queryCheckpoint('query_api_request_sent')
-        if (!options.agentId) {
-          headlessProfilerCheckpoint('api_request_sent')
-        }
-        // Generate and track client request ID so timeouts (which return no
-        // server request ID) can still be correlated with server logs.
-        // First-party only — 3P providers don't log it (inc-4029 class).
-        clientRequestId =
-          getAPIProvider() === 'firstParty' && isFirstPartyAnthropicBaseUrl()
-            ? randomUUID()
-            : undefined
-        // Use raw stream instead of BetaMessageStream to avoid O(n²) partial JSON parsing
-        // BetaMessageStream calls partialParse() on every input_json_delta, which we don't need
-        // since we handle tool input accumulation ourselves
-        // biome-ignore lint/plugin: main conversation loop handles attribution separately
-        const result = await anthropic.beta.messages
-          .create(
-            { ...params, stream: true },
-            {
-              signal,
-              ...(clientRequestId && {
-                headers: { [CLIENT_REQUEST_ID_HEADER]: clientRequestId },
-              }),
-            },
-          )
-          .withResponse()
-        queryCheckpoint('query_response_headers_received')
-        streamRequestId = result.request_id
-        streamResponse = result.response
-        return result.data
-      },
-      {
-        model: options.model,
-        fallbackModel: options.fallbackModel,
-        thinkingConfig,
-        ...(isFastModeEnabled() ? { fastMode: isFastMode } : false),
-        signal,
-        querySource: options.querySource,
-      },
-    )
-    let e
-    do {
-      e = await generator.next()
-      // yield API error messages (the stream has a 'controller' property, error messages don't)
-      if (!('controller' in e.value)) {
-        yield e.value
-      }
-    } while (!e.done)
-    stream = e.value as Stream<BetaRawMessageStreamEvent>
-    // reset state
-    newMessages.length = 0
-    ttftMs = 0
-    partialMessage = undefined
-    contentBlocks.length = 0
-    usage = EMPTY_USAGE
-    stopReason = null
-    isAdvisorInProgress = false
-    // Streaming idle timeout watchdog: abort the stream if no chunks arrive
-    // for STREAM_IDLE_TIMEOUT_MS. Unlike the stall detection below (which only
-    // fires when the *next* chunk arrives), this uses setTimeout to actively
-    // kill hung streams. Without this, a silently dropped connection can hang
-    // the session indefinitely since the SDK's request timeout only covers the
-    // initial fetch(), not the streaming body.
-    const streamWatchdogEnabled = isEnvTruthy(
-      process.env.CLAUDE_ENABLE_STREAM_WATCHDOG,
-    )
-    const STREAM_IDLE_TIMEOUT_MS =
-      parseInt(process.env.CLAUDE_STREAM_IDLE_TIMEOUT_MS || '', 10) || 90_000
-    const STREAM_IDLE_WARNING_MS = STREAM_IDLE_TIMEOUT_MS / 2
-    let streamIdleAborted = false
-    // performance.now() snapshot when watchdog fires, for measuring abort propagation delay
-    let streamWatchdogFiredAt: number | null = null
-    let streamIdleWarningTimer: ReturnType<typeof setTimeout> | null = null
-    let streamIdleTimer: ReturnType<typeof setTimeout> | null = null
-    function clearStreamIdleTimers(): void {
-      if (streamIdleWarningTimer !== null) {
-        clearTimeout(streamIdleWarningTimer)
-        streamIdleWarningTimer = null
-      }
-      if (streamIdleTimer !== null) {
-        clearTimeout(streamIdleTimer)
-        streamIdleTimer = null
-      }
-    }
-    function resetStreamIdleTimer(): void {
-      clearStreamIdleTimers()
-      if (!streamWatchdogEnabled) {
-        return
-      }
-      streamIdleWarningTimer = setTimeout(
-        warnMs => {
-          logForDebugging(
-            `Streaming idle warning: no chunks received for ${warnMs / 1000}s`,
-            { level: 'warn' },
-          )
-          logForDiagnosticsNoPII('warn', 'cli_streaming_idle_warning')
-        },
-        STREAM_IDLE_WARNING_MS,
-        STREAM_IDLE_WARNING_MS,
-      )
-      streamIdleTimer = setTimeout(() => {
-        streamIdleAborted = true
-        streamWatchdogFiredAt = performance.now()
-        logForDebugging(
-          `Streaming idle timeout: no chunks received for ${STREAM_IDLE_TIMEOUT_MS / 1000}s, aborting stream`,
-          { level: 'error' },
-        )
-        logForDiagnosticsNoPII('error', 'cli_streaming_idle_timeout')
-        logEvent('tengu_streaming_idle_timeout', {
-          model:
-            options.model as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
-          request_id: (streamRequestId ??
-            'unknown') as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
-          timeout_ms: STREAM_IDLE_TIMEOUT_MS,
-        })
-        releaseStreamResources()
-      }, STREAM_IDLE_TIMEOUT_MS)
-    }
-    resetStreamIdleTimer()
-    startSessionActivity('api_call')
-    try {
-      // stream in and accumulate state
-      let isFirstChunk = true
-      let lastEventTime: number | null = null // Set after first chunk to avoid measuring TTFB as a stall
-      const STALL_THRESHOLD_MS = 30_000 // 30 seconds
-      let totalStallTime = 0
-      let stallCount = 0
-      for await (const part of stream) {
-        resetStreamIdleTimer()
-        const now = Date.now()
-        // Detect and log streaming stalls (only after first event to avoid counting TTFB)
-        if (lastEventTime !== null) {
-          const timeSinceLastEvent = now - lastEventTime
-          if (timeSinceLastEvent > STALL_THRESHOLD_MS) {
-            stallCount++
-            totalStallTime += timeSinceLastEvent
-            logForDebugging(
-              `Streaming stall detected: ${(timeSinceLastEvent / 1000).toFixed(1)}s gap between events (stall #${stallCount})`,
-              { level: 'warn' },
-            )
-            logEvent('tengu_streaming_stall', {
-              stall_duration_ms: timeSinceLastEvent,
-              stall_count: stallCount,
-              total_stall_time_ms: totalStallTime,
-              event_type:
-                part.type as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
-              model:
-                options.model as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
-              request_id: (streamRequestId ??
-                'unknown') as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
-            })
-          }
-        }
-        lastEventTime = now
-        if (isFirstChunk) {
-          logForDebugging('Stream started - received first chunk')
-          queryCheckpoint('query_first_chunk_received')
-          if (!options.agentId) {
-            headlessProfilerCheckpoint('first_chunk')
-          }
-          endQueryProfile()
-          isFirstChunk = false
-        }
-        switch (part.type) {
-          case 'message_start': {
-            partialMessage = part.message
-            ttftMs = Date.now() - start
-            usage = updateUsage(usage, part.message?.usage)
-            // Capture research from message_start if available (internal only).
-            // Always overwrite with the latest value.
-            if (
-              process.env.USER_TYPE === 'ant' &&
-              'research' in (part.message as unknown as Record<string, unknown>)
-            ) {
-              research = (part.message as unknown as Record<string, unknown>)
-                .research
-            }
-            break
-          }
-          case 'content_block_start':
-            switch (part.content_block.type) {
-              case 'tool_use':
-                contentBlocks[part.index] = {
-                  ...part.content_block,
-                  input: '',
-                }
-                break
-              case 'server_tool_use':
-                contentBlocks[part.index] = {
-                  ...part.content_block,
-                  input: '' as unknown as { [key: string]: unknown },
-                }
-                if ((part.content_block.name as string) === 'advisor') {
-                  isAdvisorInProgress = true
-                  logForDebugging(`[AdvisorTool] Advisor tool called`)
-                  logEvent('tengu_advisor_tool_call', {
-                    model:
-                      options.model as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
-                    advisor_model: (advisorModel ??
-                      'unknown') as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
-                  })
-                }
-                break
-              case 'text':
-                contentBlocks[part.index] = {
-                  ...part.content_block,
-                  // awkwardly, the sdk sometimes returns text as part of a
-                  // content_block_start message, then returns the same text
-                  // again in a content_block_delta message. we ignore it here
-                  // since there doesn't seem to be a way to detect when a
-                  // content_block_delta message duplicates the text.
-                  text: '',
-                }
-                break
-              case 'thinking':
-                contentBlocks[part.index] = {
-                  ...part.content_block,
-                  // also awkward
-                  thinking: '',
-                  // initialize signature to ensure field exists even if signature_delta never arrives
-                  signature: '',
-                }
-                break
-              default:
-                // even more awkwardly, the sdk mutates the contents of text blocks
-                // as it works. we want the blocks to be immutable, so that we can
-                // accumulate state ourselves.
-                contentBlocks[part.index] = { ...part.content_block }
-                if (
-                  (part.content_block.type as string) === 'advisor_tool_result'
-                ) {
-                  isAdvisorInProgress = false
-                  logForDebugging(`[AdvisorTool] Advisor tool result received`)
-                }
-                break
-            }
-            break
-          case 'content_block_delta': {
-            const contentBlock = contentBlocks[part.index]
-            const delta = part.delta as typeof part.delta | ConnectorTextDelta
-            if (!contentBlock) {
-              logEvent('tengu_streaming_error', {
-                error_type:
-                  'content_block_not_found_delta' as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
-                part_type:
-                  part.type as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
-                part_index: part.index,
-              })
-              throw new RangeError('Content block not found')
-            }
-            if (
-              feature('CONNECTOR_TEXT') &&
-              delta.type === 'connector_text_delta'
-            ) {
-              if (contentBlock.type !== 'connector_text') {
-                logEvent('tengu_streaming_error', {
-                  error_type:
-                    'content_block_type_mismatch_connector_text' as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
-                  expected_type:
-                    'connector_text' as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
-                  actual_type:
-                    contentBlock.type as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
-                })
-                throw new Error('Content block is not a connector_text block')
-              }
-              contentBlock.connector_text += delta.connector_text
-            } else {
-              switch (delta.type) {
-                case 'citations_delta':
-                  // TODO: handle citations
-                  break
-                case 'input_json_delta':
-                  if (
-                    contentBlock.type !== 'tool_use' &&
-                    contentBlock.type !== 'server_tool_use'
-                  ) {
-                    logEvent('tengu_streaming_error', {
-                      error_type:
-                        'content_block_type_mismatch_input_json' as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
-                      expected_type:
-                        'tool_use' as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
-                      actual_type:
-                        contentBlock.type as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
-                    })
-                    throw new Error('Content block is not a input_json block')
-                  }
-                  if (typeof contentBlock.input !== 'string') {
-                    logEvent('tengu_streaming_error', {
-                      error_type:
-                        'content_block_input_not_string' as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
-                      input_type:
-                        typeof contentBlock.input as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
-                    })
-                    throw new Error('Content block input is not a string')
-                  }
-                  contentBlock.input += delta.partial_json
-                  break
-                case 'text_delta':
-                  if (contentBlock.type !== 'text') {
-                    logEvent('tengu_streaming_error', {
-                      error_type:
-                        'content_block_type_mismatch_text' as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
-                      expected_type:
-                        'text' as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
-                      actual_type:
-                        contentBlock.type as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
-                    })
-                    throw new Error('Content block is not a text block')
-                  }
-                  contentBlock.text += delta.text
-                  break
-                case 'signature_delta':
-                  if (
-                    feature('CONNECTOR_TEXT') &&
-                    contentBlock.type === 'connector_text'
-                  ) {
-                    contentBlock.signature = delta.signature
-                    break
-                  }
-                  if (contentBlock.type !== 'thinking') {
-                    logEvent('tengu_streaming_error', {
-                      error_type:
-                        'content_block_type_mismatch_thinking_signature' as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
-                      expected_type:
-                        'thinking' as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
-                      actual_type:
-                        contentBlock.type as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
-                    })
-                    throw new Error('Content block is not a thinking block')
-                  }
-                  contentBlock.signature = delta.signature
-                  break
-                case 'thinking_delta':
-                  if (contentBlock.type !== 'thinking') {
-                    logEvent('tengu_streaming_error', {
-                      error_type:
-                        'content_block_type_mismatch_thinking_delta' as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
-                      expected_type:
-                        'thinking' as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
-                      actual_type:
-                        contentBlock.type as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
-                    })
-                    throw new Error('Content block is not a thinking block')
-                  }
-                  contentBlock.thinking += delta.thinking
-                  break
-              }
-            }
-            // Capture research from content_block_delta if available (internal only).
-            // Always overwrite with the latest value.
-            if (process.env.USER_TYPE === 'ant' && 'research' in part) {
-              research = (part as { research: unknown }).research
-            }
-            break
-          }
-          case 'content_block_stop': {
-            const contentBlock = contentBlocks[part.index]
-            if (!contentBlock) {
-              logEvent('tengu_streaming_error', {
-                error_type:
-                  'content_block_not_found_stop' as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
-                part_type:
-                  part.type as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
-                part_index: part.index,
-              })
-              throw new RangeError('Content block not found')
-            }
-            if (!partialMessage) {
-              logEvent('tengu_streaming_error', {
-                error_type:
-                  'partial_message_not_found' as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
-                part_type:
-                  part.type as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
-              })
-              throw new Error('Message not found')
-            }
-            const m: AssistantMessage = {
-              message: {
-                ...partialMessage,
-                content: normalizeContentFromAPI(
-                  [contentBlock] as BetaContentBlock[],
-                  tools,
-                  options.agentId,
-                ),
-              },
-              requestId: streamRequestId ?? undefined,
-              type: 'assistant',
-              uuid: randomUUID(),
-              timestamp: new Date().toISOString(),
-              ...(process.env.USER_TYPE === 'ant' &&
-                research !== undefined && { research }),
-              ...(advisorModel && { advisorModel }),
-            }
-            newMessages.push(m)
-            yield m
-            break
-          }
-          case 'message_delta': {
-            usage = updateUsage(usage, part.usage)
-            // Capture research from message_delta if available (internal only).
-            // Always overwrite with the latest value. Also write back to
-            // already-yielded messages since message_delta arrives after
-            // content_block_stop.
-            if (
-              process.env.USER_TYPE === 'ant' &&
-              'research' in (part as unknown as Record<string, unknown>)
-            ) {
-              research = (part as unknown as Record<string, unknown>).research
-              for (const msg of newMessages) {
-                msg.research = research
-              }
-            }
-            // Write final usage and stop_reason back to the last yielded
-            // message. Messages are created at content_block_stop from
-            // partialMessage, which was set at message_start before any tokens
-            // were generated (output_tokens: 0, stop_reason: null).
-            // message_delta arrives after content_block_stop with the real
-            // values.
-            //
-            // IMPORTANT: Use direct property mutation, not object replacement.
-            // The transcript write queue holds a reference to message.message
-            // and serializes it lazily (100ms flush interval). Object
-            // replacement ({ ...lastMsg.message, usage }) would disconnect
-            // the queued reference; direct mutation ensures the transcript
-            // captures the final values.
-            stopReason = part.delta.stop_reason
-            const lastMsg = newMessages.at(-1)
-            if (lastMsg) {
-              lastMsg.message.usage = usage
-              lastMsg.message.stop_reason = stopReason
-            }
-            // Update cost
-            const costUSDForPart = calculateUSDCost(resolvedModel, usage)
-            costUSD += addToTotalSessionCost(
-              costUSDForPart,
-              usage,
-              options.model,
-            )
-            const refusalMessage = getErrorMessageIfRefusal(
-              part.delta.stop_reason,
-              options.model,
-            )
-            if (refusalMessage) {
-              yield refusalMessage
-            }
-            if (stopReason === 'max_tokens') {
-              logEvent('tengu_max_tokens_reached', {
-                max_tokens: maxOutputTokens,
-              })
-              yield createAssistantAPIErrorMessage({
-                content: `${API_ERROR_MESSAGE_PREFIX}: Ummaya's response exceeded the ${
-                  maxOutputTokens
-                } output token maximum. To configure this behavior, set the CLAUDE_CODE_MAX_OUTPUT_TOKENS environment variable.`,
-                apiError: 'max_output_tokens',
-                error: 'max_output_tokens',
-              })
-            }
-            if (stopReason === 'model_context_window_exceeded') {
-              logEvent('tengu_context_window_exceeded', {
-                max_tokens: maxOutputTokens,
-                output_tokens: usage.output_tokens,
-              })
-              // Reuse the max_output_tokens recovery path — from the model's
-              // perspective, both mean "response was cut off, continue from
-              // where you left off."
-              yield createAssistantAPIErrorMessage({
-                content: `${API_ERROR_MESSAGE_PREFIX}: The model has reached its context window limit.`,
-                apiError: 'max_output_tokens',
-                error: 'max_output_tokens',
-              })
-            }
-            break
-          }
-          case 'message_stop':
-            break
-        }
-        yield {
-          type: 'stream_event',
-          event: part,
-          ...(part.type === 'message_start' ? { ttftMs } : undefined),
-        }
-      }
-      // Clear the idle timeout watchdog now that the stream loop has exited
-      clearStreamIdleTimers()
-      // If the stream was aborted by our idle timeout watchdog, fall back to
-      // non-streaming retry rather than treating it as a completed stream.
-      if (streamIdleAborted) {
-        // Instrumentation: proves the for-await exited after the watchdog fired
-        // (vs. hung forever). exit_delay_ms measures abort propagation latency:
-        // 0-10ms = abort worked; >>1000ms = something else woke the loop.
-        const exitDelayMs =
-          streamWatchdogFiredAt !== null
-            ? Math.round(performance.now() - streamWatchdogFiredAt)
-            : -1
-        logForDiagnosticsNoPII(
-          'info',
-          'cli_stream_loop_exited_after_watchdog_clean',
-        )
-        logEvent('tengu_stream_loop_exited_after_watchdog', {
-          request_id: (streamRequestId ??
-            'unknown') as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
-          exit_delay_ms: exitDelayMs,
-          exit_path:
-            'clean' as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
-          model:
-            options.model as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
-        })
-        // Prevent double-emit: this throw lands in the catch block below,
-        // whose exit_path='error' probe guards on streamWatchdogFiredAt.
-        streamWatchdogFiredAt = null
-        throw new Error('Stream idle timeout - no chunks received')
-      }
-      // Detect when the stream completed without producing any assistant messages.
-      // This covers two proxy failure modes:
-      // 1. No events at all (!partialMessage): proxy returned 200 with non-SSE body
-      // 2. Partial events (partialMessage set but no content blocks completed AND
-      //    no stop_reason received): proxy returned message_start but stream ended
-      //    before content_block_stop and before message_delta with stop_reason
-      // BetaMessageStream had the first check in _endRequest() but the raw Stream
-      // does not - without it the generator silently returns no assistant messages,
-      // causing "Execution error" in -p mode.
-      // Note: We must check stopReason to avoid false positives. For example, with
-      // structured output (--json-schema), the model calls a StructuredOutput tool
-      // on turn 1, then on turn 2 responds with end_turn and no content blocks.
-      // That's a legitimate empty response, not an incomplete stream.
-      if (!partialMessage || (newMessages.length === 0 && !stopReason)) {
-        logForDebugging(
-          !partialMessage
-            ? 'Stream completed without receiving message_start event - triggering non-streaming fallback'
-            : 'Stream completed with message_start but no content blocks completed - triggering non-streaming fallback',
-          { level: 'error' },
-        )
-        logEvent('tengu_stream_no_events', {
-          model:
-            options.model as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
-          request_id: (streamRequestId ??
-            'unknown') as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
-        })
-        throw new Error('Stream ended without receiving any events')
-      }
-      // Log summary if any stalls occurred during streaming
-      if (stallCount > 0) {
-        logForDebugging(
-          `Streaming completed with ${stallCount} stall(s), total stall time: ${(totalStallTime / 1000).toFixed(1)}s`,
-          { level: 'warn' },
-        )
-        logEvent('tengu_streaming_stall_summary', {
-          stall_count: stallCount,
-          total_stall_time_ms: totalStallTime,
-          model:
-            options.model as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
-          request_id: (streamRequestId ??
-            'unknown') as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
-        })
-      }
-      // Check if the cache actually broke based on response tokens
-      if (feature('PROMPT_CACHE_BREAK_DETECTION')) {
-        void checkResponseForCacheBreak(
-          options.querySource,
-          usage.cache_read_input_tokens,
-          usage.cache_creation_input_tokens,
-          messages,
-          options.agentId,
-          streamRequestId,
-        )
-      }
-      // Process fallback percentage header and quota status if available
-      // streamResponse is set when the stream is created in the withRetry callback above
-      // TypeScript's control flow analysis can't track that streamResponse is set in the callback
-      // eslint-disable-next-line eslint-plugin-n/no-unsupported-features/node-builtins
-      const resp = streamResponse as unknown as Response | undefined
-      if (resp) {
-        extractQuotaStatusFromHeaders(resp.headers)
-        // Store headers for gateway detection
-        responseHeaders = resp.headers
-      }
-    } catch (streamingError) {
-      // Clear the idle timeout watchdog on error path too
-      clearStreamIdleTimers()
-      // Instrumentation: if the watchdog had already fired and the for-await
-      // threw (rather than exiting cleanly), record that the loop DID exit and
-      // how long after the watchdog. Distinguishes true hangs from error exits.
-      if (streamIdleAborted && streamWatchdogFiredAt !== null) {
-        const exitDelayMs = Math.round(
-          performance.now() - streamWatchdogFiredAt,
-        )
-        logForDiagnosticsNoPII(
-          'info',
-          'cli_stream_loop_exited_after_watchdog_error',
-        )
-        logEvent('tengu_stream_loop_exited_after_watchdog', {
-          request_id: (streamRequestId ??
-            'unknown') as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
-          exit_delay_ms: exitDelayMs,
-          exit_path:
-            'error' as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
-          error_name:
-            streamingError instanceof Error
-              ? (streamingError.name as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS)
-              : ('unknown' as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS),
-          model:
-            options.model as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
-        })
-      }
-      if (streamingError instanceof APIUserAbortError) {
-        // Check if the abort signal was triggered by the user (ESC key)
-        // If the signal is aborted, it's a user-initiated abort
-        // If not, it's likely a timeout from the SDK
-        if (signal.aborted) {
-          // This is a real user abort (ESC key was pressed)
-          logForDebugging(
-            `Streaming aborted by user: ${errorMessage(streamingError)}`,
-          )
-          if (isAdvisorInProgress) {
-            logEvent('tengu_advisor_tool_interrupted', {
-              model:
-                options.model as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
-              advisor_model: (advisorModel ??
-                'unknown') as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
-            })
-          }
-          throw streamingError
-        } else {
-          // The SDK threw APIUserAbortError but our signal wasn't aborted
-          // This means it's a timeout from the SDK's internal timeout
-          logForDebugging(
-            `Streaming timeout (SDK abort): ${streamingError.message}`,
-            { level: 'error' },
-          )
-          // Throw a more specific error for timeout
-          throw new APIConnectionTimeoutError({ message: 'Request timed out' })
-        }
-      }
-      // When the flag is enabled, skip the non-streaming fallback and let the
-      // error propagate to withRetry. The mid-stream fallback causes double tool
-      // execution when streaming tool execution is active: the partial stream
-      // starts a tool, then the non-streaming retry produces the same tool_use
-      // and runs it again. See inc-4258.
-      const disableFallback =
-        isEnvTruthy(process.env.CLAUDE_CODE_DISABLE_NONSTREAMING_FALLBACK) ||
-        getFeatureValue_CACHED_MAY_BE_STALE(
-          'tengu_disable_streaming_to_non_streaming_fallback',
-          false,
-        )
-      if (disableFallback) {
-        logForDebugging(
-          `Error streaming (non-streaming fallback disabled): ${errorMessage(streamingError)}`,
-          { level: 'error' },
-        )
-        logEvent('tengu_streaming_fallback_to_non_streaming', {
-          model:
-            options.model as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
-          error:
-            streamingError instanceof Error
-              ? (streamingError.name as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS)
-              : (String(
-                  streamingError,
-                ) as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS),
-          attemptNumber,
-          maxOutputTokens,
-          thinkingType:
-            thinkingConfig.type as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
-          fallback_disabled: true,
-          request_id: (streamRequestId ??
-            'unknown') as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
-          fallback_cause: (streamIdleAborted
-            ? 'watchdog'
-            : 'other') as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
-        })
-        throw streamingError
-      }
-      logForDebugging(
-        `Error streaming, falling back to non-streaming mode: ${errorMessage(streamingError)}`,
-        { level: 'error' },
-      )
-      didFallBackToNonStreaming = true
-      if (options.onStreamingFallback) {
-        options.onStreamingFallback()
-      }
-      logEvent('tengu_streaming_fallback_to_non_streaming', {
-        model:
-          options.model as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
-        error:
-          streamingError instanceof Error
-            ? (streamingError.name as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS)
-            : (String(
-                streamingError,
-              ) as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS),
-        attemptNumber,
-        maxOutputTokens,
-        thinkingType:
-          thinkingConfig.type as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
-        fallback_disabled: false,
-        request_id: (streamRequestId ??
-          'unknown') as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
-        fallback_cause: (streamIdleAborted
-          ? 'watchdog'
-          : 'other') as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
-      })
-      // Fall back to non-streaming mode with retries.
-      // If the streaming failure was itself a 529, count it toward the
-      // consecutive-529 budget so total 529s-before-model-fallback is the
-      // same whether the overload was hit in streaming or non-streaming mode.
-      // This is a speculative fix for https://github.com/anthropics/claude-code/issues/1513
-      // Instrumentation: proves executeNonStreamingRequest was entered (vs. the
-      // fallback event firing but the call itself hanging at dispatch).
-      logForDiagnosticsNoPII('info', 'cli_nonstreaming_fallback_started')
-      logEvent('tengu_nonstreaming_fallback_started', {
-        request_id: (streamRequestId ??
-          'unknown') as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
-        model:
-          options.model as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
-        fallback_cause: (streamIdleAborted
-          ? 'watchdog'
-          : 'other') as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
-      })
-      const result = yield* executeNonStreamingRequest(
-        {
-          model: options.model,
-          fetchOverride: options.fetchOverride,
-          source: options.querySource,
-        },
-        {
-          model: options.model,
-          fallbackModel: options.fallbackModel,
-          thinkingConfig,
-          ...(isFastModeEnabled() && { fastMode: isFastMode }),
-          signal,
-          initialConsecutive529Errors: is529Error(streamingError) ? 1 : 0,
-          querySource: options.querySource,
-        },
-        paramsFromContext,
-        (attempt, _startTime, tokens) => {
-          attemptNumber = attempt
-          maxOutputTokens = tokens
-        },
-        params => captureAPIRequest(params, options.querySource),
-        streamRequestId,
-      )
-      const m: AssistantMessage = {
-        message: {
-          ...result,
-          content: normalizeContentFromAPI(
-            result.content,
-            tools,
-            options.agentId,
-          ),
-        },
-        requestId: streamRequestId ?? undefined,
-        type: 'assistant',
-        uuid: randomUUID(),
-        timestamp: new Date().toISOString(),
-        ...(process.env.USER_TYPE === 'ant' &&
-          research !== undefined && {
-            research,
-          }),
-        ...(advisorModel && {
-          advisorModel,
-        }),
-      }
-      newMessages.push(m)
-      fallbackMessage = m
-      yield m
-    } finally {
-      clearStreamIdleTimers()
-    }
-  } catch (errorFromRetry) {
-    // FallbackTriggeredError must propagate to query.ts, which performs the
-    // actual model switch. Swallowing it here would turn the fallback into a
-    // no-op — the user would just see "Model fallback triggered: X -> Y" as
-    // an error message with no actual retry on the fallback model.
-    if (errorFromRetry instanceof FallbackTriggeredError) {
-      throw errorFromRetry
-    }
-    // Check if this is a 404 error during stream creation that should trigger
-    // non-streaming fallback. This handles gateways that return 404 for streaming
-    // endpoints but work fine with non-streaming. Before v2.1.8, BetaMessageStream
-    // threw 404s during iteration (caught by inner catch with fallback), but now
-    // with raw streams, 404s are thrown during creation (caught here).
-    const is404StreamCreationError =
-      !didFallBackToNonStreaming &&
-      errorFromRetry instanceof CannotRetryError &&
-      errorFromRetry.originalError instanceof APIError &&
-      errorFromRetry.originalError.status === 404
-    if (is404StreamCreationError) {
-      // 404 is thrown at .withResponse() before streamRequestId is assigned,
-      // and CannotRetryError means every retry failed — so grab the failed
-      // request's ID from the error header instead.
-      const failedRequestId =
-        (errorFromRetry.originalError as APIError).requestID ?? 'unknown'
-      logForDebugging(
-        'Streaming endpoint returned 404, falling back to non-streaming mode',
-        { level: 'warn' },
-      )
-      didFallBackToNonStreaming = true
-      if (options.onStreamingFallback) {
-        options.onStreamingFallback()
-      }
-      logEvent('tengu_streaming_fallback_to_non_streaming', {
-        model:
-          options.model as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
-        error:
-          '404_stream_creation' as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
-        attemptNumber,
-        maxOutputTokens,
-        thinkingType:
-          thinkingConfig.type as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
-        request_id:
-          failedRequestId as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
-        fallback_cause:
-          '404_stream_creation' as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
-      })
-      try {
-        // Fall back to non-streaming mode
-        const result = yield* executeNonStreamingRequest(
-          {
-            model: options.model,
-            fetchOverride: options.fetchOverride,
-            source: options.querySource,
-          },
-          {
-            model: options.model,
-            fallbackModel: options.fallbackModel,
-            thinkingConfig,
-            ...(isFastModeEnabled() && { fastMode: isFastMode }),
-            signal,
-          },
-          paramsFromContext,
-          (attempt, _startTime, tokens) => {
-            attemptNumber = attempt
-            maxOutputTokens = tokens
-          },
-          params => captureAPIRequest(params, options.querySource),
-          failedRequestId,
-        )
-        const m: AssistantMessage = {
-          message: {
-            ...result,
-            content: normalizeContentFromAPI(
-              result.content,
-              tools,
-              options.agentId,
-            ),
-          },
-          requestId: streamRequestId ?? undefined,
-          type: 'assistant',
-          uuid: randomUUID(),
-          timestamp: new Date().toISOString(),
-          ...(process.env.USER_TYPE === 'ant' &&
-            research !== undefined && { research }),
-          ...(advisorModel && { advisorModel }),
-        }
-        newMessages.push(m)
-        fallbackMessage = m
-        yield m
-        // Continue to success logging below
-      } catch (fallbackError) {
-        // Propagate model-fallback signal to query.ts (see comment above).
-        if (fallbackError instanceof FallbackTriggeredError) {
-          throw fallbackError
-        }
-        // Fallback also failed, handle as normal error
-        logForDebugging(
-          `Non-streaming fallback also failed: ${errorMessage(fallbackError)}`,
-          { level: 'error' },
-        )
-        let error = fallbackError
-        let errorModel = options.model
-        if (fallbackError instanceof CannotRetryError) {
-          error = fallbackError.originalError
-          errorModel = fallbackError.retryContext.model
-        }
-        if (error instanceof APIError) {
-          extractQuotaStatusFromError(error)
-        }
-        const requestId =
-          streamRequestId ||
-          (error instanceof APIError ? error.requestID : undefined) ||
-          (error instanceof APIError
-            ? (error.error as { request_id?: string })?.request_id
-            : undefined)
-        logAPIError({
-          error,
-          model: errorModel,
-          messageCount: messagesForAPI.length,
-          messageTokens: tokenCountFromLastAPIResponse(messagesForAPI),
-          durationMs: Date.now() - start,
-          durationMsIncludingRetries: Date.now() - startIncludingRetries,
-          attempt: attemptNumber,
-          requestId,
-          clientRequestId,
-          didFallBackToNonStreaming,
-          queryTracking: options.queryTracking,
-          querySource: options.querySource,
-          llmSpan,
-          fastMode: isFastModeRequest,
-          previousRequestId,
-        })
-        if (error instanceof APIUserAbortError) {
-          releaseStreamResources()
-          return
-        }
-        yield getAssistantMessageFromError(error, errorModel, {
-          messages,
-          messagesForAPI,
-        })
-        releaseStreamResources()
-        return
-      }
-    } else {
-      // Original error handling for non-404 errors
-      logForDebugging(`Error in API request: ${errorMessage(errorFromRetry)}`, {
-        level: 'error',
-      })
-      let error = errorFromRetry
-      let errorModel = options.model
-      if (errorFromRetry instanceof CannotRetryError) {
-        error = errorFromRetry.originalError
-        errorModel = errorFromRetry.retryContext.model
-      }
-      // Extract quota status from error headers if it's a rate limit error
-      if (error instanceof APIError) {
-        extractQuotaStatusFromError(error)
-      }
-      // Extract requestId from stream, error header, or error body
-      const requestId =
-        streamRequestId ||
-        (error instanceof APIError ? error.requestID : undefined) ||
-        (error instanceof APIError
-          ? (error.error as { request_id?: string })?.request_id
-          : undefined)
-      logAPIError({
-        error,
-        model: errorModel,
-        messageCount: messagesForAPI.length,
-        messageTokens: tokenCountFromLastAPIResponse(messagesForAPI),
-        durationMs: Date.now() - start,
-        durationMsIncludingRetries: Date.now() - startIncludingRetries,
-        attempt: attemptNumber,
-        requestId,
-        clientRequestId,
-        didFallBackToNonStreaming,
-        queryTracking: options.queryTracking,
-        querySource: options.querySource,
-        llmSpan,
-        fastMode: isFastModeRequest,
-        previousRequestId,
-      })
-      // Don't yield an assistant error message for user aborts
-      // The interruption message is handled in query.ts
-      if (error instanceof APIUserAbortError) {
-        releaseStreamResources()
-        return
-      }
-      yield getAssistantMessageFromError(error, errorModel, {
-        messages,
-        messagesForAPI,
-      })
-      releaseStreamResources()
-      return
-    }
-  } finally {
-    stopSessionActivity('api_call')
-    // Must be in the finally block: if the generator is terminated early
-    // via .return() (e.g. consumer breaks out of for-await-of, or query.ts
-    // encounters an abort), code after the try/finally never executes.
-    // Without this, the Response object's native TLS/socket buffers leak
-    // until the generator itself is GC'd (see GH #32920).
-    releaseStreamResources()
-    // Non-streaming fallback cost: the streaming path tracks cost in the
-    // message_delta handler before any yield. Fallback pushes to newMessages
-    // then yields, so tracking must be here to survive .return() at the yield.
-    if (fallbackMessage) {
-      const fallbackUsage = fallbackMessage.message.usage
-      usage = updateUsage(EMPTY_USAGE, fallbackUsage)
-      stopReason = fallbackMessage.message.stop_reason
-      const fallbackCost = calculateUSDCost(resolvedModel, fallbackUsage)
-      costUSD += addToTotalSessionCost(
-        fallbackCost,
-        fallbackUsage,
-        options.model,
-      )
-    }
-  }
-  // Mark all registered tools as sent to API so they become eligible for deletion
-  if (feature('CACHED_MICROCOMPACT') && cachedMCEnabled) {
-    markToolsSentToAPIState()
-  }
-  // Track the last requestId for the main conversation chain so shutdown
-  // can send a cache eviction hint to inference. Exclude backgrounded
-  // sessions (Ctrl+B) which share the repl_main_thread querySource but
-  // run inside an agent context — they are independent conversation chains
-  // whose cache should not be evicted when the foreground session clears.
-  if (
-    streamRequestId &&
-    !getAgentContext() &&
-    (options.querySource.startsWith('repl_main_thread') ||
-      options.querySource === 'sdk')
-  ) {
-    setLastMainRequestId(streamRequestId)
-  }
-  // Precompute scalars so the fire-and-forget .then() closure doesn't pin the
-  // full messagesForAPI array (the entire conversation up to the context window
-  // limit) until getToolPermissionContext() resolves.
-  const logMessageCount = messagesForAPI.length
-  const logMessageTokens = tokenCountFromLastAPIResponse(messagesForAPI)
-  void options.getToolPermissionContext().then(permissionContext => {
-    logAPISuccessAndDuration({
-      model:
-        newMessages[0]?.message.model ?? partialMessage?.model ?? options.model,
-      preNormalizedModel: options.model,
-      usage,
-      start,
-      startIncludingRetries,
-      attempt: attemptNumber,
-      messageCount: logMessageCount,
-      messageTokens: logMessageTokens,
-      requestId: streamRequestId ?? null,
-      stopReason,
-      ttftMs,
-      didFallBackToNonStreaming,
-      querySource: options.querySource,
-      headers: responseHeaders,
-      costUSD,
-      queryTracking: options.queryTracking,
-      permissionMode: permissionContext.mode,
-      // Pass newMessages for beta tracing - extraction happens in logging.ts
-      // only when beta tracing is enabled
-      newMessages,
-      llmSpan,
-      globalCacheStrategy,
-      requestSetupMs: start - startIncludingRetries,
-      attemptStartTimes,
-      fastMode: isFastModeRequest,
-      previousRequestId,
-      betas: lastRequestBetas,
-    })
-  })
-  // Defensive: also release on normal completion (no-op if finally already ran).
-  releaseStreamResources()
-}
-/**
- * Cleans up stream resources to prevent memory leaks.
- * @internal Exported for testing
- */
-export function cleanupStream(
-  stream: Stream<BetaRawMessageStreamEvent> | undefined,
-): void {
-  if (!stream) {
-    return
-  }
-  try {
-    // Abort the stream via its controller if not already aborted
-    if (!stream.controller.signal.aborted) {
-      stream.controller.abort()
-    }
-  } catch {
-    // Ignore - stream may already be closed
-  }
-}
-/**
- * Updates usage statistics with new values from streaming API events.
- * Note: the upstream streaming API provides cumulative usage totals, not incremental deltas.
- * (UMMAYA: byte-copied from CC where this read "Anthropic's streaming API"; FriendliAI's
- * OpenAI-compatible streaming surface follows the same cumulative semantics, so the
- * algorithm is unchanged — only the brand token is renamed.)
- * Each event contains the complete usage up to that point in the stream.
- *
- * Input-related tokens (input_tokens, cache_creation_input_tokens, cache_read_input_tokens)
- * are typically set in message_start and remain constant. message_delta events may send
- * explicit 0 values for these fields, which should not overwrite the values from message_start.
- * We only update these fields if they have a non-null, non-zero value.
- */
-export function updateUsage(
-  usage: Readonly<NonNullableUsage>,
-  partUsage: BetaMessageDeltaUsage | undefined,
-): NonNullableUsage {
-  if (!partUsage) {
-    return { ...usage }
-  }
-  return {
-    input_tokens:
-      partUsage.input_tokens !== null && partUsage.input_tokens > 0
-        ? partUsage.input_tokens
-        : usage.input_tokens,
-    cache_creation_input_tokens:
-      partUsage.cache_creation_input_tokens !== null &&
-      partUsage.cache_creation_input_tokens > 0
-        ? partUsage.cache_creation_input_tokens
-        : usage.cache_creation_input_tokens,
-    cache_read_input_tokens:
-      partUsage.cache_read_input_tokens !== null &&
-      partUsage.cache_read_input_tokens > 0
-        ? partUsage.cache_read_input_tokens
-        : usage.cache_read_input_tokens,
-    output_tokens: partUsage.output_tokens ?? usage.output_tokens,
-    server_tool_use: {
-      web_search_requests:
-        partUsage.server_tool_use?.web_search_requests ??
-        usage.server_tool_use.web_search_requests,
-      web_fetch_requests:
-        partUsage.server_tool_use?.web_fetch_requests ??
-        usage.server_tool_use.web_fetch_requests,
-    },
-    service_tier: usage.service_tier,
-    cache_creation: {
-      // SDK type BetaMessageDeltaUsage is missing cache_creation, but it's real!
-      ephemeral_1h_input_tokens:
-        (partUsage as BetaUsage).cache_creation?.ephemeral_1h_input_tokens ??
-        usage.cache_creation.ephemeral_1h_input_tokens,
-      ephemeral_5m_input_tokens:
-        (partUsage as BetaUsage).cache_creation?.ephemeral_5m_input_tokens ??
-        usage.cache_creation.ephemeral_5m_input_tokens,
-    },
-    // cache_deleted_input_tokens: returned by the API when cache editing
-    // deletes KV cache content, but not in SDK types. Kept off NonNullableUsage
-    // so the string is eliminated from external builds by dead code elimination.
-    // Uses the same > 0 guard as other token fields to prevent message_delta
-    // from overwriting the real value with 0.
-    ...(feature('CACHED_MICROCOMPACT')
-      ? {
-          cache_deleted_input_tokens:
-            (partUsage as unknown as { cache_deleted_input_tokens?: number })
-              .cache_deleted_input_tokens != null &&
-            (partUsage as unknown as { cache_deleted_input_tokens: number })
-              .cache_deleted_input_tokens > 0
-              ? (partUsage as unknown as { cache_deleted_input_tokens: number })
-                  .cache_deleted_input_tokens
-              : ((usage as unknown as { cache_deleted_input_tokens?: number })
-                  .cache_deleted_input_tokens ?? 0),
-        }
-      : {}),
-    inference_geo: usage.inference_geo,
-    iterations: partUsage.iterations ?? usage.iterations,
-    speed: (partUsage as BetaUsage).speed ?? usage.speed,
-  }
-}
-/**
- * Accumulates usage from one message into a total usage object.
- * Used to track cumulative usage across multiple assistant turns.
- */
-export function accumulateUsage(
-  totalUsage: Readonly<NonNullableUsage>,
-  messageUsage: Readonly<NonNullableUsage>,
-): NonNullableUsage {
-  return {
-    input_tokens: totalUsage.input_tokens + messageUsage.input_tokens,
-    cache_creation_input_tokens:
-      totalUsage.cache_creation_input_tokens +
-      messageUsage.cache_creation_input_tokens,
-    cache_read_input_tokens:
-      totalUsage.cache_read_input_tokens + messageUsage.cache_read_input_tokens,
-    output_tokens: totalUsage.output_tokens + messageUsage.output_tokens,
-    server_tool_use: {
-      web_search_requests:
-        totalUsage.server_tool_use.web_search_requests +
-        messageUsage.server_tool_use.web_search_requests,
-      web_fetch_requests:
-        totalUsage.server_tool_use.web_fetch_requests +
-        messageUsage.server_tool_use.web_fetch_requests,
-    },
-    service_tier: messageUsage.service_tier, // Use the most recent service tier
-    cache_creation: {
-      ephemeral_1h_input_tokens:
-        totalUsage.cache_creation.ephemeral_1h_input_tokens +
-        messageUsage.cache_creation.ephemeral_1h_input_tokens,
-      ephemeral_5m_input_tokens:
-        totalUsage.cache_creation.ephemeral_5m_input_tokens +
-        messageUsage.cache_creation.ephemeral_5m_input_tokens,
-    },
-    // See comment in updateUsage — field is not on NonNullableUsage to keep
-    // the string out of external builds.
-    ...(feature('CACHED_MICROCOMPACT')
-      ? {
-          cache_deleted_input_tokens:
-            ((totalUsage as unknown as { cache_deleted_input_tokens?: number })
-              .cache_deleted_input_tokens ?? 0) +
-            ((
-              messageUsage as unknown as { cache_deleted_input_tokens?: number }
-            ).cache_deleted_input_tokens ?? 0),
-        }
-      : {}),
-    inference_geo: messageUsage.inference_geo, // Use the most recent
-    iterations: messageUsage.iterations, // Use the most recent
-    speed: messageUsage.speed, // Use the most recent
-  }
-}
-function isToolResultBlock(
-  block: unknown,
-): block is { type: 'tool_result'; tool_use_id: string } {
-  return (
-    block !== null &&
-    typeof block === 'object' &&
-    'type' in block &&
-    (block as { type: string }).type === 'tool_result' &&
-    'tool_use_id' in block
-  )
-}
-type CachedMCEditsBlock = {
-  type: 'cache_edits'
-  edits: { type: 'delete'; cache_reference: string }[]
-}
-type CachedMCPinnedEdits = {
-  userMessageIndex: number
-  block: CachedMCEditsBlock
-}
-// Exported for testing cache_reference placement constraints
-export function addCacheBreakpoints(
-  messages: (UserMessage | AssistantMessage)[],
-  enablePromptCaching: boolean,
-  querySource?: QuerySource,
-  useCachedMC = false,
-  newCacheEdits?: CachedMCEditsBlock | null,
-  pinnedEdits?: CachedMCPinnedEdits[],
-  skipCacheWrite = false,
-): MessageParam[] {
-  logEvent('tengu_api_cache_breakpoints', {
-    totalMessageCount: messages.length,
-    cachingEnabled: enablePromptCaching,
-    skipCacheWrite,
-  })
-  // Exactly one message-level cache_control marker per request. Mycro's
-  // turn-to-turn eviction (page_manager/index.rs: Index::insert) frees
-  // local-attention KV pages at any cached prefix position NOT in
-  // cache_store_int_token_boundaries. With two markers the second-to-last
-  // position is protected and its locals survive an extra turn even though
-  // nothing will ever resume from there — with one marker they're freed
-  // immediately. For fire-and-forget forks (skipCacheWrite) we shift the
-  // marker to the second-to-last message: that's the last shared-prefix
-  // point, so the write is a no-op merge on mycro (entry already exists)
-  // and the fork doesn't leave its own tail in the KVCC. Dense pages are
-  // refcounted and survive via the new hash either way.
-  const markerIndex = skipCacheWrite ? messages.length - 2 : messages.length - 1
-  const result = messages.map((msg, index) => {
-    const addCache = index === markerIndex
-    if (msg.type === 'user') {
-      return userMessageToMessageParam(
-        msg,
-        addCache,
-        enablePromptCaching,
-        querySource,
-      )
-    }
-    return assistantMessageToMessageParam(
-      msg,
-      addCache,
-      enablePromptCaching,
-      querySource,
-    )
-  })
-  if (!useCachedMC) {
-    return result
-  }
-  // Track all cache_references being deleted to prevent duplicates across blocks.
-  const seenDeleteRefs = new Set<string>()
-  // Helper to deduplicate a cache_edits block against already-seen deletions
-  const deduplicateEdits = (block: CachedMCEditsBlock): CachedMCEditsBlock => {
-    const uniqueEdits = block.edits.filter(edit => {
-      if (seenDeleteRefs.has(edit.cache_reference)) {
-        return false
-      }
-      seenDeleteRefs.add(edit.cache_reference)
-      return true
-    })
-    return { ...block, edits: uniqueEdits }
-  }
-  // Re-insert all previously-pinned cache_edits at their original positions
-  for (const pinned of pinnedEdits ?? []) {
-    const msg = result[pinned.userMessageIndex]
-    if (msg && msg.role === 'user') {
-      if (!Array.isArray(msg.content)) {
-        msg.content = [{ type: 'text', text: msg.content as string }]
-      }
-      const dedupedBlock = deduplicateEdits(pinned.block)
-      if (dedupedBlock.edits.length > 0) {
-        insertBlockAfterToolResults(msg.content, dedupedBlock)
-      }
-    }
-  }
-  // Insert new cache_edits into the last user message and pin them
-  if (newCacheEdits && result.length > 0) {
-    const dedupedNewEdits = deduplicateEdits(newCacheEdits)
-    if (dedupedNewEdits.edits.length > 0) {
-      for (let i = result.length - 1; i >= 0; i--) {
-        const msg = result[i]
-        if (msg && msg.role === 'user') {
-          if (!Array.isArray(msg.content)) {
-            msg.content = [{ type: 'text', text: msg.content as string }]
-          }
-          insertBlockAfterToolResults(msg.content, dedupedNewEdits)
-          // Pin so this block is re-sent at the same position in future calls
-          pinCacheEdits(i, newCacheEdits)
-          logForDebugging(
-            `Added cache_edits block with ${dedupedNewEdits.edits.length} deletion(s) to message[${i}]: ${dedupedNewEdits.edits.map(e => e.cache_reference).join(', ')}`,
-          )
-          break
-        }
-      }
-    }
-  }
-  // Add cache_reference to tool_result blocks that are within the cached prefix.
-  // Must be done AFTER cache_edits insertion since that modifies content arrays.
-  if (enablePromptCaching) {
-    // Find the last message containing a cache_control marker
-    let lastCCMsg = -1
-    for (let i = 0; i < result.length; i++) {
-      const msg = result[i]!
-      if (Array.isArray(msg.content)) {
-        for (const block of msg.content) {
-          if (block && typeof block === 'object' && 'cache_control' in block) {
-            lastCCMsg = i
-          }
-        }
-      }
-    }
-    // Add cache_reference to tool_result blocks that are strictly before
-    // the last cache_control marker. The API requires cache_reference to
-    // appear "before or on" the last cache_control — we use strict "before"
-    // to avoid edge cases where cache_edits splicing shifts block indices.
-    //
-    // Create new objects instead of mutating in-place to avoid contaminating
-    // blocks reused by secondary queries that use models without cache_editing support.
-    if (lastCCMsg >= 0) {
-      for (let i = 0; i < lastCCMsg; i++) {
-        const msg = result[i]!
-        if (msg.role !== 'user' || !Array.isArray(msg.content)) {
-          continue
-        }
-        let cloned = false
-        for (let j = 0; j < msg.content.length; j++) {
-          const block = msg.content[j]
-          if (block && isToolResultBlock(block)) {
-            if (!cloned) {
-              msg.content = [...msg.content]
-              cloned = true
-            }
-            msg.content[j] = Object.assign({}, block, {
-              cache_reference: block.tool_use_id,
-            })
-          }
-        }
-      }
-    }
-  }
-  return result
-}
-export function buildSystemPromptBlocks(
-  systemPrompt: SystemPrompt,
-  enablePromptCaching: boolean,
-  options?: {
-    skipGlobalCacheForSystemPrompt?: boolean
-    querySource?: QuerySource
-  },
-): TextBlockParam[] {
-  // IMPORTANT: Do not add any more blocks for caching or you will get a 400
-  return splitSysPromptPrefix(systemPrompt, {
-    skipGlobalCacheForSystemPrompt: options?.skipGlobalCacheForSystemPrompt,
-  }).map(block => {
-    return {
-      type: 'text' as const,
-      text: block.text,
-      ...(enablePromptCaching &&
-        block.cacheScope !== null && {
-          cache_control: getCacheControl({
-            scope: block.cacheScope,
-            querySource: options?.querySource,
-          }),
-        }),
-    }
-  })
-}
-type HaikuOptions = Omit<Options, 'model' | 'getToolPermissionContext'>
-export async function queryHaiku({
-  systemPrompt = asSystemPrompt([]),
-  userPrompt,
-  outputFormat,
-  signal,
-  options,
-}: {
-  systemPrompt: SystemPrompt
-  userPrompt: string
-  outputFormat?: BetaJSONOutputFormat
-  signal: AbortSignal
-  options: HaikuOptions
-}): Promise<AssistantMessage> {
-  const result = await withVCR(
-    [
-      createUserMessage({
-        content: systemPrompt.map(text => ({ type: 'text', text })),
-      }),
-      createUserMessage({
-        content: userPrompt,
-      }),
-    ],
-    async () => {
-      const messages = [
-        createUserMessage({
-          content: userPrompt,
-        }),
-      ]
-      const result = await queryModelWithoutStreaming({
-        messages,
-        systemPrompt,
-        thinkingConfig: { type: 'disabled' },
-        tools: [],
-        signal,
-        options: {
-          ...options,
-          model: getSmallFastModel(),
-          enablePromptCaching: options.enablePromptCaching ?? false,
-          outputFormat,
-          async getToolPermissionContext() {
-            return getEmptyToolPermissionContext()
-          },
-        },
-      })
-      return [result]
-    },
-  )
-  // We don't use streaming for Haiku so this is safe
-  return result[0]! as AssistantMessage
-}
-type QueryWithModelOptions = Omit<Options, 'getToolPermissionContext'>
-/**
- * Query a specific model through the UMMAYA infrastructure.
- * (Originally "Claude Code infrastructure" in the CC byte-copy; UMMAYA renames
- * the citizen-visible doc string but preserves the function shape so future
- * audit replays diff cleanly against CC. swap/identifier-rename(2521).)
- * This goes through the full query pipeline including proper authentication,
- * betas, and headers - unlike direct API calls.
- */
-export async function queryWithModel({
-  systemPrompt = asSystemPrompt([]),
-  userPrompt,
-  outputFormat,
-  signal,
-  options,
-}: {
-  systemPrompt: SystemPrompt
-  userPrompt: string
-  outputFormat?: BetaJSONOutputFormat
-  signal: AbortSignal
-  options: QueryWithModelOptions
-}): Promise<AssistantMessage> {
-  const result = await withVCR(
-    [
-      createUserMessage({
-        content: systemPrompt.map(text => ({ type: 'text', text })),
-      }),
-      createUserMessage({
-        content: userPrompt,
-      }),
-    ],
-    async () => {
-      const messages = [
-        createUserMessage({
-          content: userPrompt,
-        }),
-      ]
-      const result = await queryModelWithoutStreaming({
-        messages,
-        systemPrompt,
-        thinkingConfig: { type: 'disabled' },
-        tools: [],
-        signal,
-        options: {
-          ...options,
-          enablePromptCaching: options.enablePromptCaching ?? false,
-          outputFormat,
-          async getToolPermissionContext() {
-            return getEmptyToolPermissionContext()
-          },
-        },
-      })
-      return [result]
-    },
-  )
-  return result[0]! as AssistantMessage
-}
-// Non-streaming requests have a 10min max per the docs:
-// https://platform.claude.com/docs/en/api/errors#long-requests
-// The SDK's 21333-token cap is derived from 10min × 128k tokens/hour, but we
-// bypass it by setting a client-level timeout, so we can cap higher.
-export const MAX_NON_STREAMING_TOKENS = 64_000
-/**
- * Adjusts thinking budget when max_tokens is capped for non-streaming fallback.
- * Ensures the API constraint: max_tokens > thinking.budget_tokens
- *
- * @param params - The parameters that will be sent to the API
- * @param maxTokensCap - The maximum allowed tokens (MAX_NON_STREAMING_TOKENS)
- * @returns Adjusted parameters with thinking budget capped if needed
- */
-export function adjustParamsForNonStreaming<
-  T extends {
-    max_tokens: number
-    thinking?: BetaMessageStreamParams['thinking']
-  },
->(params: T, maxTokensCap: number): T {
-  const cappedMaxTokens = Math.min(params.max_tokens, maxTokensCap)
-  // Adjust thinking budget if it would exceed capped max_tokens
-  // to maintain the constraint: max_tokens > thinking.budget_tokens
-  const adjustedParams = { ...params }
-  if (
-    adjustedParams.thinking?.type === 'enabled' &&
-    adjustedParams.thinking.budget_tokens
-  ) {
-    adjustedParams.thinking = {
-      ...adjustedParams.thinking,
-      budget_tokens: Math.min(
-        adjustedParams.thinking.budget_tokens,
-        cappedMaxTokens - 1, // Must be at least 1 less than max_tokens
-      ),
-    }
-  }
-  return {
-    ...adjustedParams,
-    max_tokens: cappedMaxTokens,
-  }
-}
-function isMaxTokensCapEnabled(): boolean {
-  // 3P default: false (not validated on Bedrock/Vertex)
-  return getFeatureValue_CACHED_MAY_BE_STALE('tengu_otk_slot_v1', false)
-}
-export function getMaxOutputTokensForModel(model: string): number {
-  const maxOutputTokens = getModelMaxOutputTokens(model)
-  // Slot-reservation cap: drop default to 8k for all models. BQ p99 output
-  // = 4,911 tokens; 32k/64k defaults over-reserve 8-16× slot capacity.
-  // Requests hitting the cap get one clean retry at 64k (query.ts
-  // max_output_tokens_escalate). Math.min keeps models with lower native
-  // defaults (e.g. claude-3-opus at 4k) at their native value. Applied
-  // before the env-var override so CLAUDE_CODE_MAX_OUTPUT_TOKENS still wins.
-  const defaultTokens = isMaxTokensCapEnabled()
-    ? Math.min(maxOutputTokens.default, CAPPED_DEFAULT_MAX_TOKENS)
-    : maxOutputTokens.default
-  const result = validateBoundedIntEnvVar(
-    'CLAUDE_CODE_MAX_OUTPUT_TOKENS',
-    process.env.CLAUDE_CODE_MAX_OUTPUT_TOKENS,
-    defaultTokens,
-    maxOutputTokens.upperLimit,
-  )
-  return result.effective
-}