multi-forge 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (311) hide show
  1. forge/__init__.py +3 -0
  2. forge/_extensions/agents/.gitkeep +0 -0
  3. forge/_extensions/commands/.gitkeep +0 -0
  4. forge/_extensions/skills/analyze/SKILL.md +87 -0
  5. forge/_extensions/skills/challenge/SKILL.md +91 -0
  6. forge/_extensions/skills/consensus/SKILL.md +120 -0
  7. forge/_extensions/skills/consensus/resources/code_consensus_evaluation.md +94 -0
  8. forge/_extensions/skills/consensus/resources/consensus_evaluation.md +70 -0
  9. forge/_extensions/skills/consensus/resources/synthesis.md +101 -0
  10. forge/_extensions/skills/debate/SKILL.md +116 -0
  11. forge/_extensions/skills/debate/resources/code_debate_evaluation.md +101 -0
  12. forge/_extensions/skills/debate/resources/debate_evaluation.md +90 -0
  13. forge/_extensions/skills/panel/SKILL.md +141 -0
  14. forge/_extensions/skills/panel/resources/synthesis.md +103 -0
  15. forge/_extensions/skills/qa/SKILL.md +704 -0
  16. forge/_extensions/skills/qa/resources/checklist/0-enable.md +78 -0
  17. forge/_extensions/skills/qa/resources/checklist/1-preflight.md +24 -0
  18. forge/_extensions/skills/qa/resources/checklist/10-resume.md +143 -0
  19. forge/_extensions/skills/qa/resources/checklist/11-config.md +150 -0
  20. forge/_extensions/skills/qa/resources/checklist/12-search.md +58 -0
  21. forge/_extensions/skills/qa/resources/checklist/13-guard.md +237 -0
  22. forge/_extensions/skills/qa/resources/checklist/14-workflow.md +305 -0
  23. forge/_extensions/skills/qa/resources/checklist/15-skills.md +155 -0
  24. forge/_extensions/skills/qa/resources/checklist/16-handoff.md +224 -0
  25. forge/_extensions/skills/qa/resources/checklist/17-info.md +50 -0
  26. forge/_extensions/skills/qa/resources/checklist/18-disable.md +84 -0
  27. forge/_extensions/skills/qa/resources/checklist/19-uninstall.md +146 -0
  28. forge/_extensions/skills/qa/resources/checklist/2-extensions.md +188 -0
  29. forge/_extensions/skills/qa/resources/checklist/20-cleanup.md +36 -0
  30. forge/_extensions/skills/qa/resources/checklist/3-auth.md +234 -0
  31. forge/_extensions/skills/qa/resources/checklist/4-proxy.md +481 -0
  32. forge/_extensions/skills/qa/resources/checklist/5-session.md +541 -0
  33. forge/_extensions/skills/qa/resources/checklist/6-hooks.md +275 -0
  34. forge/_extensions/skills/qa/resources/checklist/7-costs.md +309 -0
  35. forge/_extensions/skills/qa/resources/checklist/8-status-line.md +174 -0
  36. forge/_extensions/skills/qa/resources/checklist/9-direct-commands.md +146 -0
  37. forge/_extensions/skills/qa/resources/checklist.md +103 -0
  38. forge/_extensions/skills/qa/resources/report-template.md +62 -0
  39. forge/_extensions/skills/qa/scripts/start-container.sh +529 -0
  40. forge/_extensions/skills/qa/scripts/walkthrough-state.py +1137 -0
  41. forge/_extensions/skills/review/SKILL.md +125 -0
  42. forge/_extensions/skills/review/references/claude-4.6.md +474 -0
  43. forge/_extensions/skills/review/references/claude-4.7.md +710 -0
  44. forge/_extensions/skills/review/references/gemini-3.1.md +546 -0
  45. forge/_extensions/skills/review/references/gpt-5.5.md +490 -0
  46. forge/_extensions/skills/review/references/skills-writing-guide.md +1588 -0
  47. forge/_extensions/skills/review/resources/code-anthropic.md +160 -0
  48. forge/_extensions/skills/review/resources/code-gemini.md +184 -0
  49. forge/_extensions/skills/review/resources/code-openai.md +203 -0
  50. forge/_extensions/skills/review/resources/code.md +160 -0
  51. forge/_extensions/skills/review-docs/SKILL.md +121 -0
  52. forge/_extensions/skills/review-docs/resources/docs-anthropic.md +170 -0
  53. forge/_extensions/skills/review-docs/resources/docs-gemini.md +204 -0
  54. forge/_extensions/skills/review-docs/resources/docs-openai.md +231 -0
  55. forge/_extensions/skills/review-docs/resources/docs.md +170 -0
  56. forge/_extensions/skills/smoke-test/SKILL.md +27 -0
  57. forge/_extensions/skills/smoke-test/scripts/smoke-test.sh +118 -0
  58. forge/_extensions/skills/understand/SKILL.md +148 -0
  59. forge/_extensions/skills/understand/resources/code-anthropic.md +163 -0
  60. forge/_extensions/skills/understand/resources/code-gemini.md +194 -0
  61. forge/_extensions/skills/understand/resources/code-openai.md +181 -0
  62. forge/_extensions/skills/understand/resources/code.md +163 -0
  63. forge/_extensions/skills/understand/resources/docs-anthropic.md +177 -0
  64. forge/_extensions/skills/understand/resources/docs-gemini.md +202 -0
  65. forge/_extensions/skills/understand/resources/docs-openai.md +191 -0
  66. forge/_extensions/skills/understand/resources/docs.md +177 -0
  67. forge/_extensions/skills/walkthrough/SKILL.md +599 -0
  68. forge/_extensions/skills/walkthrough/resources/checklist.md +765 -0
  69. forge/_extensions/skills/walkthrough/scripts/run-in-repo.sh +118 -0
  70. forge/_extensions/skills/walkthrough/scripts/setup-test-repo.sh +198 -0
  71. forge/_extensions/skills/walkthrough/scripts/walkthrough-state.py +1137 -0
  72. forge/backend/__init__.py +174 -0
  73. forge/backend/adapters/__init__.py +38 -0
  74. forge/backend/adapters/litellm.py +158 -0
  75. forge/backend/creation.py +89 -0
  76. forge/backend/registry.py +178 -0
  77. forge/cli/__init__.py +16 -0
  78. forge/cli/auth.py +483 -0
  79. forge/cli/backend.py +298 -0
  80. forge/cli/claude.py +411 -0
  81. forge/cli/config_cmd.py +303 -0
  82. forge/cli/extensions.py +1001 -0
  83. forge/cli/gc.py +165 -0
  84. forge/cli/guard.py +1018 -0
  85. forge/cli/guards.py +106 -0
  86. forge/cli/handoff.py +110 -0
  87. forge/cli/hooks/__init__.py +36 -0
  88. forge/cli/hooks/_group.py +20 -0
  89. forge/cli/hooks/_helpers.py +149 -0
  90. forge/cli/hooks/commands.py +1677 -0
  91. forge/cli/hooks/direct_commands.py +1304 -0
  92. forge/cli/hooks/install.py +232 -0
  93. forge/cli/hooks/policy.py +151 -0
  94. forge/cli/hooks/read_hygiene.py +74 -0
  95. forge/cli/hooks/verification.py +370 -0
  96. forge/cli/logs.py +406 -0
  97. forge/cli/main.py +292 -0
  98. forge/cli/proxy.py +1821 -0
  99. forge/cli/proxy_costs.py +313 -0
  100. forge/cli/search.py +416 -0
  101. forge/cli/session.py +892 -0
  102. forge/cli/session_addendum.py +81 -0
  103. forge/cli/session_fork.py +750 -0
  104. forge/cli/session_handoff.py +141 -0
  105. forge/cli/session_lifecycle.py +2053 -0
  106. forge/cli/session_manage.py +1336 -0
  107. forge/cli/session_memory.py +201 -0
  108. forge/cli/status_line.py +1398 -0
  109. forge/cli/workflow.py +1964 -0
  110. forge/config/__init__.py +110 -0
  111. forge/config/dataclass_utils.py +88 -0
  112. forge/config/defaults/__init__.py +0 -0
  113. forge/config/defaults/backends/__init__.py +0 -0
  114. forge/config/defaults/backends/litellm.yaml +196 -0
  115. forge/config/defaults/templates/__init__.py +0 -0
  116. forge/config/defaults/templates/litellm-anthropic-local.yaml +33 -0
  117. forge/config/defaults/templates/litellm-anthropic.yaml +24 -0
  118. forge/config/defaults/templates/litellm-gemini-flash-local.yaml +37 -0
  119. forge/config/defaults/templates/litellm-gemini-local.yaml +32 -0
  120. forge/config/defaults/templates/litellm-gemini-test.yaml +34 -0
  121. forge/config/defaults/templates/litellm-gemini.yaml +21 -0
  122. forge/config/defaults/templates/litellm-openai-codex-local.yaml +36 -0
  123. forge/config/defaults/templates/litellm-openai-local.yaml +38 -0
  124. forge/config/defaults/templates/litellm-openai.yaml +28 -0
  125. forge/config/defaults/templates/openrouter-anthropic.yaml +23 -0
  126. forge/config/defaults/templates/openrouter-deepseek.yaml +26 -0
  127. forge/config/defaults/templates/openrouter-gemini-flash.yaml +26 -0
  128. forge/config/defaults/templates/openrouter-gemini.yaml +23 -0
  129. forge/config/defaults/templates/openrouter-glm.yaml +23 -0
  130. forge/config/defaults/templates/openrouter-kimi.yaml +30 -0
  131. forge/config/defaults/templates/openrouter-minimax.yaml +26 -0
  132. forge/config/defaults/templates/openrouter-openai-codex.yaml +23 -0
  133. forge/config/defaults/templates/openrouter-openai.yaml +28 -0
  134. forge/config/defaults/templates/openrouter-qwen.yaml +25 -0
  135. forge/config/loader.py +675 -0
  136. forge/config/schema.py +448 -0
  137. forge/core/__init__.py +5 -0
  138. forge/core/auth/__init__.py +67 -0
  139. forge/core/auth/capabilities.py +219 -0
  140. forge/core/auth/credentials_file.py +244 -0
  141. forge/core/auth/protocols.py +18 -0
  142. forge/core/auth/secrets.py +243 -0
  143. forge/core/auth/template_secrets.py +112 -0
  144. forge/core/data/__init__.py +5 -0
  145. forge/core/data/model_catalog.yaml +1522 -0
  146. forge/core/data/pricing.yaml +140 -0
  147. forge/core/data/system_prompt_addendums/__init__.py +0 -0
  148. forge/core/data/system_prompt_addendums/gemini.md +330 -0
  149. forge/core/data/system_prompt_addendums/openai.md +328 -0
  150. forge/core/llm/__init__.py +231 -0
  151. forge/core/llm/clients/__init__.py +14 -0
  152. forge/core/llm/clients/base.py +115 -0
  153. forge/core/llm/clients/litellm.py +619 -0
  154. forge/core/llm/clients/openai_compat.py +244 -0
  155. forge/core/llm/clients/openrouter.py +234 -0
  156. forge/core/llm/credentials.py +439 -0
  157. forge/core/llm/detection.py +86 -0
  158. forge/core/llm/errors.py +44 -0
  159. forge/core/llm/protocols.py +80 -0
  160. forge/core/llm/types.py +176 -0
  161. forge/core/logging.py +146 -0
  162. forge/core/models/__init__.py +91 -0
  163. forge/core/models/catalog.py +467 -0
  164. forge/core/models/pricing.py +165 -0
  165. forge/core/models/types.py +167 -0
  166. forge/core/naming.py +212 -0
  167. forge/core/ops/__init__.py +73 -0
  168. forge/core/ops/context.py +141 -0
  169. forge/core/ops/gc.py +802 -0
  170. forge/core/ops/proxy.py +146 -0
  171. forge/core/ops/resolution.py +135 -0
  172. forge/core/ops/session.py +344 -0
  173. forge/core/ops/session_context.py +548 -0
  174. forge/core/paths.py +38 -0
  175. forge/core/process.py +54 -0
  176. forge/core/reactive/__init__.py +38 -0
  177. forge/core/reactive/cost_tracking.py +300 -0
  178. forge/core/reactive/env.py +180 -0
  179. forge/core/reactive/proxy.py +78 -0
  180. forge/core/reactive/routing.py +622 -0
  181. forge/core/reactive/session_runner.py +185 -0
  182. forge/core/reactive/structured_output.py +62 -0
  183. forge/core/reactive/tagger.py +94 -0
  184. forge/core/reactive/throttle.py +132 -0
  185. forge/core/state/__init__.py +59 -0
  186. forge/core/state/exceptions.py +59 -0
  187. forge/core/state/io.py +140 -0
  188. forge/core/state/lock.py +99 -0
  189. forge/core/state/timestamps.py +60 -0
  190. forge/core/transcript.py +78 -0
  191. forge/core/typing_helpers.py +24 -0
  192. forge/core/workqueue/__init__.py +67 -0
  193. forge/core/workqueue/queue.py +552 -0
  194. forge/core/workqueue/types.py +63 -0
  195. forge/guard/__init__.py +26 -0
  196. forge/guard/deterministic/__init__.py +26 -0
  197. forge/guard/deterministic/base.py +158 -0
  198. forge/guard/deterministic/coding_standards.py +256 -0
  199. forge/guard/deterministic/registry.py +148 -0
  200. forge/guard/deterministic/tdd.py +171 -0
  201. forge/guard/engine.py +216 -0
  202. forge/guard/protocols.py +91 -0
  203. forge/guard/queries.py +96 -0
  204. forge/guard/semantic/__init__.py +34 -0
  205. forge/guard/semantic/promotion.py +18 -0
  206. forge/guard/semantic/supervisor.py +813 -0
  207. forge/guard/semantic/verdict.py +183 -0
  208. forge/guard/store.py +124 -0
  209. forge/guard/team/__init__.py +6 -0
  210. forge/guard/team/config.py +24 -0
  211. forge/guard/team/handlers.py +209 -0
  212. forge/guard/team/prompts.py +41 -0
  213. forge/guard/types.py +125 -0
  214. forge/guard/workflow/__init__.py +17 -0
  215. forge/guard/workflow/branches.py +67 -0
  216. forge/guard/workflow/config.py +63 -0
  217. forge/guard/workflow/divergence.py +113 -0
  218. forge/guard/workflow/policy.py +87 -0
  219. forge/guard/workflow/stages.py +205 -0
  220. forge/install/__init__.py +55 -0
  221. forge/install/cli.py +281 -0
  222. forge/install/exceptions.py +163 -0
  223. forge/install/hooks.py +109 -0
  224. forge/install/installer.py +1037 -0
  225. forge/install/models.py +321 -0
  226. forge/install/preset.py +272 -0
  227. forge/install/settings_merge.py +831 -0
  228. forge/install/tracking.py +238 -0
  229. forge/install/version.py +141 -0
  230. forge/proxy/__init__.py +0 -0
  231. forge/proxy/base_client.py +181 -0
  232. forge/proxy/client_adapter.py +476 -0
  233. forge/proxy/client_factory.py +531 -0
  234. forge/proxy/converters.py +1206 -0
  235. forge/proxy/cost_logger.py +132 -0
  236. forge/proxy/cost_tracker.py +242 -0
  237. forge/proxy/data_models.py +338 -0
  238. forge/proxy/error_hints.py +92 -0
  239. forge/proxy/metrics.py +222 -0
  240. forge/proxy/model_spec.py +158 -0
  241. forge/proxy/proxies.py +333 -0
  242. forge/proxy/proxy_identity.py +134 -0
  243. forge/proxy/proxy_orchestrator.py +1018 -0
  244. forge/proxy/proxy_startup.py +54 -0
  245. forge/proxy/server.py +1561 -0
  246. forge/proxy/utils.py +537 -0
  247. forge/review/__init__.py +6 -0
  248. forge/review/adversarial.py +111 -0
  249. forge/review/consensus.py +236 -0
  250. forge/review/engine.py +356 -0
  251. forge/review/models.py +437 -0
  252. forge/review/resources/__init__.py +5 -0
  253. forge/review/resources/codereview-performance.md +85 -0
  254. forge/review/resources/codereview-quick.md +75 -0
  255. forge/review/resources/codereview-security.md +92 -0
  256. forge/review/resources/codereview.md +85 -0
  257. forge/review/resources/docreview-quick.md +75 -0
  258. forge/review/resources/docreview.md +86 -0
  259. forge/review/resources/thinkdeep.md +89 -0
  260. forge/review/routing.py +368 -0
  261. forge/review/synthesis.py +73 -0
  262. forge/runtime_config.py +438 -0
  263. forge/search/__init__.py +55 -0
  264. forge/search/bm25_store.py +264 -0
  265. forge/search/content_store.py +197 -0
  266. forge/search/engine.py +352 -0
  267. forge/search/exceptions.py +51 -0
  268. forge/search/extractor.py +234 -0
  269. forge/search/index_state.py +295 -0
  270. forge/search/store.py +215 -0
  271. forge/search/tokenizer.py +24 -0
  272. forge/session/__init__.py +130 -0
  273. forge/session/active.py +339 -0
  274. forge/session/artifacts.py +202 -0
  275. forge/session/claude/__init__.py +50 -0
  276. forge/session/claude/cleanup.py +105 -0
  277. forge/session/claude/invoke.py +236 -0
  278. forge/session/claude/paths.py +200 -0
  279. forge/session/cleanup.py +216 -0
  280. forge/session/config.py +34 -0
  281. forge/session/direct_model.py +107 -0
  282. forge/session/effective.py +169 -0
  283. forge/session/exceptions.py +255 -0
  284. forge/session/handoff.py +881 -0
  285. forge/session/handoff_agent.py +544 -0
  286. forge/session/hooks/__init__.py +35 -0
  287. forge/session/hooks/models.py +73 -0
  288. forge/session/hooks/session_start.py +507 -0
  289. forge/session/identity.py +84 -0
  290. forge/session/index.py +553 -0
  291. forge/session/manager.py +1506 -0
  292. forge/session/models.py +572 -0
  293. forge/session/overrides.py +344 -0
  294. forge/session/plan_resolution.py +286 -0
  295. forge/session/prev_sessions.py +128 -0
  296. forge/session/store.py +431 -0
  297. forge/session/validation.py +47 -0
  298. forge/session/worktree/__init__.py +65 -0
  299. forge/session/worktree/cleanup.py +262 -0
  300. forge/session/worktree/config_copy.py +203 -0
  301. forge/session/worktree/create.py +332 -0
  302. forge/sidecar/__init__.py +29 -0
  303. forge/sidecar/container.py +161 -0
  304. forge/sidecar/docker.py +86 -0
  305. forge/sidecar/secrets.py +19 -0
  306. multi_forge-0.2.0.dist-info/METADATA +242 -0
  307. multi_forge-0.2.0.dist-info/RECORD +311 -0
  308. multi_forge-0.2.0.dist-info/WHEEL +4 -0
  309. multi_forge-0.2.0.dist-info/entry_points.txt +2 -0
  310. multi_forge-0.2.0.dist-info/licenses/LICENSE +203 -0
  311. multi_forge-0.2.0.dist-info/licenses/NOTICE +14 -0
forge/proxy/server.py ADDED
@@ -0,0 +1,1561 @@
1
+ """
2
+ Unified LLM Proxy Server - Anthropic-compatible API for multiple providers.
3
+
4
+ This FastAPI server provides an Anthropic Messages API-compatible interface for
5
+ LLM providers via LiteLLM.
6
+
7
+ The server uses a unified client architecture where provider-specific logic is
8
+ encapsulated in client implementations that inherit from AbstractLLMClient.
9
+ This design ensures consistent behavior across providers while keeping the
10
+ server code clean and maintainable.
11
+
12
+ Key endpoints:
13
+ - POST /v1/messages - Main chat completion endpoint (streaming/non-streaming)
14
+ - POST /v1/messages/count_tokens - Token counting endpoint
15
+ - GET / - Health check and service information
16
+
17
+ For detailed API documentation, architecture overview, and configuration options,
18
+ see README.md in the project root.
19
+ """
20
+
21
+ import asyncio
22
+ import logging
23
+ import os
24
+ import socket
25
+ import sys
26
+ import time
27
+ import uuid
28
+ from contextlib import asynccontextmanager
29
+ from typing import Any
30
+
31
+ import click
32
+ import uvicorn
33
+ from fastapi import FastAPI, HTTPException, Request
34
+ from fastapi.responses import JSONResponse, StreamingResponse
35
+
36
+ from forge.config import TierOverride, config, init_config, reload
37
+ from forge.core.llm.errors import AuthenticationError
38
+ from forge.core.logging import (
39
+ configure_console_logging,
40
+ configure_debug_logging,
41
+ get_effective_log_level,
42
+ )
43
+ from forge.proxy.base_client import ProxyStreamError, ToolCallError
44
+ from forge.proxy.client_factory import TierClientFactory
45
+ from forge.proxy.converters import (
46
+ convert_anthropic_to_openai,
47
+ convert_openai_to_anthropic,
48
+ convert_openai_to_anthropic_sse,
49
+ )
50
+ from forge.proxy.cost_logger import log_request_cost
51
+ from forge.proxy.cost_tracker import CostTracker
52
+ from forge.proxy.data_models import (
53
+ MessagesRequest,
54
+ TokenCountRequest,
55
+ TokenCountResponse,
56
+ map_model_name,
57
+ )
58
+ from forge.proxy.error_hints import enrich_error_content
59
+ from forge.proxy.metrics import proxy_metrics
60
+ from forge.proxy.utils import (
61
+ log_request_beautifully,
62
+ log_request_response,
63
+ log_tool_event,
64
+ log_tool_failure,
65
+ )
66
+
67
+ logger = logging.getLogger(__name__)
68
+
69
+ logging.getLogger("uvicorn").setLevel(logging.WARNING)
70
+ logging.getLogger("uvicorn.access").setLevel(logging.WARNING)
71
+ logging.getLogger("uvicorn.error").setLevel(logging.WARNING)
72
+ logging.getLogger("httpx").setLevel(logging.WARNING)
73
+ logging.getLogger("httpcore").setLevel(logging.WARNING)
74
+
75
+ client_factory = TierClientFactory()
76
+
77
+ PREFERRED_PROVIDER = None
78
+
79
+ # When a proxy is started under a proxy id, its config should be stable for the
80
+ # lifetime of the process (no hot reload).
81
+ PROXY_ID: str | None = os.environ.get("FORGE_PROXY_ID")
82
+
83
+ cost_tracker: CostTracker | None = None
84
+
85
+
86
+ def _initialize_cost_tracker_from_config() -> CostTracker:
87
+ """Initialize request cost tracking in the module serving FastAPI traffic.
88
+
89
+ ``python -m forge.proxy.server`` executes this file as ``__main__``, while
90
+ uvicorn imports ``forge.proxy.server:app`` for request handling. Module
91
+ globals therefore need to be initialized in the imported app module too.
92
+ """
93
+ global cost_tracker
94
+ if cost_tracker is not None:
95
+ return cost_tracker
96
+
97
+ from forge.config.schema import CostConfig
98
+
99
+ cost_cfg = getattr(config.proxy, "costs", None) or CostConfig()
100
+ if cost_cfg.caps.per_day is not None or cost_cfg.caps.per_month is not None:
101
+ from forge.core.paths import get_forge_home
102
+
103
+ cost_tracker = CostTracker(
104
+ daily_cap_usd=cost_cfg.caps.per_day,
105
+ monthly_cap_usd=cost_cfg.caps.per_month,
106
+ cap_mode=cost_cfg.cap_mode,
107
+ on_cap_hit=cost_cfg.on_cap_hit,
108
+ )
109
+ cost_tracker.bootstrap_from_logs(get_forge_home() / "costs" / "requests", proxy_id=PROXY_ID)
110
+ else:
111
+ cost_tracker = CostTracker()
112
+ return cost_tracker
113
+
114
+
115
+ def _ensure_runtime_state() -> None:
116
+ """Ensure the imported app module has proxy config and runtime trackers."""
117
+ if PROXY_ID is None:
118
+ reload()
119
+ elif not config.proxy.active_template:
120
+ reload(proxy_id=PROXY_ID)
121
+
122
+ _initialize_cost_tracker_from_config()
123
+
124
+
125
+ def _calc_and_log_cost(
126
+ *,
127
+ model: str,
128
+ tier: str,
129
+ input_tokens: int,
130
+ output_tokens: int,
131
+ cached_tokens: int,
132
+ latency_ms: float,
133
+ failed: bool,
134
+ request_id: str,
135
+ ) -> int:
136
+ """Calculate cost in microdollars and write to the persistent cost log.
137
+
138
+ Best-effort: pricing/logging failures return 0 cost and warn.
139
+ Never raises — cost tracking must not break the proxy request path.
140
+ """
141
+ try:
142
+ from forge.core.models.pricing import calculate_cost, get_pricing
143
+
144
+ cost_micros = calculate_cost(model, input_tokens, output_tokens, cached_tokens)
145
+ pricing = get_pricing(model)
146
+
147
+ log_request_cost(
148
+ proxy_id=PROXY_ID or "unknown",
149
+ model=model,
150
+ tier=tier,
151
+ input_tokens=input_tokens,
152
+ output_tokens=output_tokens,
153
+ cached_tokens=cached_tokens,
154
+ cost_micros=cost_micros,
155
+ latency_ms=latency_ms,
156
+ failed=failed,
157
+ request_id=request_id,
158
+ pricing_source=pricing.source,
159
+ )
160
+
161
+ if cost_tracker is not None:
162
+ cost_tracker.record(cost_micros)
163
+
164
+ return cost_micros
165
+ except Exception as e:
166
+ logger.warning("Cost calculation failed for model=%s (non-fatal): %s", model, e)
167
+ return 0
168
+
169
+
170
+ _CAP_CONFIG_KEY = {"daily": "per_day", "monthly": "per_month"}
171
+
172
+
173
+ def _cap_result_message(cap_result) -> str:
174
+ """Format a spend cap result for HTTP headers and errors."""
175
+ cap_type = cap_result.cap_type or "configured"
176
+ config_key = _CAP_CONFIG_KEY.get(cap_type, f"per_{cap_type}")
177
+ return (
178
+ f"{'Projected ' if cap_result.projected else ''}"
179
+ f"{cap_type} spend cap reached: "
180
+ f"${cap_result.current_micros / 1_000_000:.2f} / "
181
+ f"${cap_result.limit_micros / 1_000_000:.2f}. "
182
+ f"Adjust with: forge proxy set <id> costs.caps.{config_key}=<amount>"
183
+ )
184
+
185
+
186
+ def _with_spend_warning(headers: dict[str, str], warning: str | None) -> dict[str, str]:
187
+ """Attach the optional spend warning header to a response header dict."""
188
+ if warning:
189
+ headers["X-Spend-Warning"] = warning
190
+ return headers
191
+
192
+
193
+ def _textish_chars(value: object) -> int:
194
+ """Approximate text-bearing request payload size for strict cap preflight."""
195
+ if value is None:
196
+ return 0
197
+ if isinstance(value, str):
198
+ return len(value)
199
+ if isinstance(value, dict):
200
+ total = 0
201
+ for key in ("content", "text", "thinking", "input", "name", "description"):
202
+ if key in value:
203
+ total += _textish_chars(value[key])
204
+ return total
205
+ if isinstance(value, (list, tuple)):
206
+ return sum(_textish_chars(item) for item in value)
207
+
208
+ total = 0
209
+ for attr in ("content", "text", "thinking", "input", "name", "description"):
210
+ if hasattr(value, attr):
211
+ total += _textish_chars(getattr(value, attr))
212
+ return total
213
+
214
+
215
+ def _estimate_input_tokens(request_data: MessagesRequest) -> int:
216
+ """Approximate request input tokens for strict cap preflight."""
217
+ chars = _textish_chars(getattr(request_data, "system", None))
218
+ chars += _textish_chars(getattr(request_data, "messages", None))
219
+ chars += _textish_chars(getattr(request_data, "tools", None))
220
+ return chars // 4
221
+
222
+
223
+ def _get_tier_override(tier: str) -> TierOverride | None:
224
+ """Get tier override from the active provider config.
225
+
226
+ Returns the TierOverride for the specified tier, or None if not configured.
227
+ Tier overrides allow per-tier hyperparameter customization (e.g., different
228
+ reasoning_effort for opus vs sonnet when both map to the same model).
229
+ """
230
+ try:
231
+ provider_cfg = config.proxy.get_provider()
232
+ return provider_cfg.tier_overrides.get(tier)
233
+ except Exception:
234
+ return None
235
+
236
+
237
+ def _resolve_model_with_alternatives(tier: str, original_model_name: str | None, fallback_model: str) -> str:
238
+ """Resolve backend model, checking per-tier alternatives before the tier default.
239
+
240
+ Used by both message routing and token counting so model resolution is
241
+ consistent across both paths. Strips ``[1m]`` context-window suffix before
242
+ lookup since it is a Claude Code hint, not a routing decision.
243
+ """
244
+ try:
245
+ provider_cfg = config.proxy.get_provider()
246
+ alt_models = provider_cfg.model_alternatives.get(tier, {})
247
+ if original_model_name and alt_models:
248
+ lookup = original_model_name.removesuffix("[1m]")
249
+ if lookup in alt_models:
250
+ return alt_models[lookup]
251
+ except Exception:
252
+ # Best-effort: degrade to fallback_model if provider config is unavailable
253
+ logger.debug("model_alternatives lookup failed, using tier default", exc_info=True)
254
+ return fallback_model
255
+
256
+
257
+ @asynccontextmanager
258
+ async def lifespan(app: FastAPI):
259
+ """Application lifespan management."""
260
+ logger.info("Server started...")
261
+ yield
262
+ logger.info("Server is shutting down... Cleaning up resources")
263
+
264
+
265
+ app = FastAPI(title="Unified LLM Proxy", lifespan=lifespan)
266
+
267
+
268
+ # --- Thinking → reasoning_effort translation ---
269
+ # Claude Code sends Anthropic-specific `thinking` config; litellm uses
270
+ # `reasoning_effort` which it translates per provider (Gemini 3: thinking_level,
271
+ # Gemini 2.5: thinkingBudget). These helpers map between the two.
272
+
273
+ # Ordered from lowest to highest so we can compare with max().
274
+ _EFFORT_RANK: dict[str | None, int] = {
275
+ None: -1,
276
+ "none": 0,
277
+ "disable": 0,
278
+ "minimal": 1,
279
+ "low": 2,
280
+ "medium": 3,
281
+ "high": 4,
282
+ "xhigh": 5,
283
+ }
284
+
285
+ # Budget thresholds for ceil-to-tier mapping (never downgrade).
286
+ # Checked top-down; first match wins. LiteLLM internal budgets for
287
+ # reference: low ~ 1,024, medium ~ 8,192, high ~ 24,576.
288
+ _BUDGET_THRESHOLDS: list[tuple[int, str]] = [
289
+ (25_000, "xhigh"), # >=25k tokens -> xhigh (above litellm high)
290
+ (10_000, "high"), # >=10k tokens -> high
291
+ (2_000, "medium"), # >=2k tokens -> medium
292
+ (500, "low"), # >=500 tokens -> low
293
+ (1, "minimal"), # >=1 token -> minimal
294
+ ]
295
+
296
+ # Type-based fallback when budget_tokens is absent.
297
+ _TYPE_TO_EFFORT: dict[str, str] = {
298
+ "enabled": "high",
299
+ "adaptive": "medium",
300
+ "disabled": "none",
301
+ }
302
+
303
+
304
+ def _derive_reasoning_effort(thinking: dict[str, object] | object | None) -> str | None:
305
+ """Derive reasoning_effort from Claude Code's thinking config.
306
+
307
+ Priority: budget_tokens (numeric, precise) > type (semantic label).
308
+ Unknown types default to "medium" (safe — never results in no reasoning).
309
+ """
310
+ if not isinstance(thinking, dict):
311
+ return None
312
+
313
+ # 1) Use budget_tokens if present — data-driven, not label-driven.
314
+ budget = thinking.get("budget_tokens")
315
+ if isinstance(budget, (int, float)) and budget > 0:
316
+ for threshold, effort in _BUDGET_THRESHOLDS:
317
+ if budget >= threshold:
318
+ return effort
319
+ return "minimal" # budget_tokens in (0, 1) — fractional edge case
320
+
321
+ # 2) Fall back to type-based mapping.
322
+ thinking_type = thinking.get("type")
323
+ if isinstance(thinking_type, str):
324
+ mapped: str | None = _TYPE_TO_EFFORT.get(thinking_type)
325
+ if mapped is not None:
326
+ return mapped
327
+ # Unknown type — default to medium (safe), log warning.
328
+ logger.warning(
329
+ "Unknown thinking type '%s', defaulting to reasoning_effort='medium'",
330
+ thinking_type,
331
+ )
332
+ return "medium"
333
+
334
+ return None
335
+
336
+
337
+ def _max_effort(a: str | None, b: str | None) -> str | None:
338
+ """Return the higher of two reasoning_effort levels, treating None as unset."""
339
+ if a is None:
340
+ return b
341
+ if b is None:
342
+ return a
343
+ return a if _EFFORT_RANK.get(a, 3) >= _EFFORT_RANK.get(b, 3) else b
344
+
345
+
346
+ @app.post("/v1/messages", response_model=None)
347
+ async def create_message(request_data: MessagesRequest, raw_request: Request):
348
+ """
349
+ Process chat completion requests using unified client architecture.
350
+
351
+ This endpoint handles both streaming and non-streaming responses,
352
+ automatically routing to the appropriate provider based on model name.
353
+ """
354
+ request_id = raw_request.state.request_id
355
+ start_time = time.time()
356
+
357
+ _ensure_runtime_state()
358
+
359
+ spend_warning: str | None = None
360
+
361
+ # Resolve effective tier (routing invariants):
362
+ # Precedence: request explicit tier > config.proxy.default_tier
363
+ # If neither is available, fail fast (misconfiguration).
364
+ if request_data.has_explicit_tier and request_data.tier:
365
+ # Request explicitly specified a tier (haiku/sonnet/opus in model name)
366
+ resolved_tier: str = request_data.tier
367
+ resolved_tier_source = "request"
368
+ elif config.proxy.default_tier:
369
+ resolved_tier = config.proxy.default_tier
370
+ resolved_tier_source = "proxy.default_tier"
371
+ else:
372
+ raise HTTPException(
373
+ status_code=500,
374
+ detail={
375
+ "type": "configuration_error",
376
+ "message": "config.proxy.default_tier is required for ambiguous requests under proxy-only routing",
377
+ },
378
+ )
379
+
380
+ logger.debug(f"[{request_id}] Resolved tier: {resolved_tier} (source={resolved_tier_source})")
381
+
382
+ request_data.tier = resolved_tier
383
+
384
+ # Determine if this is an explicit backend model or needs tier-based resolution
385
+ # Only re-resolve model based on tier if:
386
+ # 1. Model was mapped from Anthropic-style (contains haiku/sonnet/opus), OR
387
+ # 2. Model is truly ambiguous (no provider prefix and not a known backend model)
388
+ # Do NOT override explicit backend models like "openai/gpt-5.5" or "vertex_ai/gemini-3.1-pro"
389
+ original_model_name = request_data.original_model_name
390
+ mapped_model = map_model_name(request_data.model) # Map AFTER reload() for fresh config
391
+
392
+ # Check if original model is an explicit backend model (has provider prefix)
393
+ # These should be passed through, not tier-resolved
394
+ if config.proxy.preferred_provider == "openrouter":
395
+ # OpenRouter: any provider/model format is explicit (google/, meta-llama/, etc.)
396
+ is_explicit_backend = original_model_name is not None and "/" in original_model_name
397
+ else:
398
+ is_explicit_backend = (
399
+ original_model_name is not None
400
+ and "/" in original_model_name
401
+ and any(
402
+ original_model_name.startswith(prefix)
403
+ for prefix in [
404
+ "openai/",
405
+ "anthropic/",
406
+ "vertex_ai/",
407
+ "bedrock/",
408
+ "gemini/",
409
+ "together_ai/",
410
+ "replicate/",
411
+ ]
412
+ )
413
+ )
414
+
415
+ # Only use tier-resolved model for Anthropic-style or ambiguous requests
416
+ # For explicit backend models, use what map_model_name() returned (usually pass-through)
417
+ if is_explicit_backend:
418
+ # Explicit backend model - preserve it (map_model_name already handled it)
419
+ actual_model_id = mapped_model
420
+ logger.debug(
421
+ f"[{request_id}] Explicit backend model '{original_model_name}' - preserving as '{actual_model_id}'"
422
+ )
423
+ else:
424
+ # Anthropic-style or ambiguous — check alternatives, then fall back to tier default
425
+ tier_default = config.proxy.get_model_for_tier(resolved_tier)
426
+ actual_model_id = _resolve_model_with_alternatives(resolved_tier, original_model_name, tier_default)
427
+ logger.debug(f"[{request_id}] Tier-resolved model: tier={resolved_tier} -> '{actual_model_id}'")
428
+
429
+ # Spend cap check (after model resolution so strict preflight prices the actual model)
430
+ if cost_tracker is not None and cost_tracker.has_caps:
431
+ projected = 0
432
+ if cost_tracker.cap_mode == "strict":
433
+ from forge.core.models.pricing import calculate_cost as _est_cost
434
+
435
+ _est_max_output = request_data.max_tokens or 4096
436
+ _est_input = _estimate_input_tokens(request_data)
437
+ try:
438
+ projected = _est_cost(actual_model_id, _est_input, _est_max_output, 0)
439
+ except Exception:
440
+ projected = 0
441
+
442
+ cap_result = cost_tracker.check_cap(projected_cost_micros=projected)
443
+ if cap_result.exceeded:
444
+ spend_warning = _cap_result_message(cap_result)
445
+ if cost_tracker.on_cap_hit == "reject":
446
+ return JSONResponse(
447
+ status_code=429,
448
+ content={
449
+ "type": "error",
450
+ "error": {
451
+ "type": "spend_cap_exceeded",
452
+ "message": spend_warning,
453
+ },
454
+ },
455
+ headers={"X-Request-ID": request_id},
456
+ )
457
+ logger.warning("[%s] %s", request_id, spend_warning)
458
+
459
+ try:
460
+ num_messages = len(request_data.messages) if request_data.messages else 0
461
+ num_tools = len(request_data.tools) if request_data.tools else 0
462
+ tool_names = [tool.name for tool in request_data.tools] if request_data.tools else []
463
+ has_system = bool(request_data.system)
464
+
465
+ await _check_client_tool_failures(request_data, request_id, actual_model_id)
466
+
467
+ # Detect provider BEFORE conversion to enable provider-specific schema handling
468
+ detected_provider = client_factory.detect_provider_for_model(actual_model_id)
469
+ provider_name = detected_provider.value # Convert enum to string
470
+
471
+ logger.debug(
472
+ f"[{request_id}] Processing '/v1/messages': "
473
+ f"original='{original_model_name}', target='{actual_model_id}', provider='{provider_name}', "
474
+ f"messages={num_messages}, tools={num_tools}, stream={request_data.stream}"
475
+ )
476
+
477
+ openai_request_dict = convert_anthropic_to_openai(request_data, provider=provider_name)
478
+
479
+ openai_request_dict["model"] = actual_model_id
480
+
481
+ # Forward User-Agent from incoming request (Claude Code identity).
482
+ # Upstream LLM gateways may filter traffic by User-Agent; without this,
483
+ # the proxy's OpenAI SDK default header could cause requests to be blocked.
484
+ # Only inject for LiteLLM providers (other clients don't need it).
485
+ if provider_name in ("litellm_remote", "litellm_local", "openrouter"):
486
+ incoming_user_agent = raw_request.headers.get("user-agent")
487
+ if incoming_user_agent:
488
+ openai_request_dict["_user_agent"] = incoming_user_agent
489
+ logger.debug(f"[{request_id}] Forwarding User-Agent: {incoming_user_agent[:120]!r}")
490
+
491
+ # Priority: request explicit > tier_override > model default (in catalog)
492
+ tier_override = _get_tier_override(resolved_tier)
493
+ if tier_override:
494
+ logger.debug(f"[{request_id}] Tier override for '{resolved_tier}': {tier_override}")
495
+
496
+ if request_data.temperature is not None:
497
+ openai_request_dict["temperature"] = request_data.temperature
498
+ elif tier_override and tier_override.temperature is not None:
499
+ openai_request_dict["temperature"] = tier_override.temperature
500
+
501
+ if request_data.max_tokens is not None:
502
+ openai_request_dict["max_tokens"] = request_data.max_tokens
503
+ if request_data.top_p is not None:
504
+ openai_request_dict["top_p"] = request_data.top_p
505
+
506
+ # Optional reasoning/thinking overrides.
507
+ # Priority: request explicit > thinking-derived > tier_override > model default
508
+ # tier_override acts as a FLOOR (never go below the user's tier config).
509
+ # Use getattr() for test stubs that don't include new fields.
510
+ reasoning_effort = getattr(request_data, "reasoning_effort", None)
511
+ if reasoning_effort is not None:
512
+ openai_request_dict["reasoning_effort"] = reasoning_effort
513
+ else:
514
+ # Claude Code sends `thinking` (Anthropic-specific) instead of
515
+ # `reasoning_effort`. Translate to reasoning_effort so litellm can
516
+ # map it to each provider's native parameter.
517
+ thinking = getattr(request_data, "thinking", None)
518
+ derived = _derive_reasoning_effort(thinking)
519
+
520
+ # Apply tier_override as a floor: max(derived, tier_override).
521
+ tier_effort = tier_override.reasoning_effort if tier_override else None
522
+ openai_request_dict["reasoning_effort"] = _max_effort(derived, tier_effort)
523
+
524
+ # Note: the raw `thinking` dict is NOT forwarded — it's Anthropic-specific.
525
+ # Litellm controls thinking via reasoning_effort (mapped above).
526
+
527
+ verbosity = getattr(request_data, "verbosity", None)
528
+ if verbosity is not None:
529
+ openai_request_dict["verbosity"] = verbosity
530
+ elif tier_override and tier_override.verbosity is not None:
531
+ openai_request_dict["verbosity"] = tier_override.verbosity
532
+
533
+ if request_data.stop_sequences:
534
+ openai_request_dict["stop"] = request_data.stop_sequences
535
+
536
+ # Get unified client for this model (pass tier for tier-specific hyperparameters)
537
+ try:
538
+ client = await client_factory.get_client(actual_model_id, tier=request_data.tier)
539
+ logger.debug(f"[{request_id}] Got client for {actual_model_id} (tier={request_data.tier})")
540
+ except AuthenticationError as e:
541
+ logger.error(f"[{request_id}] Authentication failed: {e}")
542
+ raise HTTPException(
543
+ status_code=401,
544
+ detail={
545
+ "type": "authentication_error",
546
+ "message": f"Authentication failed [{request_id}]",
547
+ },
548
+ )
549
+
550
+ if request_data.stream:
551
+ # Streaming response
552
+ async def stream_generator():
553
+ try:
554
+ async for chunk in client.create_streaming_completion(openai_request_dict, request_id):
555
+ yield chunk
556
+ except ToolCallError as e:
557
+ logger.error(f"[{request_id}] ToolCallError: {e}")
558
+ yield {
559
+ "error": {
560
+ "type": e.error_type,
561
+ "message": f"Tool call error [{request_id}]",
562
+ }
563
+ }
564
+ except ProxyStreamError as e:
565
+ logger.error(f"[{request_id}] ProxyStreamError ({e.error_type}): {e}")
566
+ yield {
567
+ "error": {
568
+ "type": e.error_type,
569
+ "message": f"Streaming request failed [{request_id}]",
570
+ "status_code": e.status_code,
571
+ }
572
+ }
573
+
574
+ headers = {
575
+ "X-Request-ID": request_id,
576
+ "X-Resolved-Tier": resolved_tier,
577
+ "X-Resolved-Model": actual_model_id,
578
+ "X-Cumulative-Cost": f"{proxy_metrics.total_cost_micros / 1_000_000:.6f}",
579
+ "Cache-Control": "no-cache",
580
+ "Connection": "keep-alive",
581
+ }
582
+ headers = _with_spend_warning(headers, spend_warning)
583
+
584
+ # Log streaming request (no response body available)
585
+ duration_ms = (time.time() - start_time) * 1000
586
+ asyncio.create_task(
587
+ log_request_response(
588
+ request_id=request_id,
589
+ original_model=original_model_name or "",
590
+ mapped_model=actual_model_id,
591
+ request_body=request_data.model_dump(),
592
+ response_body=None, # Streaming has no response body
593
+ status_code=200,
594
+ duration_ms=duration_ms,
595
+ num_messages=num_messages,
596
+ num_tools=num_tools,
597
+ tool_names=tool_names,
598
+ has_system=has_system,
599
+ temperature=request_data.temperature,
600
+ max_tokens=request_data.max_tokens,
601
+ streaming=True,
602
+ )
603
+ )
604
+
605
+ log_request_beautifully(
606
+ method="POST",
607
+ path="/v1/messages (streaming)",
608
+ original_model=original_model_name or "",
609
+ mapped_model=actual_model_id,
610
+ num_messages=num_messages,
611
+ num_tools=num_tools,
612
+ status_code=200,
613
+ )
614
+
615
+ def _on_stream_complete(usage: dict[str, int], failed: bool, error_type: str | None) -> None:
616
+ elapsed = (time.time() - start_time) * 1000
617
+ in_tok = usage.get("input_tokens", 0)
618
+ out_tok = usage.get("output_tokens", 0)
619
+ cache_tok = usage.get("cached_tokens", 0)
620
+ cost = _calc_and_log_cost(
621
+ model=actual_model_id,
622
+ tier=resolved_tier,
623
+ input_tokens=in_tok,
624
+ output_tokens=out_tok,
625
+ cached_tokens=cache_tok,
626
+ latency_ms=elapsed,
627
+ failed=failed,
628
+ request_id=request_id,
629
+ )
630
+ proxy_metrics.record_request(
631
+ tier=resolved_tier,
632
+ model=actual_model_id,
633
+ input_tokens=in_tok,
634
+ output_tokens=out_tok,
635
+ cached_tokens=cache_tok,
636
+ latency_ms=elapsed,
637
+ streaming=True,
638
+ failed=failed,
639
+ error_type=error_type,
640
+ cost_micros=cost,
641
+ )
642
+
643
+ return StreamingResponse(
644
+ convert_openai_to_anthropic_sse(
645
+ stream_generator(),
646
+ request_data,
647
+ request_id,
648
+ on_complete=_on_stream_complete,
649
+ ),
650
+ media_type="text/event-stream",
651
+ headers=headers,
652
+ )
653
+ else:
654
+ try:
655
+ openai_response = await client.create_completion(openai_request_dict, request_id)
656
+ anthropic_response = convert_openai_to_anthropic(openai_response, original_model_name)
657
+
658
+ if not anthropic_response:
659
+ raise HTTPException(
660
+ status_code=500,
661
+ detail={
662
+ "type": "api_error",
663
+ "message": "Failed to convert response",
664
+ },
665
+ )
666
+
667
+ response_dict = anthropic_response.model_dump()
668
+ response_dict["_request_id"] = request_id
669
+
670
+ duration_ms = (time.time() - start_time) * 1000
671
+
672
+ _usage = openai_response.get("usage", {})
673
+ _in = _usage.get("prompt_tokens", 0)
674
+ _out = _usage.get("completion_tokens", 0)
675
+ _cached = _usage.get("cached_tokens", 0)
676
+ _cost = _calc_and_log_cost(
677
+ model=actual_model_id,
678
+ tier=resolved_tier,
679
+ input_tokens=_in,
680
+ output_tokens=_out,
681
+ cached_tokens=_cached,
682
+ latency_ms=duration_ms,
683
+ failed=False,
684
+ request_id=request_id,
685
+ )
686
+ proxy_metrics.record_request(
687
+ tier=resolved_tier,
688
+ model=actual_model_id,
689
+ input_tokens=_in,
690
+ output_tokens=_out,
691
+ cached_tokens=_cached,
692
+ latency_ms=duration_ms,
693
+ streaming=False,
694
+ failed=False,
695
+ error_type=None,
696
+ cost_micros=_cost,
697
+ )
698
+
699
+ asyncio.create_task(
700
+ log_request_response(
701
+ request_id=request_id,
702
+ original_model=original_model_name or "",
703
+ mapped_model=actual_model_id,
704
+ request_body=request_data.model_dump(),
705
+ response_body=response_dict,
706
+ status_code=200,
707
+ duration_ms=duration_ms,
708
+ num_messages=num_messages,
709
+ num_tools=num_tools,
710
+ tool_names=tool_names,
711
+ has_system=has_system,
712
+ temperature=request_data.temperature,
713
+ max_tokens=request_data.max_tokens,
714
+ streaming=False,
715
+ )
716
+ )
717
+
718
+ log_request_beautifully(
719
+ method="POST",
720
+ path="/v1/messages",
721
+ original_model=original_model_name or "",
722
+ mapped_model=actual_model_id,
723
+ num_messages=num_messages,
724
+ num_tools=num_tools,
725
+ status_code=200,
726
+ )
727
+ return JSONResponse(
728
+ content=response_dict,
729
+ headers=_with_spend_warning(
730
+ {
731
+ "X-Request-ID": request_id,
732
+ "X-Resolved-Tier": resolved_tier,
733
+ "X-Resolved-Model": actual_model_id,
734
+ "X-Request-Cost": f"{_cost / 1_000_000:.6f}",
735
+ "X-Cumulative-Cost": f"{proxy_metrics.total_cost_micros / 1_000_000:.6f}",
736
+ },
737
+ spend_warning,
738
+ ),
739
+ )
740
+
741
+ except ToolCallError as e:
742
+ duration_ms = (time.time() - start_time) * 1000
743
+ error_msg = str(e)
744
+
745
+ _tc_cost = _calc_and_log_cost(
746
+ model=actual_model_id,
747
+ tier=resolved_tier,
748
+ input_tokens=0,
749
+ output_tokens=0,
750
+ cached_tokens=0,
751
+ latency_ms=duration_ms,
752
+ failed=True,
753
+ request_id=request_id,
754
+ )
755
+ proxy_metrics.record_request(
756
+ tier=resolved_tier,
757
+ model=actual_model_id,
758
+ input_tokens=0,
759
+ output_tokens=0,
760
+ cached_tokens=0,
761
+ latency_ms=duration_ms,
762
+ streaming=False,
763
+ failed=True,
764
+ error_type="tool_call_error",
765
+ cost_micros=_tc_cost,
766
+ )
767
+
768
+ asyncio.create_task(
769
+ log_request_response(
770
+ request_id=request_id,
771
+ original_model=original_model_name or "",
772
+ mapped_model=actual_model_id,
773
+ request_body=request_data.model_dump(),
774
+ response_body=None,
775
+ status_code=400,
776
+ duration_ms=duration_ms,
777
+ error=error_msg,
778
+ num_messages=num_messages,
779
+ num_tools=num_tools,
780
+ tool_names=tool_names,
781
+ has_system=has_system,
782
+ temperature=request_data.temperature,
783
+ max_tokens=request_data.max_tokens,
784
+ streaming=False,
785
+ )
786
+ )
787
+
788
+ log_request_beautifully(
789
+ method="POST",
790
+ path="/v1/messages",
791
+ original_model=original_model_name or "",
792
+ mapped_model=actual_model_id,
793
+ num_messages=num_messages,
794
+ num_tools=num_tools,
795
+ status_code=400,
796
+ )
797
+
798
+ logger.error(f"[{request_id}] Tool call error: {e}")
799
+ raise HTTPException(
800
+ status_code=400,
801
+ detail={"type": "invalid_request_error", "message": error_msg},
802
+ )
803
+ except AuthenticationError:
804
+ # Try refreshing credentials once
805
+ logger.warning(f"[{request_id}] Auth failed, refreshing credentials")
806
+ client = await client_factory.invalidate_and_retry(actual_model_id)
807
+ openai_response = await client.create_completion(openai_request_dict, request_id)
808
+ anthropic_response = convert_openai_to_anthropic(openai_response, original_model_name)
809
+
810
+ if not anthropic_response:
811
+ raise HTTPException(
812
+ status_code=500,
813
+ detail={
814
+ "type": "api_error",
815
+ "message": "Failed to convert response after retry",
816
+ },
817
+ )
818
+
819
+ retry_duration_ms = (time.time() - start_time) * 1000
820
+ _retry_usage = openai_response.get("usage", {})
821
+ _ri = _retry_usage.get("prompt_tokens", 0)
822
+ _ro = _retry_usage.get("completion_tokens", 0)
823
+ _rc = _retry_usage.get("cached_tokens", 0)
824
+ _rcost = _calc_and_log_cost(
825
+ model=actual_model_id,
826
+ tier=resolved_tier,
827
+ input_tokens=_ri,
828
+ output_tokens=_ro,
829
+ cached_tokens=_rc,
830
+ latency_ms=retry_duration_ms,
831
+ failed=False,
832
+ request_id=request_id,
833
+ )
834
+ proxy_metrics.record_request(
835
+ tier=resolved_tier,
836
+ model=actual_model_id,
837
+ input_tokens=_ri,
838
+ output_tokens=_ro,
839
+ cached_tokens=_rc,
840
+ latency_ms=retry_duration_ms,
841
+ streaming=False,
842
+ failed=False,
843
+ error_type=None,
844
+ cost_micros=_rcost,
845
+ )
846
+
847
+ response_dict = anthropic_response.model_dump()
848
+ response_dict["_request_id"] = request_id
849
+ return JSONResponse(
850
+ content=response_dict,
851
+ headers=_with_spend_warning(
852
+ {
853
+ "X-Request-ID": request_id,
854
+ "X-Resolved-Tier": resolved_tier,
855
+ "X-Resolved-Model": actual_model_id,
856
+ "X-Request-Cost": f"{_rcost / 1_000_000:.6f}",
857
+ "X-Cumulative-Cost": f"{proxy_metrics.total_cost_micros / 1_000_000:.6f}",
858
+ },
859
+ spend_warning,
860
+ ),
861
+ )
862
+
863
+ except HTTPException:
864
+ raise
865
+ except Exception as e:
866
+ duration_ms = (time.time() - start_time) * 1000
867
+ error_msg = f"Internal error [{request_id}]"
868
+
869
+ _err_cost = _calc_and_log_cost(
870
+ model=actual_model_id,
871
+ tier=resolved_tier,
872
+ input_tokens=0,
873
+ output_tokens=0,
874
+ cached_tokens=0,
875
+ latency_ms=duration_ms,
876
+ failed=True,
877
+ request_id=request_id,
878
+ )
879
+ proxy_metrics.record_request(
880
+ tier=resolved_tier,
881
+ model=actual_model_id,
882
+ input_tokens=0,
883
+ output_tokens=0,
884
+ cached_tokens=0,
885
+ latency_ms=duration_ms,
886
+ streaming=request_data.stream or False,
887
+ failed=True,
888
+ error_type="api_error",
889
+ cost_micros=_err_cost,
890
+ )
891
+
892
+ asyncio.create_task(
893
+ log_request_response(
894
+ request_id=request_id,
895
+ original_model=original_model_name or "",
896
+ mapped_model=actual_model_id,
897
+ request_body=request_data.model_dump(),
898
+ response_body=None,
899
+ status_code=500,
900
+ duration_ms=duration_ms,
901
+ error=error_msg,
902
+ num_messages=num_messages,
903
+ num_tools=num_tools,
904
+ tool_names=tool_names,
905
+ has_system=has_system,
906
+ temperature=request_data.temperature,
907
+ max_tokens=request_data.max_tokens,
908
+ streaming=request_data.stream or False,
909
+ )
910
+ )
911
+
912
+ log_request_beautifully(
913
+ method="POST",
914
+ path="/v1/messages",
915
+ original_model=original_model_name or "",
916
+ mapped_model=actual_model_id,
917
+ num_messages=num_messages,
918
+ num_tools=num_tools,
919
+ status_code=500,
920
+ )
921
+
922
+ logger.error(f"[{request_id}] Unexpected error: {e}", exc_info=True)
923
+ raise HTTPException(status_code=500, detail={"type": "api_error", "message": error_msg})
924
+
925
+
926
+ @app.post("/v1/messages/count_tokens", response_model=TokenCountResponse)
927
+ async def count_tokens(request_data: TokenCountRequest, raw_request: Request):
928
+ """Count tokens using the appropriate client's token counter."""
929
+ request_id = raw_request.state.request_id
930
+
931
+ _ensure_runtime_state()
932
+
933
+ try:
934
+ original_model_name = request_data.original_model_name
935
+
936
+ # Resolve tier FIRST (same precedence as message routing)
937
+ if request_data.has_explicit_tier and request_data.tier:
938
+ resolved_tier: str = request_data.tier
939
+ resolved_tier_source = "request"
940
+ elif config.proxy.default_tier:
941
+ resolved_tier = config.proxy.default_tier
942
+ resolved_tier_source = "proxy.default_tier"
943
+ else:
944
+ raise HTTPException(
945
+ status_code=500,
946
+ detail={
947
+ "type": "configuration_error",
948
+ "message": "config.proxy.default_tier is required for ambiguous requests under proxy-only routing",
949
+ },
950
+ )
951
+
952
+ request_data.tier = resolved_tier
953
+
954
+ # Match the /v1/messages model resolution: explicit backend models are
955
+ # preserved; Anthropic-style or ambiguous models go through tier + alternatives.
956
+ mapped_model = map_model_name(request_data.model)
957
+
958
+ if config.proxy.preferred_provider == "openrouter":
959
+ is_explicit_backend = original_model_name is not None and "/" in original_model_name
960
+ else:
961
+ is_explicit_backend = (
962
+ original_model_name is not None
963
+ and "/" in original_model_name
964
+ and any(
965
+ original_model_name.startswith(p)
966
+ for p in [
967
+ "openai/",
968
+ "anthropic/",
969
+ "vertex_ai/",
970
+ "bedrock/",
971
+ "gemini/",
972
+ "together_ai/",
973
+ "replicate/",
974
+ ]
975
+ )
976
+ )
977
+
978
+ if is_explicit_backend:
979
+ actual_model_id = mapped_model
980
+ else:
981
+ tier_default = config.proxy.get_model_for_tier(resolved_tier)
982
+ actual_model_id = _resolve_model_with_alternatives(resolved_tier, original_model_name, tier_default)
983
+
984
+ logger.info(f"[{request_id}] Token counting: original='{original_model_name}', target='{actual_model_id}'")
985
+ logger.debug(f"[{request_id}] Token count resolved tier: {resolved_tier} (source={resolved_tier_source})")
986
+
987
+ detected_provider = client_factory.detect_provider_for_model(actual_model_id)
988
+ provider_name = detected_provider.value
989
+
990
+ simulated_request = MessagesRequest(
991
+ model=actual_model_id,
992
+ messages=request_data.messages,
993
+ system=request_data.system,
994
+ max_tokens=1,
995
+ )
996
+ openai_dict = convert_anthropic_to_openai(simulated_request, provider=provider_name)
997
+ messages = openai_dict.get("messages", [])
998
+
999
+ client = await client_factory.get_client(actual_model_id, tier=resolved_tier)
1000
+ token_count = await client.count_tokens(messages)
1001
+
1002
+ response = TokenCountResponse(input_tokens=token_count)
1003
+ return JSONResponse(content=response.model_dump(), headers={"X-Request-ID": request_id})
1004
+
1005
+ except Exception as e:
1006
+ logger.error(f"[{request_id}] Token counting failed: {e}")
1007
+ raise HTTPException(
1008
+ status_code=500,
1009
+ detail={"type": "api_error", "message": f"Token counting failed [{request_id}]"},
1010
+ )
1011
+
1012
+
1013
+ DEFAULT_CONTEXT_WINDOW = 200000
1014
+
1015
+
1016
+ def get_context_window(model_name: str) -> int:
1017
+ """Get context window size for a model from the central catalog.
1018
+
1019
+ Falls back to a safe default for models not in the catalog (e.g.,
1020
+ OpenRouter models outside Forge's known set).
1021
+
1022
+ Args:
1023
+ model_name: Model ID (canonical or alias like 'openai/gpt-5.5')
1024
+
1025
+ Returns:
1026
+ Context window size in tokens.
1027
+ """
1028
+ from forge.core.models import get_context_window_tokens, model_exists
1029
+
1030
+ if not model_exists(model_name):
1031
+ logger.debug(f"Model {model_name!r} not in catalog, using default context window")
1032
+ return DEFAULT_CONTEXT_WINDOW
1033
+
1034
+ return get_context_window_tokens(model_name)
1035
+
1036
+
1037
+ @app.get("/", include_in_schema=False)
1038
+ async def root(request: Request):
1039
+ """Service health and runtime truth for status line scripts.
1040
+
1041
+ Returns proxy runtime status including:
1042
+ - is_proxy: True (indicates this is a proxy, not direct Anthropic API)
1043
+ - template: Active configuration template name
1044
+ - provider: Underlying provider (litellm, openai, gemini)
1045
+ - tiers: Mapping of Claude tiers to actual models with context windows
1046
+ - proxy: First-class proxy identity (proxy_id, template, port, base_url)
1047
+ - runtime: Actual resolved tier → model mappings, context windows, llm defaults
1048
+
1049
+ Note: Session state is no longer returned by proxy. Consumers should read
1050
+ session state locally via FORGE_SESSION env var or CWD manifest.
1051
+
1052
+ This endpoint reflects what the proxy is **actually doing**, not just
1053
+ echoed configuration. It serves as the source of runtime truth.
1054
+ """
1055
+ import os
1056
+
1057
+ from forge.proxy.proxy_identity import get_proxy_identity
1058
+
1059
+ active_template = os.environ.get("ACTIVE_TEMPLATE", "unknown")
1060
+ preferred_provider = os.environ.get("PREFERRED_PROVIDER", "unknown")
1061
+
1062
+ # Extract request host/port for proxy identity (accurate even with --auto-port)
1063
+ request_host = request.url.hostname or "localhost"
1064
+ request_port = request.url.port
1065
+
1066
+ # Fallback to env var if request port unavailable
1067
+ env_port_str = os.environ.get("ACTIVE_PORT")
1068
+ env_port = int(env_port_str) if env_port_str else None
1069
+
1070
+ # Discover proxy identity (2-tier: registry > derived)
1071
+ proxy_identity = get_proxy_identity(
1072
+ active_template=active_template,
1073
+ request_host=request_host,
1074
+ request_port=request_port,
1075
+ env_port=env_port,
1076
+ process_proxy_id=os.environ.get("FORGE_PROXY_ID"),
1077
+ )
1078
+
1079
+ # Tier mappings exposed via GET / for status line and session context
1080
+ tiers = {}
1081
+ provider_config = config.proxy.get_provider(preferred_provider)
1082
+ tier_models = {
1083
+ "haiku": provider_config.tiers.haiku,
1084
+ "sonnet": provider_config.tiers.sonnet,
1085
+ "opus": provider_config.tiers.opus,
1086
+ }
1087
+
1088
+ for tier, model in tier_models.items():
1089
+ tiers[tier] = {
1090
+ "model": model,
1091
+ "context_window": get_context_window(model),
1092
+ }
1093
+
1094
+ # Compute runtime LLM defaults (post-merge) from the credential manager.
1095
+ # This reflects the actual baseline hyperparameters used by proxy clients,
1096
+ # including env/tier overrides and caps.
1097
+ llm_defaults_by_tier: dict[str, dict[str, object]] = {}
1098
+ for tier in ("haiku", "sonnet", "opus"):
1099
+ try:
1100
+ model_name = tier_models.get(tier)
1101
+ if not model_name:
1102
+ raise ValueError(f"No model configured for tier {tier!r}")
1103
+ hp = client_factory.get_default_hyperparams_for_tier(
1104
+ provider=preferred_provider, tier=tier, model_name=model_name
1105
+ )
1106
+ llm_defaults_by_tier[tier] = hp.model_dump(exclude_unset=True)
1107
+ except Exception as e:
1108
+ llm_defaults_by_tier[tier] = {"error": f"failed to compute defaults: {e}"}
1109
+
1110
+ if config.proxy.default_tier:
1111
+ default_tier = config.proxy.default_tier
1112
+ default_tier_source = "proxy.default_tier"
1113
+ else:
1114
+ default_tier = None
1115
+ default_tier_source = "missing"
1116
+
1117
+ runtime_active_model = tier_models.get(default_tier or "sonnet") or tier_models.get("sonnet")
1118
+
1119
+ routing_section = {
1120
+ "default_tier": default_tier,
1121
+ "default_tier_source": default_tier_source,
1122
+ "note": "Routing defaults are proxy-owned. Session state is not authoritative for routing defaults.",
1123
+ }
1124
+
1125
+ if default_tier is None:
1126
+ routing_section["note"] = (
1127
+ "Proxy is missing config.proxy.default_tier; ambiguous requests will fail until configured."
1128
+ )
1129
+
1130
+ runtime_section = {
1131
+ "template": active_template,
1132
+ "provider": preferred_provider,
1133
+ "tier_mappings": tier_models,
1134
+ "context_windows": {tier: get_context_window(model) for tier, model in tier_models.items()},
1135
+ "active_tier": default_tier,
1136
+ "active_context_window": get_context_window(runtime_active_model) if runtime_active_model else None,
1137
+ # Proxy-owned hyperparameter defaults actually used by proxy clients (post-merge)
1138
+ "llm_defaults_by_tier": llm_defaults_by_tier,
1139
+ }
1140
+
1141
+ # Build proxy identity section (B2.1.5)
1142
+ proxy_section = {
1143
+ "proxy_id": proxy_identity.proxy_id,
1144
+ "template": proxy_identity.template,
1145
+ "port": proxy_identity.port,
1146
+ "base_url": proxy_identity.base_url,
1147
+ "source": proxy_identity.source,
1148
+ "status": proxy_identity.status,
1149
+ }
1150
+
1151
+ response = {
1152
+ "is_proxy": True,
1153
+ "template": active_template,
1154
+ "provider": preferred_provider,
1155
+ "tiers": tiers,
1156
+ "status": "running",
1157
+ "routing": routing_section,
1158
+ # Proxy identity (B2.1.5): first-class proxy identity
1159
+ "proxy": proxy_section,
1160
+ # Runtime truth: tier mappings, context windows, hyperparameter defaults
1161
+ "runtime": runtime_section,
1162
+ # Per-proxy metrics (request counts, token usage, latency)
1163
+ "metrics": proxy_metrics.snapshot(),
1164
+ }
1165
+
1166
+ return response
1167
+
1168
+
1169
+ @app.middleware("http")
1170
+ async def log_requests_middleware(request: Request, call_next):
1171
+ """Request logging middleware."""
1172
+ start_time = time.time()
1173
+
1174
+ path = request.url.path
1175
+ prefix = "req_"
1176
+ if "/count_tokens" in path:
1177
+ prefix = "tok_"
1178
+ elif "/" == path:
1179
+ prefix = "inf_"
1180
+
1181
+ request_id = request.headers.get("X-Request-ID") or f"{prefix}{uuid.uuid4().hex[:12]}"
1182
+ request.state.request_id = request_id
1183
+
1184
+ # Endpoints that have their own detailed logging
1185
+ verbose_endpoints = ("/messages", "/event_logging")
1186
+ has_own_logging = any(ep in path for ep in verbose_endpoints)
1187
+
1188
+ logger.debug(f"{path} [{request_id}] {request.method}")
1189
+
1190
+ try:
1191
+ response = await call_next(request)
1192
+ elapsed = time.time() - start_time
1193
+
1194
+ if has_own_logging:
1195
+ logger.debug(f"{path} [{request_id}] Middleware: {elapsed:.3f}s")
1196
+ else:
1197
+ status = response.status_code
1198
+ logger.info(f"{path} [{request_id}] Completed in {elapsed:.3f}s ({status})")
1199
+
1200
+ if "X-Request-ID" not in response.headers:
1201
+ response.headers["X-Request-ID"] = request_id
1202
+
1203
+ return response
1204
+ except Exception as e:
1205
+ logger.error(f"[{request_id}] Middleware error: {e}", exc_info=True)
1206
+ return JSONResponse(
1207
+ status_code=500,
1208
+ content={
1209
+ "error": {
1210
+ "type": "api_error",
1211
+ "message": f"Internal error [{request_id}]",
1212
+ }
1213
+ },
1214
+ headers={"X-Request-ID": request_id},
1215
+ )
1216
+
1217
+
1218
+ async def _check_client_tool_failures(request_data: MessagesRequest, request_id: str, mapped_model: str):
1219
+ """Check for client-side tool execution failures in the request.
1220
+
1221
+ Only scans the most recent user message. Older tool_result blocks were
1222
+ already inspected on prior requests; re-scanning them produces duplicate
1223
+ log entries and skews telemetry.
1224
+ """
1225
+ latest_user_msg = next(
1226
+ (m for m in reversed(request_data.messages) if m.role == "user" and isinstance(m.content, list)),
1227
+ None,
1228
+ )
1229
+ if latest_user_msg is None:
1230
+ return
1231
+
1232
+ for msg in (latest_user_msg,):
1233
+ if msg.role == "user" and isinstance(msg.content, list):
1234
+ for block in msg.content:
1235
+ if hasattr(block, "type") and block.type == "tool_result":
1236
+ tool_use_id = getattr(block, "tool_use_id", None)
1237
+ is_error = False
1238
+ error_content = None
1239
+
1240
+ # 1. Most reliable: Check explicit is_error field
1241
+ if hasattr(block, "is_error") and block.is_error:
1242
+ is_error = True
1243
+ if hasattr(block, "content"):
1244
+ error_content = block.content
1245
+
1246
+ if hasattr(block, "content") and not is_error:
1247
+ # 2. Check for dict with error keys (structured errors)
1248
+ if isinstance(block.content, dict) and any(k in block.content for k in ["error", "exception"]):
1249
+ is_error = True
1250
+ error_content = block.content
1251
+ # 3. For string content, only check for explicit error patterns at the start
1252
+ # Don't scan the entire content as it causes false positives with documentation
1253
+ elif isinstance(block.content, str):
1254
+ content_start = block.content[:200] if len(block.content) > 200 else block.content
1255
+ # Be specific to avoid false positives
1256
+ error_patterns = [
1257
+ "Error:",
1258
+ "ERROR:",
1259
+ "Exception:",
1260
+ "EXCEPTION:",
1261
+ "Failed:",
1262
+ "FAILED:",
1263
+ "Tool execution failed",
1264
+ "Command failed",
1265
+ "File not found",
1266
+ "Permission denied",
1267
+ "Invalid tool", # More specific than just "Invalid"
1268
+ "Invalid arguments",
1269
+ "Invalid input",
1270
+ "Traceback (most recent call last)",
1271
+ ]
1272
+ if any(content_start.startswith(pattern) for pattern in error_patterns):
1273
+ is_error = True
1274
+ error_content = block.content
1275
+ else:
1276
+ error_content = None
1277
+ else:
1278
+ error_content = block.content
1279
+
1280
+ if is_error and tool_use_id:
1281
+ tool_name, tool_input = _find_tool_use_info(request_data.messages, msg, tool_use_id)
1282
+
1283
+ # Check if this is a stale cleared tool result (not actionable)
1284
+ is_cleared_content = (
1285
+ isinstance(error_content, str) and "Old tool result content cleared" in error_content
1286
+ )
1287
+
1288
+ # Only log as warning if we have actual error content (not cleared)
1289
+ if error_content and not is_cleared_content:
1290
+ logger.warning(
1291
+ f"[{request_id}] Client tool failure: "
1292
+ f"tool='{tool_name or 'unknown'}', id='{tool_use_id}', "
1293
+ f"error={str(error_content)[:100]}"
1294
+ )
1295
+ elif is_cleared_content:
1296
+ logger.debug(
1297
+ f"[{request_id}] Stale tool failure (content cleared): "
1298
+ f"tool='{tool_name or 'unknown'}', id='{tool_use_id}'"
1299
+ )
1300
+ else:
1301
+ # Debug log for investigation when is_error but no content
1302
+ logger.debug(
1303
+ f"[{request_id}] Tool marked as error but no error content: "
1304
+ f"tool='{tool_name or 'unknown'}', id='{tool_use_id}', "
1305
+ f"is_error={getattr(block, 'is_error', None)}"
1306
+ )
1307
+
1308
+ enriched_content = error_content
1309
+ if error_content and not is_cleared_content and isinstance(error_content, str):
1310
+ provider_cfg = config.proxy.get_provider()
1311
+ if provider_cfg.error_hints:
1312
+ enriched_content = enrich_error_content(tool_name, error_content)
1313
+ if enriched_content != error_content:
1314
+ block.content = enriched_content
1315
+ logger.debug(f"[{request_id}] Enriched error hint for tool '{tool_name}'")
1316
+
1317
+ # Only log as failure if we have actual error content (not cleared)
1318
+ if error_content and not is_cleared_content:
1319
+ asyncio.create_task(
1320
+ log_tool_failure(
1321
+ request_id=request_id,
1322
+ mapped_model=mapped_model,
1323
+ tool_name=tool_name,
1324
+ tool_use_id=tool_use_id,
1325
+ tool_input=tool_input,
1326
+ error_content=error_content,
1327
+ )
1328
+ )
1329
+ asyncio.create_task(
1330
+ log_tool_event(
1331
+ request_id=request_id,
1332
+ tool_name=tool_name,
1333
+ status="failure",
1334
+ stage="client_execution_report",
1335
+ details={
1336
+ "tool_use_id": tool_use_id,
1337
+ "error_content": enriched_content,
1338
+ "tool_name_found": bool(tool_name),
1339
+ },
1340
+ )
1341
+ )
1342
+
1343
+
1344
+ def _find_tool_use_info(messages, current_msg, tool_use_id) -> tuple[str | None, dict[str, Any] | None]:
1345
+ """Find tool name and input parameters from message history."""
1346
+ current_idx = messages.index(current_msg)
1347
+
1348
+ for i in range(current_idx - 1, -1, -1):
1349
+ prev_msg = messages[i]
1350
+ if prev_msg.role == "assistant" and isinstance(prev_msg.content, list):
1351
+ for block in prev_msg.content:
1352
+ if (
1353
+ hasattr(block, "type")
1354
+ and block.type == "tool_use"
1355
+ and hasattr(block, "id")
1356
+ and block.id == tool_use_id
1357
+ ):
1358
+ return (
1359
+ getattr(block, "name", None),
1360
+ getattr(block, "input", None),
1361
+ )
1362
+ return None, None
1363
+
1364
+
1365
+ def find_available_port(start_port: int, max_attempts: int = 10) -> int:
1366
+ """Find an available port starting from start_port."""
1367
+ for port in range(start_port, start_port + max_attempts):
1368
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
1369
+ try:
1370
+ sock.bind(("", port))
1371
+ sock.close()
1372
+ return port
1373
+ except OSError:
1374
+ continue
1375
+ raise RuntimeError(f"Could not find available port in range {start_port}-{start_port + max_attempts}")
1376
+
1377
+
1378
+ @click.command()
1379
+ @click.option(
1380
+ "--template",
1381
+ type=str,
1382
+ required=True,
1383
+ help="Configuration template to use (e.g., openrouter-gemini, openrouter-openai, openrouter-anthropic)",
1384
+ )
1385
+ @click.option("--port", type=int, default=8082, help="Port to run the server on (default: 8082)")
1386
+ @click.option("--host", default="127.0.0.1", help="Host to bind the server to (default: 127.0.0.1)")
1387
+ @click.option("--reload", is_flag=True, help="Enable auto-reload on code changes")
1388
+ @click.option(
1389
+ "--auto-port",
1390
+ is_flag=True,
1391
+ help="Automatically find an available port if the specified port is in use",
1392
+ )
1393
+ @click.option(
1394
+ "--proxy-id",
1395
+ type=str,
1396
+ required=False,
1397
+ help="Explicit proxy id (enables proxy-scoped overrides + strict startup validation).",
1398
+ )
1399
+ def main(
1400
+ template: str,
1401
+ port: int,
1402
+ host: str,
1403
+ reload: bool,
1404
+ auto_port: bool,
1405
+ proxy_id: str | None,
1406
+ ):
1407
+ """Start the Unified LLM Proxy server with template-based configuration.
1408
+
1409
+ Template configurations are defined in YAML files under config/defaults/templates/.
1410
+ Each template specifies:
1411
+ - Provider (gemini, openai, litellm)
1412
+ - Model tier mappings (haiku, sonnet, opus)
1413
+ - Provider-specific settings (reasoning effort, cache TTL, etc.)
1414
+ """
1415
+ import os
1416
+
1417
+ from forge.config.loader import template_exists
1418
+
1419
+ if not template_exists(template):
1420
+ click.echo(f"Unknown template '{template}'")
1421
+ click.echo("Run 'forge proxy template list' to see available templates.")
1422
+ sys.exit(1)
1423
+
1424
+ level = get_effective_log_level()
1425
+ if level != "off":
1426
+ configure_debug_logging(component="proxy", subdirectory="proxy")
1427
+ configure_console_logging()
1428
+
1429
+ effective_proxy_id = proxy_id
1430
+
1431
+ try:
1432
+ cfg = init_config(template=template, proxy_id=effective_proxy_id)
1433
+ provider = cfg.proxy.preferred_provider
1434
+ default_port = cfg.proxy.default_port
1435
+
1436
+ if not provider:
1437
+ click.echo(f"✘ Template '{template}' missing 'preferred_provider' field")
1438
+ sys.exit(1)
1439
+
1440
+ except Exception as e:
1441
+ click.echo(f"✘ Failed to load template '{template}': {e}")
1442
+ sys.exit(1)
1443
+
1444
+ if default_port and default_port != port:
1445
+ click.echo(
1446
+ f"⚠︎ Warning: Template '{template}' typically uses port {default_port}, but starting on port {port}"
1447
+ )
1448
+ click.echo(f" Recommended: python -m forge.proxy.server --template {template} --port {default_port}")
1449
+
1450
+ actual_port = port
1451
+ if auto_port:
1452
+ if effective_proxy_id is not None:
1453
+ click.echo("✘ --auto-port cannot be used when starting under a proxy id")
1454
+ sys.exit(1)
1455
+
1456
+ actual_port = find_available_port(port)
1457
+ if actual_port != port:
1458
+ click.echo(f"⚠︎ Port {port} is in use, using port {actual_port} instead")
1459
+ else:
1460
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
1461
+ try:
1462
+ sock.bind((host, port))
1463
+ sock.close()
1464
+ except OSError:
1465
+ click.echo(f"✘ Port {port} is already in use!")
1466
+ click.echo(" Use --auto-port to automatically find an available port")
1467
+ sys.exit(1)
1468
+
1469
+ # Strict proxy startup validation (B2.1.3)
1470
+ if effective_proxy_id is not None:
1471
+ try:
1472
+ from forge.proxy.proxy_startup import (
1473
+ ProxyStartupContext,
1474
+ ProxyStartupValidationError,
1475
+ validate_proxy_startup,
1476
+ )
1477
+
1478
+ validate_proxy_startup(
1479
+ ctx=ProxyStartupContext(proxy_id=effective_proxy_id, template=template, port=actual_port)
1480
+ )
1481
+
1482
+ except ProxyStartupValidationError as e:
1483
+ click.echo(f"✘ {e}")
1484
+ sys.exit(1)
1485
+ except Exception as e:
1486
+ click.echo(f"✘ Failed to validate proxy startup: {e}")
1487
+ sys.exit(1)
1488
+
1489
+ # Track which template is active (for runtime introspection)
1490
+ # Set ACTIVE_PORT to actual_port (not port) to handle --auto-port correctly
1491
+ os.environ["ACTIVE_TEMPLATE"] = template
1492
+ os.environ["ACTIVE_PORT"] = str(actual_port)
1493
+ os.environ["PREFERRED_PROVIDER"] = provider
1494
+
1495
+ # Freeze proxy id for request handlers. Set in env so the uvicorn worker
1496
+ # (which reimports the module when app is passed as a string) picks it up.
1497
+ global PROXY_ID
1498
+ PROXY_ID = effective_proxy_id
1499
+ if effective_proxy_id is not None:
1500
+ os.environ["FORGE_PROXY_ID"] = effective_proxy_id
1501
+
1502
+ # Initialize in this module for direct/app-object runs; the imported
1503
+ # uvicorn app module initializes itself lazily via _ensure_runtime_state().
1504
+ _initialize_cost_tracker_from_config()
1505
+
1506
+ provider_cfg = cfg.proxy.get_provider(provider)
1507
+ tier_models = {
1508
+ "haiku": provider_cfg.tiers.haiku,
1509
+ "sonnet": provider_cfg.tiers.sonnet,
1510
+ "opus": provider_cfg.tiers.opus,
1511
+ }
1512
+
1513
+ click.echo("")
1514
+ click.echo("╔══════════════════════════════════════╗")
1515
+ click.echo("║ Unified LLM Proxy Server ║")
1516
+ click.echo("╚══════════════════════════════════════╝")
1517
+ click.echo("")
1518
+ click.echo(f"🌐 Server: http://{host}:{actual_port}")
1519
+ click.echo(f" Template: {template}")
1520
+ click.echo(f"📡 Provider: {provider}")
1521
+ click.echo(f" Log Level: {level}")
1522
+ click.echo(f"🔄 Reload: {'enabled' if reload else 'disabled'}")
1523
+ click.echo("")
1524
+ click.echo(" Model Tier Mappings:")
1525
+ for tier, model in tier_models.items():
1526
+ if model:
1527
+ click.echo(f" {tier.capitalize():6} → {model}")
1528
+ click.echo("")
1529
+
1530
+ click.echo(" Provider Settings:")
1531
+ click.echo(f" cache_ttl: {provider_cfg.cache_ttl}")
1532
+ if provider_cfg.base_url:
1533
+ click.echo(f" base_url: {provider_cfg.base_url}")
1534
+ click.echo("")
1535
+
1536
+ if effective_proxy_id is not None:
1537
+ click.echo(f" Proxy: ~/.forge/proxies/{effective_proxy_id}/proxy.yaml")
1538
+ else:
1539
+ click.echo(f" Template: defaults/templates/{template}.yaml")
1540
+ click.echo("")
1541
+ click.echo("Press CTRL+C to stop the server")
1542
+ click.echo("")
1543
+
1544
+ uvicorn_level = {
1545
+ "off": "warning",
1546
+ "debug": "debug",
1547
+ "info": "info",
1548
+ "warning": "warning",
1549
+ }.get(level, "warning")
1550
+
1551
+ uvicorn.run(
1552
+ "forge.proxy.server:app",
1553
+ host=host,
1554
+ port=actual_port,
1555
+ log_level=uvicorn_level,
1556
+ reload=reload,
1557
+ )
1558
+
1559
+
1560
+ if __name__ == "__main__":
1561
+ main()