multi-forge 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- forge/__init__.py +3 -0
- forge/_extensions/agents/.gitkeep +0 -0
- forge/_extensions/commands/.gitkeep +0 -0
- forge/_extensions/skills/analyze/SKILL.md +87 -0
- forge/_extensions/skills/challenge/SKILL.md +91 -0
- forge/_extensions/skills/consensus/SKILL.md +120 -0
- forge/_extensions/skills/consensus/resources/code_consensus_evaluation.md +94 -0
- forge/_extensions/skills/consensus/resources/consensus_evaluation.md +70 -0
- forge/_extensions/skills/consensus/resources/synthesis.md +101 -0
- forge/_extensions/skills/debate/SKILL.md +116 -0
- forge/_extensions/skills/debate/resources/code_debate_evaluation.md +101 -0
- forge/_extensions/skills/debate/resources/debate_evaluation.md +90 -0
- forge/_extensions/skills/panel/SKILL.md +141 -0
- forge/_extensions/skills/panel/resources/synthesis.md +103 -0
- forge/_extensions/skills/qa/SKILL.md +704 -0
- forge/_extensions/skills/qa/resources/checklist/0-enable.md +78 -0
- forge/_extensions/skills/qa/resources/checklist/1-preflight.md +24 -0
- forge/_extensions/skills/qa/resources/checklist/10-resume.md +143 -0
- forge/_extensions/skills/qa/resources/checklist/11-config.md +150 -0
- forge/_extensions/skills/qa/resources/checklist/12-search.md +58 -0
- forge/_extensions/skills/qa/resources/checklist/13-guard.md +237 -0
- forge/_extensions/skills/qa/resources/checklist/14-workflow.md +305 -0
- forge/_extensions/skills/qa/resources/checklist/15-skills.md +155 -0
- forge/_extensions/skills/qa/resources/checklist/16-handoff.md +224 -0
- forge/_extensions/skills/qa/resources/checklist/17-info.md +50 -0
- forge/_extensions/skills/qa/resources/checklist/18-disable.md +84 -0
- forge/_extensions/skills/qa/resources/checklist/19-uninstall.md +146 -0
- forge/_extensions/skills/qa/resources/checklist/2-extensions.md +188 -0
- forge/_extensions/skills/qa/resources/checklist/20-cleanup.md +36 -0
- forge/_extensions/skills/qa/resources/checklist/3-auth.md +234 -0
- forge/_extensions/skills/qa/resources/checklist/4-proxy.md +481 -0
- forge/_extensions/skills/qa/resources/checklist/5-session.md +541 -0
- forge/_extensions/skills/qa/resources/checklist/6-hooks.md +275 -0
- forge/_extensions/skills/qa/resources/checklist/7-costs.md +309 -0
- forge/_extensions/skills/qa/resources/checklist/8-status-line.md +174 -0
- forge/_extensions/skills/qa/resources/checklist/9-direct-commands.md +146 -0
- forge/_extensions/skills/qa/resources/checklist.md +103 -0
- forge/_extensions/skills/qa/resources/report-template.md +62 -0
- forge/_extensions/skills/qa/scripts/start-container.sh +529 -0
- forge/_extensions/skills/qa/scripts/walkthrough-state.py +1137 -0
- forge/_extensions/skills/review/SKILL.md +125 -0
- forge/_extensions/skills/review/references/claude-4.6.md +474 -0
- forge/_extensions/skills/review/references/claude-4.7.md +710 -0
- forge/_extensions/skills/review/references/gemini-3.1.md +546 -0
- forge/_extensions/skills/review/references/gpt-5.5.md +490 -0
- forge/_extensions/skills/review/references/skills-writing-guide.md +1588 -0
- forge/_extensions/skills/review/resources/code-anthropic.md +160 -0
- forge/_extensions/skills/review/resources/code-gemini.md +184 -0
- forge/_extensions/skills/review/resources/code-openai.md +203 -0
- forge/_extensions/skills/review/resources/code.md +160 -0
- forge/_extensions/skills/review-docs/SKILL.md +121 -0
- forge/_extensions/skills/review-docs/resources/docs-anthropic.md +170 -0
- forge/_extensions/skills/review-docs/resources/docs-gemini.md +204 -0
- forge/_extensions/skills/review-docs/resources/docs-openai.md +231 -0
- forge/_extensions/skills/review-docs/resources/docs.md +170 -0
- forge/_extensions/skills/smoke-test/SKILL.md +27 -0
- forge/_extensions/skills/smoke-test/scripts/smoke-test.sh +118 -0
- forge/_extensions/skills/understand/SKILL.md +148 -0
- forge/_extensions/skills/understand/resources/code-anthropic.md +163 -0
- forge/_extensions/skills/understand/resources/code-gemini.md +194 -0
- forge/_extensions/skills/understand/resources/code-openai.md +181 -0
- forge/_extensions/skills/understand/resources/code.md +163 -0
- forge/_extensions/skills/understand/resources/docs-anthropic.md +177 -0
- forge/_extensions/skills/understand/resources/docs-gemini.md +202 -0
- forge/_extensions/skills/understand/resources/docs-openai.md +191 -0
- forge/_extensions/skills/understand/resources/docs.md +177 -0
- forge/_extensions/skills/walkthrough/SKILL.md +599 -0
- forge/_extensions/skills/walkthrough/resources/checklist.md +765 -0
- forge/_extensions/skills/walkthrough/scripts/run-in-repo.sh +118 -0
- forge/_extensions/skills/walkthrough/scripts/setup-test-repo.sh +198 -0
- forge/_extensions/skills/walkthrough/scripts/walkthrough-state.py +1137 -0
- forge/backend/__init__.py +174 -0
- forge/backend/adapters/__init__.py +38 -0
- forge/backend/adapters/litellm.py +158 -0
- forge/backend/creation.py +89 -0
- forge/backend/registry.py +178 -0
- forge/cli/__init__.py +16 -0
- forge/cli/auth.py +483 -0
- forge/cli/backend.py +298 -0
- forge/cli/claude.py +411 -0
- forge/cli/config_cmd.py +303 -0
- forge/cli/extensions.py +1001 -0
- forge/cli/gc.py +165 -0
- forge/cli/guard.py +1018 -0
- forge/cli/guards.py +106 -0
- forge/cli/handoff.py +110 -0
- forge/cli/hooks/__init__.py +36 -0
- forge/cli/hooks/_group.py +20 -0
- forge/cli/hooks/_helpers.py +149 -0
- forge/cli/hooks/commands.py +1677 -0
- forge/cli/hooks/direct_commands.py +1304 -0
- forge/cli/hooks/install.py +232 -0
- forge/cli/hooks/policy.py +151 -0
- forge/cli/hooks/read_hygiene.py +74 -0
- forge/cli/hooks/verification.py +370 -0
- forge/cli/logs.py +406 -0
- forge/cli/main.py +292 -0
- forge/cli/proxy.py +1821 -0
- forge/cli/proxy_costs.py +313 -0
- forge/cli/search.py +416 -0
- forge/cli/session.py +892 -0
- forge/cli/session_addendum.py +81 -0
- forge/cli/session_fork.py +750 -0
- forge/cli/session_handoff.py +141 -0
- forge/cli/session_lifecycle.py +2053 -0
- forge/cli/session_manage.py +1336 -0
- forge/cli/session_memory.py +201 -0
- forge/cli/status_line.py +1398 -0
- forge/cli/workflow.py +1964 -0
- forge/config/__init__.py +110 -0
- forge/config/dataclass_utils.py +88 -0
- forge/config/defaults/__init__.py +0 -0
- forge/config/defaults/backends/__init__.py +0 -0
- forge/config/defaults/backends/litellm.yaml +196 -0
- forge/config/defaults/templates/__init__.py +0 -0
- forge/config/defaults/templates/litellm-anthropic-local.yaml +33 -0
- forge/config/defaults/templates/litellm-anthropic.yaml +24 -0
- forge/config/defaults/templates/litellm-gemini-flash-local.yaml +37 -0
- forge/config/defaults/templates/litellm-gemini-local.yaml +32 -0
- forge/config/defaults/templates/litellm-gemini-test.yaml +34 -0
- forge/config/defaults/templates/litellm-gemini.yaml +21 -0
- forge/config/defaults/templates/litellm-openai-codex-local.yaml +36 -0
- forge/config/defaults/templates/litellm-openai-local.yaml +38 -0
- forge/config/defaults/templates/litellm-openai.yaml +28 -0
- forge/config/defaults/templates/openrouter-anthropic.yaml +23 -0
- forge/config/defaults/templates/openrouter-deepseek.yaml +26 -0
- forge/config/defaults/templates/openrouter-gemini-flash.yaml +26 -0
- forge/config/defaults/templates/openrouter-gemini.yaml +23 -0
- forge/config/defaults/templates/openrouter-glm.yaml +23 -0
- forge/config/defaults/templates/openrouter-kimi.yaml +30 -0
- forge/config/defaults/templates/openrouter-minimax.yaml +26 -0
- forge/config/defaults/templates/openrouter-openai-codex.yaml +23 -0
- forge/config/defaults/templates/openrouter-openai.yaml +28 -0
- forge/config/defaults/templates/openrouter-qwen.yaml +25 -0
- forge/config/loader.py +675 -0
- forge/config/schema.py +448 -0
- forge/core/__init__.py +5 -0
- forge/core/auth/__init__.py +67 -0
- forge/core/auth/capabilities.py +219 -0
- forge/core/auth/credentials_file.py +244 -0
- forge/core/auth/protocols.py +18 -0
- forge/core/auth/secrets.py +243 -0
- forge/core/auth/template_secrets.py +112 -0
- forge/core/data/__init__.py +5 -0
- forge/core/data/model_catalog.yaml +1522 -0
- forge/core/data/pricing.yaml +140 -0
- forge/core/data/system_prompt_addendums/__init__.py +0 -0
- forge/core/data/system_prompt_addendums/gemini.md +330 -0
- forge/core/data/system_prompt_addendums/openai.md +328 -0
- forge/core/llm/__init__.py +231 -0
- forge/core/llm/clients/__init__.py +14 -0
- forge/core/llm/clients/base.py +115 -0
- forge/core/llm/clients/litellm.py +619 -0
- forge/core/llm/clients/openai_compat.py +244 -0
- forge/core/llm/clients/openrouter.py +234 -0
- forge/core/llm/credentials.py +439 -0
- forge/core/llm/detection.py +86 -0
- forge/core/llm/errors.py +44 -0
- forge/core/llm/protocols.py +80 -0
- forge/core/llm/types.py +176 -0
- forge/core/logging.py +146 -0
- forge/core/models/__init__.py +91 -0
- forge/core/models/catalog.py +467 -0
- forge/core/models/pricing.py +165 -0
- forge/core/models/types.py +167 -0
- forge/core/naming.py +212 -0
- forge/core/ops/__init__.py +73 -0
- forge/core/ops/context.py +141 -0
- forge/core/ops/gc.py +802 -0
- forge/core/ops/proxy.py +146 -0
- forge/core/ops/resolution.py +135 -0
- forge/core/ops/session.py +344 -0
- forge/core/ops/session_context.py +548 -0
- forge/core/paths.py +38 -0
- forge/core/process.py +54 -0
- forge/core/reactive/__init__.py +38 -0
- forge/core/reactive/cost_tracking.py +300 -0
- forge/core/reactive/env.py +180 -0
- forge/core/reactive/proxy.py +78 -0
- forge/core/reactive/routing.py +622 -0
- forge/core/reactive/session_runner.py +185 -0
- forge/core/reactive/structured_output.py +62 -0
- forge/core/reactive/tagger.py +94 -0
- forge/core/reactive/throttle.py +132 -0
- forge/core/state/__init__.py +59 -0
- forge/core/state/exceptions.py +59 -0
- forge/core/state/io.py +140 -0
- forge/core/state/lock.py +99 -0
- forge/core/state/timestamps.py +60 -0
- forge/core/transcript.py +78 -0
- forge/core/typing_helpers.py +24 -0
- forge/core/workqueue/__init__.py +67 -0
- forge/core/workqueue/queue.py +552 -0
- forge/core/workqueue/types.py +63 -0
- forge/guard/__init__.py +26 -0
- forge/guard/deterministic/__init__.py +26 -0
- forge/guard/deterministic/base.py +158 -0
- forge/guard/deterministic/coding_standards.py +256 -0
- forge/guard/deterministic/registry.py +148 -0
- forge/guard/deterministic/tdd.py +171 -0
- forge/guard/engine.py +216 -0
- forge/guard/protocols.py +91 -0
- forge/guard/queries.py +96 -0
- forge/guard/semantic/__init__.py +34 -0
- forge/guard/semantic/promotion.py +18 -0
- forge/guard/semantic/supervisor.py +813 -0
- forge/guard/semantic/verdict.py +183 -0
- forge/guard/store.py +124 -0
- forge/guard/team/__init__.py +6 -0
- forge/guard/team/config.py +24 -0
- forge/guard/team/handlers.py +209 -0
- forge/guard/team/prompts.py +41 -0
- forge/guard/types.py +125 -0
- forge/guard/workflow/__init__.py +17 -0
- forge/guard/workflow/branches.py +67 -0
- forge/guard/workflow/config.py +63 -0
- forge/guard/workflow/divergence.py +113 -0
- forge/guard/workflow/policy.py +87 -0
- forge/guard/workflow/stages.py +205 -0
- forge/install/__init__.py +55 -0
- forge/install/cli.py +281 -0
- forge/install/exceptions.py +163 -0
- forge/install/hooks.py +109 -0
- forge/install/installer.py +1037 -0
- forge/install/models.py +321 -0
- forge/install/preset.py +272 -0
- forge/install/settings_merge.py +831 -0
- forge/install/tracking.py +238 -0
- forge/install/version.py +141 -0
- forge/proxy/__init__.py +0 -0
- forge/proxy/base_client.py +181 -0
- forge/proxy/client_adapter.py +476 -0
- forge/proxy/client_factory.py +531 -0
- forge/proxy/converters.py +1206 -0
- forge/proxy/cost_logger.py +132 -0
- forge/proxy/cost_tracker.py +242 -0
- forge/proxy/data_models.py +338 -0
- forge/proxy/error_hints.py +92 -0
- forge/proxy/metrics.py +222 -0
- forge/proxy/model_spec.py +158 -0
- forge/proxy/proxies.py +333 -0
- forge/proxy/proxy_identity.py +134 -0
- forge/proxy/proxy_orchestrator.py +1018 -0
- forge/proxy/proxy_startup.py +54 -0
- forge/proxy/server.py +1561 -0
- forge/proxy/utils.py +537 -0
- forge/review/__init__.py +6 -0
- forge/review/adversarial.py +111 -0
- forge/review/consensus.py +236 -0
- forge/review/engine.py +356 -0
- forge/review/models.py +437 -0
- forge/review/resources/__init__.py +5 -0
- forge/review/resources/codereview-performance.md +85 -0
- forge/review/resources/codereview-quick.md +75 -0
- forge/review/resources/codereview-security.md +92 -0
- forge/review/resources/codereview.md +85 -0
- forge/review/resources/docreview-quick.md +75 -0
- forge/review/resources/docreview.md +86 -0
- forge/review/resources/thinkdeep.md +89 -0
- forge/review/routing.py +368 -0
- forge/review/synthesis.py +73 -0
- forge/runtime_config.py +438 -0
- forge/search/__init__.py +55 -0
- forge/search/bm25_store.py +264 -0
- forge/search/content_store.py +197 -0
- forge/search/engine.py +352 -0
- forge/search/exceptions.py +51 -0
- forge/search/extractor.py +234 -0
- forge/search/index_state.py +295 -0
- forge/search/store.py +215 -0
- forge/search/tokenizer.py +24 -0
- forge/session/__init__.py +130 -0
- forge/session/active.py +339 -0
- forge/session/artifacts.py +202 -0
- forge/session/claude/__init__.py +50 -0
- forge/session/claude/cleanup.py +105 -0
- forge/session/claude/invoke.py +236 -0
- forge/session/claude/paths.py +200 -0
- forge/session/cleanup.py +216 -0
- forge/session/config.py +34 -0
- forge/session/direct_model.py +107 -0
- forge/session/effective.py +169 -0
- forge/session/exceptions.py +255 -0
- forge/session/handoff.py +881 -0
- forge/session/handoff_agent.py +544 -0
- forge/session/hooks/__init__.py +35 -0
- forge/session/hooks/models.py +73 -0
- forge/session/hooks/session_start.py +507 -0
- forge/session/identity.py +84 -0
- forge/session/index.py +553 -0
- forge/session/manager.py +1506 -0
- forge/session/models.py +572 -0
- forge/session/overrides.py +344 -0
- forge/session/plan_resolution.py +286 -0
- forge/session/prev_sessions.py +128 -0
- forge/session/store.py +431 -0
- forge/session/validation.py +47 -0
- forge/session/worktree/__init__.py +65 -0
- forge/session/worktree/cleanup.py +262 -0
- forge/session/worktree/config_copy.py +203 -0
- forge/session/worktree/create.py +332 -0
- forge/sidecar/__init__.py +29 -0
- forge/sidecar/container.py +161 -0
- forge/sidecar/docker.py +86 -0
- forge/sidecar/secrets.py +19 -0
- multi_forge-0.2.0.dist-info/METADATA +242 -0
- multi_forge-0.2.0.dist-info/RECORD +311 -0
- multi_forge-0.2.0.dist-info/WHEEL +4 -0
- multi_forge-0.2.0.dist-info/entry_points.txt +2 -0
- multi_forge-0.2.0.dist-info/licenses/LICENSE +203 -0
- multi_forge-0.2.0.dist-info/licenses/NOTICE +14 -0
forge/proxy/server.py
ADDED
|
@@ -0,0 +1,1561 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Unified LLM Proxy Server - Anthropic-compatible API for multiple providers.
|
|
3
|
+
|
|
4
|
+
This FastAPI server provides an Anthropic Messages API-compatible interface for
|
|
5
|
+
LLM providers via LiteLLM.
|
|
6
|
+
|
|
7
|
+
The server uses a unified client architecture where provider-specific logic is
|
|
8
|
+
encapsulated in client implementations that inherit from AbstractLLMClient.
|
|
9
|
+
This design ensures consistent behavior across providers while keeping the
|
|
10
|
+
server code clean and maintainable.
|
|
11
|
+
|
|
12
|
+
Key endpoints:
|
|
13
|
+
- POST /v1/messages - Main chat completion endpoint (streaming/non-streaming)
|
|
14
|
+
- POST /v1/messages/count_tokens - Token counting endpoint
|
|
15
|
+
- GET / - Health check and service information
|
|
16
|
+
|
|
17
|
+
For detailed API documentation, architecture overview, and configuration options,
|
|
18
|
+
see README.md in the project root.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
import asyncio
|
|
22
|
+
import logging
|
|
23
|
+
import os
|
|
24
|
+
import socket
|
|
25
|
+
import sys
|
|
26
|
+
import time
|
|
27
|
+
import uuid
|
|
28
|
+
from contextlib import asynccontextmanager
|
|
29
|
+
from typing import Any
|
|
30
|
+
|
|
31
|
+
import click
|
|
32
|
+
import uvicorn
|
|
33
|
+
from fastapi import FastAPI, HTTPException, Request
|
|
34
|
+
from fastapi.responses import JSONResponse, StreamingResponse
|
|
35
|
+
|
|
36
|
+
from forge.config import TierOverride, config, init_config, reload
|
|
37
|
+
from forge.core.llm.errors import AuthenticationError
|
|
38
|
+
from forge.core.logging import (
|
|
39
|
+
configure_console_logging,
|
|
40
|
+
configure_debug_logging,
|
|
41
|
+
get_effective_log_level,
|
|
42
|
+
)
|
|
43
|
+
from forge.proxy.base_client import ProxyStreamError, ToolCallError
|
|
44
|
+
from forge.proxy.client_factory import TierClientFactory
|
|
45
|
+
from forge.proxy.converters import (
|
|
46
|
+
convert_anthropic_to_openai,
|
|
47
|
+
convert_openai_to_anthropic,
|
|
48
|
+
convert_openai_to_anthropic_sse,
|
|
49
|
+
)
|
|
50
|
+
from forge.proxy.cost_logger import log_request_cost
|
|
51
|
+
from forge.proxy.cost_tracker import CostTracker
|
|
52
|
+
from forge.proxy.data_models import (
|
|
53
|
+
MessagesRequest,
|
|
54
|
+
TokenCountRequest,
|
|
55
|
+
TokenCountResponse,
|
|
56
|
+
map_model_name,
|
|
57
|
+
)
|
|
58
|
+
from forge.proxy.error_hints import enrich_error_content
|
|
59
|
+
from forge.proxy.metrics import proxy_metrics
|
|
60
|
+
from forge.proxy.utils import (
|
|
61
|
+
log_request_beautifully,
|
|
62
|
+
log_request_response,
|
|
63
|
+
log_tool_event,
|
|
64
|
+
log_tool_failure,
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
logger = logging.getLogger(__name__)
|
|
68
|
+
|
|
69
|
+
logging.getLogger("uvicorn").setLevel(logging.WARNING)
|
|
70
|
+
logging.getLogger("uvicorn.access").setLevel(logging.WARNING)
|
|
71
|
+
logging.getLogger("uvicorn.error").setLevel(logging.WARNING)
|
|
72
|
+
logging.getLogger("httpx").setLevel(logging.WARNING)
|
|
73
|
+
logging.getLogger("httpcore").setLevel(logging.WARNING)
|
|
74
|
+
|
|
75
|
+
client_factory = TierClientFactory()
|
|
76
|
+
|
|
77
|
+
PREFERRED_PROVIDER = None
|
|
78
|
+
|
|
79
|
+
# When a proxy is started under a proxy id, its config should be stable for the
|
|
80
|
+
# lifetime of the process (no hot reload).
|
|
81
|
+
PROXY_ID: str | None = os.environ.get("FORGE_PROXY_ID")
|
|
82
|
+
|
|
83
|
+
cost_tracker: CostTracker | None = None
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _initialize_cost_tracker_from_config() -> CostTracker:
|
|
87
|
+
"""Initialize request cost tracking in the module serving FastAPI traffic.
|
|
88
|
+
|
|
89
|
+
``python -m forge.proxy.server`` executes this file as ``__main__``, while
|
|
90
|
+
uvicorn imports ``forge.proxy.server:app`` for request handling. Module
|
|
91
|
+
globals therefore need to be initialized in the imported app module too.
|
|
92
|
+
"""
|
|
93
|
+
global cost_tracker
|
|
94
|
+
if cost_tracker is not None:
|
|
95
|
+
return cost_tracker
|
|
96
|
+
|
|
97
|
+
from forge.config.schema import CostConfig
|
|
98
|
+
|
|
99
|
+
cost_cfg = getattr(config.proxy, "costs", None) or CostConfig()
|
|
100
|
+
if cost_cfg.caps.per_day is not None or cost_cfg.caps.per_month is not None:
|
|
101
|
+
from forge.core.paths import get_forge_home
|
|
102
|
+
|
|
103
|
+
cost_tracker = CostTracker(
|
|
104
|
+
daily_cap_usd=cost_cfg.caps.per_day,
|
|
105
|
+
monthly_cap_usd=cost_cfg.caps.per_month,
|
|
106
|
+
cap_mode=cost_cfg.cap_mode,
|
|
107
|
+
on_cap_hit=cost_cfg.on_cap_hit,
|
|
108
|
+
)
|
|
109
|
+
cost_tracker.bootstrap_from_logs(get_forge_home() / "costs" / "requests", proxy_id=PROXY_ID)
|
|
110
|
+
else:
|
|
111
|
+
cost_tracker = CostTracker()
|
|
112
|
+
return cost_tracker
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def _ensure_runtime_state() -> None:
|
|
116
|
+
"""Ensure the imported app module has proxy config and runtime trackers."""
|
|
117
|
+
if PROXY_ID is None:
|
|
118
|
+
reload()
|
|
119
|
+
elif not config.proxy.active_template:
|
|
120
|
+
reload(proxy_id=PROXY_ID)
|
|
121
|
+
|
|
122
|
+
_initialize_cost_tracker_from_config()
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def _calc_and_log_cost(
|
|
126
|
+
*,
|
|
127
|
+
model: str,
|
|
128
|
+
tier: str,
|
|
129
|
+
input_tokens: int,
|
|
130
|
+
output_tokens: int,
|
|
131
|
+
cached_tokens: int,
|
|
132
|
+
latency_ms: float,
|
|
133
|
+
failed: bool,
|
|
134
|
+
request_id: str,
|
|
135
|
+
) -> int:
|
|
136
|
+
"""Calculate cost in microdollars and write to the persistent cost log.
|
|
137
|
+
|
|
138
|
+
Best-effort: pricing/logging failures return 0 cost and warn.
|
|
139
|
+
Never raises — cost tracking must not break the proxy request path.
|
|
140
|
+
"""
|
|
141
|
+
try:
|
|
142
|
+
from forge.core.models.pricing import calculate_cost, get_pricing
|
|
143
|
+
|
|
144
|
+
cost_micros = calculate_cost(model, input_tokens, output_tokens, cached_tokens)
|
|
145
|
+
pricing = get_pricing(model)
|
|
146
|
+
|
|
147
|
+
log_request_cost(
|
|
148
|
+
proxy_id=PROXY_ID or "unknown",
|
|
149
|
+
model=model,
|
|
150
|
+
tier=tier,
|
|
151
|
+
input_tokens=input_tokens,
|
|
152
|
+
output_tokens=output_tokens,
|
|
153
|
+
cached_tokens=cached_tokens,
|
|
154
|
+
cost_micros=cost_micros,
|
|
155
|
+
latency_ms=latency_ms,
|
|
156
|
+
failed=failed,
|
|
157
|
+
request_id=request_id,
|
|
158
|
+
pricing_source=pricing.source,
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
if cost_tracker is not None:
|
|
162
|
+
cost_tracker.record(cost_micros)
|
|
163
|
+
|
|
164
|
+
return cost_micros
|
|
165
|
+
except Exception as e:
|
|
166
|
+
logger.warning("Cost calculation failed for model=%s (non-fatal): %s", model, e)
|
|
167
|
+
return 0
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
_CAP_CONFIG_KEY = {"daily": "per_day", "monthly": "per_month"}
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def _cap_result_message(cap_result) -> str:
|
|
174
|
+
"""Format a spend cap result for HTTP headers and errors."""
|
|
175
|
+
cap_type = cap_result.cap_type or "configured"
|
|
176
|
+
config_key = _CAP_CONFIG_KEY.get(cap_type, f"per_{cap_type}")
|
|
177
|
+
return (
|
|
178
|
+
f"{'Projected ' if cap_result.projected else ''}"
|
|
179
|
+
f"{cap_type} spend cap reached: "
|
|
180
|
+
f"${cap_result.current_micros / 1_000_000:.2f} / "
|
|
181
|
+
f"${cap_result.limit_micros / 1_000_000:.2f}. "
|
|
182
|
+
f"Adjust with: forge proxy set <id> costs.caps.{config_key}=<amount>"
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def _with_spend_warning(headers: dict[str, str], warning: str | None) -> dict[str, str]:
|
|
187
|
+
"""Attach the optional spend warning header to a response header dict."""
|
|
188
|
+
if warning:
|
|
189
|
+
headers["X-Spend-Warning"] = warning
|
|
190
|
+
return headers
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def _textish_chars(value: object) -> int:
|
|
194
|
+
"""Approximate text-bearing request payload size for strict cap preflight."""
|
|
195
|
+
if value is None:
|
|
196
|
+
return 0
|
|
197
|
+
if isinstance(value, str):
|
|
198
|
+
return len(value)
|
|
199
|
+
if isinstance(value, dict):
|
|
200
|
+
total = 0
|
|
201
|
+
for key in ("content", "text", "thinking", "input", "name", "description"):
|
|
202
|
+
if key in value:
|
|
203
|
+
total += _textish_chars(value[key])
|
|
204
|
+
return total
|
|
205
|
+
if isinstance(value, (list, tuple)):
|
|
206
|
+
return sum(_textish_chars(item) for item in value)
|
|
207
|
+
|
|
208
|
+
total = 0
|
|
209
|
+
for attr in ("content", "text", "thinking", "input", "name", "description"):
|
|
210
|
+
if hasattr(value, attr):
|
|
211
|
+
total += _textish_chars(getattr(value, attr))
|
|
212
|
+
return total
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
def _estimate_input_tokens(request_data: MessagesRequest) -> int:
|
|
216
|
+
"""Approximate request input tokens for strict cap preflight."""
|
|
217
|
+
chars = _textish_chars(getattr(request_data, "system", None))
|
|
218
|
+
chars += _textish_chars(getattr(request_data, "messages", None))
|
|
219
|
+
chars += _textish_chars(getattr(request_data, "tools", None))
|
|
220
|
+
return chars // 4
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def _get_tier_override(tier: str) -> TierOverride | None:
|
|
224
|
+
"""Get tier override from the active provider config.
|
|
225
|
+
|
|
226
|
+
Returns the TierOverride for the specified tier, or None if not configured.
|
|
227
|
+
Tier overrides allow per-tier hyperparameter customization (e.g., different
|
|
228
|
+
reasoning_effort for opus vs sonnet when both map to the same model).
|
|
229
|
+
"""
|
|
230
|
+
try:
|
|
231
|
+
provider_cfg = config.proxy.get_provider()
|
|
232
|
+
return provider_cfg.tier_overrides.get(tier)
|
|
233
|
+
except Exception:
|
|
234
|
+
return None
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def _resolve_model_with_alternatives(tier: str, original_model_name: str | None, fallback_model: str) -> str:
|
|
238
|
+
"""Resolve backend model, checking per-tier alternatives before the tier default.
|
|
239
|
+
|
|
240
|
+
Used by both message routing and token counting so model resolution is
|
|
241
|
+
consistent across both paths. Strips ``[1m]`` context-window suffix before
|
|
242
|
+
lookup since it is a Claude Code hint, not a routing decision.
|
|
243
|
+
"""
|
|
244
|
+
try:
|
|
245
|
+
provider_cfg = config.proxy.get_provider()
|
|
246
|
+
alt_models = provider_cfg.model_alternatives.get(tier, {})
|
|
247
|
+
if original_model_name and alt_models:
|
|
248
|
+
lookup = original_model_name.removesuffix("[1m]")
|
|
249
|
+
if lookup in alt_models:
|
|
250
|
+
return alt_models[lookup]
|
|
251
|
+
except Exception:
|
|
252
|
+
# Best-effort: degrade to fallback_model if provider config is unavailable
|
|
253
|
+
logger.debug("model_alternatives lookup failed, using tier default", exc_info=True)
|
|
254
|
+
return fallback_model
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
@asynccontextmanager
|
|
258
|
+
async def lifespan(app: FastAPI):
|
|
259
|
+
"""Application lifespan management."""
|
|
260
|
+
logger.info("Server started...")
|
|
261
|
+
yield
|
|
262
|
+
logger.info("Server is shutting down... Cleaning up resources")
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
app = FastAPI(title="Unified LLM Proxy", lifespan=lifespan)
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
# --- Thinking → reasoning_effort translation ---
|
|
269
|
+
# Claude Code sends Anthropic-specific `thinking` config; litellm uses
|
|
270
|
+
# `reasoning_effort` which it translates per provider (Gemini 3: thinking_level,
|
|
271
|
+
# Gemini 2.5: thinkingBudget). These helpers map between the two.
|
|
272
|
+
|
|
273
|
+
# Ordered from lowest to highest so we can compare with max().
|
|
274
|
+
_EFFORT_RANK: dict[str | None, int] = {
|
|
275
|
+
None: -1,
|
|
276
|
+
"none": 0,
|
|
277
|
+
"disable": 0,
|
|
278
|
+
"minimal": 1,
|
|
279
|
+
"low": 2,
|
|
280
|
+
"medium": 3,
|
|
281
|
+
"high": 4,
|
|
282
|
+
"xhigh": 5,
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
# Budget thresholds for ceil-to-tier mapping (never downgrade).
|
|
286
|
+
# Checked top-down; first match wins. LiteLLM internal budgets for
|
|
287
|
+
# reference: low ~ 1,024, medium ~ 8,192, high ~ 24,576.
|
|
288
|
+
_BUDGET_THRESHOLDS: list[tuple[int, str]] = [
|
|
289
|
+
(25_000, "xhigh"), # >=25k tokens -> xhigh (above litellm high)
|
|
290
|
+
(10_000, "high"), # >=10k tokens -> high
|
|
291
|
+
(2_000, "medium"), # >=2k tokens -> medium
|
|
292
|
+
(500, "low"), # >=500 tokens -> low
|
|
293
|
+
(1, "minimal"), # >=1 token -> minimal
|
|
294
|
+
]
|
|
295
|
+
|
|
296
|
+
# Type-based fallback when budget_tokens is absent.
|
|
297
|
+
_TYPE_TO_EFFORT: dict[str, str] = {
|
|
298
|
+
"enabled": "high",
|
|
299
|
+
"adaptive": "medium",
|
|
300
|
+
"disabled": "none",
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
def _derive_reasoning_effort(thinking: dict[str, object] | object | None) -> str | None:
|
|
305
|
+
"""Derive reasoning_effort from Claude Code's thinking config.
|
|
306
|
+
|
|
307
|
+
Priority: budget_tokens (numeric, precise) > type (semantic label).
|
|
308
|
+
Unknown types default to "medium" (safe — never results in no reasoning).
|
|
309
|
+
"""
|
|
310
|
+
if not isinstance(thinking, dict):
|
|
311
|
+
return None
|
|
312
|
+
|
|
313
|
+
# 1) Use budget_tokens if present — data-driven, not label-driven.
|
|
314
|
+
budget = thinking.get("budget_tokens")
|
|
315
|
+
if isinstance(budget, (int, float)) and budget > 0:
|
|
316
|
+
for threshold, effort in _BUDGET_THRESHOLDS:
|
|
317
|
+
if budget >= threshold:
|
|
318
|
+
return effort
|
|
319
|
+
return "minimal" # budget_tokens in (0, 1) — fractional edge case
|
|
320
|
+
|
|
321
|
+
# 2) Fall back to type-based mapping.
|
|
322
|
+
thinking_type = thinking.get("type")
|
|
323
|
+
if isinstance(thinking_type, str):
|
|
324
|
+
mapped: str | None = _TYPE_TO_EFFORT.get(thinking_type)
|
|
325
|
+
if mapped is not None:
|
|
326
|
+
return mapped
|
|
327
|
+
# Unknown type — default to medium (safe), log warning.
|
|
328
|
+
logger.warning(
|
|
329
|
+
"Unknown thinking type '%s', defaulting to reasoning_effort='medium'",
|
|
330
|
+
thinking_type,
|
|
331
|
+
)
|
|
332
|
+
return "medium"
|
|
333
|
+
|
|
334
|
+
return None
|
|
335
|
+
|
|
336
|
+
|
|
337
|
+
def _max_effort(a: str | None, b: str | None) -> str | None:
|
|
338
|
+
"""Return the higher of two reasoning_effort levels, treating None as unset."""
|
|
339
|
+
if a is None:
|
|
340
|
+
return b
|
|
341
|
+
if b is None:
|
|
342
|
+
return a
|
|
343
|
+
return a if _EFFORT_RANK.get(a, 3) >= _EFFORT_RANK.get(b, 3) else b
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
@app.post("/v1/messages", response_model=None)
|
|
347
|
+
async def create_message(request_data: MessagesRequest, raw_request: Request):
|
|
348
|
+
"""
|
|
349
|
+
Process chat completion requests using unified client architecture.
|
|
350
|
+
|
|
351
|
+
This endpoint handles both streaming and non-streaming responses,
|
|
352
|
+
automatically routing to the appropriate provider based on model name.
|
|
353
|
+
"""
|
|
354
|
+
request_id = raw_request.state.request_id
|
|
355
|
+
start_time = time.time()
|
|
356
|
+
|
|
357
|
+
_ensure_runtime_state()
|
|
358
|
+
|
|
359
|
+
spend_warning: str | None = None
|
|
360
|
+
|
|
361
|
+
# Resolve effective tier (routing invariants):
|
|
362
|
+
# Precedence: request explicit tier > config.proxy.default_tier
|
|
363
|
+
# If neither is available, fail fast (misconfiguration).
|
|
364
|
+
if request_data.has_explicit_tier and request_data.tier:
|
|
365
|
+
# Request explicitly specified a tier (haiku/sonnet/opus in model name)
|
|
366
|
+
resolved_tier: str = request_data.tier
|
|
367
|
+
resolved_tier_source = "request"
|
|
368
|
+
elif config.proxy.default_tier:
|
|
369
|
+
resolved_tier = config.proxy.default_tier
|
|
370
|
+
resolved_tier_source = "proxy.default_tier"
|
|
371
|
+
else:
|
|
372
|
+
raise HTTPException(
|
|
373
|
+
status_code=500,
|
|
374
|
+
detail={
|
|
375
|
+
"type": "configuration_error",
|
|
376
|
+
"message": "config.proxy.default_tier is required for ambiguous requests under proxy-only routing",
|
|
377
|
+
},
|
|
378
|
+
)
|
|
379
|
+
|
|
380
|
+
logger.debug(f"[{request_id}] Resolved tier: {resolved_tier} (source={resolved_tier_source})")
|
|
381
|
+
|
|
382
|
+
request_data.tier = resolved_tier
|
|
383
|
+
|
|
384
|
+
# Determine if this is an explicit backend model or needs tier-based resolution
|
|
385
|
+
# Only re-resolve model based on tier if:
|
|
386
|
+
# 1. Model was mapped from Anthropic-style (contains haiku/sonnet/opus), OR
|
|
387
|
+
# 2. Model is truly ambiguous (no provider prefix and not a known backend model)
|
|
388
|
+
# Do NOT override explicit backend models like "openai/gpt-5.5" or "vertex_ai/gemini-3.1-pro"
|
|
389
|
+
original_model_name = request_data.original_model_name
|
|
390
|
+
mapped_model = map_model_name(request_data.model) # Map AFTER reload() for fresh config
|
|
391
|
+
|
|
392
|
+
# Check if original model is an explicit backend model (has provider prefix)
|
|
393
|
+
# These should be passed through, not tier-resolved
|
|
394
|
+
if config.proxy.preferred_provider == "openrouter":
|
|
395
|
+
# OpenRouter: any provider/model format is explicit (google/, meta-llama/, etc.)
|
|
396
|
+
is_explicit_backend = original_model_name is not None and "/" in original_model_name
|
|
397
|
+
else:
|
|
398
|
+
is_explicit_backend = (
|
|
399
|
+
original_model_name is not None
|
|
400
|
+
and "/" in original_model_name
|
|
401
|
+
and any(
|
|
402
|
+
original_model_name.startswith(prefix)
|
|
403
|
+
for prefix in [
|
|
404
|
+
"openai/",
|
|
405
|
+
"anthropic/",
|
|
406
|
+
"vertex_ai/",
|
|
407
|
+
"bedrock/",
|
|
408
|
+
"gemini/",
|
|
409
|
+
"together_ai/",
|
|
410
|
+
"replicate/",
|
|
411
|
+
]
|
|
412
|
+
)
|
|
413
|
+
)
|
|
414
|
+
|
|
415
|
+
# Only use tier-resolved model for Anthropic-style or ambiguous requests
|
|
416
|
+
# For explicit backend models, use what map_model_name() returned (usually pass-through)
|
|
417
|
+
if is_explicit_backend:
|
|
418
|
+
# Explicit backend model - preserve it (map_model_name already handled it)
|
|
419
|
+
actual_model_id = mapped_model
|
|
420
|
+
logger.debug(
|
|
421
|
+
f"[{request_id}] Explicit backend model '{original_model_name}' - preserving as '{actual_model_id}'"
|
|
422
|
+
)
|
|
423
|
+
else:
|
|
424
|
+
# Anthropic-style or ambiguous — check alternatives, then fall back to tier default
|
|
425
|
+
tier_default = config.proxy.get_model_for_tier(resolved_tier)
|
|
426
|
+
actual_model_id = _resolve_model_with_alternatives(resolved_tier, original_model_name, tier_default)
|
|
427
|
+
logger.debug(f"[{request_id}] Tier-resolved model: tier={resolved_tier} -> '{actual_model_id}'")
|
|
428
|
+
|
|
429
|
+
# Spend cap check (after model resolution so strict preflight prices the actual model)
|
|
430
|
+
if cost_tracker is not None and cost_tracker.has_caps:
|
|
431
|
+
projected = 0
|
|
432
|
+
if cost_tracker.cap_mode == "strict":
|
|
433
|
+
from forge.core.models.pricing import calculate_cost as _est_cost
|
|
434
|
+
|
|
435
|
+
_est_max_output = request_data.max_tokens or 4096
|
|
436
|
+
_est_input = _estimate_input_tokens(request_data)
|
|
437
|
+
try:
|
|
438
|
+
projected = _est_cost(actual_model_id, _est_input, _est_max_output, 0)
|
|
439
|
+
except Exception:
|
|
440
|
+
projected = 0
|
|
441
|
+
|
|
442
|
+
cap_result = cost_tracker.check_cap(projected_cost_micros=projected)
|
|
443
|
+
if cap_result.exceeded:
|
|
444
|
+
spend_warning = _cap_result_message(cap_result)
|
|
445
|
+
if cost_tracker.on_cap_hit == "reject":
|
|
446
|
+
return JSONResponse(
|
|
447
|
+
status_code=429,
|
|
448
|
+
content={
|
|
449
|
+
"type": "error",
|
|
450
|
+
"error": {
|
|
451
|
+
"type": "spend_cap_exceeded",
|
|
452
|
+
"message": spend_warning,
|
|
453
|
+
},
|
|
454
|
+
},
|
|
455
|
+
headers={"X-Request-ID": request_id},
|
|
456
|
+
)
|
|
457
|
+
logger.warning("[%s] %s", request_id, spend_warning)
|
|
458
|
+
|
|
459
|
+
try:
|
|
460
|
+
num_messages = len(request_data.messages) if request_data.messages else 0
|
|
461
|
+
num_tools = len(request_data.tools) if request_data.tools else 0
|
|
462
|
+
tool_names = [tool.name for tool in request_data.tools] if request_data.tools else []
|
|
463
|
+
has_system = bool(request_data.system)
|
|
464
|
+
|
|
465
|
+
await _check_client_tool_failures(request_data, request_id, actual_model_id)
|
|
466
|
+
|
|
467
|
+
# Detect provider BEFORE conversion to enable provider-specific schema handling
|
|
468
|
+
detected_provider = client_factory.detect_provider_for_model(actual_model_id)
|
|
469
|
+
provider_name = detected_provider.value # Convert enum to string
|
|
470
|
+
|
|
471
|
+
logger.debug(
|
|
472
|
+
f"[{request_id}] Processing '/v1/messages': "
|
|
473
|
+
f"original='{original_model_name}', target='{actual_model_id}', provider='{provider_name}', "
|
|
474
|
+
f"messages={num_messages}, tools={num_tools}, stream={request_data.stream}"
|
|
475
|
+
)
|
|
476
|
+
|
|
477
|
+
openai_request_dict = convert_anthropic_to_openai(request_data, provider=provider_name)
|
|
478
|
+
|
|
479
|
+
openai_request_dict["model"] = actual_model_id
|
|
480
|
+
|
|
481
|
+
# Forward User-Agent from incoming request (Claude Code identity).
|
|
482
|
+
# Upstream LLM gateways may filter traffic by User-Agent; without this,
|
|
483
|
+
# the proxy's OpenAI SDK default header could cause requests to be blocked.
|
|
484
|
+
# Only inject for LiteLLM providers (other clients don't need it).
|
|
485
|
+
if provider_name in ("litellm_remote", "litellm_local", "openrouter"):
|
|
486
|
+
incoming_user_agent = raw_request.headers.get("user-agent")
|
|
487
|
+
if incoming_user_agent:
|
|
488
|
+
openai_request_dict["_user_agent"] = incoming_user_agent
|
|
489
|
+
logger.debug(f"[{request_id}] Forwarding User-Agent: {incoming_user_agent[:120]!r}")
|
|
490
|
+
|
|
491
|
+
# Priority: request explicit > tier_override > model default (in catalog)
|
|
492
|
+
tier_override = _get_tier_override(resolved_tier)
|
|
493
|
+
if tier_override:
|
|
494
|
+
logger.debug(f"[{request_id}] Tier override for '{resolved_tier}': {tier_override}")
|
|
495
|
+
|
|
496
|
+
if request_data.temperature is not None:
|
|
497
|
+
openai_request_dict["temperature"] = request_data.temperature
|
|
498
|
+
elif tier_override and tier_override.temperature is not None:
|
|
499
|
+
openai_request_dict["temperature"] = tier_override.temperature
|
|
500
|
+
|
|
501
|
+
if request_data.max_tokens is not None:
|
|
502
|
+
openai_request_dict["max_tokens"] = request_data.max_tokens
|
|
503
|
+
if request_data.top_p is not None:
|
|
504
|
+
openai_request_dict["top_p"] = request_data.top_p
|
|
505
|
+
|
|
506
|
+
# Optional reasoning/thinking overrides.
|
|
507
|
+
# Priority: request explicit > thinking-derived > tier_override > model default
|
|
508
|
+
# tier_override acts as a FLOOR (never go below the user's tier config).
|
|
509
|
+
# Use getattr() for test stubs that don't include new fields.
|
|
510
|
+
reasoning_effort = getattr(request_data, "reasoning_effort", None)
|
|
511
|
+
if reasoning_effort is not None:
|
|
512
|
+
openai_request_dict["reasoning_effort"] = reasoning_effort
|
|
513
|
+
else:
|
|
514
|
+
# Claude Code sends `thinking` (Anthropic-specific) instead of
|
|
515
|
+
# `reasoning_effort`. Translate to reasoning_effort so litellm can
|
|
516
|
+
# map it to each provider's native parameter.
|
|
517
|
+
thinking = getattr(request_data, "thinking", None)
|
|
518
|
+
derived = _derive_reasoning_effort(thinking)
|
|
519
|
+
|
|
520
|
+
# Apply tier_override as a floor: max(derived, tier_override).
|
|
521
|
+
tier_effort = tier_override.reasoning_effort if tier_override else None
|
|
522
|
+
openai_request_dict["reasoning_effort"] = _max_effort(derived, tier_effort)
|
|
523
|
+
|
|
524
|
+
# Note: the raw `thinking` dict is NOT forwarded — it's Anthropic-specific.
|
|
525
|
+
# Litellm controls thinking via reasoning_effort (mapped above).
|
|
526
|
+
|
|
527
|
+
verbosity = getattr(request_data, "verbosity", None)
|
|
528
|
+
if verbosity is not None:
|
|
529
|
+
openai_request_dict["verbosity"] = verbosity
|
|
530
|
+
elif tier_override and tier_override.verbosity is not None:
|
|
531
|
+
openai_request_dict["verbosity"] = tier_override.verbosity
|
|
532
|
+
|
|
533
|
+
if request_data.stop_sequences:
|
|
534
|
+
openai_request_dict["stop"] = request_data.stop_sequences
|
|
535
|
+
|
|
536
|
+
# Get unified client for this model (pass tier for tier-specific hyperparameters)
|
|
537
|
+
try:
|
|
538
|
+
client = await client_factory.get_client(actual_model_id, tier=request_data.tier)
|
|
539
|
+
logger.debug(f"[{request_id}] Got client for {actual_model_id} (tier={request_data.tier})")
|
|
540
|
+
except AuthenticationError as e:
|
|
541
|
+
logger.error(f"[{request_id}] Authentication failed: {e}")
|
|
542
|
+
raise HTTPException(
|
|
543
|
+
status_code=401,
|
|
544
|
+
detail={
|
|
545
|
+
"type": "authentication_error",
|
|
546
|
+
"message": f"Authentication failed [{request_id}]",
|
|
547
|
+
},
|
|
548
|
+
)
|
|
549
|
+
|
|
550
|
+
if request_data.stream:
|
|
551
|
+
# Streaming response
|
|
552
|
+
async def stream_generator():
|
|
553
|
+
try:
|
|
554
|
+
async for chunk in client.create_streaming_completion(openai_request_dict, request_id):
|
|
555
|
+
yield chunk
|
|
556
|
+
except ToolCallError as e:
|
|
557
|
+
logger.error(f"[{request_id}] ToolCallError: {e}")
|
|
558
|
+
yield {
|
|
559
|
+
"error": {
|
|
560
|
+
"type": e.error_type,
|
|
561
|
+
"message": f"Tool call error [{request_id}]",
|
|
562
|
+
}
|
|
563
|
+
}
|
|
564
|
+
except ProxyStreamError as e:
|
|
565
|
+
logger.error(f"[{request_id}] ProxyStreamError ({e.error_type}): {e}")
|
|
566
|
+
yield {
|
|
567
|
+
"error": {
|
|
568
|
+
"type": e.error_type,
|
|
569
|
+
"message": f"Streaming request failed [{request_id}]",
|
|
570
|
+
"status_code": e.status_code,
|
|
571
|
+
}
|
|
572
|
+
}
|
|
573
|
+
|
|
574
|
+
headers = {
|
|
575
|
+
"X-Request-ID": request_id,
|
|
576
|
+
"X-Resolved-Tier": resolved_tier,
|
|
577
|
+
"X-Resolved-Model": actual_model_id,
|
|
578
|
+
"X-Cumulative-Cost": f"{proxy_metrics.total_cost_micros / 1_000_000:.6f}",
|
|
579
|
+
"Cache-Control": "no-cache",
|
|
580
|
+
"Connection": "keep-alive",
|
|
581
|
+
}
|
|
582
|
+
headers = _with_spend_warning(headers, spend_warning)
|
|
583
|
+
|
|
584
|
+
# Log streaming request (no response body available)
|
|
585
|
+
duration_ms = (time.time() - start_time) * 1000
|
|
586
|
+
asyncio.create_task(
|
|
587
|
+
log_request_response(
|
|
588
|
+
request_id=request_id,
|
|
589
|
+
original_model=original_model_name or "",
|
|
590
|
+
mapped_model=actual_model_id,
|
|
591
|
+
request_body=request_data.model_dump(),
|
|
592
|
+
response_body=None, # Streaming has no response body
|
|
593
|
+
status_code=200,
|
|
594
|
+
duration_ms=duration_ms,
|
|
595
|
+
num_messages=num_messages,
|
|
596
|
+
num_tools=num_tools,
|
|
597
|
+
tool_names=tool_names,
|
|
598
|
+
has_system=has_system,
|
|
599
|
+
temperature=request_data.temperature,
|
|
600
|
+
max_tokens=request_data.max_tokens,
|
|
601
|
+
streaming=True,
|
|
602
|
+
)
|
|
603
|
+
)
|
|
604
|
+
|
|
605
|
+
log_request_beautifully(
|
|
606
|
+
method="POST",
|
|
607
|
+
path="/v1/messages (streaming)",
|
|
608
|
+
original_model=original_model_name or "",
|
|
609
|
+
mapped_model=actual_model_id,
|
|
610
|
+
num_messages=num_messages,
|
|
611
|
+
num_tools=num_tools,
|
|
612
|
+
status_code=200,
|
|
613
|
+
)
|
|
614
|
+
|
|
615
|
+
def _on_stream_complete(usage: dict[str, int], failed: bool, error_type: str | None) -> None:
|
|
616
|
+
elapsed = (time.time() - start_time) * 1000
|
|
617
|
+
in_tok = usage.get("input_tokens", 0)
|
|
618
|
+
out_tok = usage.get("output_tokens", 0)
|
|
619
|
+
cache_tok = usage.get("cached_tokens", 0)
|
|
620
|
+
cost = _calc_and_log_cost(
|
|
621
|
+
model=actual_model_id,
|
|
622
|
+
tier=resolved_tier,
|
|
623
|
+
input_tokens=in_tok,
|
|
624
|
+
output_tokens=out_tok,
|
|
625
|
+
cached_tokens=cache_tok,
|
|
626
|
+
latency_ms=elapsed,
|
|
627
|
+
failed=failed,
|
|
628
|
+
request_id=request_id,
|
|
629
|
+
)
|
|
630
|
+
proxy_metrics.record_request(
|
|
631
|
+
tier=resolved_tier,
|
|
632
|
+
model=actual_model_id,
|
|
633
|
+
input_tokens=in_tok,
|
|
634
|
+
output_tokens=out_tok,
|
|
635
|
+
cached_tokens=cache_tok,
|
|
636
|
+
latency_ms=elapsed,
|
|
637
|
+
streaming=True,
|
|
638
|
+
failed=failed,
|
|
639
|
+
error_type=error_type,
|
|
640
|
+
cost_micros=cost,
|
|
641
|
+
)
|
|
642
|
+
|
|
643
|
+
return StreamingResponse(
|
|
644
|
+
convert_openai_to_anthropic_sse(
|
|
645
|
+
stream_generator(),
|
|
646
|
+
request_data,
|
|
647
|
+
request_id,
|
|
648
|
+
on_complete=_on_stream_complete,
|
|
649
|
+
),
|
|
650
|
+
media_type="text/event-stream",
|
|
651
|
+
headers=headers,
|
|
652
|
+
)
|
|
653
|
+
else:
|
|
654
|
+
try:
|
|
655
|
+
openai_response = await client.create_completion(openai_request_dict, request_id)
|
|
656
|
+
anthropic_response = convert_openai_to_anthropic(openai_response, original_model_name)
|
|
657
|
+
|
|
658
|
+
if not anthropic_response:
|
|
659
|
+
raise HTTPException(
|
|
660
|
+
status_code=500,
|
|
661
|
+
detail={
|
|
662
|
+
"type": "api_error",
|
|
663
|
+
"message": "Failed to convert response",
|
|
664
|
+
},
|
|
665
|
+
)
|
|
666
|
+
|
|
667
|
+
response_dict = anthropic_response.model_dump()
|
|
668
|
+
response_dict["_request_id"] = request_id
|
|
669
|
+
|
|
670
|
+
duration_ms = (time.time() - start_time) * 1000
|
|
671
|
+
|
|
672
|
+
_usage = openai_response.get("usage", {})
|
|
673
|
+
_in = _usage.get("prompt_tokens", 0)
|
|
674
|
+
_out = _usage.get("completion_tokens", 0)
|
|
675
|
+
_cached = _usage.get("cached_tokens", 0)
|
|
676
|
+
_cost = _calc_and_log_cost(
|
|
677
|
+
model=actual_model_id,
|
|
678
|
+
tier=resolved_tier,
|
|
679
|
+
input_tokens=_in,
|
|
680
|
+
output_tokens=_out,
|
|
681
|
+
cached_tokens=_cached,
|
|
682
|
+
latency_ms=duration_ms,
|
|
683
|
+
failed=False,
|
|
684
|
+
request_id=request_id,
|
|
685
|
+
)
|
|
686
|
+
proxy_metrics.record_request(
|
|
687
|
+
tier=resolved_tier,
|
|
688
|
+
model=actual_model_id,
|
|
689
|
+
input_tokens=_in,
|
|
690
|
+
output_tokens=_out,
|
|
691
|
+
cached_tokens=_cached,
|
|
692
|
+
latency_ms=duration_ms,
|
|
693
|
+
streaming=False,
|
|
694
|
+
failed=False,
|
|
695
|
+
error_type=None,
|
|
696
|
+
cost_micros=_cost,
|
|
697
|
+
)
|
|
698
|
+
|
|
699
|
+
asyncio.create_task(
|
|
700
|
+
log_request_response(
|
|
701
|
+
request_id=request_id,
|
|
702
|
+
original_model=original_model_name or "",
|
|
703
|
+
mapped_model=actual_model_id,
|
|
704
|
+
request_body=request_data.model_dump(),
|
|
705
|
+
response_body=response_dict,
|
|
706
|
+
status_code=200,
|
|
707
|
+
duration_ms=duration_ms,
|
|
708
|
+
num_messages=num_messages,
|
|
709
|
+
num_tools=num_tools,
|
|
710
|
+
tool_names=tool_names,
|
|
711
|
+
has_system=has_system,
|
|
712
|
+
temperature=request_data.temperature,
|
|
713
|
+
max_tokens=request_data.max_tokens,
|
|
714
|
+
streaming=False,
|
|
715
|
+
)
|
|
716
|
+
)
|
|
717
|
+
|
|
718
|
+
log_request_beautifully(
|
|
719
|
+
method="POST",
|
|
720
|
+
path="/v1/messages",
|
|
721
|
+
original_model=original_model_name or "",
|
|
722
|
+
mapped_model=actual_model_id,
|
|
723
|
+
num_messages=num_messages,
|
|
724
|
+
num_tools=num_tools,
|
|
725
|
+
status_code=200,
|
|
726
|
+
)
|
|
727
|
+
return JSONResponse(
|
|
728
|
+
content=response_dict,
|
|
729
|
+
headers=_with_spend_warning(
|
|
730
|
+
{
|
|
731
|
+
"X-Request-ID": request_id,
|
|
732
|
+
"X-Resolved-Tier": resolved_tier,
|
|
733
|
+
"X-Resolved-Model": actual_model_id,
|
|
734
|
+
"X-Request-Cost": f"{_cost / 1_000_000:.6f}",
|
|
735
|
+
"X-Cumulative-Cost": f"{proxy_metrics.total_cost_micros / 1_000_000:.6f}",
|
|
736
|
+
},
|
|
737
|
+
spend_warning,
|
|
738
|
+
),
|
|
739
|
+
)
|
|
740
|
+
|
|
741
|
+
except ToolCallError as e:
|
|
742
|
+
duration_ms = (time.time() - start_time) * 1000
|
|
743
|
+
error_msg = str(e)
|
|
744
|
+
|
|
745
|
+
_tc_cost = _calc_and_log_cost(
|
|
746
|
+
model=actual_model_id,
|
|
747
|
+
tier=resolved_tier,
|
|
748
|
+
input_tokens=0,
|
|
749
|
+
output_tokens=0,
|
|
750
|
+
cached_tokens=0,
|
|
751
|
+
latency_ms=duration_ms,
|
|
752
|
+
failed=True,
|
|
753
|
+
request_id=request_id,
|
|
754
|
+
)
|
|
755
|
+
proxy_metrics.record_request(
|
|
756
|
+
tier=resolved_tier,
|
|
757
|
+
model=actual_model_id,
|
|
758
|
+
input_tokens=0,
|
|
759
|
+
output_tokens=0,
|
|
760
|
+
cached_tokens=0,
|
|
761
|
+
latency_ms=duration_ms,
|
|
762
|
+
streaming=False,
|
|
763
|
+
failed=True,
|
|
764
|
+
error_type="tool_call_error",
|
|
765
|
+
cost_micros=_tc_cost,
|
|
766
|
+
)
|
|
767
|
+
|
|
768
|
+
asyncio.create_task(
|
|
769
|
+
log_request_response(
|
|
770
|
+
request_id=request_id,
|
|
771
|
+
original_model=original_model_name or "",
|
|
772
|
+
mapped_model=actual_model_id,
|
|
773
|
+
request_body=request_data.model_dump(),
|
|
774
|
+
response_body=None,
|
|
775
|
+
status_code=400,
|
|
776
|
+
duration_ms=duration_ms,
|
|
777
|
+
error=error_msg,
|
|
778
|
+
num_messages=num_messages,
|
|
779
|
+
num_tools=num_tools,
|
|
780
|
+
tool_names=tool_names,
|
|
781
|
+
has_system=has_system,
|
|
782
|
+
temperature=request_data.temperature,
|
|
783
|
+
max_tokens=request_data.max_tokens,
|
|
784
|
+
streaming=False,
|
|
785
|
+
)
|
|
786
|
+
)
|
|
787
|
+
|
|
788
|
+
log_request_beautifully(
|
|
789
|
+
method="POST",
|
|
790
|
+
path="/v1/messages",
|
|
791
|
+
original_model=original_model_name or "",
|
|
792
|
+
mapped_model=actual_model_id,
|
|
793
|
+
num_messages=num_messages,
|
|
794
|
+
num_tools=num_tools,
|
|
795
|
+
status_code=400,
|
|
796
|
+
)
|
|
797
|
+
|
|
798
|
+
logger.error(f"[{request_id}] Tool call error: {e}")
|
|
799
|
+
raise HTTPException(
|
|
800
|
+
status_code=400,
|
|
801
|
+
detail={"type": "invalid_request_error", "message": error_msg},
|
|
802
|
+
)
|
|
803
|
+
except AuthenticationError:
|
|
804
|
+
# Try refreshing credentials once
|
|
805
|
+
logger.warning(f"[{request_id}] Auth failed, refreshing credentials")
|
|
806
|
+
client = await client_factory.invalidate_and_retry(actual_model_id)
|
|
807
|
+
openai_response = await client.create_completion(openai_request_dict, request_id)
|
|
808
|
+
anthropic_response = convert_openai_to_anthropic(openai_response, original_model_name)
|
|
809
|
+
|
|
810
|
+
if not anthropic_response:
|
|
811
|
+
raise HTTPException(
|
|
812
|
+
status_code=500,
|
|
813
|
+
detail={
|
|
814
|
+
"type": "api_error",
|
|
815
|
+
"message": "Failed to convert response after retry",
|
|
816
|
+
},
|
|
817
|
+
)
|
|
818
|
+
|
|
819
|
+
retry_duration_ms = (time.time() - start_time) * 1000
|
|
820
|
+
_retry_usage = openai_response.get("usage", {})
|
|
821
|
+
_ri = _retry_usage.get("prompt_tokens", 0)
|
|
822
|
+
_ro = _retry_usage.get("completion_tokens", 0)
|
|
823
|
+
_rc = _retry_usage.get("cached_tokens", 0)
|
|
824
|
+
_rcost = _calc_and_log_cost(
|
|
825
|
+
model=actual_model_id,
|
|
826
|
+
tier=resolved_tier,
|
|
827
|
+
input_tokens=_ri,
|
|
828
|
+
output_tokens=_ro,
|
|
829
|
+
cached_tokens=_rc,
|
|
830
|
+
latency_ms=retry_duration_ms,
|
|
831
|
+
failed=False,
|
|
832
|
+
request_id=request_id,
|
|
833
|
+
)
|
|
834
|
+
proxy_metrics.record_request(
|
|
835
|
+
tier=resolved_tier,
|
|
836
|
+
model=actual_model_id,
|
|
837
|
+
input_tokens=_ri,
|
|
838
|
+
output_tokens=_ro,
|
|
839
|
+
cached_tokens=_rc,
|
|
840
|
+
latency_ms=retry_duration_ms,
|
|
841
|
+
streaming=False,
|
|
842
|
+
failed=False,
|
|
843
|
+
error_type=None,
|
|
844
|
+
cost_micros=_rcost,
|
|
845
|
+
)
|
|
846
|
+
|
|
847
|
+
response_dict = anthropic_response.model_dump()
|
|
848
|
+
response_dict["_request_id"] = request_id
|
|
849
|
+
return JSONResponse(
|
|
850
|
+
content=response_dict,
|
|
851
|
+
headers=_with_spend_warning(
|
|
852
|
+
{
|
|
853
|
+
"X-Request-ID": request_id,
|
|
854
|
+
"X-Resolved-Tier": resolved_tier,
|
|
855
|
+
"X-Resolved-Model": actual_model_id,
|
|
856
|
+
"X-Request-Cost": f"{_rcost / 1_000_000:.6f}",
|
|
857
|
+
"X-Cumulative-Cost": f"{proxy_metrics.total_cost_micros / 1_000_000:.6f}",
|
|
858
|
+
},
|
|
859
|
+
spend_warning,
|
|
860
|
+
),
|
|
861
|
+
)
|
|
862
|
+
|
|
863
|
+
except HTTPException:
|
|
864
|
+
raise
|
|
865
|
+
except Exception as e:
|
|
866
|
+
duration_ms = (time.time() - start_time) * 1000
|
|
867
|
+
error_msg = f"Internal error [{request_id}]"
|
|
868
|
+
|
|
869
|
+
_err_cost = _calc_and_log_cost(
|
|
870
|
+
model=actual_model_id,
|
|
871
|
+
tier=resolved_tier,
|
|
872
|
+
input_tokens=0,
|
|
873
|
+
output_tokens=0,
|
|
874
|
+
cached_tokens=0,
|
|
875
|
+
latency_ms=duration_ms,
|
|
876
|
+
failed=True,
|
|
877
|
+
request_id=request_id,
|
|
878
|
+
)
|
|
879
|
+
proxy_metrics.record_request(
|
|
880
|
+
tier=resolved_tier,
|
|
881
|
+
model=actual_model_id,
|
|
882
|
+
input_tokens=0,
|
|
883
|
+
output_tokens=0,
|
|
884
|
+
cached_tokens=0,
|
|
885
|
+
latency_ms=duration_ms,
|
|
886
|
+
streaming=request_data.stream or False,
|
|
887
|
+
failed=True,
|
|
888
|
+
error_type="api_error",
|
|
889
|
+
cost_micros=_err_cost,
|
|
890
|
+
)
|
|
891
|
+
|
|
892
|
+
asyncio.create_task(
|
|
893
|
+
log_request_response(
|
|
894
|
+
request_id=request_id,
|
|
895
|
+
original_model=original_model_name or "",
|
|
896
|
+
mapped_model=actual_model_id,
|
|
897
|
+
request_body=request_data.model_dump(),
|
|
898
|
+
response_body=None,
|
|
899
|
+
status_code=500,
|
|
900
|
+
duration_ms=duration_ms,
|
|
901
|
+
error=error_msg,
|
|
902
|
+
num_messages=num_messages,
|
|
903
|
+
num_tools=num_tools,
|
|
904
|
+
tool_names=tool_names,
|
|
905
|
+
has_system=has_system,
|
|
906
|
+
temperature=request_data.temperature,
|
|
907
|
+
max_tokens=request_data.max_tokens,
|
|
908
|
+
streaming=request_data.stream or False,
|
|
909
|
+
)
|
|
910
|
+
)
|
|
911
|
+
|
|
912
|
+
log_request_beautifully(
|
|
913
|
+
method="POST",
|
|
914
|
+
path="/v1/messages",
|
|
915
|
+
original_model=original_model_name or "",
|
|
916
|
+
mapped_model=actual_model_id,
|
|
917
|
+
num_messages=num_messages,
|
|
918
|
+
num_tools=num_tools,
|
|
919
|
+
status_code=500,
|
|
920
|
+
)
|
|
921
|
+
|
|
922
|
+
logger.error(f"[{request_id}] Unexpected error: {e}", exc_info=True)
|
|
923
|
+
raise HTTPException(status_code=500, detail={"type": "api_error", "message": error_msg})
|
|
924
|
+
|
|
925
|
+
|
|
926
|
+
@app.post("/v1/messages/count_tokens", response_model=TokenCountResponse)
|
|
927
|
+
async def count_tokens(request_data: TokenCountRequest, raw_request: Request):
|
|
928
|
+
"""Count tokens using the appropriate client's token counter."""
|
|
929
|
+
request_id = raw_request.state.request_id
|
|
930
|
+
|
|
931
|
+
_ensure_runtime_state()
|
|
932
|
+
|
|
933
|
+
try:
|
|
934
|
+
original_model_name = request_data.original_model_name
|
|
935
|
+
|
|
936
|
+
# Resolve tier FIRST (same precedence as message routing)
|
|
937
|
+
if request_data.has_explicit_tier and request_data.tier:
|
|
938
|
+
resolved_tier: str = request_data.tier
|
|
939
|
+
resolved_tier_source = "request"
|
|
940
|
+
elif config.proxy.default_tier:
|
|
941
|
+
resolved_tier = config.proxy.default_tier
|
|
942
|
+
resolved_tier_source = "proxy.default_tier"
|
|
943
|
+
else:
|
|
944
|
+
raise HTTPException(
|
|
945
|
+
status_code=500,
|
|
946
|
+
detail={
|
|
947
|
+
"type": "configuration_error",
|
|
948
|
+
"message": "config.proxy.default_tier is required for ambiguous requests under proxy-only routing",
|
|
949
|
+
},
|
|
950
|
+
)
|
|
951
|
+
|
|
952
|
+
request_data.tier = resolved_tier
|
|
953
|
+
|
|
954
|
+
# Match the /v1/messages model resolution: explicit backend models are
|
|
955
|
+
# preserved; Anthropic-style or ambiguous models go through tier + alternatives.
|
|
956
|
+
mapped_model = map_model_name(request_data.model)
|
|
957
|
+
|
|
958
|
+
if config.proxy.preferred_provider == "openrouter":
|
|
959
|
+
is_explicit_backend = original_model_name is not None and "/" in original_model_name
|
|
960
|
+
else:
|
|
961
|
+
is_explicit_backend = (
|
|
962
|
+
original_model_name is not None
|
|
963
|
+
and "/" in original_model_name
|
|
964
|
+
and any(
|
|
965
|
+
original_model_name.startswith(p)
|
|
966
|
+
for p in [
|
|
967
|
+
"openai/",
|
|
968
|
+
"anthropic/",
|
|
969
|
+
"vertex_ai/",
|
|
970
|
+
"bedrock/",
|
|
971
|
+
"gemini/",
|
|
972
|
+
"together_ai/",
|
|
973
|
+
"replicate/",
|
|
974
|
+
]
|
|
975
|
+
)
|
|
976
|
+
)
|
|
977
|
+
|
|
978
|
+
if is_explicit_backend:
|
|
979
|
+
actual_model_id = mapped_model
|
|
980
|
+
else:
|
|
981
|
+
tier_default = config.proxy.get_model_for_tier(resolved_tier)
|
|
982
|
+
actual_model_id = _resolve_model_with_alternatives(resolved_tier, original_model_name, tier_default)
|
|
983
|
+
|
|
984
|
+
logger.info(f"[{request_id}] Token counting: original='{original_model_name}', target='{actual_model_id}'")
|
|
985
|
+
logger.debug(f"[{request_id}] Token count resolved tier: {resolved_tier} (source={resolved_tier_source})")
|
|
986
|
+
|
|
987
|
+
detected_provider = client_factory.detect_provider_for_model(actual_model_id)
|
|
988
|
+
provider_name = detected_provider.value
|
|
989
|
+
|
|
990
|
+
simulated_request = MessagesRequest(
|
|
991
|
+
model=actual_model_id,
|
|
992
|
+
messages=request_data.messages,
|
|
993
|
+
system=request_data.system,
|
|
994
|
+
max_tokens=1,
|
|
995
|
+
)
|
|
996
|
+
openai_dict = convert_anthropic_to_openai(simulated_request, provider=provider_name)
|
|
997
|
+
messages = openai_dict.get("messages", [])
|
|
998
|
+
|
|
999
|
+
client = await client_factory.get_client(actual_model_id, tier=resolved_tier)
|
|
1000
|
+
token_count = await client.count_tokens(messages)
|
|
1001
|
+
|
|
1002
|
+
response = TokenCountResponse(input_tokens=token_count)
|
|
1003
|
+
return JSONResponse(content=response.model_dump(), headers={"X-Request-ID": request_id})
|
|
1004
|
+
|
|
1005
|
+
except Exception as e:
|
|
1006
|
+
logger.error(f"[{request_id}] Token counting failed: {e}")
|
|
1007
|
+
raise HTTPException(
|
|
1008
|
+
status_code=500,
|
|
1009
|
+
detail={"type": "api_error", "message": f"Token counting failed [{request_id}]"},
|
|
1010
|
+
)
|
|
1011
|
+
|
|
1012
|
+
|
|
1013
|
+
DEFAULT_CONTEXT_WINDOW = 200000
|
|
1014
|
+
|
|
1015
|
+
|
|
1016
|
+
def get_context_window(model_name: str) -> int:
|
|
1017
|
+
"""Get context window size for a model from the central catalog.
|
|
1018
|
+
|
|
1019
|
+
Falls back to a safe default for models not in the catalog (e.g.,
|
|
1020
|
+
OpenRouter models outside Forge's known set).
|
|
1021
|
+
|
|
1022
|
+
Args:
|
|
1023
|
+
model_name: Model ID (canonical or alias like 'openai/gpt-5.5')
|
|
1024
|
+
|
|
1025
|
+
Returns:
|
|
1026
|
+
Context window size in tokens.
|
|
1027
|
+
"""
|
|
1028
|
+
from forge.core.models import get_context_window_tokens, model_exists
|
|
1029
|
+
|
|
1030
|
+
if not model_exists(model_name):
|
|
1031
|
+
logger.debug(f"Model {model_name!r} not in catalog, using default context window")
|
|
1032
|
+
return DEFAULT_CONTEXT_WINDOW
|
|
1033
|
+
|
|
1034
|
+
return get_context_window_tokens(model_name)
|
|
1035
|
+
|
|
1036
|
+
|
|
1037
|
+
@app.get("/", include_in_schema=False)
|
|
1038
|
+
async def root(request: Request):
|
|
1039
|
+
"""Service health and runtime truth for status line scripts.
|
|
1040
|
+
|
|
1041
|
+
Returns proxy runtime status including:
|
|
1042
|
+
- is_proxy: True (indicates this is a proxy, not direct Anthropic API)
|
|
1043
|
+
- template: Active configuration template name
|
|
1044
|
+
- provider: Underlying provider (litellm, openai, gemini)
|
|
1045
|
+
- tiers: Mapping of Claude tiers to actual models with context windows
|
|
1046
|
+
- proxy: First-class proxy identity (proxy_id, template, port, base_url)
|
|
1047
|
+
- runtime: Actual resolved tier → model mappings, context windows, llm defaults
|
|
1048
|
+
|
|
1049
|
+
Note: Session state is no longer returned by proxy. Consumers should read
|
|
1050
|
+
session state locally via FORGE_SESSION env var or CWD manifest.
|
|
1051
|
+
|
|
1052
|
+
This endpoint reflects what the proxy is **actually doing**, not just
|
|
1053
|
+
echoed configuration. It serves as the source of runtime truth.
|
|
1054
|
+
"""
|
|
1055
|
+
import os
|
|
1056
|
+
|
|
1057
|
+
from forge.proxy.proxy_identity import get_proxy_identity
|
|
1058
|
+
|
|
1059
|
+
active_template = os.environ.get("ACTIVE_TEMPLATE", "unknown")
|
|
1060
|
+
preferred_provider = os.environ.get("PREFERRED_PROVIDER", "unknown")
|
|
1061
|
+
|
|
1062
|
+
# Extract request host/port for proxy identity (accurate even with --auto-port)
|
|
1063
|
+
request_host = request.url.hostname or "localhost"
|
|
1064
|
+
request_port = request.url.port
|
|
1065
|
+
|
|
1066
|
+
# Fallback to env var if request port unavailable
|
|
1067
|
+
env_port_str = os.environ.get("ACTIVE_PORT")
|
|
1068
|
+
env_port = int(env_port_str) if env_port_str else None
|
|
1069
|
+
|
|
1070
|
+
# Discover proxy identity (2-tier: registry > derived)
|
|
1071
|
+
proxy_identity = get_proxy_identity(
|
|
1072
|
+
active_template=active_template,
|
|
1073
|
+
request_host=request_host,
|
|
1074
|
+
request_port=request_port,
|
|
1075
|
+
env_port=env_port,
|
|
1076
|
+
process_proxy_id=os.environ.get("FORGE_PROXY_ID"),
|
|
1077
|
+
)
|
|
1078
|
+
|
|
1079
|
+
# Tier mappings exposed via GET / for status line and session context
|
|
1080
|
+
tiers = {}
|
|
1081
|
+
provider_config = config.proxy.get_provider(preferred_provider)
|
|
1082
|
+
tier_models = {
|
|
1083
|
+
"haiku": provider_config.tiers.haiku,
|
|
1084
|
+
"sonnet": provider_config.tiers.sonnet,
|
|
1085
|
+
"opus": provider_config.tiers.opus,
|
|
1086
|
+
}
|
|
1087
|
+
|
|
1088
|
+
for tier, model in tier_models.items():
|
|
1089
|
+
tiers[tier] = {
|
|
1090
|
+
"model": model,
|
|
1091
|
+
"context_window": get_context_window(model),
|
|
1092
|
+
}
|
|
1093
|
+
|
|
1094
|
+
# Compute runtime LLM defaults (post-merge) from the credential manager.
|
|
1095
|
+
# This reflects the actual baseline hyperparameters used by proxy clients,
|
|
1096
|
+
# including env/tier overrides and caps.
|
|
1097
|
+
llm_defaults_by_tier: dict[str, dict[str, object]] = {}
|
|
1098
|
+
for tier in ("haiku", "sonnet", "opus"):
|
|
1099
|
+
try:
|
|
1100
|
+
model_name = tier_models.get(tier)
|
|
1101
|
+
if not model_name:
|
|
1102
|
+
raise ValueError(f"No model configured for tier {tier!r}")
|
|
1103
|
+
hp = client_factory.get_default_hyperparams_for_tier(
|
|
1104
|
+
provider=preferred_provider, tier=tier, model_name=model_name
|
|
1105
|
+
)
|
|
1106
|
+
llm_defaults_by_tier[tier] = hp.model_dump(exclude_unset=True)
|
|
1107
|
+
except Exception as e:
|
|
1108
|
+
llm_defaults_by_tier[tier] = {"error": f"failed to compute defaults: {e}"}
|
|
1109
|
+
|
|
1110
|
+
if config.proxy.default_tier:
|
|
1111
|
+
default_tier = config.proxy.default_tier
|
|
1112
|
+
default_tier_source = "proxy.default_tier"
|
|
1113
|
+
else:
|
|
1114
|
+
default_tier = None
|
|
1115
|
+
default_tier_source = "missing"
|
|
1116
|
+
|
|
1117
|
+
runtime_active_model = tier_models.get(default_tier or "sonnet") or tier_models.get("sonnet")
|
|
1118
|
+
|
|
1119
|
+
routing_section = {
|
|
1120
|
+
"default_tier": default_tier,
|
|
1121
|
+
"default_tier_source": default_tier_source,
|
|
1122
|
+
"note": "Routing defaults are proxy-owned. Session state is not authoritative for routing defaults.",
|
|
1123
|
+
}
|
|
1124
|
+
|
|
1125
|
+
if default_tier is None:
|
|
1126
|
+
routing_section["note"] = (
|
|
1127
|
+
"Proxy is missing config.proxy.default_tier; ambiguous requests will fail until configured."
|
|
1128
|
+
)
|
|
1129
|
+
|
|
1130
|
+
runtime_section = {
|
|
1131
|
+
"template": active_template,
|
|
1132
|
+
"provider": preferred_provider,
|
|
1133
|
+
"tier_mappings": tier_models,
|
|
1134
|
+
"context_windows": {tier: get_context_window(model) for tier, model in tier_models.items()},
|
|
1135
|
+
"active_tier": default_tier,
|
|
1136
|
+
"active_context_window": get_context_window(runtime_active_model) if runtime_active_model else None,
|
|
1137
|
+
# Proxy-owned hyperparameter defaults actually used by proxy clients (post-merge)
|
|
1138
|
+
"llm_defaults_by_tier": llm_defaults_by_tier,
|
|
1139
|
+
}
|
|
1140
|
+
|
|
1141
|
+
# Build proxy identity section (B2.1.5)
|
|
1142
|
+
proxy_section = {
|
|
1143
|
+
"proxy_id": proxy_identity.proxy_id,
|
|
1144
|
+
"template": proxy_identity.template,
|
|
1145
|
+
"port": proxy_identity.port,
|
|
1146
|
+
"base_url": proxy_identity.base_url,
|
|
1147
|
+
"source": proxy_identity.source,
|
|
1148
|
+
"status": proxy_identity.status,
|
|
1149
|
+
}
|
|
1150
|
+
|
|
1151
|
+
response = {
|
|
1152
|
+
"is_proxy": True,
|
|
1153
|
+
"template": active_template,
|
|
1154
|
+
"provider": preferred_provider,
|
|
1155
|
+
"tiers": tiers,
|
|
1156
|
+
"status": "running",
|
|
1157
|
+
"routing": routing_section,
|
|
1158
|
+
# Proxy identity (B2.1.5): first-class proxy identity
|
|
1159
|
+
"proxy": proxy_section,
|
|
1160
|
+
# Runtime truth: tier mappings, context windows, hyperparameter defaults
|
|
1161
|
+
"runtime": runtime_section,
|
|
1162
|
+
# Per-proxy metrics (request counts, token usage, latency)
|
|
1163
|
+
"metrics": proxy_metrics.snapshot(),
|
|
1164
|
+
}
|
|
1165
|
+
|
|
1166
|
+
return response
|
|
1167
|
+
|
|
1168
|
+
|
|
1169
|
+
@app.middleware("http")
|
|
1170
|
+
async def log_requests_middleware(request: Request, call_next):
|
|
1171
|
+
"""Request logging middleware."""
|
|
1172
|
+
start_time = time.time()
|
|
1173
|
+
|
|
1174
|
+
path = request.url.path
|
|
1175
|
+
prefix = "req_"
|
|
1176
|
+
if "/count_tokens" in path:
|
|
1177
|
+
prefix = "tok_"
|
|
1178
|
+
elif "/" == path:
|
|
1179
|
+
prefix = "inf_"
|
|
1180
|
+
|
|
1181
|
+
request_id = request.headers.get("X-Request-ID") or f"{prefix}{uuid.uuid4().hex[:12]}"
|
|
1182
|
+
request.state.request_id = request_id
|
|
1183
|
+
|
|
1184
|
+
# Endpoints that have their own detailed logging
|
|
1185
|
+
verbose_endpoints = ("/messages", "/event_logging")
|
|
1186
|
+
has_own_logging = any(ep in path for ep in verbose_endpoints)
|
|
1187
|
+
|
|
1188
|
+
logger.debug(f"{path} [{request_id}] {request.method}")
|
|
1189
|
+
|
|
1190
|
+
try:
|
|
1191
|
+
response = await call_next(request)
|
|
1192
|
+
elapsed = time.time() - start_time
|
|
1193
|
+
|
|
1194
|
+
if has_own_logging:
|
|
1195
|
+
logger.debug(f"{path} [{request_id}] Middleware: {elapsed:.3f}s")
|
|
1196
|
+
else:
|
|
1197
|
+
status = response.status_code
|
|
1198
|
+
logger.info(f"{path} [{request_id}] Completed in {elapsed:.3f}s ({status})")
|
|
1199
|
+
|
|
1200
|
+
if "X-Request-ID" not in response.headers:
|
|
1201
|
+
response.headers["X-Request-ID"] = request_id
|
|
1202
|
+
|
|
1203
|
+
return response
|
|
1204
|
+
except Exception as e:
|
|
1205
|
+
logger.error(f"[{request_id}] Middleware error: {e}", exc_info=True)
|
|
1206
|
+
return JSONResponse(
|
|
1207
|
+
status_code=500,
|
|
1208
|
+
content={
|
|
1209
|
+
"error": {
|
|
1210
|
+
"type": "api_error",
|
|
1211
|
+
"message": f"Internal error [{request_id}]",
|
|
1212
|
+
}
|
|
1213
|
+
},
|
|
1214
|
+
headers={"X-Request-ID": request_id},
|
|
1215
|
+
)
|
|
1216
|
+
|
|
1217
|
+
|
|
1218
|
+
async def _check_client_tool_failures(request_data: MessagesRequest, request_id: str, mapped_model: str):
|
|
1219
|
+
"""Check for client-side tool execution failures in the request.
|
|
1220
|
+
|
|
1221
|
+
Only scans the most recent user message. Older tool_result blocks were
|
|
1222
|
+
already inspected on prior requests; re-scanning them produces duplicate
|
|
1223
|
+
log entries and skews telemetry.
|
|
1224
|
+
"""
|
|
1225
|
+
latest_user_msg = next(
|
|
1226
|
+
(m for m in reversed(request_data.messages) if m.role == "user" and isinstance(m.content, list)),
|
|
1227
|
+
None,
|
|
1228
|
+
)
|
|
1229
|
+
if latest_user_msg is None:
|
|
1230
|
+
return
|
|
1231
|
+
|
|
1232
|
+
for msg in (latest_user_msg,):
|
|
1233
|
+
if msg.role == "user" and isinstance(msg.content, list):
|
|
1234
|
+
for block in msg.content:
|
|
1235
|
+
if hasattr(block, "type") and block.type == "tool_result":
|
|
1236
|
+
tool_use_id = getattr(block, "tool_use_id", None)
|
|
1237
|
+
is_error = False
|
|
1238
|
+
error_content = None
|
|
1239
|
+
|
|
1240
|
+
# 1. Most reliable: Check explicit is_error field
|
|
1241
|
+
if hasattr(block, "is_error") and block.is_error:
|
|
1242
|
+
is_error = True
|
|
1243
|
+
if hasattr(block, "content"):
|
|
1244
|
+
error_content = block.content
|
|
1245
|
+
|
|
1246
|
+
if hasattr(block, "content") and not is_error:
|
|
1247
|
+
# 2. Check for dict with error keys (structured errors)
|
|
1248
|
+
if isinstance(block.content, dict) and any(k in block.content for k in ["error", "exception"]):
|
|
1249
|
+
is_error = True
|
|
1250
|
+
error_content = block.content
|
|
1251
|
+
# 3. For string content, only check for explicit error patterns at the start
|
|
1252
|
+
# Don't scan the entire content as it causes false positives with documentation
|
|
1253
|
+
elif isinstance(block.content, str):
|
|
1254
|
+
content_start = block.content[:200] if len(block.content) > 200 else block.content
|
|
1255
|
+
# Be specific to avoid false positives
|
|
1256
|
+
error_patterns = [
|
|
1257
|
+
"Error:",
|
|
1258
|
+
"ERROR:",
|
|
1259
|
+
"Exception:",
|
|
1260
|
+
"EXCEPTION:",
|
|
1261
|
+
"Failed:",
|
|
1262
|
+
"FAILED:",
|
|
1263
|
+
"Tool execution failed",
|
|
1264
|
+
"Command failed",
|
|
1265
|
+
"File not found",
|
|
1266
|
+
"Permission denied",
|
|
1267
|
+
"Invalid tool", # More specific than just "Invalid"
|
|
1268
|
+
"Invalid arguments",
|
|
1269
|
+
"Invalid input",
|
|
1270
|
+
"Traceback (most recent call last)",
|
|
1271
|
+
]
|
|
1272
|
+
if any(content_start.startswith(pattern) for pattern in error_patterns):
|
|
1273
|
+
is_error = True
|
|
1274
|
+
error_content = block.content
|
|
1275
|
+
else:
|
|
1276
|
+
error_content = None
|
|
1277
|
+
else:
|
|
1278
|
+
error_content = block.content
|
|
1279
|
+
|
|
1280
|
+
if is_error and tool_use_id:
|
|
1281
|
+
tool_name, tool_input = _find_tool_use_info(request_data.messages, msg, tool_use_id)
|
|
1282
|
+
|
|
1283
|
+
# Check if this is a stale cleared tool result (not actionable)
|
|
1284
|
+
is_cleared_content = (
|
|
1285
|
+
isinstance(error_content, str) and "Old tool result content cleared" in error_content
|
|
1286
|
+
)
|
|
1287
|
+
|
|
1288
|
+
# Only log as warning if we have actual error content (not cleared)
|
|
1289
|
+
if error_content and not is_cleared_content:
|
|
1290
|
+
logger.warning(
|
|
1291
|
+
f"[{request_id}] Client tool failure: "
|
|
1292
|
+
f"tool='{tool_name or 'unknown'}', id='{tool_use_id}', "
|
|
1293
|
+
f"error={str(error_content)[:100]}"
|
|
1294
|
+
)
|
|
1295
|
+
elif is_cleared_content:
|
|
1296
|
+
logger.debug(
|
|
1297
|
+
f"[{request_id}] Stale tool failure (content cleared): "
|
|
1298
|
+
f"tool='{tool_name or 'unknown'}', id='{tool_use_id}'"
|
|
1299
|
+
)
|
|
1300
|
+
else:
|
|
1301
|
+
# Debug log for investigation when is_error but no content
|
|
1302
|
+
logger.debug(
|
|
1303
|
+
f"[{request_id}] Tool marked as error but no error content: "
|
|
1304
|
+
f"tool='{tool_name or 'unknown'}', id='{tool_use_id}', "
|
|
1305
|
+
f"is_error={getattr(block, 'is_error', None)}"
|
|
1306
|
+
)
|
|
1307
|
+
|
|
1308
|
+
enriched_content = error_content
|
|
1309
|
+
if error_content and not is_cleared_content and isinstance(error_content, str):
|
|
1310
|
+
provider_cfg = config.proxy.get_provider()
|
|
1311
|
+
if provider_cfg.error_hints:
|
|
1312
|
+
enriched_content = enrich_error_content(tool_name, error_content)
|
|
1313
|
+
if enriched_content != error_content:
|
|
1314
|
+
block.content = enriched_content
|
|
1315
|
+
logger.debug(f"[{request_id}] Enriched error hint for tool '{tool_name}'")
|
|
1316
|
+
|
|
1317
|
+
# Only log as failure if we have actual error content (not cleared)
|
|
1318
|
+
if error_content and not is_cleared_content:
|
|
1319
|
+
asyncio.create_task(
|
|
1320
|
+
log_tool_failure(
|
|
1321
|
+
request_id=request_id,
|
|
1322
|
+
mapped_model=mapped_model,
|
|
1323
|
+
tool_name=tool_name,
|
|
1324
|
+
tool_use_id=tool_use_id,
|
|
1325
|
+
tool_input=tool_input,
|
|
1326
|
+
error_content=error_content,
|
|
1327
|
+
)
|
|
1328
|
+
)
|
|
1329
|
+
asyncio.create_task(
|
|
1330
|
+
log_tool_event(
|
|
1331
|
+
request_id=request_id,
|
|
1332
|
+
tool_name=tool_name,
|
|
1333
|
+
status="failure",
|
|
1334
|
+
stage="client_execution_report",
|
|
1335
|
+
details={
|
|
1336
|
+
"tool_use_id": tool_use_id,
|
|
1337
|
+
"error_content": enriched_content,
|
|
1338
|
+
"tool_name_found": bool(tool_name),
|
|
1339
|
+
},
|
|
1340
|
+
)
|
|
1341
|
+
)
|
|
1342
|
+
|
|
1343
|
+
|
|
1344
|
+
def _find_tool_use_info(messages, current_msg, tool_use_id) -> tuple[str | None, dict[str, Any] | None]:
|
|
1345
|
+
"""Find tool name and input parameters from message history."""
|
|
1346
|
+
current_idx = messages.index(current_msg)
|
|
1347
|
+
|
|
1348
|
+
for i in range(current_idx - 1, -1, -1):
|
|
1349
|
+
prev_msg = messages[i]
|
|
1350
|
+
if prev_msg.role == "assistant" and isinstance(prev_msg.content, list):
|
|
1351
|
+
for block in prev_msg.content:
|
|
1352
|
+
if (
|
|
1353
|
+
hasattr(block, "type")
|
|
1354
|
+
and block.type == "tool_use"
|
|
1355
|
+
and hasattr(block, "id")
|
|
1356
|
+
and block.id == tool_use_id
|
|
1357
|
+
):
|
|
1358
|
+
return (
|
|
1359
|
+
getattr(block, "name", None),
|
|
1360
|
+
getattr(block, "input", None),
|
|
1361
|
+
)
|
|
1362
|
+
return None, None
|
|
1363
|
+
|
|
1364
|
+
|
|
1365
|
+
def find_available_port(start_port: int, max_attempts: int = 10) -> int:
|
|
1366
|
+
"""Find an available port starting from start_port."""
|
|
1367
|
+
for port in range(start_port, start_port + max_attempts):
|
|
1368
|
+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
|
|
1369
|
+
try:
|
|
1370
|
+
sock.bind(("", port))
|
|
1371
|
+
sock.close()
|
|
1372
|
+
return port
|
|
1373
|
+
except OSError:
|
|
1374
|
+
continue
|
|
1375
|
+
raise RuntimeError(f"Could not find available port in range {start_port}-{start_port + max_attempts}")
|
|
1376
|
+
|
|
1377
|
+
|
|
1378
|
+
@click.command()
|
|
1379
|
+
@click.option(
|
|
1380
|
+
"--template",
|
|
1381
|
+
type=str,
|
|
1382
|
+
required=True,
|
|
1383
|
+
help="Configuration template to use (e.g., openrouter-gemini, openrouter-openai, openrouter-anthropic)",
|
|
1384
|
+
)
|
|
1385
|
+
@click.option("--port", type=int, default=8082, help="Port to run the server on (default: 8082)")
|
|
1386
|
+
@click.option("--host", default="127.0.0.1", help="Host to bind the server to (default: 127.0.0.1)")
|
|
1387
|
+
@click.option("--reload", is_flag=True, help="Enable auto-reload on code changes")
|
|
1388
|
+
@click.option(
|
|
1389
|
+
"--auto-port",
|
|
1390
|
+
is_flag=True,
|
|
1391
|
+
help="Automatically find an available port if the specified port is in use",
|
|
1392
|
+
)
|
|
1393
|
+
@click.option(
|
|
1394
|
+
"--proxy-id",
|
|
1395
|
+
type=str,
|
|
1396
|
+
required=False,
|
|
1397
|
+
help="Explicit proxy id (enables proxy-scoped overrides + strict startup validation).",
|
|
1398
|
+
)
|
|
1399
|
+
def main(
|
|
1400
|
+
template: str,
|
|
1401
|
+
port: int,
|
|
1402
|
+
host: str,
|
|
1403
|
+
reload: bool,
|
|
1404
|
+
auto_port: bool,
|
|
1405
|
+
proxy_id: str | None,
|
|
1406
|
+
):
|
|
1407
|
+
"""Start the Unified LLM Proxy server with template-based configuration.
|
|
1408
|
+
|
|
1409
|
+
Template configurations are defined in YAML files under config/defaults/templates/.
|
|
1410
|
+
Each template specifies:
|
|
1411
|
+
- Provider (gemini, openai, litellm)
|
|
1412
|
+
- Model tier mappings (haiku, sonnet, opus)
|
|
1413
|
+
- Provider-specific settings (reasoning effort, cache TTL, etc.)
|
|
1414
|
+
"""
|
|
1415
|
+
import os
|
|
1416
|
+
|
|
1417
|
+
from forge.config.loader import template_exists
|
|
1418
|
+
|
|
1419
|
+
if not template_exists(template):
|
|
1420
|
+
click.echo(f"Unknown template '{template}'")
|
|
1421
|
+
click.echo("Run 'forge proxy template list' to see available templates.")
|
|
1422
|
+
sys.exit(1)
|
|
1423
|
+
|
|
1424
|
+
level = get_effective_log_level()
|
|
1425
|
+
if level != "off":
|
|
1426
|
+
configure_debug_logging(component="proxy", subdirectory="proxy")
|
|
1427
|
+
configure_console_logging()
|
|
1428
|
+
|
|
1429
|
+
effective_proxy_id = proxy_id
|
|
1430
|
+
|
|
1431
|
+
try:
|
|
1432
|
+
cfg = init_config(template=template, proxy_id=effective_proxy_id)
|
|
1433
|
+
provider = cfg.proxy.preferred_provider
|
|
1434
|
+
default_port = cfg.proxy.default_port
|
|
1435
|
+
|
|
1436
|
+
if not provider:
|
|
1437
|
+
click.echo(f"✘ Template '{template}' missing 'preferred_provider' field")
|
|
1438
|
+
sys.exit(1)
|
|
1439
|
+
|
|
1440
|
+
except Exception as e:
|
|
1441
|
+
click.echo(f"✘ Failed to load template '{template}': {e}")
|
|
1442
|
+
sys.exit(1)
|
|
1443
|
+
|
|
1444
|
+
if default_port and default_port != port:
|
|
1445
|
+
click.echo(
|
|
1446
|
+
f"⚠︎ Warning: Template '{template}' typically uses port {default_port}, but starting on port {port}"
|
|
1447
|
+
)
|
|
1448
|
+
click.echo(f" Recommended: python -m forge.proxy.server --template {template} --port {default_port}")
|
|
1449
|
+
|
|
1450
|
+
actual_port = port
|
|
1451
|
+
if auto_port:
|
|
1452
|
+
if effective_proxy_id is not None:
|
|
1453
|
+
click.echo("✘ --auto-port cannot be used when starting under a proxy id")
|
|
1454
|
+
sys.exit(1)
|
|
1455
|
+
|
|
1456
|
+
actual_port = find_available_port(port)
|
|
1457
|
+
if actual_port != port:
|
|
1458
|
+
click.echo(f"⚠︎ Port {port} is in use, using port {actual_port} instead")
|
|
1459
|
+
else:
|
|
1460
|
+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
|
|
1461
|
+
try:
|
|
1462
|
+
sock.bind((host, port))
|
|
1463
|
+
sock.close()
|
|
1464
|
+
except OSError:
|
|
1465
|
+
click.echo(f"✘ Port {port} is already in use!")
|
|
1466
|
+
click.echo(" Use --auto-port to automatically find an available port")
|
|
1467
|
+
sys.exit(1)
|
|
1468
|
+
|
|
1469
|
+
# Strict proxy startup validation (B2.1.3)
|
|
1470
|
+
if effective_proxy_id is not None:
|
|
1471
|
+
try:
|
|
1472
|
+
from forge.proxy.proxy_startup import (
|
|
1473
|
+
ProxyStartupContext,
|
|
1474
|
+
ProxyStartupValidationError,
|
|
1475
|
+
validate_proxy_startup,
|
|
1476
|
+
)
|
|
1477
|
+
|
|
1478
|
+
validate_proxy_startup(
|
|
1479
|
+
ctx=ProxyStartupContext(proxy_id=effective_proxy_id, template=template, port=actual_port)
|
|
1480
|
+
)
|
|
1481
|
+
|
|
1482
|
+
except ProxyStartupValidationError as e:
|
|
1483
|
+
click.echo(f"✘ {e}")
|
|
1484
|
+
sys.exit(1)
|
|
1485
|
+
except Exception as e:
|
|
1486
|
+
click.echo(f"✘ Failed to validate proxy startup: {e}")
|
|
1487
|
+
sys.exit(1)
|
|
1488
|
+
|
|
1489
|
+
# Track which template is active (for runtime introspection)
|
|
1490
|
+
# Set ACTIVE_PORT to actual_port (not port) to handle --auto-port correctly
|
|
1491
|
+
os.environ["ACTIVE_TEMPLATE"] = template
|
|
1492
|
+
os.environ["ACTIVE_PORT"] = str(actual_port)
|
|
1493
|
+
os.environ["PREFERRED_PROVIDER"] = provider
|
|
1494
|
+
|
|
1495
|
+
# Freeze proxy id for request handlers. Set in env so the uvicorn worker
|
|
1496
|
+
# (which reimports the module when app is passed as a string) picks it up.
|
|
1497
|
+
global PROXY_ID
|
|
1498
|
+
PROXY_ID = effective_proxy_id
|
|
1499
|
+
if effective_proxy_id is not None:
|
|
1500
|
+
os.environ["FORGE_PROXY_ID"] = effective_proxy_id
|
|
1501
|
+
|
|
1502
|
+
# Initialize in this module for direct/app-object runs; the imported
|
|
1503
|
+
# uvicorn app module initializes itself lazily via _ensure_runtime_state().
|
|
1504
|
+
_initialize_cost_tracker_from_config()
|
|
1505
|
+
|
|
1506
|
+
provider_cfg = cfg.proxy.get_provider(provider)
|
|
1507
|
+
tier_models = {
|
|
1508
|
+
"haiku": provider_cfg.tiers.haiku,
|
|
1509
|
+
"sonnet": provider_cfg.tiers.sonnet,
|
|
1510
|
+
"opus": provider_cfg.tiers.opus,
|
|
1511
|
+
}
|
|
1512
|
+
|
|
1513
|
+
click.echo("")
|
|
1514
|
+
click.echo("╔══════════════════════════════════════╗")
|
|
1515
|
+
click.echo("║ Unified LLM Proxy Server ║")
|
|
1516
|
+
click.echo("╚══════════════════════════════════════╝")
|
|
1517
|
+
click.echo("")
|
|
1518
|
+
click.echo(f"🌐 Server: http://{host}:{actual_port}")
|
|
1519
|
+
click.echo(f" Template: {template}")
|
|
1520
|
+
click.echo(f"📡 Provider: {provider}")
|
|
1521
|
+
click.echo(f" Log Level: {level}")
|
|
1522
|
+
click.echo(f"🔄 Reload: {'enabled' if reload else 'disabled'}")
|
|
1523
|
+
click.echo("")
|
|
1524
|
+
click.echo(" Model Tier Mappings:")
|
|
1525
|
+
for tier, model in tier_models.items():
|
|
1526
|
+
if model:
|
|
1527
|
+
click.echo(f" {tier.capitalize():6} → {model}")
|
|
1528
|
+
click.echo("")
|
|
1529
|
+
|
|
1530
|
+
click.echo(" Provider Settings:")
|
|
1531
|
+
click.echo(f" cache_ttl: {provider_cfg.cache_ttl}")
|
|
1532
|
+
if provider_cfg.base_url:
|
|
1533
|
+
click.echo(f" base_url: {provider_cfg.base_url}")
|
|
1534
|
+
click.echo("")
|
|
1535
|
+
|
|
1536
|
+
if effective_proxy_id is not None:
|
|
1537
|
+
click.echo(f" Proxy: ~/.forge/proxies/{effective_proxy_id}/proxy.yaml")
|
|
1538
|
+
else:
|
|
1539
|
+
click.echo(f" Template: defaults/templates/{template}.yaml")
|
|
1540
|
+
click.echo("")
|
|
1541
|
+
click.echo("Press CTRL+C to stop the server")
|
|
1542
|
+
click.echo("")
|
|
1543
|
+
|
|
1544
|
+
uvicorn_level = {
|
|
1545
|
+
"off": "warning",
|
|
1546
|
+
"debug": "debug",
|
|
1547
|
+
"info": "info",
|
|
1548
|
+
"warning": "warning",
|
|
1549
|
+
}.get(level, "warning")
|
|
1550
|
+
|
|
1551
|
+
uvicorn.run(
|
|
1552
|
+
"forge.proxy.server:app",
|
|
1553
|
+
host=host,
|
|
1554
|
+
port=actual_port,
|
|
1555
|
+
log_level=uvicorn_level,
|
|
1556
|
+
reload=reload,
|
|
1557
|
+
)
|
|
1558
|
+
|
|
1559
|
+
|
|
1560
|
+
if __name__ == "__main__":
|
|
1561
|
+
main()
|