vertex-proxy 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,3 @@
1
+ """vertex-proxy: Anthropic + Gemini proxy for Vertex AI."""
2
+
3
+ __version__ = "0.2.0"
@@ -0,0 +1,68 @@
1
+ """Entry point: `python -m vertex_proxy` or `vertex-proxy` CLI."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import logging
7
+ import sys
8
+ from pathlib import Path
9
+
10
+ import uvicorn
11
+
12
+ from .config import load_settings
13
+ from .main import build_app
14
+
15
+
16
+ def main() -> int:
17
+ parser = argparse.ArgumentParser(
18
+ prog="vertex-proxy",
19
+ description="Anthropic + Gemini proxy for Google Cloud Vertex AI",
20
+ )
21
+ parser.add_argument("--host", help="bind host (default: 127.0.0.1)")
22
+ parser.add_argument("--port", type=int, help="bind port (default: 8787)")
23
+ parser.add_argument("--credentials", help="path to GCP service-account JSON")
24
+ parser.add_argument("--project-id", help="GCP project ID (inferred from creds if unset)")
25
+ parser.add_argument(
26
+ "--log-level",
27
+ default=None,
28
+ help="uvicorn log level (default: info)",
29
+ )
30
+ args = parser.parse_args()
31
+
32
+ # Merge CLI into environment so Settings picks them up.
33
+ if args.host:
34
+ _set_env("VERTEX_PROXY_HOST", args.host)
35
+ if args.port:
36
+ _set_env("VERTEX_PROXY_PORT", str(args.port))
37
+ if args.credentials:
38
+ _set_env("VERTEX_PROXY_CREDENTIALS_PATH", str(Path(args.credentials).expanduser()))
39
+ if args.project_id:
40
+ _set_env("VERTEX_PROXY_PROJECT_ID", args.project_id)
41
+ if args.log_level:
42
+ _set_env("VERTEX_PROXY_LOG_LEVEL", args.log_level)
43
+
44
+ cfg = load_settings()
45
+ logging.basicConfig(
46
+ level=getattr(logging, cfg.log_level.upper(), logging.INFO),
47
+ format="%(asctime)s %(levelname)s %(name)s: %(message)s",
48
+ )
49
+
50
+ app = build_app(cfg)
51
+ uvicorn.run(
52
+ app,
53
+ host=cfg.host,
54
+ port=cfg.port,
55
+ log_level=cfg.log_level,
56
+ access_log=True,
57
+ )
58
+ return 0
59
+
60
+
61
+ def _set_env(key: str, val: str) -> None:
62
+ import os
63
+
64
+ os.environ[key] = val
65
+
66
+
67
+ if __name__ == "__main__":
68
+ sys.exit(main())
vertex_proxy/auth.py ADDED
@@ -0,0 +1,134 @@
1
+ """GCP service-account auth with background token refresh.
2
+
3
+ Vertex AI uses short-lived OAuth access tokens (60-minute TTL) derived from a
4
+ service-account JSON key. This module handles the refresh loop so callers
5
+ always see a valid token.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import asyncio
11
+ import logging
12
+ import time
13
+ from pathlib import Path
14
+
15
+ from google.auth.transport.requests import Request as GoogleAuthRequest
16
+ from google.oauth2 import service_account
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+ # Vertex AI only needs cloud-platform scope.
21
+ SCOPES = ["https://www.googleapis.com/auth/cloud-platform"]
22
+
23
+
24
+ class TokenManager:
25
+ """Holds a refreshing GCP access token.
26
+
27
+ Call ``start()`` once at app startup. Access the current token via
28
+ ``token`` or ``await get_token()``. Call ``stop()`` at shutdown.
29
+ """
30
+
31
+ def __init__(
32
+ self,
33
+ credentials_path: Path | None,
34
+ refresh_seconds: int = 3000,
35
+ ) -> None:
36
+ self._credentials_path = credentials_path
37
+ self._refresh_seconds = refresh_seconds
38
+ self._credentials: service_account.Credentials | None = None
39
+ self._project_id: str | None = None
40
+ self._refresh_task: asyncio.Task[None] | None = None
41
+ self._stop_event = asyncio.Event()
42
+ self._last_refresh: float = 0.0
43
+
44
+ @property
45
+ def project_id(self) -> str | None:
46
+ """GCP project ID extracted from the service-account key."""
47
+ return self._project_id
48
+
49
+ @property
50
+ def token(self) -> str:
51
+ """Current access token. Raises if uninitialised."""
52
+ if self._credentials is None or self._credentials.token is None:
53
+ raise RuntimeError("TokenManager not initialised; call start() first")
54
+ return self._credentials.token
55
+
56
+ async def start(self) -> None:
57
+ """Load credentials and kick off the background refresh loop."""
58
+ if self._credentials is not None:
59
+ return
60
+
61
+ if self._credentials_path is None:
62
+ # Fall back to Application Default Credentials
63
+ import google.auth
64
+
65
+ creds, project = google.auth.default(scopes=SCOPES)
66
+ self._credentials = creds # type: ignore[assignment]
67
+ self._project_id = project
68
+ logger.info("loaded Application Default Credentials; project=%s", project)
69
+ else:
70
+ path = Path(self._credentials_path).expanduser()
71
+ self._credentials = service_account.Credentials.from_service_account_file(
72
+ str(path), scopes=SCOPES
73
+ )
74
+ self._project_id = self._credentials.project_id
75
+ logger.info(
76
+ "loaded service-account key from %s; project=%s",
77
+ path,
78
+ self._project_id,
79
+ )
80
+
81
+ # Initial refresh; blocks until we have a token.
82
+ await self._do_refresh()
83
+ self._refresh_task = asyncio.create_task(self._refresh_loop(), name="token-refresh")
84
+
85
+ async def stop(self) -> None:
86
+ """Signal the refresh loop to stop and await it."""
87
+ self._stop_event.set()
88
+ if self._refresh_task is not None:
89
+ self._refresh_task.cancel()
90
+ try:
91
+ await self._refresh_task
92
+ except (asyncio.CancelledError, Exception):
93
+ pass
94
+
95
+ async def get_token(self) -> str:
96
+ """Return a valid access token, refreshing if needed."""
97
+ if self._credentials is None:
98
+ raise RuntimeError("TokenManager not initialised")
99
+ # If the token is close to expiry, force a refresh now.
100
+ if self._credentials.expired or self._credentials.token is None:
101
+ await self._do_refresh()
102
+ return self._credentials.token or ""
103
+
104
+ # --- internal ----------------------------------------------------------
105
+
106
+ async def _do_refresh(self) -> None:
107
+ """Run the blocking google-auth refresh in a worker thread."""
108
+ if self._credentials is None:
109
+ raise RuntimeError("no credentials loaded")
110
+
111
+ def _sync_refresh() -> None:
112
+ request = GoogleAuthRequest()
113
+ assert self._credentials is not None
114
+ self._credentials.refresh(request)
115
+
116
+ await asyncio.get_running_loop().run_in_executor(None, _sync_refresh)
117
+ self._last_refresh = time.time()
118
+ expiry = getattr(self._credentials, "expiry", None)
119
+ logger.info("refreshed access token; expires=%s", expiry)
120
+
121
+ async def _refresh_loop(self) -> None:
122
+ """Background task: refresh the token every N seconds until stopped."""
123
+ while not self._stop_event.is_set():
124
+ try:
125
+ await asyncio.wait_for(self._stop_event.wait(), timeout=self._refresh_seconds)
126
+ # If we got here without timeout, stop was requested.
127
+ return
128
+ except TimeoutError:
129
+ # Normal path: time to refresh.
130
+ try:
131
+ await self._do_refresh()
132
+ except Exception as exc: # noqa: BLE001
133
+ logger.error("token refresh failed: %s", exc, exc_info=True)
134
+ # Don't crash the loop; try again next interval.
vertex_proxy/config.py ADDED
@@ -0,0 +1,104 @@
1
+ """Configuration loaded from environment variables."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+
7
+ from pydantic_settings import BaseSettings, SettingsConfigDict
8
+
9
+
10
+ class Settings(BaseSettings):
11
+ """Runtime configuration for vertex-proxy."""
12
+
13
+ model_config = SettingsConfigDict(
14
+ env_prefix="VERTEX_PROXY_",
15
+ env_file=".env",
16
+ env_file_encoding="utf-8",
17
+ extra="ignore",
18
+ )
19
+
20
+ # --- GCP ---
21
+ # Path to service-account JSON. Uses GOOGLE_APPLICATION_CREDENTIALS if unset.
22
+ credentials_path: Path | None = None
23
+ project_id: str | None = None
24
+ # Region for Claude (Anthropic) models. us-east5 is the primary serving region.
25
+ anthropic_region: str = "us-east5"
26
+ # Region for Gemini models. us-central1 has the widest coverage.
27
+ gemini_region: str = "us-central1"
28
+
29
+ # --- Server ---
30
+ host: str = "127.0.0.1"
31
+ port: int = 8787
32
+ log_level: str = "info"
33
+
34
+ # Optional bearer-token auth on the proxy itself. When set, every request
35
+ # must include `Authorization: Bearer <this value>`. Leave unset for
36
+ # localhost-only deploys (the default). Set it if you expose the proxy on
37
+ # a LAN or reverse-proxy it to the internet.
38
+ api_key: str | None = None
39
+
40
+ # Prometheus-format metrics endpoint. Adds request counters + token
41
+ # counters by model and provider. Off by default to keep the footprint
42
+ # minimal; enable by setting VERTEX_PROXY_METRICS_ENABLED=true.
43
+ metrics_enabled: bool = False
44
+
45
+ # --- Auth refresh ---
46
+ # Access tokens live 60 minutes. Refresh at this interval to stay ahead.
47
+ token_refresh_seconds: int = 3000 # 50 minutes
48
+
49
+ # --- Model aliases ---
50
+ # Map canonical Anthropic model names → Vertex publisher model IDs.
51
+ # Keep this list explicit; we want to know exactly what we're routing.
52
+ # Hermes/Claude-Code typically request `claude-sonnet-4-5-20250929`; Vertex
53
+ # uses `claude-sonnet-4-5@20250929`. The proxy translates.
54
+ anthropic_model_aliases: dict[str, str] = {
55
+ # Sonnet 4.5
56
+ "claude-sonnet-4-5": "claude-sonnet-4-5@20250929",
57
+ "claude-sonnet-4-5-20250929": "claude-sonnet-4-5@20250929",
58
+ # Opus 4.5
59
+ "claude-opus-4-5": "claude-opus-4-5@20250929",
60
+ "claude-opus-4-5-20250929": "claude-opus-4-5@20250929",
61
+ # Haiku 4.5
62
+ "claude-haiku-4-5": "claude-haiku-4-5@20250929",
63
+ "claude-haiku-4-5-20250929": "claude-haiku-4-5@20250929",
64
+ }
65
+
66
+ # Map canonical Gemini model names → Vertex publisher model IDs.
67
+ gemini_model_aliases: dict[str, str] = {
68
+ "gemini-2.5-pro": "gemini-2.5-pro",
69
+ "gemini-2.5-flash": "gemini-2.5-flash",
70
+ "gemini-2.0-flash": "gemini-2.0-flash-001",
71
+ }
72
+
73
+ # Region for Vertex MaaS (Model as a Service) open-source partner models:
74
+ # Kimi K2.5, GLM 5, MiniMax-M2.5, Qwen 3.5, Grok 4.20, etc.
75
+ # Vertex typically serves these via the global endpoint or us-central1.
76
+ maas_region: str = "us-central1"
77
+
78
+ # Map canonical MaaS model names → Vertex publisher/model path fragments.
79
+ # Path shape on Vertex MaaS is:
80
+ # publishers/{PUBLISHER}/models/{MODEL_ID}
81
+ # We store the full path fragment so different publishers can coexist.
82
+ # Check each model's "How to use" tab in Model Garden for the exact shape.
83
+ maas_model_aliases: dict[str, str] = {
84
+ # Moonshot (Kimi)
85
+ "kimi-k2.5": "publishers/moonshotai/models/kimi-k2.5",
86
+ "kimi-k2": "publishers/moonshotai/models/kimi-k2",
87
+ # Zhipu (GLM)
88
+ "glm-5": "publishers/zhipu/models/glm-5",
89
+ "glm-5.1": "publishers/zhipu/models/glm-5.1",
90
+ "glm-4.6": "publishers/zhipu/models/glm-4.6",
91
+ # MiniMax
92
+ "minimax-m2.5": "publishers/minimax/models/minimax-m2.5",
93
+ "minimax-m1": "publishers/minimax/models/minimax-m1",
94
+ # Alibaba (Qwen)
95
+ "qwen3.5": "publishers/qwen/models/qwen3.5",
96
+ "qwen-3": "publishers/qwen/models/qwen-3",
97
+ # xAI (Grok on Vertex)
98
+ "grok-4.20": "publishers/xai/models/grok-4.20",
99
+ "grok-4.1-fast": "publishers/xai/models/grok-4.1-fast",
100
+ }
101
+
102
+
103
+ def load_settings() -> Settings:
104
+ return Settings()
vertex_proxy/main.py ADDED
@@ -0,0 +1,604 @@
1
+ """vertex-proxy FastAPI app.
2
+
3
+ Exposes:
4
+ - POST /anthropic/v1/messages : Anthropic-compatible, forwards to Vertex.
5
+ - POST /gemini/v1beta/models/{m}:generateContent : Gemini-compatible, forwards to Vertex.
6
+ - GET /health : liveness + token status.
7
+ - GET /v1/models : list routable models.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import json
13
+ import logging
14
+ import threading
15
+ import time
16
+ from collections import Counter
17
+ from collections.abc import AsyncGenerator
18
+ from contextlib import asynccontextmanager
19
+ from typing import Any
20
+
21
+ import httpx
22
+ from fastapi import Depends, FastAPI, HTTPException, Request
23
+ from fastapi.responses import JSONResponse, PlainTextResponse, StreamingResponse
24
+ from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
25
+
26
+ from . import __version__
27
+ from .auth import TokenManager
28
+ from .config import Settings, load_settings
29
+
30
+ logger = logging.getLogger(__name__)
31
+
32
+ DEFAULT_HTTP_TIMEOUT = httpx.Timeout(120.0, connect=10.0)
33
+ # Vertex streaming responses can legitimately go quiet for longer than the
34
+ # default read window while the model is thinking. Keep connect/write/pool
35
+ # bounded, but do not abort a live stream just because no chunk arrived.
36
+ STREAM_HTTP_TIMEOUT = httpx.Timeout(connect=10.0, read=None, write=120.0, pool=120.0)
37
+
38
+
39
+ # --- Metrics (Prometheus-format, tiny in-memory counters) -------------------
40
+ # We deliberately don't pull in prometheus_client to keep the dep footprint
41
+ # minimal. This is good enough for a local proxy; use a real metrics library
42
+ # for production multi-instance deployments.
43
+
44
+
45
+ class _Metrics:
46
+ def __init__(self) -> None:
47
+ self._lock = threading.Lock()
48
+ self._requests: Counter[tuple[str, str, str]] = Counter()
49
+ self._tokens_in: Counter[str] = Counter()
50
+ self._tokens_out: Counter[str] = Counter()
51
+ self._started_at = time.time()
52
+
53
+ def record_request(self, route: str, model: str, status: int) -> None:
54
+ with self._lock:
55
+ self._requests[(route, model, str(status))] += 1
56
+
57
+ def record_tokens(self, model: str, prompt: int, completion: int) -> None:
58
+ with self._lock:
59
+ self._tokens_in[model] += prompt
60
+ self._tokens_out[model] += completion
61
+
62
+ def render(self) -> str:
63
+ """Render Prometheus exposition format."""
64
+ lines = [
65
+ "# HELP vertex_proxy_uptime_seconds Seconds since proxy start",
66
+ "# TYPE vertex_proxy_uptime_seconds gauge",
67
+ f"vertex_proxy_uptime_seconds {time.time() - self._started_at:.0f}",
68
+ "# HELP vertex_proxy_requests_total Total requests by route, model, and status",
69
+ "# TYPE vertex_proxy_requests_total counter",
70
+ ]
71
+ with self._lock:
72
+ for (route, model, status), count in self._requests.items():
73
+ lines.append(
74
+ f'vertex_proxy_requests_total{{route="{route}",model="{model}",status="{status}"}} {count}'
75
+ )
76
+ lines.append("# HELP vertex_proxy_tokens_in_total Prompt tokens forwarded")
77
+ lines.append("# TYPE vertex_proxy_tokens_in_total counter")
78
+ for model, count in self._tokens_in.items():
79
+ lines.append(f'vertex_proxy_tokens_in_total{{model="{model}"}} {count}')
80
+ lines.append("# HELP vertex_proxy_tokens_out_total Completion tokens returned")
81
+ lines.append("# TYPE vertex_proxy_tokens_out_total counter")
82
+ for model, count in self._tokens_out.items():
83
+ lines.append(f'vertex_proxy_tokens_out_total{{model="{model}"}} {count}')
84
+ return "\n".join(lines) + "\n"
85
+
86
+
87
+ _METRICS = _Metrics()
88
+
89
+
90
+ # --- app factory ------------------------------------------------------------
91
+
92
+
93
+ def build_app(settings: Settings | None = None) -> FastAPI:
94
+ cfg = settings or load_settings()
95
+ token_mgr = TokenManager(
96
+ credentials_path=cfg.credentials_path,
97
+ refresh_seconds=cfg.token_refresh_seconds,
98
+ )
99
+
100
+ @asynccontextmanager
101
+ async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
102
+ await token_mgr.start()
103
+ # Resolve project ID from credentials if not explicitly configured.
104
+ if cfg.project_id is None:
105
+ cfg.project_id = token_mgr.project_id
106
+ if not cfg.project_id:
107
+ raise RuntimeError(
108
+ "no GCP project_id: set VERTEX_PROXY_PROJECT_ID "
109
+ "or use a service-account key that includes project_id"
110
+ )
111
+ logger.info("vertex-proxy ready; project=%s", cfg.project_id)
112
+ app.state.token_mgr = token_mgr
113
+ app.state.cfg = cfg
114
+ app.state.http = httpx.AsyncClient(timeout=DEFAULT_HTTP_TIMEOUT)
115
+ try:
116
+ yield
117
+ finally:
118
+ await app.state.http.aclose()
119
+ await token_mgr.stop()
120
+
121
+ app = FastAPI(
122
+ title="vertex-proxy",
123
+ description="Anthropic + Gemini API-compatible proxy for Google Cloud Vertex AI",
124
+ version=__version__,
125
+ lifespan=lifespan,
126
+ )
127
+
128
+ # --- optional bearer-token auth on the proxy itself ------------------------
129
+ # When VERTEX_PROXY_API_KEY is set, every non-health route requires it.
130
+ # Use when exposing the proxy on a LAN or reverse-proxying to the internet.
131
+ bearer = HTTPBearer(auto_error=False)
132
+
133
+ async def require_api_key(
134
+ creds: HTTPAuthorizationCredentials | None = Depends(bearer), # noqa: B008
135
+ ) -> None:
136
+ if not cfg.api_key:
137
+ return # auth not required
138
+ if creds is None or creds.credentials != cfg.api_key:
139
+ raise HTTPException(
140
+ status_code=401,
141
+ detail="missing or invalid bearer token",
142
+ headers={"WWW-Authenticate": "Bearer"},
143
+ )
144
+
145
+ # --- health ----------------------------------------------------------------
146
+
147
+ @app.get("/health")
148
+ async def health() -> dict[str, Any]:
149
+ try:
150
+ # Try to get a token; proves auth is working.
151
+ await token_mgr.get_token()
152
+ return {"status": "ok", "project": cfg.project_id}
153
+ except Exception as exc: # noqa: BLE001
154
+ return JSONResponse(
155
+ status_code=503,
156
+ content={"status": "unhealthy", "error": str(exc)},
157
+ )
158
+
159
+ # --- metrics (Prometheus, opt-in) ----------------------------------------
160
+
161
+ @app.get("/metrics")
162
+ async def metrics() -> PlainTextResponse:
163
+ if not cfg.metrics_enabled:
164
+ raise HTTPException(
165
+ status_code=404,
166
+ detail="metrics disabled; set VERTEX_PROXY_METRICS_ENABLED=true to enable",
167
+ )
168
+ return PlainTextResponse(_METRICS.render(), media_type="text/plain; version=0.0.4")
169
+
170
+ @app.get("/v1/models", dependencies=[Depends(require_api_key)])
171
+ async def list_models() -> dict[str, Any]:
172
+ return {
173
+ "object": "list",
174
+ "data": [
175
+ {
176
+ "id": alias,
177
+ "object": "model",
178
+ "vertex_model_id": real,
179
+ "provider": "anthropic-vertex",
180
+ "region": cfg.anthropic_region,
181
+ }
182
+ for alias, real in cfg.anthropic_model_aliases.items()
183
+ ]
184
+ + [
185
+ {
186
+ "id": alias,
187
+ "object": "model",
188
+ "vertex_model_id": real,
189
+ "provider": "gemini-vertex",
190
+ "region": cfg.gemini_region,
191
+ }
192
+ for alias, real in cfg.gemini_model_aliases.items()
193
+ ]
194
+ + [
195
+ {
196
+ "id": alias,
197
+ "object": "model",
198
+ "vertex_model_id": path,
199
+ "provider": "maas-vertex",
200
+ "region": cfg.maas_region,
201
+ }
202
+ for alias, path in cfg.maas_model_aliases.items()
203
+ ],
204
+ }
205
+
206
+ # --- Anthropic routes ------------------------------------------------------
207
+
208
+ @app.post("/anthropic/v1/messages", dependencies=[Depends(require_api_key)])
209
+ async def anthropic_messages(request: Request) -> Any:
210
+ return await _handle_anthropic(request, cfg, token_mgr)
211
+
212
+ # Also accept /v1/messages directly (some clients won't let you override path).
213
+ @app.post("/v1/messages", dependencies=[Depends(require_api_key)])
214
+ async def anthropic_messages_root(request: Request) -> Any:
215
+ return await _handle_anthropic(request, cfg, token_mgr)
216
+
217
+ # --- Gemini routes ---------------------------------------------------------
218
+ # Gemini SDK hits /v1beta/models/{model}:generateContent and :streamGenerateContent.
219
+ # We pass-through both.
220
+
221
+ @app.post(
222
+ "/gemini/v1beta/models/{model_and_action:path}", dependencies=[Depends(require_api_key)]
223
+ )
224
+ async def gemini_generate(model_and_action: str, request: Request) -> Any:
225
+ return await _handle_gemini(model_and_action, request, cfg, token_mgr)
226
+
227
+ @app.post("/v1beta/models/{model_and_action:path}", dependencies=[Depends(require_api_key)])
228
+ async def gemini_generate_root(model_and_action: str, request: Request) -> Any:
229
+ return await _handle_gemini(model_and_action, request, cfg, token_mgr)
230
+
231
+ # --- OpenAI-compatible route for Vertex MaaS models ------------------------
232
+ # Kimi K2.5, GLM 5, MiniMax-M2.5, Qwen 3.5, Grok 4.20, etc.
233
+ # Vertex exposes these through an OpenAI Chat Completions-compatible
234
+ # endpoint at /v1beta1/.../endpoints/openapi/chat/completions.
235
+
236
+ @app.post("/openai/v1/chat/completions", dependencies=[Depends(require_api_key)])
237
+ async def openai_chat_completions(request: Request) -> Any:
238
+ return await _handle_openai(request, cfg, token_mgr)
239
+
240
+ @app.post("/v1/chat/completions", dependencies=[Depends(require_api_key)])
241
+ async def openai_chat_completions_root(request: Request) -> Any:
242
+ return await _handle_openai(request, cfg, token_mgr)
243
+
244
+ # Some OpenAI clients (notably Hermes's internal one) drop the /v1 prefix
245
+ # when you set base_url to the server root. Accept that shape too.
246
+ @app.post("/chat/completions", dependencies=[Depends(require_api_key)])
247
+ async def openai_chat_completions_bare(request: Request) -> Any:
248
+ return await _handle_openai(request, cfg, token_mgr)
249
+
250
+ # /v1/models/{model}: some clients probe for a specific model's existence
251
+ # before dispatching. Return minimal metadata so they don't bail.
252
+ @app.get("/v1/models/{model_id:path}")
253
+ async def get_model(model_id: str) -> dict[str, Any]:
254
+ if (
255
+ model_id in cfg.anthropic_model_aliases
256
+ or model_id in cfg.gemini_model_aliases
257
+ or model_id in cfg.maas_model_aliases
258
+ or model_id.startswith("google/")
259
+ ):
260
+ return {"id": model_id, "object": "model", "owned_by": "vertex-proxy"}
261
+ raise HTTPException(status_code=404, detail=f"model '{model_id}' not found")
262
+
263
+ # --- OpenAI-client URL tolerance ------------------------------------------
264
+ # OpenAI-style clients construct their final URL by appending a fixed suffix
265
+ # ("/chat/completions" for inference, "/models" or "/v1/models" for model
266
+ # discovery) onto whatever base_url the user configured. A user who wants
267
+ # Gemini traffic naturally sets base_url to ".../gemini", but that prefix is
268
+ # the *native* generateContent route, so the appended "/chat/completions"
269
+ # would 404 (see issue #1). Gemini is reachable through Vertex's OpenAI-compat
270
+ # endpoint inside _handle_openai, which keys off the request body's `model`
271
+ # and ignores the URL prefix entirely. So we mount the OpenAI-compat handler
272
+ # (and the discovery endpoints) under the "/gemini" and "/openai" prefixes as
273
+ # well as the bare root, letting any reasonable base_url choice work.
274
+ _chat_alias_paths = (
275
+ "/openai/chat/completions",
276
+ "/gemini/v1/chat/completions",
277
+ "/gemini/chat/completions",
278
+ )
279
+ for _path in _chat_alias_paths:
280
+ app.add_api_route(
281
+ _path,
282
+ openai_chat_completions,
283
+ methods=["POST"],
284
+ dependencies=[Depends(require_api_key)],
285
+ )
286
+
287
+ # Model-catalog discovery under the same prefixes. Clients probe these before
288
+ # dispatching; a 404 produces noisy logs and "could not fetch models" warnings
289
+ # (and a few clients refuse to proceed). Mirror /v1/models everywhere a client
290
+ # is likely to look.
291
+ _models_list_alias_paths = (
292
+ "/models",
293
+ "/openai/v1/models",
294
+ "/openai/models",
295
+ "/gemini/v1/models",
296
+ "/gemini/models",
297
+ "/api/v1/models",
298
+ "/openai/api/v1/models",
299
+ "/gemini/api/v1/models",
300
+ )
301
+ for _path in _models_list_alias_paths:
302
+ app.add_api_route(
303
+ _path,
304
+ list_models,
305
+ methods=["GET"],
306
+ dependencies=[Depends(require_api_key)],
307
+ )
308
+
309
+ _model_probe_alias_paths = (
310
+ "/openai/v1/models/{model_id:path}",
311
+ "/gemini/v1/models/{model_id:path}",
312
+ )
313
+ for _path in _model_probe_alias_paths:
314
+ app.add_api_route(_path, get_model, methods=["GET"])
315
+
316
+ return app
317
+
318
+
319
+ # --- Anthropic handler ------------------------------------------------------
320
+
321
+
322
+ async def _handle_anthropic(request: Request, cfg: Settings, tm: TokenManager) -> Any:
323
+ try:
324
+ body = await request.json()
325
+ except Exception as exc:
326
+ raise HTTPException(status_code=400, detail="request body must be JSON") from exc
327
+
328
+ requested_model = (body.get("model") or "").strip()
329
+ if not requested_model:
330
+ raise HTTPException(status_code=400, detail="missing 'model' in request body")
331
+
332
+ # Alias resolution.
333
+ vertex_model = cfg.anthropic_model_aliases.get(requested_model, requested_model)
334
+ if "@" not in vertex_model:
335
+ # Accept a bare name only if it's an exact match; otherwise fail loud.
336
+ raise HTTPException(
337
+ status_code=400,
338
+ detail=f"unknown anthropic model '{requested_model}'. "
339
+ f"known aliases: {sorted(cfg.anthropic_model_aliases.keys())}",
340
+ )
341
+
342
+ # Anthropic-on-Vertex wants `anthropic_version` and removes `model`.
343
+ upstream_body = {k: v for k, v in body.items() if k != "model"}
344
+ upstream_body.setdefault("anthropic_version", "vertex-2023-10-16")
345
+
346
+ streaming = bool(body.get("stream"))
347
+ # Vertex endpoint: :streamRawPredict for streaming, :rawPredict for one-shot.
348
+ action = "streamRawPredict" if streaming else "rawPredict"
349
+ url = (
350
+ f"https://{cfg.anthropic_region}-aiplatform.googleapis.com/v1/projects/"
351
+ f"{cfg.project_id}/locations/{cfg.anthropic_region}/publishers/anthropic/"
352
+ f"models/{vertex_model}:{action}"
353
+ )
354
+
355
+ token = await tm.get_token()
356
+ headers = {
357
+ "Authorization": f"Bearer {token}",
358
+ "Content-Type": "application/json",
359
+ }
360
+
361
+ logger.info(
362
+ "anthropic: model=%s → vertex_model=%s streaming=%s",
363
+ requested_model,
364
+ vertex_model,
365
+ streaming,
366
+ )
367
+
368
+ http: httpx.AsyncClient = request.app.state.http
369
+ if streaming:
370
+ _METRICS.record_request("anthropic", requested_model, 200)
371
+ return StreamingResponse(
372
+ _stream_bytes(http, url, headers, upstream_body),
373
+ media_type="text/event-stream",
374
+ )
375
+
376
+ try:
377
+ resp = await http.post(url, headers=headers, json=upstream_body)
378
+ except httpx.HTTPError as exc:
379
+ logger.error("anthropic upstream error: %s", exc)
380
+ raise HTTPException(status_code=502, detail=f"upstream error: {exc}") from exc
381
+
382
+ return _passthrough_response(resp, route="anthropic", model=requested_model)
383
+
384
+
385
+ # --- Gemini handler ---------------------------------------------------------
386
+
387
+
388
+ async def _handle_gemini(
389
+ model_and_action: str, request: Request, cfg: Settings, tm: TokenManager
390
+ ) -> Any:
391
+ # model_and_action is like "gemini-2.5-pro:generateContent" or
392
+ # "gemini-2.5-flash:streamGenerateContent".
393
+ if ":" not in model_and_action:
394
+ raise HTTPException(
395
+ status_code=400,
396
+ detail="gemini path must include action (e.g., ':generateContent')",
397
+ )
398
+ requested_model, action = model_and_action.rsplit(":", 1)
399
+ vertex_model = cfg.gemini_model_aliases.get(requested_model, requested_model)
400
+ streaming = "stream" in action.lower()
401
+
402
+ try:
403
+ body = await request.json()
404
+ except Exception:
405
+ body = {}
406
+
407
+ url = (
408
+ f"https://{cfg.gemini_region}-aiplatform.googleapis.com/v1/projects/"
409
+ f"{cfg.project_id}/locations/{cfg.gemini_region}/publishers/google/"
410
+ f"models/{vertex_model}:{action}"
411
+ )
412
+ # Pass through query params (e.g., alt=sse).
413
+ if request.url.query:
414
+ url = f"{url}?{request.url.query}"
415
+
416
+ token = await tm.get_token()
417
+ headers = {
418
+ "Authorization": f"Bearer {token}",
419
+ "Content-Type": "application/json",
420
+ }
421
+
422
+ logger.info(
423
+ "gemini: model=%s action=%s streaming=%s",
424
+ requested_model,
425
+ action,
426
+ streaming,
427
+ )
428
+
429
+ http: httpx.AsyncClient = request.app.state.http
430
+ if streaming:
431
+ _METRICS.record_request("gemini", requested_model, 200)
432
+ return StreamingResponse(
433
+ _stream_bytes(http, url, headers, body),
434
+ media_type="text/event-stream",
435
+ )
436
+
437
+ try:
438
+ resp = await http.post(url, headers=headers, json=body)
439
+ except httpx.HTTPError as exc:
440
+ logger.error("gemini upstream error: %s", exc)
441
+ raise HTTPException(status_code=502, detail=f"upstream error: {exc}") from exc
442
+
443
+ return _passthrough_response(resp, route="gemini", model=requested_model)
444
+
445
+
446
+ # --- OpenAI-compatible (Vertex MaaS) handler -------------------------------
447
+
448
+
449
+ async def _handle_openai(request: Request, cfg: Settings, tm: TokenManager) -> Any:
450
+ """Forward OpenAI Chat Completions requests to Vertex AI MaaS models.
451
+
452
+ Supports Moonshot (Kimi), Zhipu (GLM), MiniMax, Alibaba (Qwen), xAI (Grok).
453
+ """
454
+ try:
455
+ body = await request.json()
456
+ except Exception as exc:
457
+ raise HTTPException(status_code=400, detail="request body must be JSON") from exc
458
+
459
+ requested_model = (body.get("model") or "").strip()
460
+ if not requested_model:
461
+ raise HTTPException(status_code=400, detail="missing 'model' in request body")
462
+
463
+ streaming = bool(body.get("stream"))
464
+ token = await tm.get_token()
465
+ headers = {
466
+ "Authorization": f"Bearer {token}",
467
+ "Content-Type": "application/json",
468
+ }
469
+
470
+ # --- routing: Gemini via Vertex OpenAI-compat, or MaaS partner model. ---
471
+ if requested_model in cfg.gemini_model_aliases or requested_model.startswith("google/"):
472
+ # Gemini models through Vertex's OpenAI-compat endpoint.
473
+ # See: https://cloud.google.com/vertex-ai/generative-ai/docs/multimodal/call-gemini-using-openai-library
474
+ bare_model = requested_model.removeprefix("google/")
475
+ vertex_model = cfg.gemini_model_aliases.get(bare_model, bare_model)
476
+ url = (
477
+ f"https://{cfg.gemini_region}-aiplatform.googleapis.com/v1beta1/projects/"
478
+ f"{cfg.project_id}/locations/{cfg.gemini_region}/endpoints/openapi/chat/completions"
479
+ )
480
+ upstream_body = dict(body)
481
+ upstream_body["model"] = f"google/{vertex_model}"
482
+ logger.info(
483
+ "openai→gemini: model=%s → %s streaming=%s",
484
+ requested_model,
485
+ upstream_body["model"],
486
+ streaming,
487
+ )
488
+ else:
489
+ # MaaS partner models (Kimi, GLM, MiniMax, Qwen, Grok).
490
+ path_fragment = cfg.maas_model_aliases.get(requested_model)
491
+ if path_fragment is None:
492
+ raise HTTPException(
493
+ status_code=400,
494
+ detail=(
495
+ f"unknown MaaS model '{requested_model}'. "
496
+ f"known aliases: {sorted(cfg.maas_model_aliases.keys())} "
497
+ f"or gemini: {sorted(cfg.gemini_model_aliases.keys())}"
498
+ ),
499
+ )
500
+ url = (
501
+ f"https://{cfg.maas_region}-aiplatform.googleapis.com/v1beta1/projects/"
502
+ f"{cfg.project_id}/locations/{cfg.maas_region}/{path_fragment}/chat/completions"
503
+ )
504
+ upstream_body = dict(body)
505
+ upstream_body["model"] = path_fragment.rsplit("/", 1)[-1]
506
+ logger.info(
507
+ "openai→maas: model=%s → path=%s streaming=%s",
508
+ requested_model,
509
+ path_fragment,
510
+ streaming,
511
+ )
512
+
513
+ http: httpx.AsyncClient = request.app.state.http
514
+ if streaming:
515
+ _METRICS.record_request("openai", requested_model, 200)
516
+ return StreamingResponse(
517
+ _stream_bytes(http, url, headers, upstream_body),
518
+ media_type="text/event-stream",
519
+ )
520
+
521
+ try:
522
+ resp = await http.post(url, headers=headers, json=upstream_body)
523
+ except httpx.HTTPError as exc:
524
+ logger.error("maas upstream error: %s", exc)
525
+ raise HTTPException(status_code=502, detail=f"upstream error: {exc}") from exc
526
+
527
+ return _passthrough_response(resp, route="openai", model=requested_model)
528
+
529
+
530
+ # --- helpers ----------------------------------------------------------------
531
+
532
+
533
+ async def _stream_bytes(
534
+ http: httpx.AsyncClient,
535
+ url: str,
536
+ headers: dict[str, str],
537
+ body: dict[str, Any],
538
+ ) -> AsyncGenerator[bytes, None]:
539
+ try:
540
+ async with http.stream(
541
+ "POST",
542
+ url,
543
+ headers=headers,
544
+ json=body,
545
+ timeout=STREAM_HTTP_TIMEOUT,
546
+ ) as r:
547
+ if r.status_code >= 400:
548
+ # StreamingResponse has already committed to a 200 status by
549
+ # the time this generator runs, so emit a structured SSE error
550
+ # instead of raising and leaving the client with a broken chunk.
551
+ err_body = b""
552
+ async for chunk in r.aiter_bytes():
553
+ err_body += chunk
554
+ detail = err_body.decode("utf-8", errors="replace")[:2000]
555
+ logger.warning("upstream stream returned %s: %s", r.status_code, detail)
556
+ yield _stream_error("upstream_http_error", detail, status_code=r.status_code)
557
+ return
558
+ async for chunk in r.aiter_bytes():
559
+ yield chunk
560
+ except httpx.ReadTimeout as exc:
561
+ logger.warning("upstream stream read timeout: %s", exc)
562
+ yield _stream_error(
563
+ "upstream_read_timeout",
564
+ "upstream stream stalled before completion",
565
+ )
566
+ except httpx.HTTPError as exc:
567
+ logger.error("upstream stream error: %s", exc)
568
+ yield _stream_error("upstream_stream_error", str(exc))
569
+
570
+
571
+ def _stream_error(error_type: str, message: str, status_code: int | None = None) -> bytes:
572
+ payload: dict[str, Any] = {
573
+ "error": {
574
+ "type": error_type,
575
+ "message": message[:2000],
576
+ }
577
+ }
578
+ if status_code is not None:
579
+ payload["error"]["status_code"] = status_code
580
+ return f"event: error\ndata: {json.dumps(payload)}\n\n".encode()
581
+
582
+
583
+ def _passthrough_response(resp: httpx.Response, route: str = "", model: str = "") -> JSONResponse:
584
+ """Forward upstream status + JSON body to the client.
585
+
586
+ If ``route`` + ``model`` are provided and metrics are enabled, record
587
+ request count + token usage from the OpenAI/Anthropic-style ``usage`` field.
588
+ """
589
+ try:
590
+ payload = resp.json()
591
+ except json.JSONDecodeError:
592
+ # Not JSON; forward as text wrapped.
593
+ payload = {"raw": resp.text[:4000]}
594
+
595
+ if route and model:
596
+ _METRICS.record_request(route, model, resp.status_code)
597
+ usage = payload.get("usage") if isinstance(payload, dict) else None
598
+ if isinstance(usage, dict):
599
+ prompt = int(usage.get("prompt_tokens") or usage.get("input_tokens") or 0)
600
+ completion = int(usage.get("completion_tokens") or usage.get("output_tokens") or 0)
601
+ if prompt or completion:
602
+ _METRICS.record_tokens(model, prompt, completion)
603
+
604
+ return JSONResponse(status_code=resp.status_code, content=payload)
@@ -0,0 +1,332 @@
1
+ Metadata-Version: 2.4
2
+ Name: vertex-proxy
3
+ Version: 0.2.0
4
+ Summary: Anthropic + Gemini + OpenAI API-compatible proxy for Google Cloud Vertex AI. Bridges static-URL API consumers to Vertex AI's service-account auth.
5
+ Project-URL: Homepage, https://github.com/prasadus92/vertex-proxy
6
+ Project-URL: Author, https://prasad.tech
7
+ Project-URL: Issues, https://github.com/prasadus92/vertex-proxy/issues
8
+ Project-URL: Changelog, https://github.com/prasadus92/vertex-proxy/blob/main/CHANGELOG.md
9
+ Author-email: Prasad Subrahmanya <prasad@luminik.io>
10
+ License: MIT
11
+ License-File: LICENSE
12
+ Keywords: anthropic,api-proxy,claude,fastapi,gcp,gemini,google-cloud,llm,llm-proxy,openai,openai-compatible,proxy,service-account,vertex-ai
13
+ Classifier: Development Status :: 4 - Beta
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Topic :: Software Development :: Libraries
19
+ Requires-Python: >=3.11
20
+ Requires-Dist: fastapi>=0.109
21
+ Requires-Dist: google-auth>=2.28
22
+ Requires-Dist: httpx>=0.26
23
+ Requires-Dist: pydantic-settings>=2.2
24
+ Requires-Dist: pydantic>=2.6
25
+ Requires-Dist: requests>=2.31
26
+ Requires-Dist: uvicorn[standard]>=0.27
27
+ Provides-Extra: dev
28
+ Requires-Dist: pytest-asyncio>=0.23; extra == 'dev'
29
+ Requires-Dist: pytest>=8.0; extra == 'dev'
30
+ Requires-Dist: ruff>=0.3; extra == 'dev'
31
+ Description-Content-Type: text/markdown
32
+
33
+ # vertex-proxy
34
+
35
+ [![CI](https://github.com/prasadus92/vertex-proxy/actions/workflows/ci.yml/badge.svg)](https://github.com/prasadus92/vertex-proxy/actions/workflows/ci.yml)
36
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
37
+ [![Python 3.11+](https://img.shields.io/badge/python-3.11+-blue.svg)](https://www.python.org/downloads/)
38
+
39
+ A small, local-only proxy that bridges **any tool speaking the Anthropic Messages API, Gemini API, or OpenAI Chat Completions API** to **Google Cloud Vertex AI**, so you can point existing clients at Vertex without changing their code.
40
+
41
+ ## What this is for
42
+
43
+ You have a tool (Claude Code, Hermes Agent, opencode, Cline, Continue.dev, a custom SDK integration, etc.) that already knows how to talk to:
44
+
45
+ - `api.anthropic.com`
46
+ - `generativelanguage.googleapis.com`
47
+ - any OpenAI-compatible endpoint
48
+
49
+ You want that same tool to hit Vertex AI instead, maybe because you want to burn GCP credits, unify billing, or get higher quotas than the public APIs offer.
50
+
51
+ The problem: Vertex uses **short-lived OAuth access tokens** from a service-account key. Most tools expect a static `Authorization: Bearer xxx` header. Nobody wants to rebuild auth in every client.
52
+
53
+ vertex-proxy runs on `127.0.0.1:8787`, handles the auth refresh loop, and translates between the public API shapes and Vertex's publisher-model endpoints.
54
+
55
+ ```
56
+ ┌──────────────┐ Anthropic/Gemini/OpenAI ┌──────────────┐ GCP auth ┌────────────┐
57
+ │ your tool │ ──────────────────────────► │ vertex-proxy │ ──────────► │ Vertex AI │
58
+ └──────────────┘ localhost:8787 └──────────────┘ SA JWT └────────────┘
59
+ ```
60
+
61
+ No client changes. Small, dependency-light Python. MIT licensed.
62
+
63
+ ## Install
64
+
65
+ Python 3.11+, a GCP project with Vertex AI API enabled, and a service-account JSON key with `roles/aiplatform.user`.
66
+
67
+ ```bash
68
+ pipx install vertex-proxy
69
+ # or: uv tool install vertex-proxy
70
+ # or run it without installing: uvx vertex-proxy
71
+ ```
72
+
73
+ ### From source (for development)
74
+
75
+ ```bash
76
+ git clone https://github.com/prasadus92/vertex-proxy.git
77
+ cd vertex-proxy
78
+ python -m venv .venv
79
+ .venv/bin/pip install -e .
80
+ ```
81
+
82
+ ## Run
83
+
84
+ ```bash
85
+ export VERTEX_PROXY_CREDENTIALS_PATH=/path/to/service-account.json
86
+ export VERTEX_PROXY_PROJECT_ID=your-gcp-project
87
+ vertex-proxy
88
+ # → listening on http://127.0.0.1:8787
89
+ ```
90
+
91
+ Or inline:
92
+
93
+ ```bash
94
+ vertex-proxy \
95
+ --credentials ~/.vertex/key.json \
96
+ --project-id my-project \
97
+ --port 8787
98
+ ```
99
+
100
+ (From a source checkout, the command is `.venv/bin/vertex-proxy`.)
101
+
102
+ Verify:
103
+
104
+ ```bash
105
+ curl http://127.0.0.1:8787/health
106
+ # {"status":"ok","project":"my-project"}
107
+
108
+ curl -X POST http://127.0.0.1:8787/gemini/v1beta/models/gemini-2.5-flash:generateContent \
109
+ -H "Content-Type: application/json" \
110
+ -d '{"contents":[{"role":"user","parts":[{"text":"hello"}]}]}'
111
+ ```
112
+
113
+ ## Endpoints
114
+
115
+ | Path | API compat | Vertex backend |
116
+ |---|---|---|
117
+ | `POST /anthropic/v1/messages` | Anthropic Messages API | `publishers/anthropic/models/{model}:rawPredict` |
118
+ | `POST /gemini/v1beta/models/{m}:{action}` | Gemini generateContent API | `publishers/google/models/{m}:{action}` |
119
+ | `POST /openai/v1/chat/completions` | OpenAI Chat Completions | Gemini (via Vertex OpenAI-compat) + MaaS partner models (Kimi, GLM, MiniMax, Qwen, Grok) |
120
+ | `GET /v1/models` | - | Lists routable models |
121
+ | `GET /health` | - | Liveness + auth check |
122
+
123
+ The OpenAI Chat Completions shape is also accepted under the `/gemini` prefix and the bare root, so clients that build their URL from a `base_url` of `.../openai`, `.../gemini`, or the server root all reach the same handler. Model-discovery probes (`/v1/models`, `/models`) are mirrored under those prefixes too.
124
+
125
+ Streaming is supported on all routes (Anthropic, Gemini, and the OpenAI-compat route).
126
+ Streaming requests use a no-read-timeout upstream client so long Vertex generations do not get cut off during idle periods.
127
+
128
+ ## Pre-configured models
129
+
130
+ All aliases live in [`vertex_proxy/config.py`](vertex_proxy/config.py); extend as needed.
131
+
132
+ **Anthropic** (on Vertex, `us-east5` by default)
133
+ - `claude-sonnet-4-5-20250929` → `claude-sonnet-4-5@20250929`
134
+ - `claude-opus-4-5-20250929` → `claude-opus-4-5@20250929`
135
+ - `claude-haiku-4-5-20250929` → `claude-haiku-4-5@20250929`
136
+
137
+ **Gemini** (on Vertex, `us-central1` by default)
138
+ - `gemini-2.5-pro`, `gemini-2.5-flash`, `gemini-2.0-flash`
139
+
140
+ **MaaS partner models** (OpenAI-compatible route)
141
+ - `kimi-k2.5`, `kimi-k2` (Moonshot)
142
+ - `glm-5`, `glm-5.1`, `glm-4.6` (Zhipu)
143
+ - `minimax-m2.5`, `minimax-m1` (MiniMax)
144
+ - `qwen3.5`, `qwen-3` (Alibaba)
145
+ - `grok-4.20`, `grok-4.1-fast` (xAI)
146
+
147
+ ## Recipes
148
+
149
+ ### Claude Code CLI
150
+
151
+ Point Claude Code at the proxy via `ANTHROPIC_BASE_URL`:
152
+
153
+ ```bash
154
+ export ANTHROPIC_BASE_URL=http://127.0.0.1:8787/anthropic
155
+ export ANTHROPIC_AUTH_TOKEN=bypass # proxy ignores this; Vertex auth is server-side
156
+ claude
157
+ ```
158
+
159
+ Your local Claude Code session now bills against your GCP project instead of api.anthropic.com.
160
+
161
+ ### Hermes Agent
162
+
163
+ Add to `~/.hermes/config.yaml`:
164
+
165
+ ```yaml
166
+ custom_providers:
167
+ - name: vertex-gemini
168
+ # Hermes's openai_chat transport appends /chat/completions (and probes
169
+ # /v1/models) onto base_url. Gemini is served through Vertex's OpenAI-compat
170
+ # layer, so any of these bases work: .../openai, .../gemini, or the bare root.
171
+ base_url: http://127.0.0.1:8787/openai
172
+ transport: openai_chat
173
+
174
+ - name: vertex-anthropic
175
+ base_url: http://127.0.0.1:8787/anthropic
176
+ transport: anthropic_messages
177
+
178
+ fallback_model:
179
+ provider: vertex-gemini
180
+ model: gemini-2.5-pro
181
+ ```
182
+
183
+ Zero Hermes source changes required. Picks up the existing `custom_providers` mechanism. The `openai_chat` transport routes through the proxy's OpenAI-compat handler, which dispatches Gemini models to Vertex's OpenAI-compatible endpoint based on the request body's `model`.
184
+
185
+ ### opencode / Cline / any Anthropic-SDK client
186
+
187
+ Set the base URL environment variable the client supports (usually one of `ANTHROPIC_BASE_URL`, `ANTHROPIC_API_URL`, or the equivalent in your client's config):
188
+
189
+ ```bash
190
+ export ANTHROPIC_BASE_URL=http://127.0.0.1:8787/anthropic
191
+ ```
192
+
193
+ ## Run as a service (macOS launchd)
194
+
195
+ ```bash
196
+ cd launchd
197
+ ./install.sh --credentials /path/to/key.json --project my-gcp-project
198
+ ```
199
+
200
+ This renders the plist template, copies it to `~/Library/LaunchAgents/`, loads it, and does a health check. Logs go to `~/Library/Logs/vertex-proxy.{log,err}`.
201
+
202
+ Stop:
203
+ ```bash
204
+ launchctl unload ~/Library/LaunchAgents/ai.hermes.vertex-proxy.plist
205
+ ```
206
+
207
+ For Linux, the same pattern works with systemd; see [`examples/systemd.service`](examples/systemd.service).
208
+
209
+ ## Configuration reference
210
+
211
+ All settings accept `VERTEX_PROXY_` env var prefix or CLI flags.
212
+
213
+ | Env var | Default | Purpose |
214
+ |---|---|---|
215
+ | `VERTEX_PROXY_CREDENTIALS_PATH` | - | Service-account JSON path (falls back to ADC) |
216
+ | `VERTEX_PROXY_PROJECT_ID` | inferred from key | GCP project ID |
217
+ | `VERTEX_PROXY_ANTHROPIC_REGION` | `us-east5` | Region for Claude |
218
+ | `VERTEX_PROXY_GEMINI_REGION` | `us-central1` | Region for Gemini |
219
+ | `VERTEX_PROXY_MAAS_REGION` | `us-central1` | Region for Kimi / GLM / MiniMax / Qwen / Grok |
220
+ | `VERTEX_PROXY_HOST` | `127.0.0.1` | Bind host |
221
+ | `VERTEX_PROXY_PORT` | `8787` | Bind port |
222
+ | `VERTEX_PROXY_TOKEN_REFRESH_SECONDS` | `3000` | Token refresh interval (50 min) |
223
+ | `VERTEX_PROXY_LOG_LEVEL` | `info` | uvicorn log level |
224
+
225
+ ## A word on GCP credits
226
+
227
+ **GCP promotional credits (startup, free trial, partner) typically do NOT cover Google Cloud Marketplace purchases.** On Vertex AI, this matters because:
228
+
229
+ - **First-party Google models** (Gemini 2.5 Pro / Flash, Gemma) are billed as "Vertex AI API" usage → **credits cover ✅**
230
+ - **Partner models** (Claude, Kimi, GLM, MiniMax, Grok) are typically billed via GCP Marketplace → **credits usually don't cover ❌**
231
+
232
+ The "Promotional credits" section of your model's agreement page in Google Cloud Console will tell you explicitly. Quote from a typical Claude-on-Vertex agreement:
233
+
234
+ > *Most Google Cloud promotional credits don't apply to Google Cloud Marketplace purchases.*
235
+
236
+ If credit-burn is your goal, point vertex-proxy at Gemini. If billing unification is your goal, vertex-proxy works for everything.
237
+
238
+ ## Security
239
+
240
+ vertex-proxy binds to `127.0.0.1` by default and **ships with no authentication**. It's designed as a local-loopback shim; anyone who can reach it can spend your GCP credits via your service account.
241
+
242
+ Do not expose it to a public interface. If you need remote access, put it behind a reverse proxy with proper auth (nginx + basic auth, Tailscale, Cloud Run with IAP, etc.).
243
+
244
+ ## Status
245
+
246
+ - [x] Anthropic Messages API → Vertex Claude (with streaming)
247
+ - [x] Gemini generateContent API → Vertex Gemini (with streaming)
248
+ - [x] OpenAI Chat Completions → Vertex Gemini via Vertex's OpenAI-compat layer
249
+ - [x] OpenAI Chat Completions → Vertex MaaS partner models (Kimi, GLM, MiniMax, Qwen, Grok)
250
+ - [x] Multiple URL shapes accepted for OpenAI client compatibility: chat completions under the `/openai`, `/gemini`, and bare-root prefixes (e.g. `/openai/v1/chat/completions`, `/gemini/chat/completions`, `/chat/completions`), plus model-discovery (`/v1/models`, `/models`) mirrored under the same prefixes
251
+ - [x] Automatic GCP service-account token refresh
252
+ - [x] launchd (macOS) + systemd (Linux) service recipes
253
+ - [x] Dockerfile + docker-compose for containerized deploy
254
+ - [x] Optional bearer-token auth on the proxy itself (for remote deploys)
255
+ - [x] Prometheus metrics endpoint at `/metrics`
256
+ - [x] `pipx` / `uv` / `uvx` install via PyPI (tag-triggered OIDC Trusted Publishing release workflow)
257
+ - [x] 22 unit tests, GitHub Actions CI on Python 3.11 + 3.12
258
+
259
+ ### Tested with
260
+ - [x] Hermes Agent: verified end-to-end with live Gemini 2.5 Flash dispatch
261
+ - [x] Claude Code CLI: via `ANTHROPIC_BASE_URL` env
262
+ - [x] Direct `curl` against all routes
263
+
264
+ ## Troubleshooting
265
+
266
+ ### Client reports incomplete chunked read during streaming
267
+
268
+ This usually means the upstream Vertex stream was interrupted. Current streaming routes keep the upstream read open without a fixed read timeout and return a structured SSE error if Vertex still fails mid-stream, so clients should receive a clean error event instead of a broken HTTP chunk.
269
+
270
+ ### 404 "model not found" on Claude routes
271
+
272
+ Most Vertex AI Claude model endpoints require one-time enablement in Model Garden. Go to https://console.cloud.google.com/vertex-ai/publishers/anthropic/model-garden and click ENABLE on the specific model (Sonnet, Opus, Haiku). Accept the Marketplace T&Cs. Your service account can then call them.
273
+
274
+ Note: GCP promotional credits typically don't cover Marketplace models. See "A word on GCP credits" above.
275
+
276
+ ### 404 "model not found" on MaaS routes (Kimi, GLM, MiniMax, Qwen, Grok)
277
+
278
+ Same as Claude: Vertex partner models require Model Garden enablement per model. Additionally, the MaaS path in `config.py` is a best-effort guess at Vertex's URL shape for these partners. If you hit 404s after enablement, check the "How to use" tab on the model's page in Model Garden and update the `maas_model_aliases` entry with the exact path fragment Google shows.
279
+
280
+ ### 401 / 403 on all routes
281
+
282
+ Your service account lacks `roles/aiplatform.user`. Grant it:
283
+ ```
284
+ gcloud projects add-iam-policy-binding YOUR_PROJECT \
285
+ --member="serviceAccount:YOUR_SA@YOUR_PROJECT.iam.gserviceaccount.com" \
286
+ --role="roles/aiplatform.user"
287
+ ```
288
+
289
+ ### Gemini 2.5 returns empty content with `reasoning_tokens` populated
290
+
291
+ Gemini 2.5 models use an internal "thinking" budget that counts against `max_tokens`. If `max_tokens` is too low, the model may use all its budget on thinking and return no visible output. Raise `max_tokens` to at least 100 for anything beyond trivial replies.
292
+
293
+ ### Hermes (or any OpenAI-chat client) returns `404 {'detail': 'Not Found'}` for Gemini
294
+
295
+ This happens when an OpenAI-chat client is pointed at the `/gemini` base. That client builds its request URL by appending `/chat/completions` (so it actually calls `/gemini/chat/completions`), but `/gemini` is the *native* `generateContent` route, which has no `chat/completions` handler. `curl` against `/gemini/v1beta/models/...:generateContent` works because that's the native shape; the OpenAI-chat client uses a different shape.
296
+
297
+ The proxy now accepts the OpenAI-chat shape under the `/gemini` and `/openai` prefixes as well as the bare root, so `transport: openai_chat` works against any of these bases. If you're on an older build, point the provider's `base_url` at `http://127.0.0.1:8787/openai` (or the bare `http://127.0.0.1:8787`) instead of `.../gemini`. Gemini still routes correctly because the OpenAI-compat handler dispatches by the request body's `model`.
298
+
299
+ ### Request works with `curl` but fails from my OpenAI client
300
+
301
+ Your client is probably sending requests to a URL shape the shim didn't expect. The shim accepts `/chat/completions`, `/v1/chat/completions`, `/openai/v1/chat/completions`, `/openai/chat/completions`, `/gemini/v1/chat/completions`, and `/gemini/chat/completions` for OpenAI-compatible traffic, and mirrors model discovery (`/v1/models`, `/models`) under the same prefixes. If your client sends something else, file an issue with the exact URL shape and we'll add it.
302
+
303
+ ### Token refresh errors in logs
304
+
305
+ The background refresh task logs errors but doesn't crash the process. If you see repeated refresh failures, check:
306
+ 1. Service account JSON path is correct (`VERTEX_PROXY_CREDENTIALS_PATH`)
307
+ 2. Machine clock is in sync (GCP JWT exchange is clock-sensitive)
308
+ 3. Service account isn't disabled or rotated in GCP IAM
309
+
310
+ ## Comparison with alternatives
311
+
312
+ | Tool | What it does | Fit |
313
+ |---|---|---|
314
+ | **vertex-proxy** (this) | Bridge existing Anthropic/Gemini/OpenAI clients to Vertex AI with auto-auth | You already use a tool with configurable base URL and want to point it at Vertex without rewriting auth |
315
+ | **LiteLLM** | Full-featured multi-provider router with caching, budgets, observability | Managing many providers centrally with policies; heavier dependency |
316
+ | **openai-compat-server** (various) | OpenAI shape over arbitrary backend | Similar to one route of vertex-proxy; doesn't handle GCP SA auth natively |
317
+ | **Vertex AI Python SDK** | Direct first-party Google SDK | You're writing new code and want to talk Vertex directly |
318
+ | **Anthropic Python SDK with Vertex backend** | First-party SDK with Vertex mode flag | You're writing new Anthropic code and control the client |
319
+
320
+ Use vertex-proxy when you have an **existing** tool you can't modify and need to redirect its traffic to Vertex.
321
+
322
+ ## Contributing
323
+
324
+ See [CONTRIBUTING.md](CONTRIBUTING.md). PRs welcome.
325
+
326
+ ## License
327
+
328
+ MIT. See [LICENSE](LICENSE).
329
+
330
+ ## Credits
331
+
332
+ Built by Prasad Subrahmanya ([prasad.tech](https://prasad.tech) · [@prasadus92](https://github.com/prasadus92)) as part of solving the "Hermes fallback model" problem for [Luminik](https://luminik.io), then extracted into a standalone tool because the shim turned out to be useful beyond Hermes.
@@ -0,0 +1,10 @@
1
+ vertex_proxy/__init__.py,sha256=ToDJZ3kXyPkqgK-G0GJpMglTTwGKsMgwgfNRIuGL7Xc,83
2
+ vertex_proxy/__main__.py,sha256=2juNiHQpWzvqM6AeaD1h4WJSDpQed3JbPRgiPC7jZLQ,1894
3
+ vertex_proxy/auth.py,sha256=enqOFZ8OyyUP0o0c6dSAP6qAnUB_eQDmnEhN-nsfBeg,5141
4
+ vertex_proxy/config.py,sha256=idTrMsd5YZDVjwbpyG7Hvb0BHSNkftsg3p0q7lIIFbY,4159
5
+ vertex_proxy/main.py,sha256=Hr4f5FAgjDsudZLfupr-DFUbji8QSeAzbQzhL7BxUM0,24155
6
+ vertex_proxy-0.2.0.dist-info/METADATA,sha256=OY64FURWVVMg053s-VVCa2pwBZ6sFyuCurZWw9zEUb8,16655
7
+ vertex_proxy-0.2.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
8
+ vertex_proxy-0.2.0.dist-info/entry_points.txt,sha256=otfhuFttbqSSH01ZkC118Uz49HOPIkvKR0socZxi21c,60
9
+ vertex_proxy-0.2.0.dist-info/licenses/LICENSE,sha256=W00aDguL-XCsFfMdUU3fwd8Tq9a061MCZC6afJmtqwc,1075
10
+ vertex_proxy-0.2.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.30.1
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ vertex-proxy = vertex_proxy.__main__:main
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Prasad Subrahmanya
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.