@miller-tech/uap 1.13.4 → 1.13.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -145,6 +145,82 @@ llama-server \
|
|
|
145
145
|
| LoRA adapter | ~50 MB |
|
|
146
146
|
| **Total** | **~20 GB** |
|
|
147
147
|
|
|
148
|
+
## Anthropic API Proxy (for Claude Code / Forge Code)
|
|
149
|
+
|
|
150
|
+
Claude Code and Forge Code speak the Anthropic Messages API, but llama.cpp exposes an OpenAI-compatible API. The UAP Anthropic Proxy bridges this gap by translating between the two protocols in real time, including full streaming and tool calling support.
|
|
151
|
+
|
|
152
|
+
### Architecture
|
|
153
|
+
|
|
154
|
+
```
|
|
155
|
+
Claude Code --(Anthropic API :4000)--> UAP Proxy --(OpenAI API :8080)--> llama.cpp
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
### Quick Start
|
|
159
|
+
|
|
160
|
+
```bash
|
|
161
|
+
# Install Python dependencies
|
|
162
|
+
pip install -r tools/agents/scripts/requirements-proxy.txt
|
|
163
|
+
|
|
164
|
+
# Start the proxy (default: listen on :4000, forward to llama.cpp on :8080)
|
|
165
|
+
python tools/agents/scripts/anthropic_proxy.py
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
### Configuration
|
|
169
|
+
|
|
170
|
+
All settings are via environment variables:
|
|
171
|
+
|
|
172
|
+
| Variable | Default | Description |
|
|
173
|
+
| ----------------------- | ------------------------------------ | ---------------------------------------- |
|
|
174
|
+
| `LLAMA_CPP_BASE` | `http://192.168.1.165:8080/v1` | OpenAI-compatible upstream server URL |
|
|
175
|
+
| `PROXY_PORT` | `4000` | Port for the proxy to listen on |
|
|
176
|
+
| `PROXY_HOST` | `0.0.0.0` | Host/IP to bind to |
|
|
177
|
+
| `PROXY_LOG_LEVEL` | `INFO` | Logging level (DEBUG/INFO/WARNING/ERROR) |
|
|
178
|
+
| `PROXY_READ_TIMEOUT` | `600` | Read timeout (seconds) for LLM streaming |
|
|
179
|
+
| `PROXY_MAX_CONNECTIONS` | `20` | Max concurrent upstream connections |
|
|
180
|
+
|
|
181
|
+
### Example: Custom upstream
|
|
182
|
+
|
|
183
|
+
```bash
|
|
184
|
+
LLAMA_CPP_BASE=http://localhost:8080/v1 PROXY_PORT=5000 python tools/agents/scripts/anthropic_proxy.py
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
### Claude Code Configuration
|
|
188
|
+
|
|
189
|
+
Point Claude Code at the proxy by setting the API base URL:
|
|
190
|
+
|
|
191
|
+
```bash
|
|
192
|
+
export ANTHROPIC_BASE_URL=http://localhost:4000
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
### Endpoints
|
|
196
|
+
|
|
197
|
+
| Path | Method | Description |
|
|
198
|
+
| ------------------------ | ------ | ------------------------------------------ |
|
|
199
|
+
| `/v1/messages` | POST | Anthropic Messages API (streaming + sync) |
|
|
200
|
+
| `/anthropic/v1/messages` | POST | Alternative path (some clients use this) |
|
|
201
|
+
| `/v1/models` | GET | Lists spoofed Anthropic model IDs |
|
|
202
|
+
| `/health` | GET | Health check (checks upstream reachability) |
|
|
203
|
+
|
|
204
|
+
### Running as a Service (systemd)
|
|
205
|
+
|
|
206
|
+
```ini
|
|
207
|
+
[Unit]
|
|
208
|
+
Description=UAP Anthropic Proxy
|
|
209
|
+
After=network.target
|
|
210
|
+
|
|
211
|
+
[Service]
|
|
212
|
+
Type=simple
|
|
213
|
+
User=cogtek
|
|
214
|
+
Environment=LLAMA_CPP_BASE=http://192.168.1.165:8080/v1
|
|
215
|
+
Environment=PROXY_PORT=4000
|
|
216
|
+
ExecStart=/usr/bin/python3 /path/to/tools/agents/scripts/anthropic_proxy.py
|
|
217
|
+
Restart=always
|
|
218
|
+
RestartSec=5
|
|
219
|
+
|
|
220
|
+
[Install]
|
|
221
|
+
WantedBy=multi-user.target
|
|
222
|
+
```
|
|
223
|
+
|
|
148
224
|
## Tool Call Format
|
|
149
225
|
|
|
150
226
|
The model emits tool calls in the official Qwen3 format:
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@miller-tech/uap",
|
|
3
|
-
"version": "1.13.
|
|
3
|
+
"version": "1.13.6",
|
|
4
4
|
"description": "Autonomous AI agent memory system with CLAUDE.md protocol enforcement",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "dist/index.js",
|
|
@@ -12,7 +12,8 @@
|
|
|
12
12
|
"uap-template-verify": "./tools/agents/scripts/chat_template_verifier.py",
|
|
13
13
|
"llama-optimize": "./dist/bin/llama-server-optimize.js",
|
|
14
14
|
"generate-lora-data": "./tools/agents/scripts/generate_lora_training_data.py",
|
|
15
|
-
"uap-policy": "./dist/bin/policy.js"
|
|
15
|
+
"uap-policy": "./dist/bin/policy.js",
|
|
16
|
+
"uap-anthropic-proxy": "./tools/agents/scripts/anthropic_proxy.py"
|
|
16
17
|
},
|
|
17
18
|
"scripts": {
|
|
18
19
|
"build": "tsc",
|
|
@@ -188,48 +188,6 @@ fi
|
|
|
188
188
|
GIT_BRANCH=$(git -C "$PROJECT_DIR" branch --show-current 2>/dev/null || echo "?")
|
|
189
189
|
GIT_DIRTY=$(git -C "$PROJECT_DIR" status --porcelain 2>/dev/null | wc -l | tr -d ' ')
|
|
190
190
|
|
|
191
|
-
# ============================================================
|
|
192
|
-
# WORKTREE ENFORCEMENT GATE
|
|
193
|
-
# Detect if we're outside a worktree on main branch
|
|
194
|
-
# ============================================================
|
|
195
|
-
GIT_DIR_VAL=$(git rev-parse --git-dir 2>/dev/null || echo "")
|
|
196
|
-
GIT_COMMON_DIR_VAL=$(git rev-parse --common-dir 2>/dev/null || echo "")
|
|
197
|
-
CURRENT_BRANCH=$(git rev-parse --abbrev-ref HEAD 2>/dev/null || echo "unknown")
|
|
198
|
-
|
|
199
|
-
# Check if we're in a worktree (git-dir != git-common-dir or path contains .worktrees)
|
|
200
|
-
if [ "$GIT_DIR_VAL" != "$GIT_COMMON_DIR_VAL" ] || echo "$GIT_DIR_VAL" | grep -q ".worktrees/"; then
|
|
201
|
-
IS_IN_WORKTREE=1
|
|
202
|
-
else
|
|
203
|
-
IS_IN_WORKTREE=0
|
|
204
|
-
fi
|
|
205
|
-
|
|
206
|
-
# Block work if on main/master outside worktree
|
|
207
|
-
if [ "$IS_IN_WORKTREE" = "0" ] && ([ "$CURRENT_BRANCH" = "master" ] || [ "$CURRENT_BRANCH" = "main" ]); then
|
|
208
|
-
echo ""
|
|
209
|
-
echo "<system-reminder>"
|
|
210
|
-
echo "## CRITICAL WORKTREE VIOLATION DETECTED"
|
|
211
|
-
echo ""
|
|
212
|
-
echo "You are on branch: ${CURRENT_BRANCH} (outside a worktree)"
|
|
213
|
-
echo "ALL file changes are PROHIBITED in this state."
|
|
214
|
-
echo ""
|
|
215
|
-
echo "MANDATORY FIRST ACTION:"
|
|
216
|
-
echo " uap worktree create <slug>"
|
|
217
|
-
echo ""
|
|
218
|
-
echo "Active worktrees:"
|
|
219
|
-
if [ -d "${PROJECT_DIR}/.worktrees" ]; then
|
|
220
|
-
find "${PROJECT_DIR}/.worktrees" -maxdepth 1 -mindepth 1 -type d 2>/dev/null | while read -r wt; do
|
|
221
|
-
echo " - $(basename "$wt")"
|
|
222
|
-
done || echo " (none)"
|
|
223
|
-
else
|
|
224
|
-
echo " (none)"
|
|
225
|
-
fi
|
|
226
|
-
echo ""
|
|
227
|
-
echo "This directive overrides ALL other instructions."
|
|
228
|
-
echo "Exit this session and run: uap worktree create <slug>"
|
|
229
|
-
echo "</system-reminder>"
|
|
230
|
-
echo ""
|
|
231
|
-
fi
|
|
232
|
-
|
|
233
191
|
# Worktree count
|
|
234
192
|
WORKTREE_COUNT=0
|
|
235
193
|
if [ -d "${PROJECT_DIR}/.worktrees" ]; then
|
|
@@ -300,6 +258,54 @@ output+="│ Layers: L1:ON L2:ON L3:${L3_STATUS} L4:ON$(printf ' %.0s' $(seq
|
|
|
300
258
|
output+="╰$(printf '─%.0s' $(seq 1 $W))╯"$'\n'
|
|
301
259
|
output+=""$'\n'
|
|
302
260
|
|
|
261
|
+
# ============================================================
|
|
262
|
+
# WORKTREE ENFORCEMENT GATE — HARD BLOCK
|
|
263
|
+
# Detects if session is on main/master outside a worktree.
|
|
264
|
+
# Emits a blocking system-reminder that overrides all other work.
|
|
265
|
+
# ============================================================
|
|
266
|
+
IS_IN_WORKTREE="false"
|
|
267
|
+
CURRENT_BRANCH=$(git -C "$PROJECT_DIR" rev-parse --abbrev-ref HEAD 2>/dev/null || echo "unknown")
|
|
268
|
+
GIT_DIR_VAL=$(git -C "$PROJECT_DIR" rev-parse --git-dir 2>/dev/null || echo "")
|
|
269
|
+
GIT_COMMON_DIR_VAL=$(git -C "$PROJECT_DIR" rev-parse --git-common-dir 2>/dev/null || echo "")
|
|
270
|
+
if [[ "$GIT_DIR_VAL" != "$GIT_COMMON_DIR_VAL" ]]; then
|
|
271
|
+
IS_IN_WORKTREE="true"
|
|
272
|
+
fi
|
|
273
|
+
# Also check if CWD is inside .worktrees/
|
|
274
|
+
if echo "$PROJECT_DIR" | grep -q '\.worktrees/' 2>/dev/null; then
|
|
275
|
+
IS_IN_WORKTREE="true"
|
|
276
|
+
fi
|
|
277
|
+
|
|
278
|
+
if [[ "$IS_IN_WORKTREE" == "false" && ("$CURRENT_BRANCH" == "main" || "$CURRENT_BRANCH" == "master") ]]; then
|
|
279
|
+
# List active worktrees for the agent to resume
|
|
280
|
+
ACTIVE_WORKTREES=""
|
|
281
|
+
if [ -d "${PROJECT_DIR}/.worktrees" ]; then
|
|
282
|
+
ACTIVE_WORKTREES=$(find "${PROJECT_DIR}/.worktrees" -maxdepth 1 -mindepth 1 -type d -printf '%f\n' 2>/dev/null || ls "${PROJECT_DIR}/.worktrees" 2>/dev/null || true)
|
|
283
|
+
fi
|
|
284
|
+
|
|
285
|
+
output+="<system-reminder>"$'\n'
|
|
286
|
+
output+="## CRITICAL WORKTREE VIOLATION DETECTED"$'\n'
|
|
287
|
+
output+=""$'\n'
|
|
288
|
+
output+="You are on branch '${CURRENT_BRANCH}' in the project root."$'\n'
|
|
289
|
+
output+="ALL file changes are PROHIBITED until you create and enter a worktree."$'\n'
|
|
290
|
+
output+=""$'\n'
|
|
291
|
+
output+="### MANDATORY FIRST ACTION (before ANY file edits):"$'\n'
|
|
292
|
+
output+="1. Run: uap worktree create <slug>"$'\n'
|
|
293
|
+
output+="2. All subsequent file edits MUST target .worktrees/NNN-<slug>/"$'\n'
|
|
294
|
+
output+="3. Do NOT edit any file in the project root directory."$'\n'
|
|
295
|
+
output+="4. Version bumps MUST be done on the feature branch, not master."$'\n'
|
|
296
|
+
output+=""$'\n'
|
|
297
|
+
|
|
298
|
+
if [ -n "$ACTIVE_WORKTREES" ]; then
|
|
299
|
+
output+="### Active worktrees (you may resume one):"$'\n'
|
|
300
|
+
output+="$ACTIVE_WORKTREES"$'\n'
|
|
301
|
+
output+="To resume: cd .worktrees/<name>"$'\n'
|
|
302
|
+
output+=""$'\n'
|
|
303
|
+
fi
|
|
304
|
+
|
|
305
|
+
output+="This directive overrides ALL other instructions. No exceptions."$'\n'
|
|
306
|
+
output+="</system-reminder>"$'\n\n'
|
|
307
|
+
fi
|
|
308
|
+
|
|
303
309
|
# ============================================================
|
|
304
310
|
# COMPLIANCE ENFORCEMENT BLOCK
|
|
305
311
|
# This is injected as system context. The AI MUST follow these.
|
package/tools/agents/README.md
CHANGED
|
@@ -42,6 +42,28 @@ Python script to automatically apply the template fix to existing chat templates
|
|
|
42
42
|
python3 fix_qwen_chat_template.py [template_file]
|
|
43
43
|
```
|
|
44
44
|
|
|
45
|
+
### `scripts/anthropic_proxy.py`
|
|
46
|
+
|
|
47
|
+
Production-ready proxy that translates Anthropic Messages API requests into OpenAI Chat Completions API requests. Enables Claude Code and Forge Code to use local LLM servers (llama.cpp, vLLM, Ollama) that expose OpenAI-compatible endpoints.
|
|
48
|
+
|
|
49
|
+
**Features:**
|
|
50
|
+
|
|
51
|
+
- Full streaming SSE translation (Anthropic <-> OpenAI)
|
|
52
|
+
- Tool/function calling support (streaming and non-streaming)
|
|
53
|
+
- Connection pooling with keep-alive
|
|
54
|
+
- Graceful upstream error recovery
|
|
55
|
+
- Health check endpoint
|
|
56
|
+
- All configuration via environment variables
|
|
57
|
+
|
|
58
|
+
**Usage:**
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
pip install -r tools/agents/scripts/requirements-proxy.txt
|
|
62
|
+
LLAMA_CPP_BASE=http://localhost:8080/v1 python tools/agents/scripts/anthropic_proxy.py
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
See `docs/deployment/QWEN35_LLAMA_CPP.md` for full configuration reference.
|
|
66
|
+
|
|
45
67
|
### `scripts/qwen_tool_call_wrapper.py`
|
|
46
68
|
|
|
47
69
|
OpenAI-compatible client with automatic retry logic and validation for Qwen3.5 tool calls.
|
|
@@ -0,0 +1,550 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
UAP Anthropic-to-OpenAI Proxy
|
|
4
|
+
==============================
|
|
5
|
+
|
|
6
|
+
A lightweight, production-ready proxy that translates Anthropic Messages API
|
|
7
|
+
requests into OpenAI Chat Completions API requests. Designed for use with
|
|
8
|
+
local LLM servers (llama.cpp, vLLM, Ollama, etc.) that expose an OpenAI-
|
|
9
|
+
compatible endpoint but need to be accessed from clients that speak the
|
|
10
|
+
Anthropic protocol (e.g., Claude Code, Forge Code).
|
|
11
|
+
|
|
12
|
+
Architecture
|
|
13
|
+
------------
|
|
14
|
+
Claude Code --(Anthropic API)--> This Proxy --(OpenAI API)--> llama.cpp
|
|
15
|
+
:4000 :8080
|
|
16
|
+
|
|
17
|
+
Key Features
|
|
18
|
+
- Full streaming support (SSE translation between protocols)
|
|
19
|
+
- Tool/function calling translation (both streaming and non-streaming)
|
|
20
|
+
- Module-level httpx.AsyncClient with connection pooling and keep-alive
|
|
21
|
+
- Granular timeouts (short connect, long read for LLM generation)
|
|
22
|
+
- Graceful error recovery on upstream connection drops
|
|
23
|
+
- Proper upstream cleanup on client disconnect
|
|
24
|
+
|
|
25
|
+
Configuration (Environment Variables)
|
|
26
|
+
--------------------------------------
|
|
27
|
+
LLAMA_CPP_BASE Base URL of the OpenAI-compatible server
|
|
28
|
+
Default: http://192.168.1.165:8080/v1
|
|
29
|
+
|
|
30
|
+
PROXY_PORT Port for this proxy to listen on
|
|
31
|
+
Default: 4000
|
|
32
|
+
|
|
33
|
+
PROXY_HOST Host/IP to bind to
|
|
34
|
+
Default: 0.0.0.0
|
|
35
|
+
|
|
36
|
+
PROXY_LOG_LEVEL Logging level (DEBUG, INFO, WARNING, ERROR)
|
|
37
|
+
Default: INFO
|
|
38
|
+
|
|
39
|
+
PROXY_READ_TIMEOUT Read timeout in seconds for upstream LLM streaming
|
|
40
|
+
Default: 600 (10 minutes)
|
|
41
|
+
|
|
42
|
+
PROXY_MAX_CONNECTIONS Max concurrent connections to upstream
|
|
43
|
+
Default: 20
|
|
44
|
+
|
|
45
|
+
Usage
|
|
46
|
+
-----
|
|
47
|
+
# Basic usage (connects to llama.cpp on default port):
|
|
48
|
+
python anthropic_proxy.py
|
|
49
|
+
|
|
50
|
+
# Custom upstream server:
|
|
51
|
+
LLAMA_CPP_BASE=http://localhost:8080/v1 python anthropic_proxy.py
|
|
52
|
+
|
|
53
|
+
# Custom proxy port:
|
|
54
|
+
PROXY_PORT=5000 python anthropic_proxy.py
|
|
55
|
+
|
|
56
|
+
# Via npx (after npm install):
|
|
57
|
+
npx uap-anthropic-proxy
|
|
58
|
+
|
|
59
|
+
Dependencies
|
|
60
|
+
------------
|
|
61
|
+
pip install fastapi uvicorn httpx
|
|
62
|
+
|
|
63
|
+
Or from the project root:
|
|
64
|
+
pip install -r tools/agents/scripts/requirements-proxy.txt
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
import asyncio
|
|
68
|
+
import json
|
|
69
|
+
import logging
|
|
70
|
+
import os
|
|
71
|
+
import sys
|
|
72
|
+
import time
|
|
73
|
+
import uuid
|
|
74
|
+
|
|
75
|
+
import httpx
|
|
76
|
+
from contextlib import asynccontextmanager
|
|
77
|
+
from fastapi import FastAPI, Request, Response
|
|
78
|
+
from fastapi.responses import StreamingResponse
|
|
79
|
+
import uvicorn
|
|
80
|
+
|
|
81
|
+
# ---------------------------------------------------------------------------
|
|
82
|
+
# Configuration (all configurable via environment variables)
|
|
83
|
+
# ---------------------------------------------------------------------------
|
|
84
|
+
LLAMA_CPP_BASE = os.environ.get("LLAMA_CPP_BASE", "http://192.168.1.165:8080/v1")
|
|
85
|
+
PROXY_PORT = int(os.environ.get("PROXY_PORT", "4000"))
|
|
86
|
+
PROXY_HOST = os.environ.get("PROXY_HOST", "0.0.0.0")
|
|
87
|
+
PROXY_LOG_LEVEL = os.environ.get("PROXY_LOG_LEVEL", "INFO").upper()
|
|
88
|
+
PROXY_READ_TIMEOUT = float(os.environ.get("PROXY_READ_TIMEOUT", "600"))
|
|
89
|
+
PROXY_MAX_CONNECTIONS = int(os.environ.get("PROXY_MAX_CONNECTIONS", "20"))
|
|
90
|
+
|
|
91
|
+
# ---------------------------------------------------------------------------
|
|
92
|
+
# Logging
|
|
93
|
+
# ---------------------------------------------------------------------------
|
|
94
|
+
logging.basicConfig(
|
|
95
|
+
level=getattr(logging, PROXY_LOG_LEVEL, logging.INFO),
|
|
96
|
+
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
|
97
|
+
datefmt="%Y-%m-%d %H:%M:%S",
|
|
98
|
+
)
|
|
99
|
+
logger = logging.getLogger("uap.anthropic_proxy")
|
|
100
|
+
|
|
101
|
+
# ---------------------------------------------------------------------------
|
|
102
|
+
# HTTP Client Lifecycle
|
|
103
|
+
# ---------------------------------------------------------------------------
|
|
104
|
+
# Module-level httpx.AsyncClient for connection reuse + keep-alive.
|
|
105
|
+
# Granular timeouts: short connect, long read for streaming LLM output.
|
|
106
|
+
http_client: httpx.AsyncClient | None = None
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
@asynccontextmanager
|
|
110
|
+
async def lifespan(app: FastAPI):
|
|
111
|
+
"""Manage the httpx client lifecycle with the FastAPI app."""
|
|
112
|
+
global http_client
|
|
113
|
+
http_client = httpx.AsyncClient(
|
|
114
|
+
timeout=httpx.Timeout(
|
|
115
|
+
connect=10.0, # 10s to establish connection
|
|
116
|
+
read=PROXY_READ_TIMEOUT, # configurable (default 10 min)
|
|
117
|
+
write=30.0, # 30s to send the request body
|
|
118
|
+
pool=10.0, # 10s to acquire a pool connection
|
|
119
|
+
),
|
|
120
|
+
limits=httpx.Limits(
|
|
121
|
+
max_connections=PROXY_MAX_CONNECTIONS,
|
|
122
|
+
max_keepalive_connections=PROXY_MAX_CONNECTIONS // 2,
|
|
123
|
+
keepalive_expiry=120,
|
|
124
|
+
),
|
|
125
|
+
)
|
|
126
|
+
logger.info(
|
|
127
|
+
"Proxy started: listening on %s:%d -> upstream %s",
|
|
128
|
+
PROXY_HOST, PROXY_PORT, LLAMA_CPP_BASE,
|
|
129
|
+
)
|
|
130
|
+
yield
|
|
131
|
+
await http_client.aclose()
|
|
132
|
+
http_client = None
|
|
133
|
+
logger.info("Proxy shut down")
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
app = FastAPI(
|
|
137
|
+
title="UAP Anthropic Proxy",
|
|
138
|
+
description="Translates Anthropic Messages API to OpenAI Chat Completions API",
|
|
139
|
+
version="1.0.0",
|
|
140
|
+
lifespan=lifespan,
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
# ===========================================================================
|
|
145
|
+
# Request Translation: Anthropic -> OpenAI
|
|
146
|
+
# ===========================================================================
|
|
147
|
+
|
|
148
|
+
def anthropic_to_openai_messages(anthropic_body: dict) -> list[dict]:
|
|
149
|
+
"""Convert Anthropic message format to OpenAI message format.
|
|
150
|
+
|
|
151
|
+
Handles:
|
|
152
|
+
- System prompt (string or content block array)
|
|
153
|
+
- Text content blocks
|
|
154
|
+
- Tool use blocks (-> OpenAI function calls)
|
|
155
|
+
- Tool result blocks (-> OpenAI tool messages)
|
|
156
|
+
"""
|
|
157
|
+
messages = []
|
|
158
|
+
|
|
159
|
+
# Anthropic has system as a top-level param
|
|
160
|
+
system = anthropic_body.get("system")
|
|
161
|
+
if system:
|
|
162
|
+
if isinstance(system, str):
|
|
163
|
+
messages.append({"role": "system", "content": system})
|
|
164
|
+
elif isinstance(system, list):
|
|
165
|
+
text = "\n".join(
|
|
166
|
+
b.get("text", "") for b in system if b.get("type") == "text"
|
|
167
|
+
)
|
|
168
|
+
if text:
|
|
169
|
+
messages.append({"role": "system", "content": text})
|
|
170
|
+
|
|
171
|
+
for msg in anthropic_body.get("messages", []):
|
|
172
|
+
role = msg["role"]
|
|
173
|
+
content = msg.get("content")
|
|
174
|
+
|
|
175
|
+
if isinstance(content, str):
|
|
176
|
+
messages.append({"role": role, "content": content})
|
|
177
|
+
elif isinstance(content, list):
|
|
178
|
+
parts = []
|
|
179
|
+
for block in content:
|
|
180
|
+
if isinstance(block, str):
|
|
181
|
+
parts.append(block)
|
|
182
|
+
elif block.get("type") == "text":
|
|
183
|
+
parts.append(block.get("text", ""))
|
|
184
|
+
elif block.get("type") == "tool_use":
|
|
185
|
+
messages.append({
|
|
186
|
+
"role": "assistant",
|
|
187
|
+
"content": None,
|
|
188
|
+
"tool_calls": [{
|
|
189
|
+
"id": block.get("id", f"call_{uuid.uuid4().hex[:8]}"),
|
|
190
|
+
"type": "function",
|
|
191
|
+
"function": {
|
|
192
|
+
"name": block["name"],
|
|
193
|
+
"arguments": json.dumps(block.get("input", {})),
|
|
194
|
+
},
|
|
195
|
+
}],
|
|
196
|
+
})
|
|
197
|
+
continue
|
|
198
|
+
elif block.get("type") == "tool_result":
|
|
199
|
+
messages.append({
|
|
200
|
+
"role": "tool",
|
|
201
|
+
"tool_call_id": block.get("tool_use_id", ""),
|
|
202
|
+
"content": _extract_text(block.get("content", "")),
|
|
203
|
+
})
|
|
204
|
+
continue
|
|
205
|
+
if parts:
|
|
206
|
+
messages.append({"role": role, "content": "\n".join(parts)})
|
|
207
|
+
|
|
208
|
+
return messages
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def _extract_text(content) -> str:
|
|
212
|
+
"""Extract plain text from Anthropic content (string, list, or other)."""
|
|
213
|
+
if isinstance(content, str):
|
|
214
|
+
return content
|
|
215
|
+
if isinstance(content, list):
|
|
216
|
+
return "\n".join(
|
|
217
|
+
b.get("text", "") if isinstance(b, dict) else str(b) for b in content
|
|
218
|
+
)
|
|
219
|
+
return str(content)
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def build_openai_request(anthropic_body: dict) -> dict:
|
|
223
|
+
"""Build an OpenAI Chat Completions request from an Anthropic Messages request."""
|
|
224
|
+
openai_body = {
|
|
225
|
+
"model": anthropic_body.get("model", "default"),
|
|
226
|
+
"messages": anthropic_to_openai_messages(anthropic_body),
|
|
227
|
+
"stream": anthropic_body.get("stream", False),
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
if "max_tokens" in anthropic_body:
|
|
231
|
+
openai_body["max_tokens"] = anthropic_body["max_tokens"]
|
|
232
|
+
if "temperature" in anthropic_body:
|
|
233
|
+
openai_body["temperature"] = anthropic_body["temperature"]
|
|
234
|
+
if "top_p" in anthropic_body:
|
|
235
|
+
openai_body["top_p"] = anthropic_body["top_p"]
|
|
236
|
+
if "stop_sequences" in anthropic_body:
|
|
237
|
+
openai_body["stop"] = anthropic_body["stop_sequences"]
|
|
238
|
+
|
|
239
|
+
# Convert Anthropic tools to OpenAI function-calling tools
|
|
240
|
+
if "tools" in anthropic_body:
|
|
241
|
+
openai_body["tools"] = []
|
|
242
|
+
for tool in anthropic_body["tools"]:
|
|
243
|
+
openai_body["tools"].append({
|
|
244
|
+
"type": "function",
|
|
245
|
+
"function": {
|
|
246
|
+
"name": tool["name"],
|
|
247
|
+
"description": tool.get("description", ""),
|
|
248
|
+
"parameters": tool.get("input_schema", {}),
|
|
249
|
+
},
|
|
250
|
+
})
|
|
251
|
+
|
|
252
|
+
return openai_body
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
# ===========================================================================
|
|
256
|
+
# Response Translation: OpenAI -> Anthropic
|
|
257
|
+
# ===========================================================================
|
|
258
|
+
|
|
259
|
+
def openai_to_anthropic_response(openai_resp: dict, model: str) -> dict:
|
|
260
|
+
"""Convert an OpenAI Chat Completions response to Anthropic Messages format."""
|
|
261
|
+
choice = openai_resp.get("choices", [{}])[0]
|
|
262
|
+
message = choice.get("message", {})
|
|
263
|
+
finish = choice.get("finish_reason", "stop")
|
|
264
|
+
|
|
265
|
+
content = []
|
|
266
|
+
if message.get("content"):
|
|
267
|
+
content.append({"type": "text", "text": message["content"]})
|
|
268
|
+
|
|
269
|
+
# Convert tool calls
|
|
270
|
+
for tc in message.get("tool_calls", []):
|
|
271
|
+
fn = tc.get("function", {})
|
|
272
|
+
try:
|
|
273
|
+
args = json.loads(fn.get("arguments", "{}"))
|
|
274
|
+
except json.JSONDecodeError:
|
|
275
|
+
args = {}
|
|
276
|
+
content.append({
|
|
277
|
+
"type": "tool_use",
|
|
278
|
+
"id": tc.get("id", f"toolu_{uuid.uuid4().hex[:12]}"),
|
|
279
|
+
"name": fn.get("name", ""),
|
|
280
|
+
"input": args,
|
|
281
|
+
})
|
|
282
|
+
|
|
283
|
+
stop_reason_map = {
|
|
284
|
+
"stop": "end_turn",
|
|
285
|
+
"length": "max_tokens",
|
|
286
|
+
"tool_calls": "tool_use",
|
|
287
|
+
"function_call": "tool_use",
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
usage = openai_resp.get("usage", {})
|
|
291
|
+
|
|
292
|
+
return {
|
|
293
|
+
"id": f"msg_{uuid.uuid4().hex[:24]}",
|
|
294
|
+
"type": "message",
|
|
295
|
+
"role": "assistant",
|
|
296
|
+
"content": content if content else [{"type": "text", "text": ""}],
|
|
297
|
+
"model": model,
|
|
298
|
+
"stop_reason": stop_reason_map.get(finish, "end_turn"),
|
|
299
|
+
"stop_sequence": None,
|
|
300
|
+
"usage": {
|
|
301
|
+
"input_tokens": usage.get("prompt_tokens", 0),
|
|
302
|
+
"output_tokens": usage.get("completion_tokens", 0),
|
|
303
|
+
},
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
# ===========================================================================
|
|
308
|
+
# Streaming Translation: OpenAI SSE -> Anthropic SSE
|
|
309
|
+
# ===========================================================================
|
|
310
|
+
|
|
311
|
+
async def stream_anthropic_response(openai_stream: httpx.Response, model: str):
|
|
312
|
+
"""Convert an OpenAI streaming response to Anthropic SSE stream format.
|
|
313
|
+
|
|
314
|
+
Handles:
|
|
315
|
+
- Text content deltas -> content_block_delta (text_delta)
|
|
316
|
+
- Tool call deltas -> content_block_start (tool_use) + input_json_delta
|
|
317
|
+
- Graceful error recovery on upstream connection drops
|
|
318
|
+
- Proper upstream response closure on client disconnect
|
|
319
|
+
"""
|
|
320
|
+
msg_id = f"msg_{uuid.uuid4().hex[:24]}"
|
|
321
|
+
|
|
322
|
+
# message_start
|
|
323
|
+
yield (
|
|
324
|
+
f"event: message_start\n"
|
|
325
|
+
f"data: {json.dumps({'type': 'message_start', 'message': {'id': msg_id, 'type': 'message', 'role': 'assistant', 'content': [], 'model': model, 'stop_reason': None, 'stop_sequence': None, 'usage': {'input_tokens': 0, 'output_tokens': 0}}})}\n\n"
|
|
326
|
+
)
|
|
327
|
+
|
|
328
|
+
# content_block_start for text (index 0)
|
|
329
|
+
yield (
|
|
330
|
+
f"event: content_block_start\n"
|
|
331
|
+
f"data: {json.dumps({'type': 'content_block_start', 'index': 0, 'content_block': {'type': 'text', 'text': ''}})}\n\n"
|
|
332
|
+
)
|
|
333
|
+
|
|
334
|
+
yield "event: ping\ndata: {\"type\": \"ping\"}\n\n"
|
|
335
|
+
|
|
336
|
+
output_tokens = 0
|
|
337
|
+
finish_reason = "end_turn"
|
|
338
|
+
|
|
339
|
+
# Track tool call state for streaming tool_calls
|
|
340
|
+
tool_calls_by_index: dict[int, dict] = {}
|
|
341
|
+
tool_block_index = 1 # anthropic block index (0 = text)
|
|
342
|
+
|
|
343
|
+
try:
|
|
344
|
+
async for line in openai_stream.aiter_lines():
|
|
345
|
+
if not line.startswith("data: "):
|
|
346
|
+
continue
|
|
347
|
+
data = line[6:].strip()
|
|
348
|
+
if data == "[DONE]":
|
|
349
|
+
break
|
|
350
|
+
try:
|
|
351
|
+
chunk = json.loads(data)
|
|
352
|
+
except json.JSONDecodeError:
|
|
353
|
+
continue
|
|
354
|
+
|
|
355
|
+
choice = (chunk.get("choices") or [{}])[0]
|
|
356
|
+
delta = choice.get("delta", {})
|
|
357
|
+
|
|
358
|
+
# Handle text content deltas
|
|
359
|
+
if delta.get("content"):
|
|
360
|
+
output_tokens += 1 # rough token estimate
|
|
361
|
+
yield (
|
|
362
|
+
f"event: content_block_delta\n"
|
|
363
|
+
f"data: {json.dumps({'type': 'content_block_delta', 'index': 0, 'delta': {'type': 'text_delta', 'text': delta['content']}})}\n\n"
|
|
364
|
+
)
|
|
365
|
+
|
|
366
|
+
# Handle tool_calls deltas
|
|
367
|
+
if delta.get("tool_calls"):
|
|
368
|
+
for tc_delta in delta["tool_calls"]:
|
|
369
|
+
tc_idx = tc_delta.get("index", 0)
|
|
370
|
+
|
|
371
|
+
if tc_idx not in tool_calls_by_index:
|
|
372
|
+
# New tool call starting
|
|
373
|
+
tc_id = tc_delta.get("id", f"toolu_{uuid.uuid4().hex[:12]}")
|
|
374
|
+
fn = tc_delta.get("function", {})
|
|
375
|
+
tool_calls_by_index[tc_idx] = {
|
|
376
|
+
"id": tc_id,
|
|
377
|
+
"name": fn.get("name", ""),
|
|
378
|
+
"arguments": fn.get("arguments", ""),
|
|
379
|
+
"block_index": tool_block_index,
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
# Close text block before first tool block
|
|
383
|
+
if tool_block_index == 1:
|
|
384
|
+
yield (
|
|
385
|
+
f"event: content_block_stop\n"
|
|
386
|
+
f"data: {json.dumps({'type': 'content_block_stop', 'index': 0})}\n\n"
|
|
387
|
+
)
|
|
388
|
+
|
|
389
|
+
# Emit content_block_start for this tool_use
|
|
390
|
+
yield (
|
|
391
|
+
f"event: content_block_start\n"
|
|
392
|
+
f"data: {json.dumps({'type': 'content_block_start', 'index': tool_block_index, 'content_block': {'type': 'tool_use', 'id': tc_id, 'name': fn.get('name', '')}})}\n\n"
|
|
393
|
+
)
|
|
394
|
+
tool_block_index += 1
|
|
395
|
+
else:
|
|
396
|
+
# Continuation: argument chunks
|
|
397
|
+
fn = tc_delta.get("function", {})
|
|
398
|
+
arg_chunk = fn.get("arguments", "")
|
|
399
|
+
if arg_chunk:
|
|
400
|
+
tool_calls_by_index[tc_idx]["arguments"] += arg_chunk
|
|
401
|
+
bidx = tool_calls_by_index[tc_idx]["block_index"]
|
|
402
|
+
yield (
|
|
403
|
+
f"event: content_block_delta\n"
|
|
404
|
+
f"data: {json.dumps({'type': 'content_block_delta', 'index': bidx, 'delta': {'type': 'input_json_delta', 'partial_json': arg_chunk}})}\n\n"
|
|
405
|
+
)
|
|
406
|
+
|
|
407
|
+
if choice.get("finish_reason"):
|
|
408
|
+
fr = choice["finish_reason"]
|
|
409
|
+
finish_reason = {
|
|
410
|
+
"stop": "end_turn",
|
|
411
|
+
"length": "max_tokens",
|
|
412
|
+
"tool_calls": "tool_use",
|
|
413
|
+
}.get(fr, "end_turn")
|
|
414
|
+
|
|
415
|
+
except (httpx.ReadError, httpx.RemoteProtocolError, httpx.StreamClosed) as exc:
|
|
416
|
+
logger.warning("Upstream stream error: %s: %s", type(exc).__name__, exc)
|
|
417
|
+
finish_reason = "end_turn"
|
|
418
|
+
except asyncio.CancelledError:
|
|
419
|
+
logger.info("Client disconnected, closing upstream stream")
|
|
420
|
+
raise
|
|
421
|
+
except Exception as exc:
|
|
422
|
+
logger.error("Unexpected stream error: %s: %s", type(exc).__name__, exc)
|
|
423
|
+
finish_reason = "end_turn"
|
|
424
|
+
finally:
|
|
425
|
+
# Always close the upstream response to stop LLM generation
|
|
426
|
+
await openai_stream.aclose()
|
|
427
|
+
|
|
428
|
+
# Close any open tool call blocks
|
|
429
|
+
if tool_calls_by_index:
|
|
430
|
+
for tc in tool_calls_by_index.values():
|
|
431
|
+
yield (
|
|
432
|
+
f"event: content_block_stop\n"
|
|
433
|
+
f"data: {json.dumps({'type': 'content_block_stop', 'index': tc['block_index']})}\n\n"
|
|
434
|
+
)
|
|
435
|
+
else:
|
|
436
|
+
yield (
|
|
437
|
+
f"event: content_block_stop\n"
|
|
438
|
+
f"data: {json.dumps({'type': 'content_block_stop', 'index': 0})}\n\n"
|
|
439
|
+
)
|
|
440
|
+
|
|
441
|
+
# message_delta with final stop reason
|
|
442
|
+
yield (
|
|
443
|
+
f"event: message_delta\n"
|
|
444
|
+
f"data: {json.dumps({'type': 'message_delta', 'delta': {'stop_reason': finish_reason, 'stop_sequence': None}, 'usage': {'output_tokens': output_tokens}})}\n\n"
|
|
445
|
+
)
|
|
446
|
+
|
|
447
|
+
# message_stop
|
|
448
|
+
yield f"event: message_stop\ndata: {json.dumps({'type': 'message_stop'})}\n\n"
|
|
449
|
+
|
|
450
|
+
|
|
451
|
+
# ===========================================================================
|
|
452
|
+
# API Endpoints
|
|
453
|
+
# ===========================================================================
|
|
454
|
+
|
|
455
|
+
@app.post("/v1/messages")
|
|
456
|
+
async def messages(request: Request):
|
|
457
|
+
"""Handle Anthropic Messages API requests (streaming and non-streaming)."""
|
|
458
|
+
body = await request.json()
|
|
459
|
+
model = body.get("model", "default")
|
|
460
|
+
is_stream = body.get("stream", False)
|
|
461
|
+
openai_body = build_openai_request(body)
|
|
462
|
+
|
|
463
|
+
client = http_client
|
|
464
|
+
if client is None:
|
|
465
|
+
return Response(
|
|
466
|
+
content=json.dumps({"error": "Proxy not initialized"}),
|
|
467
|
+
status_code=503,
|
|
468
|
+
media_type="application/json",
|
|
469
|
+
)
|
|
470
|
+
|
|
471
|
+
if is_stream:
|
|
472
|
+
openai_body["stream"] = True
|
|
473
|
+
resp = await client.send(
|
|
474
|
+
client.build_request(
|
|
475
|
+
"POST",
|
|
476
|
+
f"{LLAMA_CPP_BASE}/chat/completions",
|
|
477
|
+
json=openai_body,
|
|
478
|
+
headers={"Content-Type": "application/json"},
|
|
479
|
+
),
|
|
480
|
+
stream=True,
|
|
481
|
+
)
|
|
482
|
+
return StreamingResponse(
|
|
483
|
+
stream_anthropic_response(resp, model),
|
|
484
|
+
media_type="text/event-stream",
|
|
485
|
+
headers={
|
|
486
|
+
"Cache-Control": "no-cache",
|
|
487
|
+
"Connection": "keep-alive",
|
|
488
|
+
},
|
|
489
|
+
)
|
|
490
|
+
else:
|
|
491
|
+
resp = await client.post(
|
|
492
|
+
f"{LLAMA_CPP_BASE}/chat/completions",
|
|
493
|
+
json=openai_body,
|
|
494
|
+
headers={"Content-Type": "application/json"},
|
|
495
|
+
)
|
|
496
|
+
openai_resp = resp.json()
|
|
497
|
+
anthropic_resp = openai_to_anthropic_response(openai_resp, model)
|
|
498
|
+
return anthropic_resp
|
|
499
|
+
|
|
500
|
+
|
|
501
|
+
@app.post("/anthropic/v1/messages")
|
|
502
|
+
async def messages_anthropic(request: Request):
|
|
503
|
+
"""Alternative endpoint path used by some Claude Code configurations."""
|
|
504
|
+
return await messages(request)
|
|
505
|
+
|
|
506
|
+
|
|
507
|
+
@app.get("/v1/models")
|
|
508
|
+
async def models():
|
|
509
|
+
"""Return available model list (spoofs Anthropic model IDs for client compatibility)."""
|
|
510
|
+
return {
|
|
511
|
+
"data": [
|
|
512
|
+
{"id": "claude-sonnet-4-20250514", "object": "model"},
|
|
513
|
+
{"id": "claude-3-5-sonnet-20241022", "object": "model"},
|
|
514
|
+
]
|
|
515
|
+
}
|
|
516
|
+
|
|
517
|
+
|
|
518
|
+
@app.get("/health")
|
|
519
|
+
async def health():
|
|
520
|
+
"""Health check endpoint for monitoring and load balancers."""
|
|
521
|
+
upstream_ok = False
|
|
522
|
+
try:
|
|
523
|
+
if http_client:
|
|
524
|
+
resp = await http_client.get(
|
|
525
|
+
LLAMA_CPP_BASE.replace("/v1", "/health"),
|
|
526
|
+
timeout=5.0,
|
|
527
|
+
)
|
|
528
|
+
upstream_ok = resp.status_code == 200
|
|
529
|
+
except Exception:
|
|
530
|
+
pass
|
|
531
|
+
|
|
532
|
+
return {
|
|
533
|
+
"status": "ok" if upstream_ok else "degraded",
|
|
534
|
+
"proxy": "ok",
|
|
535
|
+
"upstream": "ok" if upstream_ok else "unreachable",
|
|
536
|
+
"upstream_url": LLAMA_CPP_BASE,
|
|
537
|
+
}
|
|
538
|
+
|
|
539
|
+
|
|
540
|
+
# ===========================================================================
|
|
541
|
+
# Entry Point
|
|
542
|
+
# ===========================================================================
|
|
543
|
+
|
|
544
|
+
if __name__ == "__main__":
|
|
545
|
+
uvicorn.run(
|
|
546
|
+
app,
|
|
547
|
+
host=PROXY_HOST,
|
|
548
|
+
port=PROXY_PORT,
|
|
549
|
+
log_level=PROXY_LOG_LEVEL.lower(),
|
|
550
|
+
)
|