@miller-tech/uap 1.18.1 → 1.19.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
|
@@ -85,6 +85,7 @@ import re
|
|
|
85
85
|
import sys
|
|
86
86
|
import time
|
|
87
87
|
import uuid
|
|
88
|
+
from collections import defaultdict, deque
|
|
88
89
|
from dataclasses import dataclass, field
|
|
89
90
|
|
|
90
91
|
import httpx
|
|
@@ -101,6 +102,8 @@ PROXY_PORT = int(os.environ.get("PROXY_PORT", "4000"))
|
|
|
101
102
|
PROXY_HOST = os.environ.get("PROXY_HOST", "0.0.0.0")
|
|
102
103
|
PROXY_LOG_LEVEL = os.environ.get("PROXY_LOG_LEVEL", "INFO").upper()
|
|
103
104
|
PROXY_READ_TIMEOUT = float(os.environ.get("PROXY_READ_TIMEOUT", "600"))
|
|
105
|
+
PROXY_UPSTREAM_RETRY_MAX = int(os.environ.get("PROXY_UPSTREAM_RETRY_MAX", "3"))
|
|
106
|
+
PROXY_UPSTREAM_RETRY_DELAY_SECS = float(os.environ.get("PROXY_UPSTREAM_RETRY_DELAY_SECS", "5"))
|
|
104
107
|
PROXY_MAX_CONNECTIONS = int(os.environ.get("PROXY_MAX_CONNECTIONS", "20"))
|
|
105
108
|
PROXY_CONTEXT_WINDOW = int(os.environ.get("PROXY_CONTEXT_WINDOW", "0"))
|
|
106
109
|
PROXY_CONTEXT_PRUNE_THRESHOLD = float(
|
|
@@ -116,9 +119,9 @@ PROXY_LOOP_BREAKER = os.environ.get("PROXY_LOOP_BREAKER", "on").lower() not in {
|
|
|
116
119
|
"no",
|
|
117
120
|
}
|
|
118
121
|
PROXY_LOOP_WINDOW = int(os.environ.get("PROXY_LOOP_WINDOW", "6"))
|
|
119
|
-
PROXY_LOOP_REPEAT_THRESHOLD = int(os.environ.get("PROXY_LOOP_REPEAT_THRESHOLD", "
|
|
122
|
+
PROXY_LOOP_REPEAT_THRESHOLD = int(os.environ.get("PROXY_LOOP_REPEAT_THRESHOLD", "6"))
|
|
120
123
|
PROXY_FORCED_THRESHOLD = int(os.environ.get("PROXY_FORCED_THRESHOLD", "15"))
|
|
121
|
-
PROXY_NO_PROGRESS_THRESHOLD = int(os.environ.get("PROXY_NO_PROGRESS_THRESHOLD", "
|
|
124
|
+
PROXY_NO_PROGRESS_THRESHOLD = int(os.environ.get("PROXY_NO_PROGRESS_THRESHOLD", "3"))
|
|
122
125
|
PROXY_CONTEXT_RELEASE_THRESHOLD = float(
|
|
123
126
|
os.environ.get("PROXY_CONTEXT_RELEASE_THRESHOLD", "0.90")
|
|
124
127
|
)
|
|
@@ -138,16 +141,28 @@ PROXY_TOOL_STATE_FORCED_BUDGET = int(
|
|
|
138
141
|
)
|
|
139
142
|
PROXY_TOOL_STATE_AUTO_BUDGET = int(os.environ.get("PROXY_TOOL_STATE_AUTO_BUDGET", "2"))
|
|
140
143
|
PROXY_TOOL_STATE_STAGNATION_THRESHOLD = int(
|
|
141
|
-
os.environ.get("PROXY_TOOL_STATE_STAGNATION_THRESHOLD", "
|
|
144
|
+
os.environ.get("PROXY_TOOL_STATE_STAGNATION_THRESHOLD", "9")
|
|
142
145
|
)
|
|
143
146
|
PROXY_TOOL_STATE_CYCLE_WINDOW = int(
|
|
144
147
|
os.environ.get("PROXY_TOOL_STATE_CYCLE_WINDOW", "8")
|
|
145
148
|
)
|
|
146
149
|
PROXY_TOOL_STATE_FINALIZE_THRESHOLD = int(
|
|
147
|
-
os.environ.get("PROXY_TOOL_STATE_FINALIZE_THRESHOLD", "
|
|
150
|
+
os.environ.get("PROXY_TOOL_STATE_FINALIZE_THRESHOLD", "18")
|
|
148
151
|
)
|
|
149
152
|
PROXY_TOOL_STATE_REVIEW_CYCLE_LIMIT = int(
|
|
150
|
-
os.environ.get("PROXY_TOOL_STATE_REVIEW_CYCLE_LIMIT", "
|
|
153
|
+
os.environ.get("PROXY_TOOL_STATE_REVIEW_CYCLE_LIMIT", "2")
|
|
154
|
+
)
|
|
155
|
+
PROXY_CLIENT_RATE_WINDOW_SECS = int(
|
|
156
|
+
os.environ.get("PROXY_CLIENT_RATE_WINDOW_SECS", "60")
|
|
157
|
+
)
|
|
158
|
+
PROXY_CLIENT_RATE_LOG_MIN_SECS = float(
|
|
159
|
+
os.environ.get("PROXY_CLIENT_RATE_LOG_MIN_SECS", "15")
|
|
160
|
+
)
|
|
161
|
+
PROXY_OPUS46_CTX_THRESHOLD = float(
|
|
162
|
+
os.environ.get("PROXY_OPUS46_CTX_THRESHOLD", "0.8")
|
|
163
|
+
)
|
|
164
|
+
PROXY_OPUS46_MAX_TOKENS_HIGH_CTX = int(
|
|
165
|
+
os.environ.get("PROXY_OPUS46_MAX_TOKENS_HIGH_CTX", "4096")
|
|
151
166
|
)
|
|
152
167
|
PROXY_TOOL_NARROWING_EXPAND_ON_LOOP = os.environ.get(
|
|
153
168
|
"PROXY_TOOL_NARROWING_EXPAND_ON_LOOP", "on"
|
|
@@ -323,6 +338,51 @@ logging.basicConfig(
|
|
|
323
338
|
)
|
|
324
339
|
logger = logging.getLogger("uap.anthropic_proxy")
|
|
325
340
|
|
|
341
|
+
_client_request_times: dict[str, deque[float]] = defaultdict(deque)
|
|
342
|
+
_client_rate_last_log: dict[str, float] = defaultdict(float)
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
def resolve_client_id(request: Request) -> str:
|
|
346
|
+
header_keys = ("x-uap-client-id", "x-forwarded-for", "x-real-ip")
|
|
347
|
+
for key in header_keys:
|
|
348
|
+
value = request.headers.get(key)
|
|
349
|
+
if value:
|
|
350
|
+
return f"{key}:{value.split(',')[0].strip()}"
|
|
351
|
+
if request.client:
|
|
352
|
+
return f"remote:{request.client.host}"
|
|
353
|
+
return "remote:unknown"
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
def log_client_rate(client_id: str) -> int:
|
|
357
|
+
if PROXY_CLIENT_RATE_WINDOW_SECS <= 0:
|
|
358
|
+
return 0
|
|
359
|
+
now = time.time()
|
|
360
|
+
window = PROXY_CLIENT_RATE_WINDOW_SECS
|
|
361
|
+
request_times = _client_request_times[client_id]
|
|
362
|
+
request_times.append(now)
|
|
363
|
+
cutoff = now - window
|
|
364
|
+
while request_times and request_times[0] < cutoff:
|
|
365
|
+
request_times.popleft()
|
|
366
|
+
count = len(request_times)
|
|
367
|
+
if PROXY_CLIENT_RATE_LOG_MIN_SECS <= 0:
|
|
368
|
+
logger.info(
|
|
369
|
+
"CLIENT_RATE: id=%s window=%ss count=%d",
|
|
370
|
+
client_id,
|
|
371
|
+
window,
|
|
372
|
+
count,
|
|
373
|
+
)
|
|
374
|
+
return count
|
|
375
|
+
last_log = _client_rate_last_log.get(client_id, 0.0)
|
|
376
|
+
if now - last_log >= PROXY_CLIENT_RATE_LOG_MIN_SECS:
|
|
377
|
+
_client_rate_last_log[client_id] = now
|
|
378
|
+
logger.info(
|
|
379
|
+
"CLIENT_RATE: id=%s window=%ss count=%d",
|
|
380
|
+
client_id,
|
|
381
|
+
window,
|
|
382
|
+
count,
|
|
383
|
+
)
|
|
384
|
+
return count
|
|
385
|
+
|
|
326
386
|
|
|
327
387
|
def _load_tool_call_grammar(path: str) -> str:
|
|
328
388
|
if not PROXY_TOOL_CALL_GRAMMAR:
|
|
@@ -1065,6 +1125,37 @@ def prune_conversation(
|
|
|
1065
1125
|
http_client: httpx.AsyncClient | None = None
|
|
1066
1126
|
|
|
1067
1127
|
|
|
1128
|
+
async def _post_with_retry(
|
|
1129
|
+
client: httpx.AsyncClient,
|
|
1130
|
+
url: str,
|
|
1131
|
+
payload: dict,
|
|
1132
|
+
headers: dict,
|
|
1133
|
+
) -> httpx.Response:
|
|
1134
|
+
last_exc: Exception | None = None
|
|
1135
|
+
for attempt in range(PROXY_UPSTREAM_RETRY_MAX):
|
|
1136
|
+
try:
|
|
1137
|
+
return await client.post(url, json=payload, headers=headers)
|
|
1138
|
+
except (httpx.ConnectError, httpx.RemoteProtocolError, httpx.ReadTimeout) as exc:
|
|
1139
|
+
last_exc = exc
|
|
1140
|
+
if attempt < PROXY_UPSTREAM_RETRY_MAX - 1:
|
|
1141
|
+
logger.warning(
|
|
1142
|
+
"Upstream connect failed (attempt %d/%d): %s – retrying in %.0fs",
|
|
1143
|
+
attempt + 1,
|
|
1144
|
+
PROXY_UPSTREAM_RETRY_MAX,
|
|
1145
|
+
type(exc).__name__,
|
|
1146
|
+
PROXY_UPSTREAM_RETRY_DELAY_SECS,
|
|
1147
|
+
)
|
|
1148
|
+
await asyncio.sleep(PROXY_UPSTREAM_RETRY_DELAY_SECS)
|
|
1149
|
+
else:
|
|
1150
|
+
logger.error(
|
|
1151
|
+
"Upstream connect failed after %d attempts: %s: %s",
|
|
1152
|
+
PROXY_UPSTREAM_RETRY_MAX,
|
|
1153
|
+
type(exc).__name__,
|
|
1154
|
+
exc,
|
|
1155
|
+
)
|
|
1156
|
+
raise last_exc if last_exc else RuntimeError("upstream retry failed")
|
|
1157
|
+
|
|
1158
|
+
|
|
1068
1159
|
@asynccontextmanager
|
|
1069
1160
|
async def lifespan(app: FastAPI):
|
|
1070
1161
|
"""Manage the httpx client lifecycle with the FastAPI app."""
|
|
@@ -1879,6 +1970,25 @@ def build_openai_request(anthropic_body: dict, monitor: SessionMonitor) -> dict:
|
|
|
1879
1970
|
)
|
|
1880
1971
|
requested_max = max(1024, available_for_output)
|
|
1881
1972
|
|
|
1973
|
+
model_name = str(anthropic_body.get("model", "")).lower()
|
|
1974
|
+
utilization = estimated_input / ctx_window if ctx_window else 0.0
|
|
1975
|
+
if (
|
|
1976
|
+
PROXY_OPUS46_MAX_TOKENS_HIGH_CTX > 0
|
|
1977
|
+
and "opus" in model_name
|
|
1978
|
+
and "4.6" in model_name
|
|
1979
|
+
and utilization >= PROXY_OPUS46_CTX_THRESHOLD
|
|
1980
|
+
and requested_max > PROXY_OPUS46_MAX_TOKENS_HIGH_CTX
|
|
1981
|
+
):
|
|
1982
|
+
logger.warning(
|
|
1983
|
+
"MAX_TOKENS capped for Opus 4.6 at high context: %d -> %d (ctx=%d input~%d util=%.1f%%)",
|
|
1984
|
+
requested_max,
|
|
1985
|
+
PROXY_OPUS46_MAX_TOKENS_HIGH_CTX,
|
|
1986
|
+
ctx_window,
|
|
1987
|
+
estimated_input,
|
|
1988
|
+
utilization * 100,
|
|
1989
|
+
)
|
|
1990
|
+
requested_max = PROXY_OPUS46_MAX_TOKENS_HIGH_CTX
|
|
1991
|
+
|
|
1882
1992
|
openai_body["max_tokens"] = requested_max
|
|
1883
1993
|
if "temperature" in anthropic_body:
|
|
1884
1994
|
openai_body["temperature"] = anthropic_body["temperature"]
|
|
@@ -3953,6 +4063,7 @@ async def messages(request: Request):
|
|
|
3953
4063
|
body = await request.json()
|
|
3954
4064
|
model = body.get("model", "default")
|
|
3955
4065
|
is_stream = body.get("stream", False)
|
|
4066
|
+
client_id = resolve_client_id(request)
|
|
3956
4067
|
session_id = resolve_session_id(request, body)
|
|
3957
4068
|
monitor = get_session_monitor(session_id)
|
|
3958
4069
|
last_session_id = session_id
|
|
@@ -3982,8 +4093,12 @@ async def messages(request: Request):
|
|
|
3982
4093
|
last_text = last_content[:200]
|
|
3983
4094
|
else:
|
|
3984
4095
|
last_text = str(last_content)[:200]
|
|
4096
|
+
rate_count = log_client_rate(client_id)
|
|
3985
4097
|
logger.info(
|
|
3986
|
-
"REQ: stream=%s msgs=%d tools=%d max_tokens=%s last_role=%s last_content=%.200s",
|
|
4098
|
+
"REQ: client=%s rate_%ss=%d stream=%s msgs=%d tools=%d max_tokens=%s last_role=%s last_content=%.200s",
|
|
4099
|
+
client_id,
|
|
4100
|
+
PROXY_CLIENT_RATE_WINDOW_SECS,
|
|
4101
|
+
rate_count,
|
|
3987
4102
|
is_stream,
|
|
3988
4103
|
n_messages,
|
|
3989
4104
|
n_tools,
|
|
@@ -4040,11 +4155,27 @@ async def messages(request: Request):
|
|
|
4040
4155
|
strict_body = dict(openai_body)
|
|
4041
4156
|
strict_body["stream"] = False
|
|
4042
4157
|
|
|
4043
|
-
|
|
4044
|
-
|
|
4045
|
-
|
|
4046
|
-
|
|
4047
|
-
|
|
4158
|
+
try:
|
|
4159
|
+
strict_resp = await _post_with_retry(
|
|
4160
|
+
client,
|
|
4161
|
+
f"{LLAMA_CPP_BASE}/chat/completions",
|
|
4162
|
+
strict_body,
|
|
4163
|
+
{"Content-Type": "application/json"},
|
|
4164
|
+
)
|
|
4165
|
+
except Exception as exc:
|
|
4166
|
+
return Response(
|
|
4167
|
+
content=json.dumps(
|
|
4168
|
+
{
|
|
4169
|
+
"type": "error",
|
|
4170
|
+
"error": {
|
|
4171
|
+
"type": "overloaded_error",
|
|
4172
|
+
"message": f"Upstream server unavailable after {PROXY_UPSTREAM_RETRY_MAX} retries: {exc}",
|
|
4173
|
+
},
|
|
4174
|
+
}
|
|
4175
|
+
),
|
|
4176
|
+
status_code=529,
|
|
4177
|
+
media_type="application/json",
|
|
4178
|
+
)
|
|
4048
4179
|
|
|
4049
4180
|
if strict_resp.status_code != 200:
|
|
4050
4181
|
error_text = strict_resp.text[:1000]
|
|
@@ -4054,11 +4185,27 @@ async def messages(request: Request):
|
|
|
4054
4185
|
error_text,
|
|
4055
4186
|
"strict-stream",
|
|
4056
4187
|
):
|
|
4057
|
-
|
|
4058
|
-
|
|
4059
|
-
|
|
4060
|
-
|
|
4061
|
-
|
|
4188
|
+
try:
|
|
4189
|
+
strict_resp = await _post_with_retry(
|
|
4190
|
+
client,
|
|
4191
|
+
f"{LLAMA_CPP_BASE}/chat/completions",
|
|
4192
|
+
strict_body,
|
|
4193
|
+
{"Content-Type": "application/json"},
|
|
4194
|
+
)
|
|
4195
|
+
except Exception as exc:
|
|
4196
|
+
return Response(
|
|
4197
|
+
content=json.dumps(
|
|
4198
|
+
{
|
|
4199
|
+
"type": "error",
|
|
4200
|
+
"error": {
|
|
4201
|
+
"type": "overloaded_error",
|
|
4202
|
+
"message": f"Upstream server unavailable after {PROXY_UPSTREAM_RETRY_MAX} retries: {exc}",
|
|
4203
|
+
},
|
|
4204
|
+
}
|
|
4205
|
+
),
|
|
4206
|
+
status_code=529,
|
|
4207
|
+
media_type="application/json",
|
|
4208
|
+
)
|
|
4062
4209
|
|
|
4063
4210
|
if strict_resp.status_code != 200:
|
|
4064
4211
|
error_text = strict_resp.text[:1000]
|
|
@@ -4128,8 +4275,8 @@ async def messages(request: Request):
|
|
|
4128
4275
|
|
|
4129
4276
|
# Retry upstream connection with backoff to handle
|
|
4130
4277
|
# llama-server restarts gracefully instead of 500-ing to the client.
|
|
4131
|
-
MAX_UPSTREAM_RETRIES =
|
|
4132
|
-
RETRY_DELAY_SECS =
|
|
4278
|
+
MAX_UPSTREAM_RETRIES = PROXY_UPSTREAM_RETRY_MAX
|
|
4279
|
+
RETRY_DELAY_SECS = PROXY_UPSTREAM_RETRY_DELAY_SECS
|
|
4133
4280
|
last_exc: Exception | None = None
|
|
4134
4281
|
resp: httpx.Response | None = None
|
|
4135
4282
|
|
|
@@ -4147,7 +4294,7 @@ async def messages(request: Request):
|
|
|
4147
4294
|
# Connection succeeded – break out of retry loop
|
|
4148
4295
|
last_exc = None
|
|
4149
4296
|
break
|
|
4150
|
-
except (httpx.ConnectError, httpx.RemoteProtocolError) as exc:
|
|
4297
|
+
except (httpx.ConnectError, httpx.RemoteProtocolError, httpx.ReadTimeout) as exc:
|
|
4151
4298
|
last_exc = exc
|
|
4152
4299
|
if attempt < MAX_UPSTREAM_RETRIES - 1:
|
|
4153
4300
|
logger.warning(
|
|
@@ -4314,11 +4461,27 @@ async def messages(request: Request):
|
|
|
4314
4461
|
},
|
|
4315
4462
|
)
|
|
4316
4463
|
else:
|
|
4317
|
-
|
|
4318
|
-
|
|
4319
|
-
|
|
4320
|
-
|
|
4321
|
-
|
|
4464
|
+
try:
|
|
4465
|
+
resp = await _post_with_retry(
|
|
4466
|
+
client,
|
|
4467
|
+
f"{LLAMA_CPP_BASE}/chat/completions",
|
|
4468
|
+
openai_body,
|
|
4469
|
+
{"Content-Type": "application/json"},
|
|
4470
|
+
)
|
|
4471
|
+
except Exception as exc:
|
|
4472
|
+
return Response(
|
|
4473
|
+
content=json.dumps(
|
|
4474
|
+
{
|
|
4475
|
+
"type": "error",
|
|
4476
|
+
"error": {
|
|
4477
|
+
"type": "overloaded_error",
|
|
4478
|
+
"message": f"Upstream server unavailable after {PROXY_UPSTREAM_RETRY_MAX} retries: {exc}",
|
|
4479
|
+
},
|
|
4480
|
+
}
|
|
4481
|
+
),
|
|
4482
|
+
status_code=529,
|
|
4483
|
+
media_type="application/json",
|
|
4484
|
+
)
|
|
4322
4485
|
|
|
4323
4486
|
if resp.status_code != 200:
|
|
4324
4487
|
error_text = resp.text[:1000]
|
|
@@ -4328,11 +4491,27 @@ async def messages(request: Request):
|
|
|
4328
4491
|
error_text,
|
|
4329
4492
|
"non-stream",
|
|
4330
4493
|
):
|
|
4331
|
-
|
|
4332
|
-
|
|
4333
|
-
|
|
4334
|
-
|
|
4335
|
-
|
|
4494
|
+
try:
|
|
4495
|
+
resp = await _post_with_retry(
|
|
4496
|
+
client,
|
|
4497
|
+
f"{LLAMA_CPP_BASE}/chat/completions",
|
|
4498
|
+
openai_body,
|
|
4499
|
+
{"Content-Type": "application/json"},
|
|
4500
|
+
)
|
|
4501
|
+
except Exception as exc:
|
|
4502
|
+
return Response(
|
|
4503
|
+
content=json.dumps(
|
|
4504
|
+
{
|
|
4505
|
+
"type": "error",
|
|
4506
|
+
"error": {
|
|
4507
|
+
"type": "overloaded_error",
|
|
4508
|
+
"message": f"Upstream server unavailable after {PROXY_UPSTREAM_RETRY_MAX} retries: {exc}",
|
|
4509
|
+
},
|
|
4510
|
+
}
|
|
4511
|
+
),
|
|
4512
|
+
status_code=529,
|
|
4513
|
+
media_type="application/json",
|
|
4514
|
+
)
|
|
4336
4515
|
|
|
4337
4516
|
# Option B: Handle non-streaming errors too
|
|
4338
4517
|
if resp.status_code != 200:
|