@miller-tech/uap 1.18.1 → 1.19.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@miller-tech/uap",
3
- "version": "1.18.1",
3
+ "version": "1.19.1",
4
4
  "description": "Autonomous AI agent memory system with CLAUDE.md protocol enforcement",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",
@@ -85,6 +85,7 @@ import re
85
85
  import sys
86
86
  import time
87
87
  import uuid
88
+ from collections import defaultdict, deque
88
89
  from dataclasses import dataclass, field
89
90
 
90
91
  import httpx
@@ -101,6 +102,8 @@ PROXY_PORT = int(os.environ.get("PROXY_PORT", "4000"))
101
102
  PROXY_HOST = os.environ.get("PROXY_HOST", "0.0.0.0")
102
103
  PROXY_LOG_LEVEL = os.environ.get("PROXY_LOG_LEVEL", "INFO").upper()
103
104
  PROXY_READ_TIMEOUT = float(os.environ.get("PROXY_READ_TIMEOUT", "600"))
105
+ PROXY_UPSTREAM_RETRY_MAX = int(os.environ.get("PROXY_UPSTREAM_RETRY_MAX", "3"))
106
+ PROXY_UPSTREAM_RETRY_DELAY_SECS = float(os.environ.get("PROXY_UPSTREAM_RETRY_DELAY_SECS", "5"))
104
107
  PROXY_MAX_CONNECTIONS = int(os.environ.get("PROXY_MAX_CONNECTIONS", "20"))
105
108
  PROXY_CONTEXT_WINDOW = int(os.environ.get("PROXY_CONTEXT_WINDOW", "0"))
106
109
  PROXY_CONTEXT_PRUNE_THRESHOLD = float(
@@ -116,9 +119,9 @@ PROXY_LOOP_BREAKER = os.environ.get("PROXY_LOOP_BREAKER", "on").lower() not in {
116
119
  "no",
117
120
  }
118
121
  PROXY_LOOP_WINDOW = int(os.environ.get("PROXY_LOOP_WINDOW", "6"))
119
- PROXY_LOOP_REPEAT_THRESHOLD = int(os.environ.get("PROXY_LOOP_REPEAT_THRESHOLD", "8"))
122
+ PROXY_LOOP_REPEAT_THRESHOLD = int(os.environ.get("PROXY_LOOP_REPEAT_THRESHOLD", "6"))
120
123
  PROXY_FORCED_THRESHOLD = int(os.environ.get("PROXY_FORCED_THRESHOLD", "15"))
121
- PROXY_NO_PROGRESS_THRESHOLD = int(os.environ.get("PROXY_NO_PROGRESS_THRESHOLD", "4"))
124
+ PROXY_NO_PROGRESS_THRESHOLD = int(os.environ.get("PROXY_NO_PROGRESS_THRESHOLD", "3"))
122
125
  PROXY_CONTEXT_RELEASE_THRESHOLD = float(
123
126
  os.environ.get("PROXY_CONTEXT_RELEASE_THRESHOLD", "0.90")
124
127
  )
@@ -138,16 +141,28 @@ PROXY_TOOL_STATE_FORCED_BUDGET = int(
138
141
  )
139
142
  PROXY_TOOL_STATE_AUTO_BUDGET = int(os.environ.get("PROXY_TOOL_STATE_AUTO_BUDGET", "2"))
140
143
  PROXY_TOOL_STATE_STAGNATION_THRESHOLD = int(
141
- os.environ.get("PROXY_TOOL_STATE_STAGNATION_THRESHOLD", "12")
144
+ os.environ.get("PROXY_TOOL_STATE_STAGNATION_THRESHOLD", "9")
142
145
  )
143
146
  PROXY_TOOL_STATE_CYCLE_WINDOW = int(
144
147
  os.environ.get("PROXY_TOOL_STATE_CYCLE_WINDOW", "8")
145
148
  )
146
149
  PROXY_TOOL_STATE_FINALIZE_THRESHOLD = int(
147
- os.environ.get("PROXY_TOOL_STATE_FINALIZE_THRESHOLD", "24")
150
+ os.environ.get("PROXY_TOOL_STATE_FINALIZE_THRESHOLD", "18")
148
151
  )
149
152
  PROXY_TOOL_STATE_REVIEW_CYCLE_LIMIT = int(
150
- os.environ.get("PROXY_TOOL_STATE_REVIEW_CYCLE_LIMIT", "3")
153
+ os.environ.get("PROXY_TOOL_STATE_REVIEW_CYCLE_LIMIT", "2")
154
+ )
155
+ PROXY_CLIENT_RATE_WINDOW_SECS = int(
156
+ os.environ.get("PROXY_CLIENT_RATE_WINDOW_SECS", "60")
157
+ )
158
+ PROXY_CLIENT_RATE_LOG_MIN_SECS = float(
159
+ os.environ.get("PROXY_CLIENT_RATE_LOG_MIN_SECS", "15")
160
+ )
161
+ PROXY_OPUS46_CTX_THRESHOLD = float(
162
+ os.environ.get("PROXY_OPUS46_CTX_THRESHOLD", "0.8")
163
+ )
164
+ PROXY_OPUS46_MAX_TOKENS_HIGH_CTX = int(
165
+ os.environ.get("PROXY_OPUS46_MAX_TOKENS_HIGH_CTX", "4096")
151
166
  )
152
167
  PROXY_TOOL_NARROWING_EXPAND_ON_LOOP = os.environ.get(
153
168
  "PROXY_TOOL_NARROWING_EXPAND_ON_LOOP", "on"
@@ -323,6 +338,51 @@ logging.basicConfig(
323
338
  )
324
339
  logger = logging.getLogger("uap.anthropic_proxy")
325
340
 
341
+ _client_request_times: dict[str, deque[float]] = defaultdict(deque)
342
+ _client_rate_last_log: dict[str, float] = defaultdict(float)
343
+
344
+
345
+ def resolve_client_id(request: Request) -> str:
346
+ header_keys = ("x-uap-client-id", "x-forwarded-for", "x-real-ip")
347
+ for key in header_keys:
348
+ value = request.headers.get(key)
349
+ if value:
350
+ return f"{key}:{value.split(',')[0].strip()}"
351
+ if request.client:
352
+ return f"remote:{request.client.host}"
353
+ return "remote:unknown"
354
+
355
+
356
+ def log_client_rate(client_id: str) -> int:
357
+ if PROXY_CLIENT_RATE_WINDOW_SECS <= 0:
358
+ return 0
359
+ now = time.time()
360
+ window = PROXY_CLIENT_RATE_WINDOW_SECS
361
+ request_times = _client_request_times[client_id]
362
+ request_times.append(now)
363
+ cutoff = now - window
364
+ while request_times and request_times[0] < cutoff:
365
+ request_times.popleft()
366
+ count = len(request_times)
367
+ if PROXY_CLIENT_RATE_LOG_MIN_SECS <= 0:
368
+ logger.info(
369
+ "CLIENT_RATE: id=%s window=%ss count=%d",
370
+ client_id,
371
+ window,
372
+ count,
373
+ )
374
+ return count
375
+ last_log = _client_rate_last_log.get(client_id, 0.0)
376
+ if now - last_log >= PROXY_CLIENT_RATE_LOG_MIN_SECS:
377
+ _client_rate_last_log[client_id] = now
378
+ logger.info(
379
+ "CLIENT_RATE: id=%s window=%ss count=%d",
380
+ client_id,
381
+ window,
382
+ count,
383
+ )
384
+ return count
385
+
326
386
 
327
387
  def _load_tool_call_grammar(path: str) -> str:
328
388
  if not PROXY_TOOL_CALL_GRAMMAR:
@@ -1065,6 +1125,37 @@ def prune_conversation(
1065
1125
  http_client: httpx.AsyncClient | None = None
1066
1126
 
1067
1127
 
1128
+ async def _post_with_retry(
1129
+ client: httpx.AsyncClient,
1130
+ url: str,
1131
+ payload: dict,
1132
+ headers: dict,
1133
+ ) -> httpx.Response:
1134
+ last_exc: Exception | None = None
1135
+ for attempt in range(PROXY_UPSTREAM_RETRY_MAX):
1136
+ try:
1137
+ return await client.post(url, json=payload, headers=headers)
1138
+ except (httpx.ConnectError, httpx.RemoteProtocolError, httpx.ReadTimeout) as exc:
1139
+ last_exc = exc
1140
+ if attempt < PROXY_UPSTREAM_RETRY_MAX - 1:
1141
+ logger.warning(
1142
+ "Upstream connect failed (attempt %d/%d): %s – retrying in %.0fs",
1143
+ attempt + 1,
1144
+ PROXY_UPSTREAM_RETRY_MAX,
1145
+ type(exc).__name__,
1146
+ PROXY_UPSTREAM_RETRY_DELAY_SECS,
1147
+ )
1148
+ await asyncio.sleep(PROXY_UPSTREAM_RETRY_DELAY_SECS)
1149
+ else:
1150
+ logger.error(
1151
+ "Upstream connect failed after %d attempts: %s: %s",
1152
+ PROXY_UPSTREAM_RETRY_MAX,
1153
+ type(exc).__name__,
1154
+ exc,
1155
+ )
1156
+ raise last_exc if last_exc else RuntimeError("upstream retry failed")
1157
+
1158
+
1068
1159
  @asynccontextmanager
1069
1160
  async def lifespan(app: FastAPI):
1070
1161
  """Manage the httpx client lifecycle with the FastAPI app."""
@@ -1879,6 +1970,25 @@ def build_openai_request(anthropic_body: dict, monitor: SessionMonitor) -> dict:
1879
1970
  )
1880
1971
  requested_max = max(1024, available_for_output)
1881
1972
 
1973
+ model_name = str(anthropic_body.get("model", "")).lower()
1974
+ utilization = estimated_input / ctx_window if ctx_window else 0.0
1975
+ if (
1976
+ PROXY_OPUS46_MAX_TOKENS_HIGH_CTX > 0
1977
+ and "opus" in model_name
1978
+ and "4.6" in model_name
1979
+ and utilization >= PROXY_OPUS46_CTX_THRESHOLD
1980
+ and requested_max > PROXY_OPUS46_MAX_TOKENS_HIGH_CTX
1981
+ ):
1982
+ logger.warning(
1983
+ "MAX_TOKENS capped for Opus 4.6 at high context: %d -> %d (ctx=%d input~%d util=%.1f%%)",
1984
+ requested_max,
1985
+ PROXY_OPUS46_MAX_TOKENS_HIGH_CTX,
1986
+ ctx_window,
1987
+ estimated_input,
1988
+ utilization * 100,
1989
+ )
1990
+ requested_max = PROXY_OPUS46_MAX_TOKENS_HIGH_CTX
1991
+
1882
1992
  openai_body["max_tokens"] = requested_max
1883
1993
  if "temperature" in anthropic_body:
1884
1994
  openai_body["temperature"] = anthropic_body["temperature"]
@@ -3953,6 +4063,7 @@ async def messages(request: Request):
3953
4063
  body = await request.json()
3954
4064
  model = body.get("model", "default")
3955
4065
  is_stream = body.get("stream", False)
4066
+ client_id = resolve_client_id(request)
3956
4067
  session_id = resolve_session_id(request, body)
3957
4068
  monitor = get_session_monitor(session_id)
3958
4069
  last_session_id = session_id
@@ -3982,8 +4093,12 @@ async def messages(request: Request):
3982
4093
  last_text = last_content[:200]
3983
4094
  else:
3984
4095
  last_text = str(last_content)[:200]
4096
+ rate_count = log_client_rate(client_id)
3985
4097
  logger.info(
3986
- "REQ: stream=%s msgs=%d tools=%d max_tokens=%s last_role=%s last_content=%.200s",
4098
+ "REQ: client=%s rate_%ss=%d stream=%s msgs=%d tools=%d max_tokens=%s last_role=%s last_content=%.200s",
4099
+ client_id,
4100
+ PROXY_CLIENT_RATE_WINDOW_SECS,
4101
+ rate_count,
3987
4102
  is_stream,
3988
4103
  n_messages,
3989
4104
  n_tools,
@@ -4040,11 +4155,27 @@ async def messages(request: Request):
4040
4155
  strict_body = dict(openai_body)
4041
4156
  strict_body["stream"] = False
4042
4157
 
4043
- strict_resp = await client.post(
4044
- f"{LLAMA_CPP_BASE}/chat/completions",
4045
- json=strict_body,
4046
- headers={"Content-Type": "application/json"},
4047
- )
4158
+ try:
4159
+ strict_resp = await _post_with_retry(
4160
+ client,
4161
+ f"{LLAMA_CPP_BASE}/chat/completions",
4162
+ strict_body,
4163
+ {"Content-Type": "application/json"},
4164
+ )
4165
+ except Exception as exc:
4166
+ return Response(
4167
+ content=json.dumps(
4168
+ {
4169
+ "type": "error",
4170
+ "error": {
4171
+ "type": "overloaded_error",
4172
+ "message": f"Upstream server unavailable after {PROXY_UPSTREAM_RETRY_MAX} retries: {exc}",
4173
+ },
4174
+ }
4175
+ ),
4176
+ status_code=529,
4177
+ media_type="application/json",
4178
+ )
4048
4179
 
4049
4180
  if strict_resp.status_code != 200:
4050
4181
  error_text = strict_resp.text[:1000]
@@ -4054,11 +4185,27 @@ async def messages(request: Request):
4054
4185
  error_text,
4055
4186
  "strict-stream",
4056
4187
  ):
4057
- strict_resp = await client.post(
4058
- f"{LLAMA_CPP_BASE}/chat/completions",
4059
- json=strict_body,
4060
- headers={"Content-Type": "application/json"},
4061
- )
4188
+ try:
4189
+ strict_resp = await _post_with_retry(
4190
+ client,
4191
+ f"{LLAMA_CPP_BASE}/chat/completions",
4192
+ strict_body,
4193
+ {"Content-Type": "application/json"},
4194
+ )
4195
+ except Exception as exc:
4196
+ return Response(
4197
+ content=json.dumps(
4198
+ {
4199
+ "type": "error",
4200
+ "error": {
4201
+ "type": "overloaded_error",
4202
+ "message": f"Upstream server unavailable after {PROXY_UPSTREAM_RETRY_MAX} retries: {exc}",
4203
+ },
4204
+ }
4205
+ ),
4206
+ status_code=529,
4207
+ media_type="application/json",
4208
+ )
4062
4209
 
4063
4210
  if strict_resp.status_code != 200:
4064
4211
  error_text = strict_resp.text[:1000]
@@ -4128,8 +4275,8 @@ async def messages(request: Request):
4128
4275
 
4129
4276
  # Retry upstream connection with backoff to handle
4130
4277
  # llama-server restarts gracefully instead of 500-ing to the client.
4131
- MAX_UPSTREAM_RETRIES = 3
4132
- RETRY_DELAY_SECS = 5.0
4278
+ MAX_UPSTREAM_RETRIES = PROXY_UPSTREAM_RETRY_MAX
4279
+ RETRY_DELAY_SECS = PROXY_UPSTREAM_RETRY_DELAY_SECS
4133
4280
  last_exc: Exception | None = None
4134
4281
  resp: httpx.Response | None = None
4135
4282
 
@@ -4147,7 +4294,7 @@ async def messages(request: Request):
4147
4294
  # Connection succeeded – break out of retry loop
4148
4295
  last_exc = None
4149
4296
  break
4150
- except (httpx.ConnectError, httpx.RemoteProtocolError) as exc:
4297
+ except (httpx.ConnectError, httpx.RemoteProtocolError, httpx.ReadTimeout) as exc:
4151
4298
  last_exc = exc
4152
4299
  if attempt < MAX_UPSTREAM_RETRIES - 1:
4153
4300
  logger.warning(
@@ -4314,11 +4461,27 @@ async def messages(request: Request):
4314
4461
  },
4315
4462
  )
4316
4463
  else:
4317
- resp = await client.post(
4318
- f"{LLAMA_CPP_BASE}/chat/completions",
4319
- json=openai_body,
4320
- headers={"Content-Type": "application/json"},
4321
- )
4464
+ try:
4465
+ resp = await _post_with_retry(
4466
+ client,
4467
+ f"{LLAMA_CPP_BASE}/chat/completions",
4468
+ openai_body,
4469
+ {"Content-Type": "application/json"},
4470
+ )
4471
+ except Exception as exc:
4472
+ return Response(
4473
+ content=json.dumps(
4474
+ {
4475
+ "type": "error",
4476
+ "error": {
4477
+ "type": "overloaded_error",
4478
+ "message": f"Upstream server unavailable after {PROXY_UPSTREAM_RETRY_MAX} retries: {exc}",
4479
+ },
4480
+ }
4481
+ ),
4482
+ status_code=529,
4483
+ media_type="application/json",
4484
+ )
4322
4485
 
4323
4486
  if resp.status_code != 200:
4324
4487
  error_text = resp.text[:1000]
@@ -4328,11 +4491,27 @@ async def messages(request: Request):
4328
4491
  error_text,
4329
4492
  "non-stream",
4330
4493
  ):
4331
- resp = await client.post(
4332
- f"{LLAMA_CPP_BASE}/chat/completions",
4333
- json=openai_body,
4334
- headers={"Content-Type": "application/json"},
4335
- )
4494
+ try:
4495
+ resp = await _post_with_retry(
4496
+ client,
4497
+ f"{LLAMA_CPP_BASE}/chat/completions",
4498
+ openai_body,
4499
+ {"Content-Type": "application/json"},
4500
+ )
4501
+ except Exception as exc:
4502
+ return Response(
4503
+ content=json.dumps(
4504
+ {
4505
+ "type": "error",
4506
+ "error": {
4507
+ "type": "overloaded_error",
4508
+ "message": f"Upstream server unavailable after {PROXY_UPSTREAM_RETRY_MAX} retries: {exc}",
4509
+ },
4510
+ }
4511
+ ),
4512
+ status_code=529,
4513
+ media_type="application/json",
4514
+ )
4336
4515
 
4337
4516
  # Option B: Handle non-streaming errors too
4338
4517
  if resp.status_code != 200: