@miller-tech/uap 1.13.14 → 1.13.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/.tsbuildinfo +1 -1
- package/dist/cli/hooks.js +36 -3
- package/dist/cli/hooks.js.map +1 -1
- package/docs/INDEX.md +1 -0
- package/docs/benchmarks/SPECULATIVE_DECODING_JOURNEY_2026-03.md +221 -0
- package/docs/deployment/QWEN35_LLAMA_CPP.md +27 -0
- package/package.json +1 -1
- package/tools/agents/scripts/anthropic_proxy.py +890 -65
- package/tools/agents/tests/test_anthropic_proxy_streaming.py +474 -0
|
@@ -105,6 +105,9 @@ PROXY_CONTEXT_WINDOW = int(os.environ.get("PROXY_CONTEXT_WINDOW", "0"))
|
|
|
105
105
|
PROXY_CONTEXT_PRUNE_THRESHOLD = float(
|
|
106
106
|
os.environ.get("PROXY_CONTEXT_PRUNE_THRESHOLD", "0.75")
|
|
107
107
|
)
|
|
108
|
+
PROXY_CONTEXT_PRUNE_TARGET_FRACTION = float(
|
|
109
|
+
os.environ.get("PROXY_CONTEXT_PRUNE_TARGET_FRACTION", "0.65")
|
|
110
|
+
)
|
|
108
111
|
PROXY_LOOP_BREAKER = os.environ.get("PROXY_LOOP_BREAKER", "on").lower() not in {
|
|
109
112
|
"0",
|
|
110
113
|
"false",
|
|
@@ -131,6 +134,75 @@ PROXY_STREAM_REASONING_FALLBACK = (
|
|
|
131
134
|
PROXY_STREAM_REASONING_MAX_CHARS = int(
|
|
132
135
|
os.environ.get("PROXY_STREAM_REASONING_MAX_CHARS", "240")
|
|
133
136
|
)
|
|
137
|
+
PROXY_MAX_TOKENS_FLOOR = int(os.environ.get("PROXY_MAX_TOKENS_FLOOR", "16384"))
|
|
138
|
+
PROXY_TOOL_NARROWING = os.environ.get("PROXY_TOOL_NARROWING", "off").lower() not in {
|
|
139
|
+
"0",
|
|
140
|
+
"false",
|
|
141
|
+
"off",
|
|
142
|
+
"no",
|
|
143
|
+
}
|
|
144
|
+
PROXY_TOOL_NARROWING_KEEP = int(os.environ.get("PROXY_TOOL_NARROWING_KEEP", "8"))
|
|
145
|
+
PROXY_TOOL_NARROWING_MIN_TOOLS = int(
|
|
146
|
+
os.environ.get("PROXY_TOOL_NARROWING_MIN_TOOLS", "12")
|
|
147
|
+
)
|
|
148
|
+
PROXY_DISABLE_THINKING_ON_TOOL_TURNS = os.environ.get(
|
|
149
|
+
"PROXY_DISABLE_THINKING_ON_TOOL_TURNS", "off"
|
|
150
|
+
).lower() not in {
|
|
151
|
+
"0",
|
|
152
|
+
"false",
|
|
153
|
+
"off",
|
|
154
|
+
"no",
|
|
155
|
+
}
|
|
156
|
+
PROXY_MALFORMED_TOOL_GUARDRAIL = os.environ.get(
|
|
157
|
+
"PROXY_MALFORMED_TOOL_GUARDRAIL", "on"
|
|
158
|
+
).lower() not in {
|
|
159
|
+
"0",
|
|
160
|
+
"false",
|
|
161
|
+
"off",
|
|
162
|
+
"no",
|
|
163
|
+
}
|
|
164
|
+
PROXY_MALFORMED_TOOL_RETRY_MAX = int(
|
|
165
|
+
os.environ.get("PROXY_MALFORMED_TOOL_RETRY_MAX", "1")
|
|
166
|
+
)
|
|
167
|
+
PROXY_MALFORMED_TOOL_RETRY_MAX_TOKENS = int(
|
|
168
|
+
os.environ.get("PROXY_MALFORMED_TOOL_RETRY_MAX_TOKENS", "2048")
|
|
169
|
+
)
|
|
170
|
+
PROXY_MALFORMED_TOOL_RETRY_TEMPERATURE = float(
|
|
171
|
+
os.environ.get("PROXY_MALFORMED_TOOL_RETRY_TEMPERATURE", "0")
|
|
172
|
+
)
|
|
173
|
+
PROXY_MALFORMED_TOOL_STREAM_STRICT = os.environ.get(
|
|
174
|
+
"PROXY_MALFORMED_TOOL_STREAM_STRICT", "off"
|
|
175
|
+
).lower() not in {
|
|
176
|
+
"0",
|
|
177
|
+
"false",
|
|
178
|
+
"off",
|
|
179
|
+
"no",
|
|
180
|
+
}
|
|
181
|
+
PROXY_FORCE_NON_STREAM = os.environ.get(
|
|
182
|
+
"PROXY_FORCE_NON_STREAM", "off"
|
|
183
|
+
).lower() not in {
|
|
184
|
+
"0",
|
|
185
|
+
"false",
|
|
186
|
+
"off",
|
|
187
|
+
"no",
|
|
188
|
+
}
|
|
189
|
+
PROXY_SESSION_CONTAMINATION_BREAKER = os.environ.get(
|
|
190
|
+
"PROXY_SESSION_CONTAMINATION_BREAKER", "on"
|
|
191
|
+
).lower() not in {
|
|
192
|
+
"0",
|
|
193
|
+
"false",
|
|
194
|
+
"off",
|
|
195
|
+
"no",
|
|
196
|
+
}
|
|
197
|
+
PROXY_SESSION_CONTAMINATION_THRESHOLD = int(
|
|
198
|
+
os.environ.get("PROXY_SESSION_CONTAMINATION_THRESHOLD", "3")
|
|
199
|
+
)
|
|
200
|
+
PROXY_SESSION_CONTAMINATION_KEEP_LAST = int(
|
|
201
|
+
os.environ.get("PROXY_SESSION_CONTAMINATION_KEEP_LAST", "8")
|
|
202
|
+
)
|
|
203
|
+
PROXY_AGENTIC_SUPPLEMENT_MODE = (
|
|
204
|
+
os.environ.get("PROXY_AGENTIC_SUPPLEMENT_MODE", "clean").strip().lower()
|
|
205
|
+
)
|
|
134
206
|
|
|
135
207
|
# ---------------------------------------------------------------------------
|
|
136
208
|
# Logging
|
|
@@ -170,6 +242,8 @@ class SessionMonitor:
|
|
|
170
242
|
loop_warnings_emitted: int = 0 # How many loop warnings sent to the model
|
|
171
243
|
no_progress_streak: int = 0 # Forced tool turns without new tool_result
|
|
172
244
|
unexpected_end_turn_count: int = 0 # end_turn without tool_use in active loop
|
|
245
|
+
malformed_tool_streak: int = 0 # consecutive malformed pseudo tool payloads
|
|
246
|
+
contamination_resets: int = 0 # how many contamination resets were applied
|
|
173
247
|
last_seen_ts: float = 0.0
|
|
174
248
|
|
|
175
249
|
def record_request(self, estimated_tokens: int):
|
|
@@ -688,9 +762,20 @@ async def lifespan(app: FastAPI):
|
|
|
688
762
|
if mon.context_window <= 0:
|
|
689
763
|
mon.context_window = default_context_window
|
|
690
764
|
logger.info(
|
|
691
|
-
"Context window: %d tokens, prune threshold: %.0f%%",
|
|
765
|
+
"Context window: %d tokens, prune threshold: %.0f%%, prune target: %.0f%%",
|
|
692
766
|
default_context_window,
|
|
693
767
|
PROXY_CONTEXT_PRUNE_THRESHOLD * 100,
|
|
768
|
+
_resolve_prune_target_fraction() * 100,
|
|
769
|
+
)
|
|
770
|
+
logger.info(
|
|
771
|
+
"Guardrails: malformed=%s stream_strict=%s force_non_stream=%s tool_narrowing=%s thinking_off_on_tools=%s contamination_breaker=%s(%d)",
|
|
772
|
+
PROXY_MALFORMED_TOOL_GUARDRAIL,
|
|
773
|
+
PROXY_MALFORMED_TOOL_STREAM_STRICT,
|
|
774
|
+
PROXY_FORCE_NON_STREAM,
|
|
775
|
+
PROXY_TOOL_NARROWING,
|
|
776
|
+
PROXY_DISABLE_THINKING_ON_TOOL_TURNS,
|
|
777
|
+
PROXY_SESSION_CONTAMINATION_BREAKER,
|
|
778
|
+
PROXY_SESSION_CONTAMINATION_THRESHOLD,
|
|
694
779
|
)
|
|
695
780
|
|
|
696
781
|
yield
|
|
@@ -794,7 +879,7 @@ def _extract_text(content) -> str:
|
|
|
794
879
|
return str(content)
|
|
795
880
|
|
|
796
881
|
|
|
797
|
-
|
|
882
|
+
_AGENTIC_SYSTEM_SUPPLEMENT_LEGACY = (
|
|
798
883
|
"\n\n<agentic-protocol>\n"
|
|
799
884
|
"You are operating in an agentic coding loop with tool access. Follow these rules:\n"
|
|
800
885
|
"1. ALWAYS use tools to read, edit, write, and test code. Never just describe or explain what should be done.\n"
|
|
@@ -807,6 +892,30 @@ _AGENTIC_SYSTEM_SUPPLEMENT = (
|
|
|
807
892
|
"</agentic-protocol>"
|
|
808
893
|
)
|
|
809
894
|
|
|
895
|
+
_AGENTIC_SYSTEM_SUPPLEMENT_CLEAN = (
|
|
896
|
+
"\n\n<agentic-protocol>\n"
|
|
897
|
+
"You are operating in an agentic coding loop with tool access. Follow these rules:\n"
|
|
898
|
+
"1. Use tools for concrete work (read, edit, write, test) instead of stopping at analysis.\n"
|
|
899
|
+
"2. When a fix is identified, take the next tool action immediately.\n"
|
|
900
|
+
"3. Return final text only when the task is complete and verified.\n"
|
|
901
|
+
"4. Never output protocol fragments or raw tool schema in assistant text.\n"
|
|
902
|
+
"5. Never emit literal tag artifacts such as </parameter>, <tool_call>, or <function=...>.\n"
|
|
903
|
+
"6. When a tool is needed, emit a valid tool call object instead of prose about tool-call formatting.\n"
|
|
904
|
+
"7. If a tool call fails, adapt and try another approach.\n"
|
|
905
|
+
"</agentic-protocol>"
|
|
906
|
+
)
|
|
907
|
+
|
|
908
|
+
if PROXY_AGENTIC_SUPPLEMENT_MODE == "legacy":
|
|
909
|
+
_AGENTIC_SYSTEM_SUPPLEMENT = _AGENTIC_SYSTEM_SUPPLEMENT_LEGACY
|
|
910
|
+
elif PROXY_AGENTIC_SUPPLEMENT_MODE == "clean":
|
|
911
|
+
_AGENTIC_SYSTEM_SUPPLEMENT = _AGENTIC_SYSTEM_SUPPLEMENT_CLEAN
|
|
912
|
+
else:
|
|
913
|
+
logger.warning(
|
|
914
|
+
"Unknown PROXY_AGENTIC_SUPPLEMENT_MODE=%r; using clean supplement",
|
|
915
|
+
PROXY_AGENTIC_SUPPLEMENT_MODE,
|
|
916
|
+
)
|
|
917
|
+
_AGENTIC_SYSTEM_SUPPLEMENT = _AGENTIC_SYSTEM_SUPPLEMENT_CLEAN
|
|
918
|
+
|
|
810
919
|
|
|
811
920
|
def _content_fingerprint(content) -> str:
|
|
812
921
|
if isinstance(content, str):
|
|
@@ -878,6 +987,87 @@ def _last_user_has_tool_result(anthropic_body: dict) -> bool:
|
|
|
878
987
|
return False
|
|
879
988
|
|
|
880
989
|
|
|
990
|
+
def _convert_anthropic_tools_to_openai(anthropic_tools: list[dict]) -> list[dict]:
|
|
991
|
+
converted = []
|
|
992
|
+
for tool in anthropic_tools:
|
|
993
|
+
converted.append(
|
|
994
|
+
{
|
|
995
|
+
"type": "function",
|
|
996
|
+
"function": {
|
|
997
|
+
"name": tool.get("name", ""),
|
|
998
|
+
"description": tool.get("description", ""),
|
|
999
|
+
"parameters": tool.get("input_schema", {}),
|
|
1000
|
+
},
|
|
1001
|
+
}
|
|
1002
|
+
)
|
|
1003
|
+
return converted
|
|
1004
|
+
|
|
1005
|
+
|
|
1006
|
+
def _latest_user_text(anthropic_body: dict) -> str:
|
|
1007
|
+
for msg in reversed(anthropic_body.get("messages", [])):
|
|
1008
|
+
if msg.get("role") != "user":
|
|
1009
|
+
continue
|
|
1010
|
+
return _extract_text(msg.get("content", ""))
|
|
1011
|
+
return ""
|
|
1012
|
+
|
|
1013
|
+
|
|
1014
|
+
def _tokenize_for_tool_ranking(text: str) -> set[str]:
|
|
1015
|
+
return {m.group(0).lower() for m in re.finditer(r"[a-zA-Z0-9_]{2,}", text)}
|
|
1016
|
+
|
|
1017
|
+
|
|
1018
|
+
def _narrow_tools_for_request(
|
|
1019
|
+
anthropic_body: dict, openai_tools: list[dict]
|
|
1020
|
+
) -> list[dict]:
|
|
1021
|
+
if not PROXY_TOOL_NARROWING:
|
|
1022
|
+
return openai_tools
|
|
1023
|
+
|
|
1024
|
+
if len(openai_tools) < max(1, PROXY_TOOL_NARROWING_MIN_TOOLS):
|
|
1025
|
+
return openai_tools
|
|
1026
|
+
|
|
1027
|
+
keep = max(1, PROXY_TOOL_NARROWING_KEEP)
|
|
1028
|
+
if keep >= len(openai_tools):
|
|
1029
|
+
return openai_tools
|
|
1030
|
+
|
|
1031
|
+
query_text = _latest_user_text(anthropic_body).lower()
|
|
1032
|
+
query_tokens = _tokenize_for_tool_ranking(query_text)
|
|
1033
|
+
if not query_tokens:
|
|
1034
|
+
narrowed = openai_tools[:keep]
|
|
1035
|
+
logger.info(
|
|
1036
|
+
"TOOL NARROWING: %d -> %d tools (no query tokens)",
|
|
1037
|
+
len(openai_tools),
|
|
1038
|
+
len(narrowed),
|
|
1039
|
+
)
|
|
1040
|
+
return narrowed
|
|
1041
|
+
|
|
1042
|
+
scored: list[tuple[int, int, dict]] = []
|
|
1043
|
+
for idx, tool in enumerate(openai_tools):
|
|
1044
|
+
fn = tool.get("function", {})
|
|
1045
|
+
name = fn.get("name", "")
|
|
1046
|
+
desc = fn.get("description", "")
|
|
1047
|
+
hay = f"{name} {desc}".lower()
|
|
1048
|
+
tool_tokens = _tokenize_for_tool_ranking(hay)
|
|
1049
|
+
overlap = len(query_tokens & tool_tokens)
|
|
1050
|
+
score = overlap * 3
|
|
1051
|
+
if name and name.lower() in query_text:
|
|
1052
|
+
score += 4
|
|
1053
|
+
if name and any(tok in name.lower() for tok in query_tokens):
|
|
1054
|
+
score += 1
|
|
1055
|
+
scored.append((score, -idx, tool))
|
|
1056
|
+
|
|
1057
|
+
scored.sort(reverse=True)
|
|
1058
|
+
selected = {id(tool) for _, _, tool in scored[:keep]}
|
|
1059
|
+
narrowed = [tool for tool in openai_tools if id(tool) in selected]
|
|
1060
|
+
|
|
1061
|
+
top_names = [t.get("function", {}).get("name", "") for t in narrowed[:4]]
|
|
1062
|
+
logger.info(
|
|
1063
|
+
"TOOL NARROWING: %d -> %d tools (top=%s)",
|
|
1064
|
+
len(openai_tools),
|
|
1065
|
+
len(narrowed),
|
|
1066
|
+
top_names,
|
|
1067
|
+
)
|
|
1068
|
+
return narrowed
|
|
1069
|
+
|
|
1070
|
+
|
|
881
1071
|
def build_openai_request(anthropic_body: dict, monitor: SessionMonitor) -> dict:
|
|
882
1072
|
"""Build an OpenAI Chat Completions request from an Anthropic Messages request."""
|
|
883
1073
|
openai_body = {
|
|
@@ -901,10 +1091,10 @@ def build_openai_request(anthropic_body: dict, monitor: SessionMonitor) -> dict:
|
|
|
901
1091
|
)
|
|
902
1092
|
|
|
903
1093
|
if "max_tokens" in anthropic_body:
|
|
904
|
-
# Enforce minimum floor for thinking mode: model needs
|
|
905
|
-
# reasoning (<think>...</think>) plus
|
|
906
|
-
#
|
|
907
|
-
requested_max =
|
|
1094
|
+
# Enforce configurable minimum floor for thinking mode: model needs
|
|
1095
|
+
# tokens for reasoning (<think>...</think>) plus actual response/tool
|
|
1096
|
+
# calls. Set PROXY_MAX_TOKENS_FLOOR=0 to disable this floor.
|
|
1097
|
+
requested_max = _resolve_max_tokens_request(anthropic_body["max_tokens"])
|
|
908
1098
|
|
|
909
1099
|
# Option E: Smart max_tokens capping — prevent the response from
|
|
910
1100
|
# consuming so many tokens that the NEXT turn's input won't fit.
|
|
@@ -948,18 +1138,12 @@ def build_openai_request(anthropic_body: dict, monitor: SessionMonitor) -> dict:
|
|
|
948
1138
|
|
|
949
1139
|
# Convert Anthropic tools to OpenAI function-calling tools
|
|
950
1140
|
if "tools" in anthropic_body:
|
|
951
|
-
openai_body["tools"] =
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
"name": tool["name"],
|
|
958
|
-
"description": tool.get("description", ""),
|
|
959
|
-
"parameters": tool.get("input_schema", {}),
|
|
960
|
-
},
|
|
961
|
-
}
|
|
962
|
-
)
|
|
1141
|
+
openai_body["tools"] = _convert_anthropic_tools_to_openai(
|
|
1142
|
+
anthropic_body.get("tools", [])
|
|
1143
|
+
)
|
|
1144
|
+
openai_body["tools"] = _narrow_tools_for_request(
|
|
1145
|
+
anthropic_body, openai_body["tools"]
|
|
1146
|
+
)
|
|
963
1147
|
|
|
964
1148
|
# Smart tool_choice: force tool calls during the agentic loop to
|
|
965
1149
|
# prevent the model from producing text-only end_turn responses that
|
|
@@ -1016,6 +1200,12 @@ def build_openai_request(anthropic_body: dict, monitor: SessionMonitor) -> dict:
|
|
|
1016
1200
|
monitor.consecutive_forced_count = 0
|
|
1017
1201
|
monitor.no_progress_streak = 0
|
|
1018
1202
|
|
|
1203
|
+
if PROXY_DISABLE_THINKING_ON_TOOL_TURNS:
|
|
1204
|
+
openai_body["enable_thinking"] = False
|
|
1205
|
+
logger.info(
|
|
1206
|
+
"Thinking disabled for tool turn (PROXY_DISABLE_THINKING_ON_TOOL_TURNS=on)"
|
|
1207
|
+
)
|
|
1208
|
+
|
|
1019
1209
|
return openai_body
|
|
1020
1210
|
|
|
1021
1211
|
|
|
@@ -1066,6 +1256,24 @@ def _is_unexpected_end_turn(openai_resp: dict, anthropic_body: dict) -> bool:
|
|
|
1066
1256
|
return has_tool_results or _last_assistant_was_text_only(anthropic_body)
|
|
1067
1257
|
|
|
1068
1258
|
|
|
1259
|
+
def _resolve_max_tokens_request(requested_max_tokens: int) -> int:
|
|
1260
|
+
requested = max(1, int(requested_max_tokens))
|
|
1261
|
+
floor = max(0, PROXY_MAX_TOKENS_FLOOR)
|
|
1262
|
+
if floor == 0:
|
|
1263
|
+
return requested
|
|
1264
|
+
return max(requested, floor)
|
|
1265
|
+
|
|
1266
|
+
|
|
1267
|
+
def _resolve_prune_target_fraction() -> float:
|
|
1268
|
+
if 0.0 < PROXY_CONTEXT_PRUNE_TARGET_FRACTION < 1.0:
|
|
1269
|
+
return PROXY_CONTEXT_PRUNE_TARGET_FRACTION
|
|
1270
|
+
logger.warning(
|
|
1271
|
+
"Invalid PROXY_CONTEXT_PRUNE_TARGET_FRACTION=%s; using default 0.65",
|
|
1272
|
+
PROXY_CONTEXT_PRUNE_TARGET_FRACTION,
|
|
1273
|
+
)
|
|
1274
|
+
return 0.65
|
|
1275
|
+
|
|
1276
|
+
|
|
1069
1277
|
def _sanitize_reasoning_fallback_text(reasoning_text: str) -> str:
|
|
1070
1278
|
cleaned = re.sub(r"</?think>", "", reasoning_text, flags=re.IGNORECASE)
|
|
1071
1279
|
cleaned = re.sub(r"\s+", " ", cleaned).strip()
|
|
@@ -1132,6 +1340,463 @@ def _last_assistant_was_text_only(anthropic_body: dict) -> bool:
|
|
|
1132
1340
|
return False
|
|
1133
1341
|
|
|
1134
1342
|
|
|
1343
|
+
def _extract_openai_choice(openai_resp: dict) -> tuple[dict, dict]:
|
|
1344
|
+
choice = (openai_resp.get("choices") or [{}])[0]
|
|
1345
|
+
message = choice.get("message") or {}
|
|
1346
|
+
return choice, message
|
|
1347
|
+
|
|
1348
|
+
|
|
1349
|
+
def _openai_message_text(openai_resp: dict) -> str:
|
|
1350
|
+
_, message = _extract_openai_choice(openai_resp)
|
|
1351
|
+
content = message.get("content", "")
|
|
1352
|
+
return content if isinstance(content, str) else str(content)
|
|
1353
|
+
|
|
1354
|
+
|
|
1355
|
+
def _extract_openai_tool_calls(openai_resp: dict) -> list[dict]:
|
|
1356
|
+
_, message = _extract_openai_choice(openai_resp)
|
|
1357
|
+
tool_calls = message.get("tool_calls") or []
|
|
1358
|
+
return tool_calls if isinstance(tool_calls, list) else []
|
|
1359
|
+
|
|
1360
|
+
|
|
1361
|
+
def _openai_has_tool_calls(openai_resp: dict) -> bool:
|
|
1362
|
+
return bool(_extract_openai_tool_calls(openai_resp))
|
|
1363
|
+
|
|
1364
|
+
|
|
1365
|
+
def _parse_openai_function_arguments(raw_args) -> tuple[dict | None, str | None]:
|
|
1366
|
+
if isinstance(raw_args, dict):
|
|
1367
|
+
return raw_args, None
|
|
1368
|
+
if isinstance(raw_args, str):
|
|
1369
|
+
try:
|
|
1370
|
+
parsed = json.loads(raw_args)
|
|
1371
|
+
except json.JSONDecodeError:
|
|
1372
|
+
return None, "invalid_json"
|
|
1373
|
+
if not isinstance(parsed, dict):
|
|
1374
|
+
return None, "arguments_not_object"
|
|
1375
|
+
return parsed, None
|
|
1376
|
+
return None, "invalid_arguments_type"
|
|
1377
|
+
|
|
1378
|
+
|
|
1379
|
+
def _schema_type_matches(value, expected_type: str) -> bool:
|
|
1380
|
+
if expected_type == "string":
|
|
1381
|
+
return isinstance(value, str)
|
|
1382
|
+
if expected_type == "number":
|
|
1383
|
+
return isinstance(value, (int, float)) and not isinstance(value, bool)
|
|
1384
|
+
if expected_type == "integer":
|
|
1385
|
+
return isinstance(value, int) and not isinstance(value, bool)
|
|
1386
|
+
if expected_type == "boolean":
|
|
1387
|
+
return isinstance(value, bool)
|
|
1388
|
+
if expected_type == "array":
|
|
1389
|
+
return isinstance(value, list)
|
|
1390
|
+
if expected_type == "object":
|
|
1391
|
+
return isinstance(value, dict)
|
|
1392
|
+
if expected_type == "null":
|
|
1393
|
+
return value is None
|
|
1394
|
+
return True
|
|
1395
|
+
|
|
1396
|
+
|
|
1397
|
+
def _string_contains_tool_markup(value: str) -> bool:
|
|
1398
|
+
lowered = value.lower()
|
|
1399
|
+
markers = ("<parameter", "</parameter", "<tool_call", "<function=", "</function")
|
|
1400
|
+
return any(marker in lowered for marker in markers)
|
|
1401
|
+
|
|
1402
|
+
|
|
1403
|
+
def _validate_tool_arguments_against_schema(
|
|
1404
|
+
args: dict, input_schema: dict
|
|
1405
|
+
) -> tuple[bool, str]:
|
|
1406
|
+
if not isinstance(input_schema, dict):
|
|
1407
|
+
return True, ""
|
|
1408
|
+
|
|
1409
|
+
required = input_schema.get("required") or []
|
|
1410
|
+
if isinstance(required, list):
|
|
1411
|
+
for field in required:
|
|
1412
|
+
if not isinstance(field, str):
|
|
1413
|
+
continue
|
|
1414
|
+
if field not in args:
|
|
1415
|
+
return False, f"missing required field '{field}'"
|
|
1416
|
+
value = args.get(field)
|
|
1417
|
+
if value is None:
|
|
1418
|
+
return False, f"required field '{field}' is null"
|
|
1419
|
+
if isinstance(value, str) and not value.strip():
|
|
1420
|
+
return False, f"required field '{field}' is empty"
|
|
1421
|
+
if isinstance(value, str) and _string_contains_tool_markup(value):
|
|
1422
|
+
return (
|
|
1423
|
+
False,
|
|
1424
|
+
f"required field '{field}' contains malformed tool markup",
|
|
1425
|
+
)
|
|
1426
|
+
|
|
1427
|
+
properties = input_schema.get("properties") or {}
|
|
1428
|
+
if isinstance(properties, dict):
|
|
1429
|
+
for key, prop_schema in properties.items():
|
|
1430
|
+
if key not in args:
|
|
1431
|
+
continue
|
|
1432
|
+
if not isinstance(prop_schema, dict):
|
|
1433
|
+
continue
|
|
1434
|
+
expected = prop_schema.get("type")
|
|
1435
|
+
if isinstance(expected, str):
|
|
1436
|
+
if not _schema_type_matches(args[key], expected):
|
|
1437
|
+
return (
|
|
1438
|
+
False,
|
|
1439
|
+
f"type mismatch for '{key}' (expected {expected})",
|
|
1440
|
+
)
|
|
1441
|
+
if expected == "string" and isinstance(args[key], str):
|
|
1442
|
+
if _string_contains_tool_markup(args[key]):
|
|
1443
|
+
return (
|
|
1444
|
+
False,
|
|
1445
|
+
f"string field '{key}' contains malformed tool markup",
|
|
1446
|
+
)
|
|
1447
|
+
elif isinstance(expected, list) and expected:
|
|
1448
|
+
if not any(_schema_type_matches(args[key], t) for t in expected):
|
|
1449
|
+
expected_str = ",".join(str(t) for t in expected)
|
|
1450
|
+
return (
|
|
1451
|
+
False,
|
|
1452
|
+
f"type mismatch for '{key}' (expected one of {expected_str})",
|
|
1453
|
+
)
|
|
1454
|
+
|
|
1455
|
+
return True, ""
|
|
1456
|
+
|
|
1457
|
+
|
|
1458
|
+
def _tool_schema_map_from_anthropic_body(anthropic_body: dict) -> dict[str, dict]:
|
|
1459
|
+
schema_map: dict[str, dict] = {}
|
|
1460
|
+
for tool in anthropic_body.get("tools", []) or []:
|
|
1461
|
+
if not isinstance(tool, dict):
|
|
1462
|
+
continue
|
|
1463
|
+
name = tool.get("name")
|
|
1464
|
+
if isinstance(name, str) and name:
|
|
1465
|
+
schema = tool.get("input_schema")
|
|
1466
|
+
schema_map[name] = schema if isinstance(schema, dict) else {}
|
|
1467
|
+
return schema_map
|
|
1468
|
+
|
|
1469
|
+
|
|
1470
|
+
def _invalid_tool_call_reason(openai_resp: dict, anthropic_body: dict) -> str | None:
|
|
1471
|
+
if "tools" not in anthropic_body:
|
|
1472
|
+
return None
|
|
1473
|
+
|
|
1474
|
+
tool_calls = _extract_openai_tool_calls(openai_resp)
|
|
1475
|
+
if not tool_calls:
|
|
1476
|
+
return None
|
|
1477
|
+
|
|
1478
|
+
schema_map = _tool_schema_map_from_anthropic_body(anthropic_body)
|
|
1479
|
+
if not schema_map:
|
|
1480
|
+
return None
|
|
1481
|
+
|
|
1482
|
+
for idx, tc in enumerate(tool_calls):
|
|
1483
|
+
if not isinstance(tc, dict):
|
|
1484
|
+
return f"tool call {idx} is not an object"
|
|
1485
|
+
fn = tc.get("function")
|
|
1486
|
+
if not isinstance(fn, dict):
|
|
1487
|
+
return f"tool call {idx} missing function payload"
|
|
1488
|
+
|
|
1489
|
+
name = fn.get("name")
|
|
1490
|
+
if not isinstance(name, str) or not name:
|
|
1491
|
+
return f"tool call {idx} missing function name"
|
|
1492
|
+
if name not in schema_map:
|
|
1493
|
+
return f"tool call {idx} uses unknown tool '{name}'"
|
|
1494
|
+
|
|
1495
|
+
args, parse_error = _parse_openai_function_arguments(fn.get("arguments", "{}"))
|
|
1496
|
+
if parse_error:
|
|
1497
|
+
return f"tool call {idx} invalid arguments ({parse_error})"
|
|
1498
|
+
if args is None:
|
|
1499
|
+
return f"tool call {idx} has empty arguments"
|
|
1500
|
+
|
|
1501
|
+
valid, reason = _validate_tool_arguments_against_schema(args, schema_map[name])
|
|
1502
|
+
if not valid:
|
|
1503
|
+
return f"tool call {idx} failed schema validation: {reason}"
|
|
1504
|
+
|
|
1505
|
+
return None
|
|
1506
|
+
|
|
1507
|
+
|
|
1508
|
+
def _openai_has_valid_tool_calls(openai_resp: dict, anthropic_body: dict) -> bool:
|
|
1509
|
+
return (
|
|
1510
|
+
_openai_has_tool_calls(openai_resp)
|
|
1511
|
+
and _invalid_tool_call_reason(openai_resp, anthropic_body) is None
|
|
1512
|
+
)
|
|
1513
|
+
|
|
1514
|
+
|
|
1515
|
+
def _looks_malformed_tool_payload(text: str) -> bool:
|
|
1516
|
+
if not text:
|
|
1517
|
+
return False
|
|
1518
|
+
|
|
1519
|
+
lowered = text.lower()
|
|
1520
|
+
primary_markers = ("</parameter", "<parameter", "<tool_call", "<function=")
|
|
1521
|
+
if any(marker in lowered for marker in primary_markers):
|
|
1522
|
+
return True
|
|
1523
|
+
|
|
1524
|
+
structural_markers = (
|
|
1525
|
+
'=\n{"description"',
|
|
1526
|
+
"</think>",
|
|
1527
|
+
)
|
|
1528
|
+
marker_hits = sum(1 for marker in structural_markers if marker in lowered)
|
|
1529
|
+
repeated_description = lowered.count('{"description"') >= 2
|
|
1530
|
+
repeated_must_call = lowered.count("you must call a tool") >= 2
|
|
1531
|
+
has_unicode_marker = "⎿" in text
|
|
1532
|
+
policy_echo_loop = repeated_must_call and (
|
|
1533
|
+
"do not summarize the issue and stop" in lowered
|
|
1534
|
+
or "must call a tool to make the fix" in lowered
|
|
1535
|
+
)
|
|
1536
|
+
policy_snippets = (
|
|
1537
|
+
"do not summarize the issue and stop",
|
|
1538
|
+
"if you have identified a problem",
|
|
1539
|
+
"you must call a tool to make the fix",
|
|
1540
|
+
"</agentic-protocol>",
|
|
1541
|
+
)
|
|
1542
|
+
policy_hits = sum(1 for snippet in policy_snippets if snippet in lowered)
|
|
1543
|
+
|
|
1544
|
+
if marker_hits >= 2:
|
|
1545
|
+
return True
|
|
1546
|
+
if marker_hits >= 1 and (
|
|
1547
|
+
repeated_description or repeated_must_call or has_unicode_marker
|
|
1548
|
+
):
|
|
1549
|
+
return True
|
|
1550
|
+
if policy_echo_loop:
|
|
1551
|
+
return True
|
|
1552
|
+
if policy_hits >= 2:
|
|
1553
|
+
return True
|
|
1554
|
+
if lowered.count("</parameter") >= 1 and lowered.count('{"description"') >= 1:
|
|
1555
|
+
return True
|
|
1556
|
+
return False
|
|
1557
|
+
|
|
1558
|
+
|
|
1559
|
+
def _is_malformed_tool_response(openai_resp: dict, anthropic_body: dict) -> bool:
|
|
1560
|
+
if "tools" not in anthropic_body:
|
|
1561
|
+
return False
|
|
1562
|
+
|
|
1563
|
+
if _invalid_tool_call_reason(openai_resp, anthropic_body):
|
|
1564
|
+
return True
|
|
1565
|
+
|
|
1566
|
+
if _openai_has_tool_calls(openai_resp):
|
|
1567
|
+
return False
|
|
1568
|
+
|
|
1569
|
+
return _looks_malformed_tool_payload(_openai_message_text(openai_resp))
|
|
1570
|
+
|
|
1571
|
+
|
|
1572
|
+
def _build_malformed_retry_body(openai_body: dict, anthropic_body: dict) -> dict:
|
|
1573
|
+
retry_body = dict(openai_body)
|
|
1574
|
+
retry_body["stream"] = False
|
|
1575
|
+
retry_body["tool_choice"] = "required"
|
|
1576
|
+
retry_body["temperature"] = PROXY_MALFORMED_TOOL_RETRY_TEMPERATURE
|
|
1577
|
+
|
|
1578
|
+
if PROXY_MALFORMED_TOOL_RETRY_MAX_TOKENS > 0:
|
|
1579
|
+
current_max = int(
|
|
1580
|
+
retry_body.get("max_tokens", PROXY_MALFORMED_TOOL_RETRY_MAX_TOKENS)
|
|
1581
|
+
)
|
|
1582
|
+
retry_body["max_tokens"] = min(
|
|
1583
|
+
current_max, PROXY_MALFORMED_TOOL_RETRY_MAX_TOKENS
|
|
1584
|
+
)
|
|
1585
|
+
|
|
1586
|
+
# On malformed retry, restore full tool list to avoid starving selection.
|
|
1587
|
+
if anthropic_body.get("tools"):
|
|
1588
|
+
retry_body["tools"] = _convert_anthropic_tools_to_openai(
|
|
1589
|
+
anthropic_body.get("tools", [])
|
|
1590
|
+
)
|
|
1591
|
+
|
|
1592
|
+
if PROXY_DISABLE_THINKING_ON_TOOL_TURNS:
|
|
1593
|
+
retry_body["enable_thinking"] = False
|
|
1594
|
+
|
|
1595
|
+
return retry_body
|
|
1596
|
+
|
|
1597
|
+
|
|
1598
|
+
def _build_clean_guardrail_openai_response(openai_resp: dict) -> dict:
|
|
1599
|
+
return {
|
|
1600
|
+
"id": openai_resp.get("id", f"chatcmpl_{uuid.uuid4().hex[:12]}"),
|
|
1601
|
+
"object": openai_resp.get("object", "chat.completion"),
|
|
1602
|
+
"created": openai_resp.get("created", int(time.time())),
|
|
1603
|
+
"model": openai_resp.get("model", "unknown"),
|
|
1604
|
+
"choices": [
|
|
1605
|
+
{
|
|
1606
|
+
"index": 0,
|
|
1607
|
+
"finish_reason": "stop",
|
|
1608
|
+
"message": {
|
|
1609
|
+
"role": "assistant",
|
|
1610
|
+
"content": (
|
|
1611
|
+
"I could not produce a valid tool-call format in this turn. "
|
|
1612
|
+
"Please continue; I will issue exactly one valid tool call next."
|
|
1613
|
+
),
|
|
1614
|
+
},
|
|
1615
|
+
}
|
|
1616
|
+
],
|
|
1617
|
+
"usage": openai_resp.get("usage", {}),
|
|
1618
|
+
}
|
|
1619
|
+
|
|
1620
|
+
|
|
1621
|
+
async def _apply_unexpected_end_turn_guardrail(
|
|
1622
|
+
client: httpx.AsyncClient,
|
|
1623
|
+
openai_resp: dict,
|
|
1624
|
+
openai_body: dict,
|
|
1625
|
+
anthropic_body: dict,
|
|
1626
|
+
monitor: SessionMonitor,
|
|
1627
|
+
session_id: str,
|
|
1628
|
+
) -> dict:
|
|
1629
|
+
if not PROXY_GUARDRAIL_RETRY:
|
|
1630
|
+
return openai_resp
|
|
1631
|
+
|
|
1632
|
+
if not _is_unexpected_end_turn(openai_resp, anthropic_body):
|
|
1633
|
+
return openai_resp
|
|
1634
|
+
|
|
1635
|
+
monitor.unexpected_end_turn_count += 1
|
|
1636
|
+
logger.warning(
|
|
1637
|
+
"GUARDRAIL: unexpected end_turn without tool_use in active loop (session=%s), retrying once with tool_choice=required",
|
|
1638
|
+
session_id,
|
|
1639
|
+
)
|
|
1640
|
+
|
|
1641
|
+
retry_body = dict(openai_body)
|
|
1642
|
+
retry_body["tool_choice"] = "required"
|
|
1643
|
+
retry_body["stream"] = False
|
|
1644
|
+
|
|
1645
|
+
retry_resp = await client.post(
|
|
1646
|
+
f"{LLAMA_CPP_BASE}/chat/completions",
|
|
1647
|
+
json=retry_body,
|
|
1648
|
+
headers={"Content-Type": "application/json"},
|
|
1649
|
+
)
|
|
1650
|
+
if retry_resp.status_code == 200:
|
|
1651
|
+
retry_json = retry_resp.json()
|
|
1652
|
+
retry_choice, retry_message = _extract_openai_choice(retry_json)
|
|
1653
|
+
if _openai_has_valid_tool_calls(retry_json, anthropic_body):
|
|
1654
|
+
logger.info("GUARDRAIL: retry produced tool_use; using retried response")
|
|
1655
|
+
return retry_json
|
|
1656
|
+
invalid_reason = _invalid_tool_call_reason(retry_json, anthropic_body)
|
|
1657
|
+
if invalid_reason:
|
|
1658
|
+
logger.warning(
|
|
1659
|
+
"GUARDRAIL: retry produced invalid tool_call payload (%s)",
|
|
1660
|
+
invalid_reason,
|
|
1661
|
+
)
|
|
1662
|
+
logger.info(
|
|
1663
|
+
"GUARDRAIL: retry returned finish_reason=%s without tool_use",
|
|
1664
|
+
retry_choice.get("finish_reason"),
|
|
1665
|
+
)
|
|
1666
|
+
else:
|
|
1667
|
+
logger.warning(
|
|
1668
|
+
"GUARDRAIL retry upstream status=%d; keeping original response",
|
|
1669
|
+
retry_resp.status_code,
|
|
1670
|
+
)
|
|
1671
|
+
|
|
1672
|
+
return openai_resp
|
|
1673
|
+
|
|
1674
|
+
|
|
1675
|
+
async def _apply_malformed_tool_guardrail(
|
|
1676
|
+
client: httpx.AsyncClient,
|
|
1677
|
+
openai_resp: dict,
|
|
1678
|
+
openai_body: dict,
|
|
1679
|
+
anthropic_body: dict,
|
|
1680
|
+
monitor: SessionMonitor,
|
|
1681
|
+
session_id: str,
|
|
1682
|
+
) -> dict:
|
|
1683
|
+
if not PROXY_MALFORMED_TOOL_GUARDRAIL:
|
|
1684
|
+
return openai_resp
|
|
1685
|
+
|
|
1686
|
+
if not _is_malformed_tool_response(openai_resp, anthropic_body):
|
|
1687
|
+
if _openai_has_valid_tool_calls(openai_resp, anthropic_body):
|
|
1688
|
+
monitor.malformed_tool_streak = 0
|
|
1689
|
+
return openai_resp
|
|
1690
|
+
|
|
1691
|
+
monitor.malformed_tool_streak += 1
|
|
1692
|
+
invalid_reason = _invalid_tool_call_reason(openai_resp, anthropic_body)
|
|
1693
|
+
if invalid_reason:
|
|
1694
|
+
excerpt = invalid_reason[:220]
|
|
1695
|
+
else:
|
|
1696
|
+
excerpt = _openai_message_text(openai_resp)[:220].replace("\n", " ")
|
|
1697
|
+
logger.warning(
|
|
1698
|
+
"MALFORMED TOOL PAYLOAD: session=%s streak=%d excerpt=%.220s",
|
|
1699
|
+
session_id,
|
|
1700
|
+
monitor.malformed_tool_streak,
|
|
1701
|
+
excerpt,
|
|
1702
|
+
)
|
|
1703
|
+
|
|
1704
|
+
attempts = max(0, PROXY_MALFORMED_TOOL_RETRY_MAX)
|
|
1705
|
+
for attempt in range(attempts):
|
|
1706
|
+
retry_body = _build_malformed_retry_body(openai_body, anthropic_body)
|
|
1707
|
+
retry_resp = await client.post(
|
|
1708
|
+
f"{LLAMA_CPP_BASE}/chat/completions",
|
|
1709
|
+
json=retry_body,
|
|
1710
|
+
headers={"Content-Type": "application/json"},
|
|
1711
|
+
)
|
|
1712
|
+
if retry_resp.status_code != 200:
|
|
1713
|
+
logger.warning(
|
|
1714
|
+
"MALFORMED RETRY failed (attempt %d/%d): HTTP %d",
|
|
1715
|
+
attempt + 1,
|
|
1716
|
+
attempts,
|
|
1717
|
+
retry_resp.status_code,
|
|
1718
|
+
)
|
|
1719
|
+
continue
|
|
1720
|
+
|
|
1721
|
+
retry_json = retry_resp.json()
|
|
1722
|
+
if _openai_has_valid_tool_calls(retry_json, anthropic_body):
|
|
1723
|
+
monitor.malformed_tool_streak = 0
|
|
1724
|
+
logger.info(
|
|
1725
|
+
"MALFORMED RETRY success: produced tool_use (attempt %d/%d)",
|
|
1726
|
+
attempt + 1,
|
|
1727
|
+
attempts,
|
|
1728
|
+
)
|
|
1729
|
+
return retry_json
|
|
1730
|
+
|
|
1731
|
+
retry_invalid_reason = _invalid_tool_call_reason(retry_json, anthropic_body)
|
|
1732
|
+
if retry_invalid_reason:
|
|
1733
|
+
logger.warning(
|
|
1734
|
+
"MALFORMED RETRY invalid tool_call payload (attempt %d/%d): %s",
|
|
1735
|
+
attempt + 1,
|
|
1736
|
+
attempts,
|
|
1737
|
+
retry_invalid_reason,
|
|
1738
|
+
)
|
|
1739
|
+
|
|
1740
|
+
if not _is_malformed_tool_response(retry_json, anthropic_body):
|
|
1741
|
+
monitor.malformed_tool_streak = 0
|
|
1742
|
+
logger.info(
|
|
1743
|
+
"MALFORMED RETRY produced clean text response (attempt %d/%d)",
|
|
1744
|
+
attempt + 1,
|
|
1745
|
+
attempts,
|
|
1746
|
+
)
|
|
1747
|
+
return retry_json
|
|
1748
|
+
|
|
1749
|
+
monitor.malformed_tool_streak += 1
|
|
1750
|
+
|
|
1751
|
+
logger.error(
|
|
1752
|
+
"MALFORMED TOOL PAYLOAD persisted after retries (session=%s); returning clean guardrail response",
|
|
1753
|
+
session_id,
|
|
1754
|
+
)
|
|
1755
|
+
return _build_clean_guardrail_openai_response(openai_resp)
|
|
1756
|
+
|
|
1757
|
+
|
|
1758
|
+
def _maybe_apply_session_contamination_breaker(
|
|
1759
|
+
anthropic_body: dict, monitor: SessionMonitor, session_id: str
|
|
1760
|
+
) -> dict:
|
|
1761
|
+
if not PROXY_SESSION_CONTAMINATION_BREAKER:
|
|
1762
|
+
return anthropic_body
|
|
1763
|
+
|
|
1764
|
+
threshold = max(1, PROXY_SESSION_CONTAMINATION_THRESHOLD)
|
|
1765
|
+
if monitor.malformed_tool_streak < threshold:
|
|
1766
|
+
return anthropic_body
|
|
1767
|
+
|
|
1768
|
+
messages = anthropic_body.get("messages", [])
|
|
1769
|
+
keep_last = max(2, PROXY_SESSION_CONTAMINATION_KEEP_LAST)
|
|
1770
|
+
if len(messages) <= keep_last + 1:
|
|
1771
|
+
monitor.malformed_tool_streak = 0
|
|
1772
|
+
return anthropic_body
|
|
1773
|
+
|
|
1774
|
+
head = messages[:1]
|
|
1775
|
+
tail = messages[-keep_last:]
|
|
1776
|
+
reset_marker = {
|
|
1777
|
+
"role": "user",
|
|
1778
|
+
"content": (
|
|
1779
|
+
"[SESSION RESET: previous turns contained malformed tool-call formatting "
|
|
1780
|
+
"artifacts. Continue from the recent context below and emit valid tool calls only.]"
|
|
1781
|
+
),
|
|
1782
|
+
}
|
|
1783
|
+
|
|
1784
|
+
updated_body = dict(anthropic_body)
|
|
1785
|
+
updated_body["messages"] = head + [reset_marker] + tail
|
|
1786
|
+
|
|
1787
|
+
monitor.contamination_resets += 1
|
|
1788
|
+
monitor.malformed_tool_streak = 0
|
|
1789
|
+
monitor.no_progress_streak = 0
|
|
1790
|
+
monitor.consecutive_forced_count = 0
|
|
1791
|
+
logger.warning(
|
|
1792
|
+
"SESSION CONTAMINATION BREAKER: session=%s reset applied, kept=%d messages",
|
|
1793
|
+
session_id,
|
|
1794
|
+
len(updated_body["messages"]),
|
|
1795
|
+
)
|
|
1796
|
+
|
|
1797
|
+
return updated_body
|
|
1798
|
+
|
|
1799
|
+
|
|
1135
1800
|
# ===========================================================================
|
|
1136
1801
|
# Response Translation: OpenAI -> Anthropic
|
|
1137
1802
|
# ===========================================================================
|
|
@@ -1187,6 +1852,67 @@ def openai_to_anthropic_response(openai_resp: dict, model: str) -> dict:
|
|
|
1187
1852
|
}
|
|
1188
1853
|
|
|
1189
1854
|
|
|
1855
|
+
async def stream_anthropic_message(anthropic_resp: dict):
|
|
1856
|
+
"""Stream a finalized Anthropic message as SSE events."""
|
|
1857
|
+
message = {
|
|
1858
|
+
"id": anthropic_resp.get("id", f"msg_{uuid.uuid4().hex[:24]}"),
|
|
1859
|
+
"type": "message",
|
|
1860
|
+
"role": "assistant",
|
|
1861
|
+
"content": [],
|
|
1862
|
+
"model": anthropic_resp.get("model", "unknown"),
|
|
1863
|
+
"stop_reason": None,
|
|
1864
|
+
"stop_sequence": None,
|
|
1865
|
+
"usage": {"input_tokens": 0, "output_tokens": 0},
|
|
1866
|
+
}
|
|
1867
|
+
yield f"event: message_start\ndata: {json.dumps({'type': 'message_start', 'message': message})}\n\n"
|
|
1868
|
+
|
|
1869
|
+
content_blocks = anthropic_resp.get("content", []) or [{"type": "text", "text": ""}]
|
|
1870
|
+
block_index = 0
|
|
1871
|
+
for block in content_blocks:
|
|
1872
|
+
btype = block.get("type", "text")
|
|
1873
|
+
if btype == "tool_use":
|
|
1874
|
+
tool_id = block.get("id", f"toolu_{uuid.uuid4().hex[:12]}")
|
|
1875
|
+
tool_name = block.get("name", "")
|
|
1876
|
+
tool_input = json.dumps(block.get("input", {}), separators=(",", ":"))
|
|
1877
|
+
yield (
|
|
1878
|
+
"event: content_block_start\n"
|
|
1879
|
+
f"data: {json.dumps({'type': 'content_block_start', 'index': block_index, 'content_block': {'type': 'tool_use', 'id': tool_id, 'name': tool_name}})}\n\n"
|
|
1880
|
+
)
|
|
1881
|
+
if tool_input:
|
|
1882
|
+
yield (
|
|
1883
|
+
"event: content_block_delta\n"
|
|
1884
|
+
f"data: {json.dumps({'type': 'content_block_delta', 'index': block_index, 'delta': {'type': 'input_json_delta', 'partial_json': tool_input}})}\n\n"
|
|
1885
|
+
)
|
|
1886
|
+
yield (
|
|
1887
|
+
"event: content_block_stop\n"
|
|
1888
|
+
f"data: {json.dumps({'type': 'content_block_stop', 'index': block_index})}\n\n"
|
|
1889
|
+
)
|
|
1890
|
+
else:
|
|
1891
|
+
text = block.get("text", "")
|
|
1892
|
+
yield (
|
|
1893
|
+
"event: content_block_start\n"
|
|
1894
|
+
f"data: {json.dumps({'type': 'content_block_start', 'index': block_index, 'content_block': {'type': 'text', 'text': ''}})}\n\n"
|
|
1895
|
+
)
|
|
1896
|
+
if text:
|
|
1897
|
+
yield (
|
|
1898
|
+
"event: content_block_delta\n"
|
|
1899
|
+
f"data: {json.dumps({'type': 'content_block_delta', 'index': block_index, 'delta': {'type': 'text_delta', 'text': text}})}\n\n"
|
|
1900
|
+
)
|
|
1901
|
+
yield (
|
|
1902
|
+
"event: content_block_stop\n"
|
|
1903
|
+
f"data: {json.dumps({'type': 'content_block_stop', 'index': block_index})}\n\n"
|
|
1904
|
+
)
|
|
1905
|
+
block_index += 1
|
|
1906
|
+
|
|
1907
|
+
output_tokens = anthropic_resp.get("usage", {}).get("output_tokens", 0)
|
|
1908
|
+
stop_reason = anthropic_resp.get("stop_reason", "end_turn")
|
|
1909
|
+
yield (
|
|
1910
|
+
"event: message_delta\n"
|
|
1911
|
+
f"data: {json.dumps({'type': 'message_delta', 'delta': {'stop_reason': stop_reason, 'stop_sequence': None}, 'usage': {'output_tokens': output_tokens}})}\n\n"
|
|
1912
|
+
)
|
|
1913
|
+
yield f"event: message_stop\ndata: {json.dumps({'type': 'message_stop'})}\n\n"
|
|
1914
|
+
|
|
1915
|
+
|
|
1190
1916
|
# ===========================================================================
|
|
1191
1917
|
# Streaming Translation: OpenAI SSE -> Anthropic SSE
|
|
1192
1918
|
# ===========================================================================
|
|
@@ -1400,30 +2126,43 @@ async def stream_anthropic_response(
|
|
|
1400
2126
|
[a[:200] for a in tc_args],
|
|
1401
2127
|
)
|
|
1402
2128
|
|
|
1403
|
-
|
|
1404
|
-
|
|
1405
|
-
|
|
1406
|
-
|
|
1407
|
-
|
|
1408
|
-
|
|
1409
|
-
|
|
1410
|
-
"
|
|
1411
|
-
|
|
1412
|
-
|
|
1413
|
-
{
|
|
1414
|
-
"
|
|
1415
|
-
|
|
1416
|
-
"arguments": tc.get("arguments", ""),
|
|
1417
|
-
}
|
|
2129
|
+
synthetic_openai_resp = {
|
|
2130
|
+
"choices": [
|
|
2131
|
+
{
|
|
2132
|
+
"finish_reason": "stop"
|
|
2133
|
+
if finish_reason == "end_turn"
|
|
2134
|
+
else finish_reason,
|
|
2135
|
+
"message": {
|
|
2136
|
+
"content": accumulated_text,
|
|
2137
|
+
"tool_calls": [
|
|
2138
|
+
{
|
|
2139
|
+
"function": {
|
|
2140
|
+
"name": tc["name"],
|
|
2141
|
+
"arguments": tc.get("arguments", ""),
|
|
1418
2142
|
}
|
|
1419
|
-
|
|
1420
|
-
|
|
1421
|
-
|
|
1422
|
-
}
|
|
1423
|
-
|
|
1424
|
-
|
|
1425
|
-
|
|
2143
|
+
}
|
|
2144
|
+
for tc in tool_calls_by_index.values()
|
|
2145
|
+
],
|
|
2146
|
+
},
|
|
2147
|
+
}
|
|
2148
|
+
]
|
|
2149
|
+
}
|
|
2150
|
+
|
|
2151
|
+
if _is_malformed_tool_response(synthetic_openai_resp, anthropic_body):
|
|
2152
|
+
monitor.malformed_tool_streak += 1
|
|
2153
|
+
elif (
|
|
2154
|
+
"tools" in anthropic_body
|
|
2155
|
+
and not tool_calls_by_index
|
|
2156
|
+
and (
|
|
2157
|
+
finish_reason == "max_tokens"
|
|
2158
|
+
or (finish_reason == "end_turn" and len(accumulated_text) > 512)
|
|
2159
|
+
)
|
|
1426
2160
|
):
|
|
2161
|
+
monitor.malformed_tool_streak += 1
|
|
2162
|
+
elif tool_calls_by_index:
|
|
2163
|
+
monitor.malformed_tool_streak = 0
|
|
2164
|
+
|
|
2165
|
+
if _is_unexpected_end_turn(synthetic_openai_resp, anthropic_body):
|
|
1427
2166
|
monitor.unexpected_end_turn_count += 1
|
|
1428
2167
|
|
|
1429
2168
|
# message_delta with final stop reason
|
|
@@ -1460,6 +2199,8 @@ async def messages(request: Request):
|
|
|
1460
2199
|
monitor = get_session_monitor(session_id)
|
|
1461
2200
|
last_session_id = session_id
|
|
1462
2201
|
|
|
2202
|
+
body = _maybe_apply_session_contamination_breaker(body, monitor, session_id)
|
|
2203
|
+
|
|
1463
2204
|
# Debug: log request summary
|
|
1464
2205
|
n_messages = len(body.get("messages", []))
|
|
1465
2206
|
n_tools = len(body.get("tools", []))
|
|
@@ -1500,7 +2241,9 @@ async def messages(request: Request):
|
|
|
1500
2241
|
utilization * 100,
|
|
1501
2242
|
PROXY_CONTEXT_PRUNE_THRESHOLD * 100,
|
|
1502
2243
|
)
|
|
1503
|
-
body = prune_conversation(
|
|
2244
|
+
body = prune_conversation(
|
|
2245
|
+
body, ctx_window, target_fraction=_resolve_prune_target_fraction()
|
|
2246
|
+
)
|
|
1504
2247
|
monitor.prune_count += 1
|
|
1505
2248
|
# Re-estimate after pruning
|
|
1506
2249
|
estimated_tokens = estimate_total_tokens(body)
|
|
@@ -1522,6 +2265,79 @@ async def messages(request: Request):
|
|
|
1522
2265
|
media_type="application/json",
|
|
1523
2266
|
)
|
|
1524
2267
|
|
|
2268
|
+
use_guarded_non_stream = is_stream and (
|
|
2269
|
+
PROXY_FORCE_NON_STREAM
|
|
2270
|
+
or (PROXY_MALFORMED_TOOL_STREAM_STRICT and "tools" in body)
|
|
2271
|
+
)
|
|
2272
|
+
if use_guarded_non_stream:
|
|
2273
|
+
strict_body = dict(openai_body)
|
|
2274
|
+
strict_body["stream"] = False
|
|
2275
|
+
|
|
2276
|
+
strict_resp = await client.post(
|
|
2277
|
+
f"{LLAMA_CPP_BASE}/chat/completions",
|
|
2278
|
+
json=strict_body,
|
|
2279
|
+
headers={"Content-Type": "application/json"},
|
|
2280
|
+
)
|
|
2281
|
+
|
|
2282
|
+
if strict_resp.status_code != 200:
|
|
2283
|
+
error_text = strict_resp.text[:1000]
|
|
2284
|
+
logger.error(
|
|
2285
|
+
"Upstream HTTP %d (strict-stream): %s",
|
|
2286
|
+
strict_resp.status_code,
|
|
2287
|
+
error_text,
|
|
2288
|
+
)
|
|
2289
|
+
return Response(
|
|
2290
|
+
content=json.dumps(
|
|
2291
|
+
{
|
|
2292
|
+
"type": "error",
|
|
2293
|
+
"error": {
|
|
2294
|
+
"type": "overloaded_error",
|
|
2295
|
+
"message": f"Upstream error (HTTP {strict_resp.status_code}): {error_text[:500]}",
|
|
2296
|
+
},
|
|
2297
|
+
}
|
|
2298
|
+
),
|
|
2299
|
+
status_code=529,
|
|
2300
|
+
media_type="application/json",
|
|
2301
|
+
)
|
|
2302
|
+
|
|
2303
|
+
openai_resp = strict_resp.json()
|
|
2304
|
+
openai_resp = await _apply_unexpected_end_turn_guardrail(
|
|
2305
|
+
client,
|
|
2306
|
+
openai_resp,
|
|
2307
|
+
strict_body,
|
|
2308
|
+
body,
|
|
2309
|
+
monitor,
|
|
2310
|
+
session_id,
|
|
2311
|
+
)
|
|
2312
|
+
openai_resp = await _apply_malformed_tool_guardrail(
|
|
2313
|
+
client,
|
|
2314
|
+
openai_resp,
|
|
2315
|
+
strict_body,
|
|
2316
|
+
body,
|
|
2317
|
+
monitor,
|
|
2318
|
+
session_id,
|
|
2319
|
+
)
|
|
2320
|
+
|
|
2321
|
+
anthropic_resp = openai_to_anthropic_response(openai_resp, model)
|
|
2322
|
+
monitor.record_response(anthropic_resp.get("usage", {}).get("output_tokens", 0))
|
|
2323
|
+
if PROXY_FORCE_NON_STREAM:
|
|
2324
|
+
logger.info(
|
|
2325
|
+
"FORCED NON-STREAM: served stream response via guarded non-stream path"
|
|
2326
|
+
)
|
|
2327
|
+
else:
|
|
2328
|
+
logger.info(
|
|
2329
|
+
"STRICT STREAM GUARDRAIL: served stream response via guarded non-stream path"
|
|
2330
|
+
)
|
|
2331
|
+
|
|
2332
|
+
return StreamingResponse(
|
|
2333
|
+
stream_anthropic_message(anthropic_resp),
|
|
2334
|
+
media_type="text/event-stream",
|
|
2335
|
+
headers={
|
|
2336
|
+
"Cache-Control": "no-cache",
|
|
2337
|
+
"Connection": "keep-alive",
|
|
2338
|
+
},
|
|
2339
|
+
)
|
|
2340
|
+
|
|
1525
2341
|
if is_stream:
|
|
1526
2342
|
openai_body["stream"] = True
|
|
1527
2343
|
|
|
@@ -1711,32 +2527,39 @@ async def messages(request: Request):
|
|
|
1711
2527
|
)
|
|
1712
2528
|
|
|
1713
2529
|
openai_resp = resp.json()
|
|
2530
|
+
openai_resp = await _apply_unexpected_end_turn_guardrail(
|
|
2531
|
+
client,
|
|
2532
|
+
openai_resp,
|
|
2533
|
+
openai_body,
|
|
2534
|
+
body,
|
|
2535
|
+
monitor,
|
|
2536
|
+
session_id,
|
|
2537
|
+
)
|
|
2538
|
+
openai_resp = await _apply_malformed_tool_guardrail(
|
|
2539
|
+
client,
|
|
2540
|
+
openai_resp,
|
|
2541
|
+
openai_body,
|
|
2542
|
+
body,
|
|
2543
|
+
monitor,
|
|
2544
|
+
session_id,
|
|
2545
|
+
)
|
|
1714
2546
|
|
|
1715
|
-
|
|
1716
|
-
|
|
1717
|
-
|
|
1718
|
-
|
|
1719
|
-
|
|
1720
|
-
|
|
1721
|
-
|
|
1722
|
-
|
|
1723
|
-
|
|
1724
|
-
|
|
1725
|
-
|
|
1726
|
-
retry_resp = await client.post(
|
|
1727
|
-
f"{LLAMA_CPP_BASE}/chat/completions",
|
|
1728
|
-
json=retry_body,
|
|
1729
|
-
headers={"Content-Type": "application/json"},
|
|
2547
|
+
choice, _ = _extract_openai_choice(openai_resp)
|
|
2548
|
+
finish_reason = choice.get("finish_reason", "")
|
|
2549
|
+
if (
|
|
2550
|
+
"tools" in body
|
|
2551
|
+
and not _openai_has_tool_calls(openai_resp)
|
|
2552
|
+
and (
|
|
2553
|
+
finish_reason in {"length", "max_tokens"}
|
|
2554
|
+
or (
|
|
2555
|
+
finish_reason in {"stop", "end_turn"}
|
|
2556
|
+
and len(_openai_message_text(openai_resp)) > 512
|
|
2557
|
+
)
|
|
1730
2558
|
)
|
|
1731
|
-
|
|
1732
|
-
|
|
1733
|
-
|
|
1734
|
-
|
|
1735
|
-
if retry_message.get("tool_calls"):
|
|
1736
|
-
openai_resp = retry_json
|
|
1737
|
-
logger.info(
|
|
1738
|
-
"GUARDRAIL: retry produced tool_use; using retried response"
|
|
1739
|
-
)
|
|
2559
|
+
):
|
|
2560
|
+
monitor.malformed_tool_streak += 1
|
|
2561
|
+
elif _openai_has_tool_calls(openai_resp):
|
|
2562
|
+
monitor.malformed_tool_streak = 0
|
|
1740
2563
|
|
|
1741
2564
|
anthropic_resp = openai_to_anthropic_response(openai_resp, model)
|
|
1742
2565
|
|
|
@@ -1826,6 +2649,8 @@ async def context_status(request: Request):
|
|
|
1826
2649
|
"no_progress_streak": monitor.no_progress_streak,
|
|
1827
2650
|
"loop_warnings_emitted": monitor.loop_warnings_emitted,
|
|
1828
2651
|
"unexpected_end_turn_count": monitor.unexpected_end_turn_count,
|
|
2652
|
+
"malformed_tool_streak": monitor.malformed_tool_streak,
|
|
2653
|
+
"contamination_resets": monitor.contamination_resets,
|
|
1829
2654
|
"tool_call_history_len": len(monitor.tool_call_history),
|
|
1830
2655
|
"is_looping": monitor.detect_tool_loop(window=PROXY_LOOP_WINDOW)[0],
|
|
1831
2656
|
"loop_repeat_count": monitor.detect_tool_loop(window=PROXY_LOOP_WINDOW)[1],
|