@miller-tech/uap 1.20.32 → 1.20.34
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/model-profiles/qwen35.json +6 -5
- package/dist/.tsbuildinfo +1 -1
- package/dist/bin/cli.js +6 -1
- package/dist/bin/cli.js.map +1 -1
- package/dist/cli/hooks.js +30 -7
- package/dist/cli/hooks.js.map +1 -1
- package/dist/cli/policy.d.ts.map +1 -1
- package/dist/cli/policy.js +26 -0
- package/dist/cli/policy.js.map +1 -1
- package/dist/dashboard/data-seeder.d.ts.map +1 -1
- package/dist/dashboard/data-seeder.js +72 -3
- package/dist/dashboard/data-seeder.js.map +1 -1
- package/dist/dashboard/data-service.js +1 -1
- package/dist/dashboard/data-service.js.map +1 -1
- package/dist/dashboard/server.js +1 -1
- package/dist/dashboard/server.js.map +1 -1
- package/dist/index.d.ts +15 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +14 -0
- package/dist/index.js.map +1 -1
- package/dist/types/index.d.ts +20 -0
- package/dist/types/index.d.ts.map +1 -1
- package/dist/types/index.js +20 -0
- package/dist/types/index.js.map +1 -1
- package/docs/AGENTS.md +423 -0
- package/docs/AGENTS.md</path>CLAUDE.md</path>/home/cogtek/dev/miller-tech/universal-agent-protocol/docs/INDEX.md</path>/home/cogtek/dev/miller-tech/universal-agent-protocol/docs/reference/API_REFERENCE.md</path>/home/cogtek/dev/miller-tech/universal-agent-protocol/docs/reference/UAP_CLI_REFERENCE.md</path>src/index.ts</path>/src/cli/worktree.ts</path>/src/coordination/deploy-batcher.ts</path>/src/policies/policy-gate.ts</path>/src/memory/model-router.ts</path>/src/memory/embeddings.ts</path>/src/models/types.ts</path>/src/types/coordination.ts</path>/src/utils/logger.ts</path>/src/utils/config-loader.ts</path>/src/utils/performance-monitor.ts</path>/src/utils/concurrency.ts</path>/src/utils/concurrency-pool.ts</path>/src/utils/string-similarity.ts</path>/src/utils/rate-limiter.ts</path>/src/utils/system-resources.ts</path>/src/utils/adaptive-cache.ts</path>/src/utils/lazy-imports.ts</path>/src/utils/merge-claude-md.ts</path>/src/utils/stopwords.ts</path>/src/utils/config-loader.ts</path>/src/utils/performance-monitor.ts</path>/src/utils/concurrency.ts</path>/src/utils/concurrency-pool.ts</path>/src/utils/string-similarity.ts</path>/src/utils/rate-limiter.ts</path>/src/utils/system-resources.ts</path>/src/utils/adaptive-cache.ts</path>/src/utils/lazy-imports.ts</path>/src/utils/merge-claude-md.ts</path>/src/utils/stopwords.ts</path> +433 -0
- package/docs/DOCUMENTATION_AUDIT_REPORT.md +131 -0
- package/docs/GETTING_STARTED.md +288 -0
- package/docs/INDEX.md +272 -42
- package/docs/PROJECT_ANALYSIS_REPORT.md +510 -0
- package/docs/architecture/SYSTEM_ANALYSIS.md +220 -1003
- package/docs/blog/local-coding-agents.md +266 -0
- package/docs/blog/x-thread.md +254 -0
- package/docs/deployment/DEPLOY_BATCHER_ANALYSIS.md +15 -647
- package/docs/getting-started/OVERVIEW.md +10 -30
- package/docs/getting-started/SETUP.md +183 -9
- package/docs/pr/UPSTREAM_PRS.md +424 -0
- package/docs/reference/CONFIGURATION.md +208 -0
- package/docs/reference/DATABASE_SCHEMA.md +344 -0
- package/docs/reference/PATTERN_LIBRARY.md +636 -0
- package/package.json +1 -1
- package/templates/hooks/uap-policy-gate.sh +36 -0
- package/tools/agents/claude_local_agent.py +92 -0
- package/tools/agents/opencode_uap_agent.py +3 -0
- package/tools/agents/scripts/anthropic_proxy.py +654 -20
- package/tools/agents/uap_agent.py +1 -1
|
@@ -166,6 +166,12 @@ PROXY_TOOL_STATE_FINALIZE_THRESHOLD = int(
|
|
|
166
166
|
PROXY_TOOL_STATE_REVIEW_CYCLE_LIMIT = int(
|
|
167
167
|
os.environ.get("PROXY_TOOL_STATE_REVIEW_CYCLE_LIMIT", "3")
|
|
168
168
|
)
|
|
169
|
+
# Force finalize after N consecutive forced_budget_exhausted events where
|
|
170
|
+
# neither cycling nor stagnation was detected — catches "distinct but
|
|
171
|
+
# unproductive" tool spam that defeats per-tool cycle detection.
|
|
172
|
+
PROXY_UNPRODUCTIVE_EXHAUSTION_LIMIT = int(
|
|
173
|
+
os.environ.get("PROXY_UNPRODUCTIVE_EXHAUSTION_LIMIT", "2")
|
|
174
|
+
)
|
|
169
175
|
PROXY_COMPLETION_RECOVERY_MAX = int(
|
|
170
176
|
os.environ.get("PROXY_COMPLETION_RECOVERY_MAX", "3")
|
|
171
177
|
)
|
|
@@ -205,6 +211,13 @@ PROXY_SESSION_TTL_SECS = int(os.environ.get("PROXY_SESSION_TTL_SECS", "7200"))
|
|
|
205
211
|
PROXY_FINALIZE_CONTINUATION_MAX = int(
|
|
206
212
|
os.environ.get("PROXY_FINALIZE_CONTINUATION_MAX", "3")
|
|
207
213
|
)
|
|
214
|
+
# Session-level cap: after N total finalize continuations in a session (even
|
|
215
|
+
# across "fresh user text" state resets), stop injecting synthetic tools and
|
|
216
|
+
# let the response terminate naturally. Catches runaway loops that dodge the
|
|
217
|
+
# per-cycle cap by triggering state resets.
|
|
218
|
+
PROXY_FINALIZE_SESSION_HARD_CAP = int(
|
|
219
|
+
os.environ.get("PROXY_FINALIZE_SESSION_HARD_CAP", "3")
|
|
220
|
+
)
|
|
208
221
|
PROXY_STREAM_REASONING_FALLBACK = (
|
|
209
222
|
os.environ.get("PROXY_STREAM_REASONING_FALLBACK", "off").strip().lower()
|
|
210
223
|
)
|
|
@@ -234,6 +247,14 @@ PROXY_DISABLE_THINKING_ON_TOOL_TURNS = os.environ.get(
|
|
|
234
247
|
"off",
|
|
235
248
|
"no",
|
|
236
249
|
}
|
|
250
|
+
PROXY_DISABLE_SPEC_ON_TOOL_TURNS = os.environ.get(
|
|
251
|
+
"PROXY_DISABLE_SPEC_ON_TOOL_TURNS", "off"
|
|
252
|
+
).lower() not in {
|
|
253
|
+
"0",
|
|
254
|
+
"false",
|
|
255
|
+
"off",
|
|
256
|
+
"no",
|
|
257
|
+
}
|
|
237
258
|
PROXY_MALFORMED_TOOL_GUARDRAIL = os.environ.get(
|
|
238
259
|
"PROXY_MALFORMED_TOOL_GUARDRAIL", "on"
|
|
239
260
|
).lower() not in {
|
|
@@ -654,6 +675,7 @@ class SessionMonitor:
|
|
|
654
675
|
tool_state_stagnation_streak: int = 0
|
|
655
676
|
tool_state_transitions: int = 0
|
|
656
677
|
tool_state_review_cycles: int = 0
|
|
678
|
+
tool_state_unproductive_exhaustion_streak: int = 0
|
|
657
679
|
last_tool_fingerprint: str = ""
|
|
658
680
|
cycling_tool_names: list = field(default_factory=list)
|
|
659
681
|
session_banned_tools: set = field(default_factory=set) # tools banned for entire session after repeated cycling
|
|
@@ -661,6 +683,7 @@ class SessionMonitor:
|
|
|
661
683
|
last_response_garbled: bool = False # previous turn had garbled/malformed output
|
|
662
684
|
finalize_turn_active: bool = False
|
|
663
685
|
finalize_continuation_count: int = 0
|
|
686
|
+
finalize_hard_stop_count: int = 0 # monotonic, not reset by fresh user text
|
|
664
687
|
finalize_synthetic_tool_id: str = ""
|
|
665
688
|
completion_required: bool = False
|
|
666
689
|
completion_pending: bool = False
|
|
@@ -898,6 +921,7 @@ class SessionMonitor:
|
|
|
898
921
|
self.tool_state_auto_budget_remaining = 0
|
|
899
922
|
self.tool_state_stagnation_streak = 0
|
|
900
923
|
self.tool_state_review_cycles = 0
|
|
924
|
+
self.tool_state_unproductive_exhaustion_streak = 0
|
|
901
925
|
self.cycling_tool_names = []
|
|
902
926
|
self.last_tool_fingerprint = ""
|
|
903
927
|
self.reset_tool_targets()
|
|
@@ -906,7 +930,10 @@ class SessionMonitor:
|
|
|
906
930
|
self.completion_required = _should_enforce_completion_contract(anthropic_body)
|
|
907
931
|
self.completion_progress_signals = _count_completion_progress_signals(anthropic_body)
|
|
908
932
|
blockers = _completion_blockers(
|
|
909
|
-
anthropic_body,
|
|
933
|
+
anthropic_body,
|
|
934
|
+
has_tool_results,
|
|
935
|
+
phase=self.tool_turn_phase,
|
|
936
|
+
finalize_fired=(self.finalize_hard_stop_count > 0),
|
|
910
937
|
)
|
|
911
938
|
self.completion_blockers = blockers
|
|
912
939
|
self.completion_pending = self.completion_required and bool(blockers)
|
|
@@ -1046,6 +1073,8 @@ class SessionMonitor:
|
|
|
1046
1073
|
session_monitors: dict[str, SessionMonitor] = {}
|
|
1047
1074
|
default_context_window = 0
|
|
1048
1075
|
last_session_id = ""
|
|
1076
|
+
_last_ctx_recheck_ts: float = 0.0
|
|
1077
|
+
_CTX_RECHECK_INTERVAL: float = 60.0 # Re-detect context window every 60s
|
|
1049
1078
|
|
|
1050
1079
|
|
|
1051
1080
|
def _cleanup_stale_monitors(now_ts: float) -> None:
|
|
@@ -1058,6 +1087,39 @@ def _cleanup_stale_monitors(now_ts: float) -> None:
|
|
|
1058
1087
|
session_monitors.pop(sid, None)
|
|
1059
1088
|
|
|
1060
1089
|
|
|
1090
|
+
async def _maybe_recheck_context_window() -> None:
|
|
1091
|
+
"""Periodically re-query the upstream server's context window.
|
|
1092
|
+
|
|
1093
|
+
Handles server restarts with different --ctx-size mid-session.
|
|
1094
|
+
Non-blocking: skips if the check interval hasn't elapsed.
|
|
1095
|
+
"""
|
|
1096
|
+
global default_context_window, _last_ctx_recheck_ts
|
|
1097
|
+
now = time.time()
|
|
1098
|
+
if now - _last_ctx_recheck_ts < _CTX_RECHECK_INTERVAL:
|
|
1099
|
+
return
|
|
1100
|
+
_last_ctx_recheck_ts = now
|
|
1101
|
+
if http_client is None:
|
|
1102
|
+
return
|
|
1103
|
+
try:
|
|
1104
|
+
slots_url = LLAMA_CPP_BASE.replace("/v1", "/slots")
|
|
1105
|
+
resp = await http_client.get(slots_url, timeout=2.0)
|
|
1106
|
+
if resp.status_code == 200:
|
|
1107
|
+
slots = resp.json()
|
|
1108
|
+
if slots and isinstance(slots, list):
|
|
1109
|
+
n_ctx = slots[0].get("n_ctx", 0)
|
|
1110
|
+
if n_ctx > 0 and n_ctx != default_context_window:
|
|
1111
|
+
old = default_context_window
|
|
1112
|
+
default_context_window = n_ctx
|
|
1113
|
+
for mon in session_monitors.values():
|
|
1114
|
+
mon.context_window = n_ctx
|
|
1115
|
+
logger.warning(
|
|
1116
|
+
"Context window changed: %d → %d (upstream server restarted?)",
|
|
1117
|
+
old, n_ctx,
|
|
1118
|
+
)
|
|
1119
|
+
except Exception:
|
|
1120
|
+
pass # Non-critical, will retry next interval
|
|
1121
|
+
|
|
1122
|
+
|
|
1061
1123
|
def get_session_monitor(session_id: str) -> SessionMonitor:
|
|
1062
1124
|
now_ts = time.time()
|
|
1063
1125
|
_cleanup_stale_monitors(now_ts)
|
|
@@ -1852,6 +1914,9 @@ else:
|
|
|
1852
1914
|
|
|
1853
1915
|
|
|
1854
1916
|
def _content_fingerprint(content) -> str:
|
|
1917
|
+
"""Return a STABLE fingerprint for content. Must not include volatile
|
|
1918
|
+
identifiers (tool_use_ids change per-turn), otherwise session stickiness
|
|
1919
|
+
breaks in agentic loops with stateful guardrails."""
|
|
1855
1920
|
if isinstance(content, str):
|
|
1856
1921
|
return content[:512]
|
|
1857
1922
|
if isinstance(content, list):
|
|
@@ -1866,7 +1931,10 @@ def _content_fingerprint(content) -> str:
|
|
|
1866
1931
|
elif btype == "tool_use":
|
|
1867
1932
|
parts.append(f"tool:{block.get('name', '')}")
|
|
1868
1933
|
elif btype == "tool_result":
|
|
1869
|
-
|
|
1934
|
+
# Stable: use tool name + first 64 chars of content, not tool_use_id
|
|
1935
|
+
inner = block.get("content", "")
|
|
1936
|
+
inner_text = _extract_text(inner) if not isinstance(inner, str) else inner
|
|
1937
|
+
parts.append(f"result:{inner_text[:64]}")
|
|
1870
1938
|
return "\n".join(parts)[:1024]
|
|
1871
1939
|
return str(content)[:512]
|
|
1872
1940
|
|
|
@@ -1893,14 +1961,26 @@ def resolve_session_id(request: Request, anthropic_body: dict) -> str:
|
|
|
1893
1961
|
first_user = ""
|
|
1894
1962
|
for msg in anthropic_body.get("messages", []):
|
|
1895
1963
|
if msg.get("role") == "user":
|
|
1896
|
-
|
|
1964
|
+
# Only hash TEXT content of first user message, not tool_result blocks
|
|
1965
|
+
# (which may appear in /anthropic/v1/messages passthrough scenarios)
|
|
1966
|
+
content = msg.get("content", "")
|
|
1967
|
+
if isinstance(content, str):
|
|
1968
|
+
first_user = content[:512]
|
|
1969
|
+
elif isinstance(content, list):
|
|
1970
|
+
text_parts = [
|
|
1971
|
+
b.get("text", "") for b in content
|
|
1972
|
+
if isinstance(b, dict) and b.get("type") == "text"
|
|
1973
|
+
]
|
|
1974
|
+
first_user = "\n".join(text_parts)[:512]
|
|
1897
1975
|
break
|
|
1898
1976
|
|
|
1899
|
-
|
|
1977
|
+
# Deliberately exclude `system` from fingerprint — clients often inject
|
|
1978
|
+
# volatile context (timestamps, cwd, session markers) into system prompts
|
|
1979
|
+
# which would break session stickiness for ongoing conversations.
|
|
1900
1980
|
model = anthropic_body.get("model", "default")
|
|
1901
1981
|
remote = request.client.host if request.client else "unknown"
|
|
1902
1982
|
digest = hashlib.sha256(
|
|
1903
|
-
f"{remote}|{model}|{
|
|
1983
|
+
f"{remote}|{model}|{first_user}".encode(
|
|
1904
1984
|
"utf-8", errors="ignore"
|
|
1905
1985
|
)
|
|
1906
1986
|
).hexdigest()[:20]
|
|
@@ -1965,7 +2045,10 @@ def _should_enforce_completion_contract(anthropic_body: dict) -> bool:
|
|
|
1965
2045
|
|
|
1966
2046
|
|
|
1967
2047
|
def _completion_blockers(
|
|
1968
|
-
anthropic_body: dict,
|
|
2048
|
+
anthropic_body: dict,
|
|
2049
|
+
has_tool_results: bool,
|
|
2050
|
+
phase: str = "",
|
|
2051
|
+
finalize_fired: bool = False,
|
|
1969
2052
|
) -> list[str]:
|
|
1970
2053
|
blockers: list[str] = []
|
|
1971
2054
|
progress = _count_completion_progress_signals(anthropic_body)
|
|
@@ -1977,9 +2060,12 @@ def _completion_blockers(
|
|
|
1977
2060
|
if last_user_has_result:
|
|
1978
2061
|
blockers.append("awaiting_post_tool_followup")
|
|
1979
2062
|
elif _last_assistant_was_text_only(anthropic_body):
|
|
1980
|
-
#
|
|
1981
|
-
#
|
|
1982
|
-
|
|
2063
|
+
# Suppress in two cases:
|
|
2064
|
+
# 1. Currently in finalize phase — text-only is expected
|
|
2065
|
+
# 2. A finalize fired earlier this session — means the state machine
|
|
2066
|
+
# already wrapped up the loop, don't re-trigger it (was causing
|
|
2067
|
+
# finalize -> review -> cycle -> finalize -> review... infinite loop)
|
|
2068
|
+
if phase != "finalize" and not finalize_fired:
|
|
1983
2069
|
blockers.append("text_only_after_tool_results")
|
|
1984
2070
|
|
|
1985
2071
|
return blockers
|
|
@@ -2020,6 +2106,212 @@ def _sanitize_tool_schema_for_llama(schema):
|
|
|
2020
2106
|
return _walk(schema), removed
|
|
2021
2107
|
|
|
2022
2108
|
|
|
2109
|
+
def openai_to_anthropic_request(openai_body: dict) -> dict:
|
|
2110
|
+
"""Convert an OpenAI Chat Completions request to an Anthropic Messages request.
|
|
2111
|
+
|
|
2112
|
+
Inverse of anthropic_to_openai_messages. Used by /v1/chat/completions passthrough
|
|
2113
|
+
to let OpenAI-shaped clients (Forge, etc.) benefit from the Anthropic-path
|
|
2114
|
+
guardrails (loop detection, tool narrowing, cycle breaking, etc.).
|
|
2115
|
+
"""
|
|
2116
|
+
anthropic_messages: list[dict] = []
|
|
2117
|
+
system_text_parts: list[str] = []
|
|
2118
|
+
|
|
2119
|
+
for msg in openai_body.get("messages", []):
|
|
2120
|
+
role = msg.get("role", "")
|
|
2121
|
+
content = msg.get("content")
|
|
2122
|
+
|
|
2123
|
+
if role == "system":
|
|
2124
|
+
if isinstance(content, str):
|
|
2125
|
+
system_text_parts.append(content)
|
|
2126
|
+
elif isinstance(content, list):
|
|
2127
|
+
for block in content:
|
|
2128
|
+
if isinstance(block, dict) and block.get("type") == "text":
|
|
2129
|
+
system_text_parts.append(block.get("text", ""))
|
|
2130
|
+
elif isinstance(block, str):
|
|
2131
|
+
system_text_parts.append(block)
|
|
2132
|
+
continue
|
|
2133
|
+
|
|
2134
|
+
if role == "tool":
|
|
2135
|
+
# OpenAI tool response -> Anthropic user message with tool_result block
|
|
2136
|
+
tool_call_id = msg.get("tool_call_id", "")
|
|
2137
|
+
tool_text = content if isinstance(content, str) else _extract_text(content)
|
|
2138
|
+
anthropic_messages.append(
|
|
2139
|
+
{
|
|
2140
|
+
"role": "user",
|
|
2141
|
+
"content": [
|
|
2142
|
+
{
|
|
2143
|
+
"type": "tool_result",
|
|
2144
|
+
"tool_use_id": tool_call_id,
|
|
2145
|
+
"content": tool_text,
|
|
2146
|
+
}
|
|
2147
|
+
],
|
|
2148
|
+
}
|
|
2149
|
+
)
|
|
2150
|
+
continue
|
|
2151
|
+
|
|
2152
|
+
if role == "assistant":
|
|
2153
|
+
blocks: list[dict] = []
|
|
2154
|
+
if isinstance(content, str) and content:
|
|
2155
|
+
blocks.append({"type": "text", "text": content})
|
|
2156
|
+
elif isinstance(content, list):
|
|
2157
|
+
for block in content:
|
|
2158
|
+
if isinstance(block, dict) and block.get("type") == "text":
|
|
2159
|
+
blocks.append({"type": "text", "text": block.get("text", "")})
|
|
2160
|
+
elif isinstance(block, str):
|
|
2161
|
+
blocks.append({"type": "text", "text": block})
|
|
2162
|
+
|
|
2163
|
+
for tc in msg.get("tool_calls", []) or []:
|
|
2164
|
+
fn = tc.get("function", {})
|
|
2165
|
+
try:
|
|
2166
|
+
args = json.loads(fn.get("arguments", "{}") or "{}")
|
|
2167
|
+
except (ValueError, TypeError):
|
|
2168
|
+
args = {}
|
|
2169
|
+
blocks.append(
|
|
2170
|
+
{
|
|
2171
|
+
"type": "tool_use",
|
|
2172
|
+
"id": tc.get("id", f"toolu_{uuid.uuid4().hex[:12]}"),
|
|
2173
|
+
"name": fn.get("name", ""),
|
|
2174
|
+
"input": args,
|
|
2175
|
+
}
|
|
2176
|
+
)
|
|
2177
|
+
|
|
2178
|
+
anthropic_messages.append(
|
|
2179
|
+
{"role": "assistant", "content": blocks if blocks else ""}
|
|
2180
|
+
)
|
|
2181
|
+
continue
|
|
2182
|
+
|
|
2183
|
+
# role == "user" (or unknown -> treat as user)
|
|
2184
|
+
if isinstance(content, str):
|
|
2185
|
+
anthropic_messages.append({"role": "user", "content": content})
|
|
2186
|
+
elif isinstance(content, list):
|
|
2187
|
+
blocks = []
|
|
2188
|
+
for block in content:
|
|
2189
|
+
if isinstance(block, dict) and block.get("type") == "text":
|
|
2190
|
+
blocks.append({"type": "text", "text": block.get("text", "")})
|
|
2191
|
+
elif isinstance(block, str):
|
|
2192
|
+
blocks.append({"type": "text", "text": block})
|
|
2193
|
+
anthropic_messages.append(
|
|
2194
|
+
{"role": "user", "content": blocks if blocks else ""}
|
|
2195
|
+
)
|
|
2196
|
+
else:
|
|
2197
|
+
anthropic_messages.append({"role": "user", "content": ""})
|
|
2198
|
+
|
|
2199
|
+
anthropic_body: dict = {
|
|
2200
|
+
"model": openai_body.get("model", "default"),
|
|
2201
|
+
"messages": anthropic_messages,
|
|
2202
|
+
"max_tokens": int(openai_body.get("max_tokens", 4096) or 4096),
|
|
2203
|
+
}
|
|
2204
|
+
if system_text_parts:
|
|
2205
|
+
anthropic_body["system"] = "\n\n".join(p for p in system_text_parts if p)
|
|
2206
|
+
|
|
2207
|
+
for key_o, key_a in (
|
|
2208
|
+
("temperature", "temperature"),
|
|
2209
|
+
("top_p", "top_p"),
|
|
2210
|
+
("top_k", "top_k"),
|
|
2211
|
+
("stop", "stop_sequences"),
|
|
2212
|
+
("stream", "stream"),
|
|
2213
|
+
):
|
|
2214
|
+
if key_o in openai_body:
|
|
2215
|
+
val = openai_body[key_o]
|
|
2216
|
+
if key_a == "stop_sequences" and isinstance(val, str):
|
|
2217
|
+
val = [val]
|
|
2218
|
+
anthropic_body[key_a] = val
|
|
2219
|
+
|
|
2220
|
+
# Convert OpenAI tools -> Anthropic tools
|
|
2221
|
+
openai_tools = openai_body.get("tools") or []
|
|
2222
|
+
if openai_tools:
|
|
2223
|
+
anthropic_tools = []
|
|
2224
|
+
for tool in openai_tools:
|
|
2225
|
+
fn = tool.get("function", {}) if isinstance(tool, dict) else {}
|
|
2226
|
+
if not fn.get("name"):
|
|
2227
|
+
continue
|
|
2228
|
+
anthropic_tools.append(
|
|
2229
|
+
{
|
|
2230
|
+
"name": fn.get("name", ""),
|
|
2231
|
+
"description": fn.get("description", ""),
|
|
2232
|
+
"input_schema": fn.get("parameters", {"type": "object", "properties": {}}),
|
|
2233
|
+
}
|
|
2234
|
+
)
|
|
2235
|
+
if anthropic_tools:
|
|
2236
|
+
anthropic_body["tools"] = anthropic_tools
|
|
2237
|
+
|
|
2238
|
+
tool_choice = openai_body.get("tool_choice")
|
|
2239
|
+
if tool_choice == "none":
|
|
2240
|
+
anthropic_body.pop("tools", None)
|
|
2241
|
+
elif tool_choice == "required":
|
|
2242
|
+
anthropic_body["tool_choice"] = {"type": "any"}
|
|
2243
|
+
elif isinstance(tool_choice, dict) and tool_choice.get("type") == "function":
|
|
2244
|
+
anthropic_body["tool_choice"] = {
|
|
2245
|
+
"type": "tool",
|
|
2246
|
+
"name": tool_choice.get("function", {}).get("name", ""),
|
|
2247
|
+
}
|
|
2248
|
+
|
|
2249
|
+
return anthropic_body
|
|
2250
|
+
|
|
2251
|
+
|
|
2252
|
+
def anthropic_to_openai_response(anthropic_resp: dict) -> dict:
|
|
2253
|
+
"""Convert an Anthropic Messages response to OpenAI Chat Completions format."""
|
|
2254
|
+
content_blocks = anthropic_resp.get("content", []) or []
|
|
2255
|
+
text_parts: list[str] = []
|
|
2256
|
+
tool_calls: list[dict] = []
|
|
2257
|
+
|
|
2258
|
+
for block in content_blocks:
|
|
2259
|
+
if not isinstance(block, dict):
|
|
2260
|
+
continue
|
|
2261
|
+
btype = block.get("type")
|
|
2262
|
+
if btype == "text":
|
|
2263
|
+
text_parts.append(block.get("text", ""))
|
|
2264
|
+
elif btype == "tool_use":
|
|
2265
|
+
tool_calls.append(
|
|
2266
|
+
{
|
|
2267
|
+
"id": block.get("id", f"call_{uuid.uuid4().hex[:12]}"),
|
|
2268
|
+
"type": "function",
|
|
2269
|
+
"function": {
|
|
2270
|
+
"name": block.get("name", ""),
|
|
2271
|
+
"arguments": json.dumps(block.get("input", {}) or {}),
|
|
2272
|
+
},
|
|
2273
|
+
}
|
|
2274
|
+
)
|
|
2275
|
+
|
|
2276
|
+
stop_reason = anthropic_resp.get("stop_reason", "end_turn")
|
|
2277
|
+
finish_map = {
|
|
2278
|
+
"end_turn": "stop",
|
|
2279
|
+
"stop_sequence": "stop",
|
|
2280
|
+
"max_tokens": "length",
|
|
2281
|
+
"tool_use": "tool_calls",
|
|
2282
|
+
}
|
|
2283
|
+
finish_reason = finish_map.get(stop_reason, "stop")
|
|
2284
|
+
|
|
2285
|
+
message: dict = {"role": "assistant"}
|
|
2286
|
+
if text_parts:
|
|
2287
|
+
message["content"] = "".join(text_parts)
|
|
2288
|
+
else:
|
|
2289
|
+
message["content"] = None
|
|
2290
|
+
if tool_calls:
|
|
2291
|
+
message["tool_calls"] = tool_calls
|
|
2292
|
+
|
|
2293
|
+
usage = anthropic_resp.get("usage", {}) or {}
|
|
2294
|
+
|
|
2295
|
+
return {
|
|
2296
|
+
"id": anthropic_resp.get("id", f"chatcmpl-{uuid.uuid4().hex[:12]}"),
|
|
2297
|
+
"object": "chat.completion",
|
|
2298
|
+
"created": int(time.time()),
|
|
2299
|
+
"model": anthropic_resp.get("model", "unknown"),
|
|
2300
|
+
"choices": [
|
|
2301
|
+
{
|
|
2302
|
+
"index": 0,
|
|
2303
|
+
"message": message,
|
|
2304
|
+
"finish_reason": finish_reason,
|
|
2305
|
+
}
|
|
2306
|
+
],
|
|
2307
|
+
"usage": {
|
|
2308
|
+
"prompt_tokens": usage.get("input_tokens", 0),
|
|
2309
|
+
"completion_tokens": usage.get("output_tokens", 0),
|
|
2310
|
+
"total_tokens": usage.get("input_tokens", 0) + usage.get("output_tokens", 0),
|
|
2311
|
+
},
|
|
2312
|
+
}
|
|
2313
|
+
|
|
2314
|
+
|
|
2023
2315
|
def _convert_anthropic_tools_to_openai(anthropic_tools: list[dict]) -> list[dict]:
|
|
2024
2316
|
converted = []
|
|
2025
2317
|
removed_pattern_fields = 0
|
|
@@ -2313,11 +2605,27 @@ def _resolve_state_machine_tool_choice(
|
|
|
2313
2605
|
|
|
2314
2606
|
if monitor.tool_state_forced_budget_remaining <= 0:
|
|
2315
2607
|
monitor.set_tool_turn_phase("review", reason="forced_budget_exhausted")
|
|
2316
|
-
# Only count toward review cycle limit if there was an actual
|
|
2317
|
-
# cycle/stagnation detected. Budget exhaustion alone means the
|
|
2318
|
-
# model is working — it just used all its turns — not cycling.
|
|
2319
2608
|
if cycle_looping or stagnating:
|
|
2320
2609
|
monitor.tool_state_review_cycles += 1
|
|
2610
|
+
monitor.tool_state_unproductive_exhaustion_streak = 0
|
|
2611
|
+
else:
|
|
2612
|
+
# Track consecutive unproductive exhaustions. Even without a
|
|
2613
|
+
# detected cycle, if the model burns through the forced budget
|
|
2614
|
+
# repeatedly with distinct-but-useless tool calls, treat it as
|
|
2615
|
+
# a loop and force finalize. Catches the 35B-A3B failure mode
|
|
2616
|
+
# where different short tool calls defeat per-tool cycle
|
|
2617
|
+
# detection.
|
|
2618
|
+
monitor.tool_state_unproductive_exhaustion_streak += 1
|
|
2619
|
+
if monitor.tool_state_unproductive_exhaustion_streak >= PROXY_UNPRODUCTIVE_EXHAUSTION_LIMIT:
|
|
2620
|
+
logger.warning(
|
|
2621
|
+
"TOOL STATE MACHINE: %d consecutive unproductive budget exhaustions — forcing finalize",
|
|
2622
|
+
monitor.tool_state_unproductive_exhaustion_streak,
|
|
2623
|
+
)
|
|
2624
|
+
monitor.set_tool_turn_phase("finalize", reason="unproductive_exhaustion")
|
|
2625
|
+
monitor.tool_state_unproductive_exhaustion_streak = 0
|
|
2626
|
+
monitor.tool_state_forced_budget_remaining = 0
|
|
2627
|
+
monitor.tool_state_auto_budget_remaining = 0
|
|
2628
|
+
return "finalize", "unproductive_exhaustion"
|
|
2321
2629
|
monitor.tool_state_auto_budget_remaining = max(
|
|
2322
2630
|
1, PROXY_TOOL_STATE_AUTO_BUDGET
|
|
2323
2631
|
)
|
|
@@ -2325,10 +2633,11 @@ def _resolve_state_machine_tool_choice(
|
|
|
2325
2633
|
1, PROXY_TOOL_STATE_FORCED_BUDGET // 2
|
|
2326
2634
|
)
|
|
2327
2635
|
logger.warning(
|
|
2328
|
-
"TOOL STATE MACHINE: forced budget exhausted, entering review (cycles=%d cycling=%s stagnating=%s)",
|
|
2636
|
+
"TOOL STATE MACHINE: forced budget exhausted, entering review (cycles=%d cycling=%s stagnating=%s unprod_exh=%d)",
|
|
2329
2637
|
monitor.tool_state_review_cycles,
|
|
2330
2638
|
cycle_looping,
|
|
2331
2639
|
stagnating,
|
|
2640
|
+
monitor.tool_state_unproductive_exhaustion_streak,
|
|
2332
2641
|
)
|
|
2333
2642
|
return "required", "forced_budget_exhausted"
|
|
2334
2643
|
|
|
@@ -2612,6 +2921,8 @@ def build_openai_request(
|
|
|
2612
2921
|
# Skip all further tool_choice logic — no tools this turn
|
|
2613
2922
|
if PROXY_DISABLE_THINKING_ON_TOOL_TURNS:
|
|
2614
2923
|
openai_body["enable_thinking"] = False
|
|
2924
|
+
if PROXY_DISABLE_SPEC_ON_TOOL_TURNS:
|
|
2925
|
+
openai_body["speculative.n_max"] = 0
|
|
2615
2926
|
return openai_body
|
|
2616
2927
|
|
|
2617
2928
|
# Check if forced-tool dampener or loop breaker should override tool_choice
|
|
@@ -2638,6 +2949,7 @@ def build_openai_request(
|
|
|
2638
2949
|
openai_body.pop("tool_choice", None)
|
|
2639
2950
|
openai_body.pop("tools", None)
|
|
2640
2951
|
monitor.finalize_turn_active = True
|
|
2952
|
+
monitor.finalize_hard_stop_count += 1 # monotonic marker: a finalize fired this session
|
|
2641
2953
|
monitor.consecutive_forced_count = 0
|
|
2642
2954
|
monitor.no_progress_streak = 0
|
|
2643
2955
|
# Option 3: Inject explicit "no tool calls" instruction to reduce XML leak
|
|
@@ -2732,11 +3044,21 @@ def build_openai_request(
|
|
|
2732
3044
|
elif state_reason in {"fresh_user_text", "inactive_loop"} and n_msgs <= 1:
|
|
2733
3045
|
monitor.consecutive_forced_count = 0
|
|
2734
3046
|
monitor.no_progress_streak = 0
|
|
2735
|
-
|
|
2736
|
-
|
|
2737
|
-
|
|
2738
|
-
|
|
2739
|
-
|
|
3047
|
+
# Force tool_choice=required on first turn to ensure local models
|
|
3048
|
+
# produce a tool call instead of plain text (cold-start fix)
|
|
3049
|
+
if has_tools and n_msgs == 1:
|
|
3050
|
+
openai_body["tool_choice"] = "required"
|
|
3051
|
+
logger.info(
|
|
3052
|
+
"tool_choice forced to 'required' on first turn (reason=%s n_msgs=%d cold_start_fix=true)",
|
|
3053
|
+
state_reason,
|
|
3054
|
+
n_msgs,
|
|
3055
|
+
)
|
|
3056
|
+
else:
|
|
3057
|
+
logger.info(
|
|
3058
|
+
"tool_choice left unchanged after state reset (reason=%s n_msgs=%d)",
|
|
3059
|
+
state_reason,
|
|
3060
|
+
n_msgs,
|
|
3061
|
+
)
|
|
2740
3062
|
elif monitor.should_release_tool_choice():
|
|
2741
3063
|
openai_body["tool_choice"] = "auto"
|
|
2742
3064
|
monitor.consecutive_forced_count = 0
|
|
@@ -2773,6 +3095,12 @@ def build_openai_request(
|
|
|
2773
3095
|
"Thinking disabled for tool turn (PROXY_DISABLE_THINKING_ON_TOOL_TURNS=on)"
|
|
2774
3096
|
)
|
|
2775
3097
|
|
|
3098
|
+
if PROXY_DISABLE_SPEC_ON_TOOL_TURNS:
|
|
3099
|
+
openai_body["speculative.n_max"] = 0
|
|
3100
|
+
logger.info(
|
|
3101
|
+
"Spec decoding disabled for tool turn (PROXY_DISABLE_SPEC_ON_TOOL_TURNS=on)"
|
|
3102
|
+
)
|
|
3103
|
+
|
|
2776
3104
|
_apply_tool_call_grammar(openai_body, grammar_override=profile_grammar)
|
|
2777
3105
|
|
|
2778
3106
|
return openai_body
|
|
@@ -5212,6 +5540,18 @@ def _inject_synthetic_continuation(
|
|
|
5212
5540
|
Appends a no-op Read("/dev/null") tool_use block and changes stop_reason
|
|
5213
5541
|
from "end_turn" to "tool_use" so the client continues sending requests.
|
|
5214
5542
|
"""
|
|
5543
|
+
# Session-level hard cap: if we've already done N continuations in this
|
|
5544
|
+
# session (counter is monotonic, survives fresh-user-text resets), stop
|
|
5545
|
+
# injecting and let the response terminate. This catches runaway loops
|
|
5546
|
+
# that dodge the per-cycle cap via state resets.
|
|
5547
|
+
if monitor.finalize_hard_stop_count >= PROXY_FINALIZE_SESSION_HARD_CAP:
|
|
5548
|
+
logger.warning(
|
|
5549
|
+
"FINALIZE CONTINUATION: session hard cap reached (%d/%d) — not injecting, allowing termination",
|
|
5550
|
+
monitor.finalize_hard_stop_count,
|
|
5551
|
+
PROXY_FINALIZE_SESSION_HARD_CAP,
|
|
5552
|
+
)
|
|
5553
|
+
return anthropic_resp
|
|
5554
|
+
|
|
5215
5555
|
# Pick a safe tool the client knows about (case-insensitive match,
|
|
5216
5556
|
# then use the client's actual casing for the tool name)
|
|
5217
5557
|
if _client_has_tool(anthropic_body, "read"):
|
|
@@ -5227,6 +5567,7 @@ def _inject_synthetic_continuation(
|
|
|
5227
5567
|
synthetic_id = f"toolu_{uuid.uuid4().hex[:12]}"
|
|
5228
5568
|
monitor.finalize_synthetic_tool_id = synthetic_id
|
|
5229
5569
|
monitor.finalize_continuation_count += 1
|
|
5570
|
+
monitor.finalize_hard_stop_count += 1
|
|
5230
5571
|
|
|
5231
5572
|
content = anthropic_resp.get("content", [])
|
|
5232
5573
|
content.append({
|
|
@@ -5239,11 +5580,13 @@ def _inject_synthetic_continuation(
|
|
|
5239
5580
|
anthropic_resp["stop_reason"] = "tool_use"
|
|
5240
5581
|
|
|
5241
5582
|
logger.info(
|
|
5242
|
-
"FINALIZE CONTINUATION: injected synthetic %s tool_use id=%s (count=%d/%d)",
|
|
5583
|
+
"FINALIZE CONTINUATION: injected synthetic %s tool_use id=%s (count=%d/%d, session=%d/%d)",
|
|
5243
5584
|
tool_name,
|
|
5244
5585
|
synthetic_id,
|
|
5245
5586
|
monitor.finalize_continuation_count,
|
|
5246
5587
|
PROXY_FINALIZE_CONTINUATION_MAX,
|
|
5588
|
+
monitor.finalize_hard_stop_count,
|
|
5589
|
+
PROXY_FINALIZE_SESSION_HARD_CAP,
|
|
5247
5590
|
)
|
|
5248
5591
|
return anthropic_resp
|
|
5249
5592
|
|
|
@@ -5804,6 +6147,10 @@ async def messages(request: Request):
|
|
|
5804
6147
|
is_stream = body.get("stream", False)
|
|
5805
6148
|
model = body.get("model", "default")
|
|
5806
6149
|
client_id = resolve_client_id(request)
|
|
6150
|
+
|
|
6151
|
+
# Periodically re-detect context window from upstream (handles server restarts)
|
|
6152
|
+
await _maybe_recheck_context_window()
|
|
6153
|
+
|
|
5807
6154
|
if _should_passthrough_model(model):
|
|
5808
6155
|
logger.info("PASSTHROUGH: model=%s -> %s", model, ANTHROPIC_API_BASE)
|
|
5809
6156
|
return await _passthrough_anthropic_request(request, body, is_stream)
|
|
@@ -5861,8 +6208,9 @@ async def messages(request: Request):
|
|
|
5861
6208
|
last_text = str(last_content)[:200]
|
|
5862
6209
|
rate_count = log_client_rate(client_id)
|
|
5863
6210
|
logger.info(
|
|
5864
|
-
"REQ: client=%s rate_%ss=%d stream=%s msgs=%d tools=%d max_tokens=%s last_role=%s last_content=%.200s",
|
|
6211
|
+
"REQ: client=%s sess=%s rate_%ss=%d stream=%s msgs=%d tools=%d max_tokens=%s last_role=%s last_content=%.200s",
|
|
5865
6212
|
client_id,
|
|
6213
|
+
session_id,
|
|
5866
6214
|
PROXY_CLIENT_RATE_WINDOW_SECS,
|
|
5867
6215
|
rate_count,
|
|
5868
6216
|
is_stream,
|
|
@@ -6532,6 +6880,292 @@ async def messages_anthropic(request: Request):
|
|
|
6532
6880
|
return await messages(request)
|
|
6533
6881
|
|
|
6534
6882
|
|
|
6883
|
+
@app.post("/v1/chat/completions")
|
|
6884
|
+
async def chat_completions(request: Request):
|
|
6885
|
+
"""OpenAI-compatible chat/completions endpoint for clients like Forge
|
|
6886
|
+
that require the OpenAI API shape.
|
|
6887
|
+
|
|
6888
|
+
FULL GUARDRAIL PATH: Converts the OpenAI request to Anthropic format,
|
|
6889
|
+
runs the full /v1/messages pipeline (loop detection, tool narrowing,
|
|
6890
|
+
cycle breaking, malformed tool retry, context pruning, etc.), then
|
|
6891
|
+
converts the Anthropic response back to OpenAI format.
|
|
6892
|
+
|
|
6893
|
+
Streaming is down-converted to a single final OpenAI SSE chunk sequence
|
|
6894
|
+
built from the completed Anthropic response (not token-by-token from
|
|
6895
|
+
upstream). This preserves guardrails at the cost of stream granularity.
|
|
6896
|
+
"""
|
|
6897
|
+
body_bytes = await request.body()
|
|
6898
|
+
try:
|
|
6899
|
+
openai_body = json.loads(body_bytes) if body_bytes else {}
|
|
6900
|
+
except (ValueError, TypeError):
|
|
6901
|
+
return Response(
|
|
6902
|
+
content=b'{"error":{"message":"invalid JSON","type":"invalid_request_error"}}',
|
|
6903
|
+
status_code=400,
|
|
6904
|
+
media_type="application/json",
|
|
6905
|
+
)
|
|
6906
|
+
|
|
6907
|
+
requested_stream = bool(openai_body.get("stream", False))
|
|
6908
|
+
model = openai_body.get("model", "default")
|
|
6909
|
+
client_id = resolve_client_id(request)
|
|
6910
|
+
|
|
6911
|
+
logger.info(
|
|
6912
|
+
"CHAT (guarded): client=%s model=%s stream=%s msgs=%d tools=%d",
|
|
6913
|
+
client_id,
|
|
6914
|
+
model,
|
|
6915
|
+
requested_stream,
|
|
6916
|
+
len(openai_body.get("messages", [])),
|
|
6917
|
+
len(openai_body.get("tools", []) or []),
|
|
6918
|
+
)
|
|
6919
|
+
|
|
6920
|
+
# Convert OpenAI request -> Anthropic request
|
|
6921
|
+
anthropic_body = openai_to_anthropic_request(openai_body)
|
|
6922
|
+
# Force non-streaming through the pipeline; we re-stream at the end if the
|
|
6923
|
+
# client wanted streaming. This keeps guardrail logic simpler/consistent.
|
|
6924
|
+
anthropic_body["stream"] = False
|
|
6925
|
+
|
|
6926
|
+
# Build a synthetic Request that the existing messages() handler can consume
|
|
6927
|
+
fake_body_bytes = json.dumps(anthropic_body).encode("utf-8")
|
|
6928
|
+
|
|
6929
|
+
async def receive():
|
|
6930
|
+
return {"type": "http.request", "body": fake_body_bytes, "more_body": False}
|
|
6931
|
+
|
|
6932
|
+
fake_scope = dict(request.scope)
|
|
6933
|
+
# Preserve client/headers but override the body + path
|
|
6934
|
+
fake_scope["path"] = "/v1/messages"
|
|
6935
|
+
fake_scope["raw_path"] = b"/v1/messages"
|
|
6936
|
+
# Strip content-length since the body changes
|
|
6937
|
+
fake_scope["headers"] = [
|
|
6938
|
+
(k, v)
|
|
6939
|
+
for (k, v) in fake_scope.get("headers", [])
|
|
6940
|
+
if k.lower() != b"content-length"
|
|
6941
|
+
]
|
|
6942
|
+
fake_request = Request(fake_scope, receive)
|
|
6943
|
+
|
|
6944
|
+
# Run the full guarded Anthropic pipeline
|
|
6945
|
+
inner_resp = await messages(fake_request)
|
|
6946
|
+
|
|
6947
|
+
# Extract the Anthropic-format JSON from whatever messages() returned
|
|
6948
|
+
anthropic_resp_dict: dict | None = None
|
|
6949
|
+
status_code = 200
|
|
6950
|
+
if isinstance(inner_resp, StreamingResponse):
|
|
6951
|
+
# Pipeline shouldn't stream because we set stream=False, but defensively
|
|
6952
|
+
# consume the stream and parse the final message event.
|
|
6953
|
+
chunks: list[bytes] = []
|
|
6954
|
+
async for chunk in inner_resp.body_iterator:
|
|
6955
|
+
if isinstance(chunk, bytes):
|
|
6956
|
+
chunks.append(chunk)
|
|
6957
|
+
elif isinstance(chunk, str):
|
|
6958
|
+
chunks.append(chunk.encode("utf-8"))
|
|
6959
|
+
raw = b"".join(chunks)
|
|
6960
|
+
# Try to parse as JSON directly first, then fall back to SSE parsing
|
|
6961
|
+
try:
|
|
6962
|
+
anthropic_resp_dict = json.loads(raw)
|
|
6963
|
+
except (ValueError, TypeError):
|
|
6964
|
+
anthropic_resp_dict = _parse_anthropic_sse_to_message(raw)
|
|
6965
|
+
elif isinstance(inner_resp, Response):
|
|
6966
|
+
status_code = inner_resp.status_code
|
|
6967
|
+
try:
|
|
6968
|
+
anthropic_resp_dict = json.loads(inner_resp.body)
|
|
6969
|
+
except (ValueError, TypeError):
|
|
6970
|
+
anthropic_resp_dict = None
|
|
6971
|
+
elif isinstance(inner_resp, dict):
|
|
6972
|
+
anthropic_resp_dict = inner_resp
|
|
6973
|
+
|
|
6974
|
+
if anthropic_resp_dict is None or "content" not in anthropic_resp_dict:
|
|
6975
|
+
# Upstream error: forward as-is in OpenAI error shape
|
|
6976
|
+
err_msg = "upstream returned no message"
|
|
6977
|
+
if isinstance(anthropic_resp_dict, dict) and "error" in anthropic_resp_dict:
|
|
6978
|
+
err_msg = anthropic_resp_dict["error"].get("message", err_msg)
|
|
6979
|
+
return Response(
|
|
6980
|
+
content=json.dumps({"error": {"message": err_msg, "type": "upstream_error"}}).encode(),
|
|
6981
|
+
status_code=status_code if status_code >= 400 else 502,
|
|
6982
|
+
media_type="application/json",
|
|
6983
|
+
)
|
|
6984
|
+
|
|
6985
|
+
# Ensure model field is set for response
|
|
6986
|
+
anthropic_resp_dict.setdefault("model", model)
|
|
6987
|
+
openai_resp = anthropic_to_openai_response(anthropic_resp_dict)
|
|
6988
|
+
|
|
6989
|
+
if not requested_stream:
|
|
6990
|
+
return Response(
|
|
6991
|
+
content=json.dumps(openai_resp).encode(),
|
|
6992
|
+
status_code=200,
|
|
6993
|
+
media_type="application/json",
|
|
6994
|
+
)
|
|
6995
|
+
|
|
6996
|
+
# Client requested streaming: emit the response as OpenAI SSE chunks
|
|
6997
|
+
async def emit_openai_stream():
|
|
6998
|
+
resp_id = openai_resp["id"]
|
|
6999
|
+
created = openai_resp["created"]
|
|
7000
|
+
model_name = openai_resp["model"]
|
|
7001
|
+
choice = openai_resp["choices"][0]
|
|
7002
|
+
message = choice["message"]
|
|
7003
|
+
|
|
7004
|
+
# Opening chunk: role
|
|
7005
|
+
opening = {
|
|
7006
|
+
"id": resp_id,
|
|
7007
|
+
"object": "chat.completion.chunk",
|
|
7008
|
+
"created": created,
|
|
7009
|
+
"model": model_name,
|
|
7010
|
+
"choices": [{"index": 0, "delta": {"role": "assistant"}, "finish_reason": None}],
|
|
7011
|
+
}
|
|
7012
|
+
yield f"data: {json.dumps(opening)}\n\n".encode()
|
|
7013
|
+
|
|
7014
|
+
# Content chunk
|
|
7015
|
+
if message.get("content"):
|
|
7016
|
+
content_chunk = {
|
|
7017
|
+
"id": resp_id,
|
|
7018
|
+
"object": "chat.completion.chunk",
|
|
7019
|
+
"created": created,
|
|
7020
|
+
"model": model_name,
|
|
7021
|
+
"choices": [
|
|
7022
|
+
{
|
|
7023
|
+
"index": 0,
|
|
7024
|
+
"delta": {"content": message["content"]},
|
|
7025
|
+
"finish_reason": None,
|
|
7026
|
+
}
|
|
7027
|
+
],
|
|
7028
|
+
}
|
|
7029
|
+
yield f"data: {json.dumps(content_chunk)}\n\n".encode()
|
|
7030
|
+
|
|
7031
|
+
# Tool call chunks
|
|
7032
|
+
for idx, tc in enumerate(message.get("tool_calls", []) or []):
|
|
7033
|
+
tc_chunk = {
|
|
7034
|
+
"id": resp_id,
|
|
7035
|
+
"object": "chat.completion.chunk",
|
|
7036
|
+
"created": created,
|
|
7037
|
+
"model": model_name,
|
|
7038
|
+
"choices": [
|
|
7039
|
+
{
|
|
7040
|
+
"index": 0,
|
|
7041
|
+
"delta": {
|
|
7042
|
+
"tool_calls": [
|
|
7043
|
+
{
|
|
7044
|
+
"index": idx,
|
|
7045
|
+
"id": tc["id"],
|
|
7046
|
+
"type": "function",
|
|
7047
|
+
"function": {
|
|
7048
|
+
"name": tc["function"]["name"],
|
|
7049
|
+
"arguments": tc["function"]["arguments"],
|
|
7050
|
+
},
|
|
7051
|
+
}
|
|
7052
|
+
]
|
|
7053
|
+
},
|
|
7054
|
+
"finish_reason": None,
|
|
7055
|
+
}
|
|
7056
|
+
],
|
|
7057
|
+
}
|
|
7058
|
+
yield f"data: {json.dumps(tc_chunk)}\n\n".encode()
|
|
7059
|
+
|
|
7060
|
+
# Final chunk with finish_reason
|
|
7061
|
+
final_chunk = {
|
|
7062
|
+
"id": resp_id,
|
|
7063
|
+
"object": "chat.completion.chunk",
|
|
7064
|
+
"created": created,
|
|
7065
|
+
"model": model_name,
|
|
7066
|
+
"choices": [
|
|
7067
|
+
{"index": 0, "delta": {}, "finish_reason": choice["finish_reason"]}
|
|
7068
|
+
],
|
|
7069
|
+
}
|
|
7070
|
+
yield f"data: {json.dumps(final_chunk)}\n\n".encode()
|
|
7071
|
+
yield b"data: [DONE]\n\n"
|
|
7072
|
+
|
|
7073
|
+
return StreamingResponse(
|
|
7074
|
+
emit_openai_stream(),
|
|
7075
|
+
media_type="text/event-stream",
|
|
7076
|
+
headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"},
|
|
7077
|
+
)
|
|
7078
|
+
|
|
7079
|
+
|
|
7080
|
+
def _parse_anthropic_sse_to_message(raw: bytes) -> dict | None:
|
|
7081
|
+
"""Parse a concatenated Anthropic SSE stream into a final message dict.
|
|
7082
|
+
Used as a fallback when messages() returns a StreamingResponse despite stream=False.
|
|
7083
|
+
"""
|
|
7084
|
+
try:
|
|
7085
|
+
text = raw.decode("utf-8", errors="replace")
|
|
7086
|
+
except Exception:
|
|
7087
|
+
return None
|
|
7088
|
+
|
|
7089
|
+
text_parts: list[str] = []
|
|
7090
|
+
tool_uses: list[dict] = []
|
|
7091
|
+
usage = {"input_tokens": 0, "output_tokens": 0}
|
|
7092
|
+
stop_reason = "end_turn"
|
|
7093
|
+
model = "unknown"
|
|
7094
|
+
message_id = f"msg_{uuid.uuid4().hex[:24]}"
|
|
7095
|
+
|
|
7096
|
+
current_block: dict | None = None
|
|
7097
|
+
current_json_buffer = ""
|
|
7098
|
+
|
|
7099
|
+
for line in text.splitlines():
|
|
7100
|
+
if not line.startswith("data:"):
|
|
7101
|
+
continue
|
|
7102
|
+
payload = line[5:].strip()
|
|
7103
|
+
if not payload or payload == "[DONE]":
|
|
7104
|
+
continue
|
|
7105
|
+
try:
|
|
7106
|
+
evt = json.loads(payload)
|
|
7107
|
+
except (ValueError, TypeError):
|
|
7108
|
+
continue
|
|
7109
|
+
etype = evt.get("type")
|
|
7110
|
+
if etype == "message_start":
|
|
7111
|
+
m = evt.get("message", {}) or {}
|
|
7112
|
+
message_id = m.get("id", message_id)
|
|
7113
|
+
model = m.get("model", model)
|
|
7114
|
+
if "usage" in m:
|
|
7115
|
+
usage.update(m["usage"])
|
|
7116
|
+
elif etype == "content_block_start":
|
|
7117
|
+
current_block = evt.get("content_block", {})
|
|
7118
|
+
current_json_buffer = ""
|
|
7119
|
+
if current_block.get("type") == "text":
|
|
7120
|
+
text_parts.append(current_block.get("text", ""))
|
|
7121
|
+
elif etype == "content_block_delta":
|
|
7122
|
+
d = evt.get("delta", {}) or {}
|
|
7123
|
+
if d.get("type") == "text_delta":
|
|
7124
|
+
text_parts.append(d.get("text", ""))
|
|
7125
|
+
elif d.get("type") == "input_json_delta":
|
|
7126
|
+
current_json_buffer += d.get("partial_json", "")
|
|
7127
|
+
elif etype == "content_block_stop":
|
|
7128
|
+
if current_block and current_block.get("type") == "tool_use":
|
|
7129
|
+
try:
|
|
7130
|
+
input_obj = json.loads(current_json_buffer) if current_json_buffer else {}
|
|
7131
|
+
except (ValueError, TypeError):
|
|
7132
|
+
input_obj = {}
|
|
7133
|
+
tool_uses.append(
|
|
7134
|
+
{
|
|
7135
|
+
"type": "tool_use",
|
|
7136
|
+
"id": current_block.get("id", f"toolu_{uuid.uuid4().hex[:12]}"),
|
|
7137
|
+
"name": current_block.get("name", ""),
|
|
7138
|
+
"input": input_obj,
|
|
7139
|
+
}
|
|
7140
|
+
)
|
|
7141
|
+
current_block = None
|
|
7142
|
+
current_json_buffer = ""
|
|
7143
|
+
elif etype == "message_delta":
|
|
7144
|
+
d = evt.get("delta", {}) or {}
|
|
7145
|
+
if "stop_reason" in d:
|
|
7146
|
+
stop_reason = d["stop_reason"] or stop_reason
|
|
7147
|
+
u = evt.get("usage", {}) or {}
|
|
7148
|
+
if u:
|
|
7149
|
+
usage.update(u)
|
|
7150
|
+
|
|
7151
|
+
content: list[dict] = []
|
|
7152
|
+
joined_text = "".join(text_parts)
|
|
7153
|
+
if joined_text:
|
|
7154
|
+
content.append({"type": "text", "text": joined_text})
|
|
7155
|
+
content.extend(tool_uses)
|
|
7156
|
+
|
|
7157
|
+
return {
|
|
7158
|
+
"id": message_id,
|
|
7159
|
+
"type": "message",
|
|
7160
|
+
"role": "assistant",
|
|
7161
|
+
"content": content if content else [{"type": "text", "text": ""}],
|
|
7162
|
+
"model": model,
|
|
7163
|
+
"stop_reason": stop_reason,
|
|
7164
|
+
"stop_sequence": None,
|
|
7165
|
+
"usage": usage,
|
|
7166
|
+
}
|
|
7167
|
+
|
|
7168
|
+
|
|
6535
7169
|
@app.get("/v1/models")
|
|
6536
7170
|
async def models():
|
|
6537
7171
|
"""Return available model list (spoofs Anthropic model IDs for client compatibility)."""
|