@miller-tech/uap 1.20.32 → 1.20.34

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. package/config/model-profiles/qwen35.json +6 -5
  2. package/dist/.tsbuildinfo +1 -1
  3. package/dist/bin/cli.js +6 -1
  4. package/dist/bin/cli.js.map +1 -1
  5. package/dist/cli/hooks.js +30 -7
  6. package/dist/cli/hooks.js.map +1 -1
  7. package/dist/cli/policy.d.ts.map +1 -1
  8. package/dist/cli/policy.js +26 -0
  9. package/dist/cli/policy.js.map +1 -1
  10. package/dist/dashboard/data-seeder.d.ts.map +1 -1
  11. package/dist/dashboard/data-seeder.js +72 -3
  12. package/dist/dashboard/data-seeder.js.map +1 -1
  13. package/dist/dashboard/data-service.js +1 -1
  14. package/dist/dashboard/data-service.js.map +1 -1
  15. package/dist/dashboard/server.js +1 -1
  16. package/dist/dashboard/server.js.map +1 -1
  17. package/dist/index.d.ts +15 -1
  18. package/dist/index.d.ts.map +1 -1
  19. package/dist/index.js +14 -0
  20. package/dist/index.js.map +1 -1
  21. package/dist/types/index.d.ts +20 -0
  22. package/dist/types/index.d.ts.map +1 -1
  23. package/dist/types/index.js +20 -0
  24. package/dist/types/index.js.map +1 -1
  25. package/docs/AGENTS.md +423 -0
  26. package/docs/AGENTS.md</path>CLAUDE.md</path>/home/cogtek/dev/miller-tech/universal-agent-protocol/docs/INDEX.md</path>/home/cogtek/dev/miller-tech/universal-agent-protocol/docs/reference/API_REFERENCE.md</path>/home/cogtek/dev/miller-tech/universal-agent-protocol/docs/reference/UAP_CLI_REFERENCE.md</path>src/index.ts</path>/src/cli/worktree.ts</path>/src/coordination/deploy-batcher.ts</path>/src/policies/policy-gate.ts</path>/src/memory/model-router.ts</path>/src/memory/embeddings.ts</path>/src/models/types.ts</path>/src/types/coordination.ts</path>/src/utils/logger.ts</path>/src/utils/config-loader.ts</path>/src/utils/performance-monitor.ts</path>/src/utils/concurrency.ts</path>/src/utils/concurrency-pool.ts</path>/src/utils/string-similarity.ts</path>/src/utils/rate-limiter.ts</path>/src/utils/system-resources.ts</path>/src/utils/adaptive-cache.ts</path>/src/utils/lazy-imports.ts</path>/src/utils/merge-claude-md.ts</path>/src/utils/stopwords.ts</path>/src/utils/config-loader.ts</path>/src/utils/performance-monitor.ts</path>/src/utils/concurrency.ts</path>/src/utils/concurrency-pool.ts</path>/src/utils/string-similarity.ts</path>/src/utils/rate-limiter.ts</path>/src/utils/system-resources.ts</path>/src/utils/adaptive-cache.ts</path>/src/utils/lazy-imports.ts</path>/src/utils/merge-claude-md.ts</path>/src/utils/stopwords.ts</path> +433 -0
  27. package/docs/DOCUMENTATION_AUDIT_REPORT.md +131 -0
  28. package/docs/GETTING_STARTED.md +288 -0
  29. package/docs/INDEX.md +272 -42
  30. package/docs/PROJECT_ANALYSIS_REPORT.md +510 -0
  31. package/docs/architecture/SYSTEM_ANALYSIS.md +220 -1003
  32. package/docs/blog/local-coding-agents.md +266 -0
  33. package/docs/blog/x-thread.md +254 -0
  34. package/docs/deployment/DEPLOY_BATCHER_ANALYSIS.md +15 -647
  35. package/docs/getting-started/OVERVIEW.md +10 -30
  36. package/docs/getting-started/SETUP.md +183 -9
  37. package/docs/pr/UPSTREAM_PRS.md +424 -0
  38. package/docs/reference/CONFIGURATION.md +208 -0
  39. package/docs/reference/DATABASE_SCHEMA.md +344 -0
  40. package/docs/reference/PATTERN_LIBRARY.md +636 -0
  41. package/package.json +1 -1
  42. package/templates/hooks/uap-policy-gate.sh +36 -0
  43. package/tools/agents/claude_local_agent.py +92 -0
  44. package/tools/agents/opencode_uap_agent.py +3 -0
  45. package/tools/agents/scripts/anthropic_proxy.py +654 -20
  46. package/tools/agents/uap_agent.py +1 -1
@@ -166,6 +166,12 @@ PROXY_TOOL_STATE_FINALIZE_THRESHOLD = int(
166
166
  PROXY_TOOL_STATE_REVIEW_CYCLE_LIMIT = int(
167
167
  os.environ.get("PROXY_TOOL_STATE_REVIEW_CYCLE_LIMIT", "3")
168
168
  )
169
+ # Force finalize after N consecutive forced_budget_exhausted events where
170
+ # neither cycling nor stagnation was detected — catches "distinct but
171
+ # unproductive" tool spam that defeats per-tool cycle detection.
172
+ PROXY_UNPRODUCTIVE_EXHAUSTION_LIMIT = int(
173
+ os.environ.get("PROXY_UNPRODUCTIVE_EXHAUSTION_LIMIT", "2")
174
+ )
169
175
  PROXY_COMPLETION_RECOVERY_MAX = int(
170
176
  os.environ.get("PROXY_COMPLETION_RECOVERY_MAX", "3")
171
177
  )
@@ -205,6 +211,13 @@ PROXY_SESSION_TTL_SECS = int(os.environ.get("PROXY_SESSION_TTL_SECS", "7200"))
205
211
  PROXY_FINALIZE_CONTINUATION_MAX = int(
206
212
  os.environ.get("PROXY_FINALIZE_CONTINUATION_MAX", "3")
207
213
  )
214
+ # Session-level cap: after N total finalize continuations in a session (even
215
+ # across "fresh user text" state resets), stop injecting synthetic tools and
216
+ # let the response terminate naturally. Catches runaway loops that dodge the
217
+ # per-cycle cap by triggering state resets.
218
+ PROXY_FINALIZE_SESSION_HARD_CAP = int(
219
+ os.environ.get("PROXY_FINALIZE_SESSION_HARD_CAP", "3")
220
+ )
208
221
  PROXY_STREAM_REASONING_FALLBACK = (
209
222
  os.environ.get("PROXY_STREAM_REASONING_FALLBACK", "off").strip().lower()
210
223
  )
@@ -234,6 +247,14 @@ PROXY_DISABLE_THINKING_ON_TOOL_TURNS = os.environ.get(
234
247
  "off",
235
248
  "no",
236
249
  }
250
+ PROXY_DISABLE_SPEC_ON_TOOL_TURNS = os.environ.get(
251
+ "PROXY_DISABLE_SPEC_ON_TOOL_TURNS", "off"
252
+ ).lower() not in {
253
+ "0",
254
+ "false",
255
+ "off",
256
+ "no",
257
+ }
237
258
  PROXY_MALFORMED_TOOL_GUARDRAIL = os.environ.get(
238
259
  "PROXY_MALFORMED_TOOL_GUARDRAIL", "on"
239
260
  ).lower() not in {
@@ -654,6 +675,7 @@ class SessionMonitor:
654
675
  tool_state_stagnation_streak: int = 0
655
676
  tool_state_transitions: int = 0
656
677
  tool_state_review_cycles: int = 0
678
+ tool_state_unproductive_exhaustion_streak: int = 0
657
679
  last_tool_fingerprint: str = ""
658
680
  cycling_tool_names: list = field(default_factory=list)
659
681
  session_banned_tools: set = field(default_factory=set) # tools banned for entire session after repeated cycling
@@ -661,6 +683,7 @@ class SessionMonitor:
661
683
  last_response_garbled: bool = False # previous turn had garbled/malformed output
662
684
  finalize_turn_active: bool = False
663
685
  finalize_continuation_count: int = 0
686
+ finalize_hard_stop_count: int = 0 # monotonic, not reset by fresh user text
664
687
  finalize_synthetic_tool_id: str = ""
665
688
  completion_required: bool = False
666
689
  completion_pending: bool = False
@@ -898,6 +921,7 @@ class SessionMonitor:
898
921
  self.tool_state_auto_budget_remaining = 0
899
922
  self.tool_state_stagnation_streak = 0
900
923
  self.tool_state_review_cycles = 0
924
+ self.tool_state_unproductive_exhaustion_streak = 0
901
925
  self.cycling_tool_names = []
902
926
  self.last_tool_fingerprint = ""
903
927
  self.reset_tool_targets()
@@ -906,7 +930,10 @@ class SessionMonitor:
906
930
  self.completion_required = _should_enforce_completion_contract(anthropic_body)
907
931
  self.completion_progress_signals = _count_completion_progress_signals(anthropic_body)
908
932
  blockers = _completion_blockers(
909
- anthropic_body, has_tool_results, phase=self.tool_turn_phase
933
+ anthropic_body,
934
+ has_tool_results,
935
+ phase=self.tool_turn_phase,
936
+ finalize_fired=(self.finalize_hard_stop_count > 0),
910
937
  )
911
938
  self.completion_blockers = blockers
912
939
  self.completion_pending = self.completion_required and bool(blockers)
@@ -1046,6 +1073,8 @@ class SessionMonitor:
1046
1073
  session_monitors: dict[str, SessionMonitor] = {}
1047
1074
  default_context_window = 0
1048
1075
  last_session_id = ""
1076
+ _last_ctx_recheck_ts: float = 0.0
1077
+ _CTX_RECHECK_INTERVAL: float = 60.0 # Re-detect context window every 60s
1049
1078
 
1050
1079
 
1051
1080
  def _cleanup_stale_monitors(now_ts: float) -> None:
@@ -1058,6 +1087,39 @@ def _cleanup_stale_monitors(now_ts: float) -> None:
1058
1087
  session_monitors.pop(sid, None)
1059
1088
 
1060
1089
 
1090
+ async def _maybe_recheck_context_window() -> None:
1091
+ """Periodically re-query the upstream server's context window.
1092
+
1093
+ Handles server restarts with different --ctx-size mid-session.
1094
+ Non-blocking: skips if the check interval hasn't elapsed.
1095
+ """
1096
+ global default_context_window, _last_ctx_recheck_ts
1097
+ now = time.time()
1098
+ if now - _last_ctx_recheck_ts < _CTX_RECHECK_INTERVAL:
1099
+ return
1100
+ _last_ctx_recheck_ts = now
1101
+ if http_client is None:
1102
+ return
1103
+ try:
1104
+ slots_url = LLAMA_CPP_BASE.replace("/v1", "/slots")
1105
+ resp = await http_client.get(slots_url, timeout=2.0)
1106
+ if resp.status_code == 200:
1107
+ slots = resp.json()
1108
+ if slots and isinstance(slots, list):
1109
+ n_ctx = slots[0].get("n_ctx", 0)
1110
+ if n_ctx > 0 and n_ctx != default_context_window:
1111
+ old = default_context_window
1112
+ default_context_window = n_ctx
1113
+ for mon in session_monitors.values():
1114
+ mon.context_window = n_ctx
1115
+ logger.warning(
1116
+ "Context window changed: %d → %d (upstream server restarted?)",
1117
+ old, n_ctx,
1118
+ )
1119
+ except Exception:
1120
+ pass # Non-critical, will retry next interval
1121
+
1122
+
1061
1123
  def get_session_monitor(session_id: str) -> SessionMonitor:
1062
1124
  now_ts = time.time()
1063
1125
  _cleanup_stale_monitors(now_ts)
@@ -1852,6 +1914,9 @@ else:
1852
1914
 
1853
1915
 
1854
1916
  def _content_fingerprint(content) -> str:
1917
+ """Return a STABLE fingerprint for content. Must not include volatile
1918
+ identifiers (tool_use_ids change per-turn), otherwise session stickiness
1919
+ breaks in agentic loops with stateful guardrails."""
1855
1920
  if isinstance(content, str):
1856
1921
  return content[:512]
1857
1922
  if isinstance(content, list):
@@ -1866,7 +1931,10 @@ def _content_fingerprint(content) -> str:
1866
1931
  elif btype == "tool_use":
1867
1932
  parts.append(f"tool:{block.get('name', '')}")
1868
1933
  elif btype == "tool_result":
1869
- parts.append(f"result:{block.get('tool_use_id', '')}")
1934
+ # Stable: use tool name + first 64 chars of content, not tool_use_id
1935
+ inner = block.get("content", "")
1936
+ inner_text = _extract_text(inner) if not isinstance(inner, str) else inner
1937
+ parts.append(f"result:{inner_text[:64]}")
1870
1938
  return "\n".join(parts)[:1024]
1871
1939
  return str(content)[:512]
1872
1940
 
@@ -1893,14 +1961,26 @@ def resolve_session_id(request: Request, anthropic_body: dict) -> str:
1893
1961
  first_user = ""
1894
1962
  for msg in anthropic_body.get("messages", []):
1895
1963
  if msg.get("role") == "user":
1896
- first_user = _content_fingerprint(msg.get("content", ""))
1964
+ # Only hash TEXT content of first user message, not tool_result blocks
1965
+ # (which may appear in /anthropic/v1/messages passthrough scenarios)
1966
+ content = msg.get("content", "")
1967
+ if isinstance(content, str):
1968
+ first_user = content[:512]
1969
+ elif isinstance(content, list):
1970
+ text_parts = [
1971
+ b.get("text", "") for b in content
1972
+ if isinstance(b, dict) and b.get("type") == "text"
1973
+ ]
1974
+ first_user = "\n".join(text_parts)[:512]
1897
1975
  break
1898
1976
 
1899
- system_fingerprint = _content_fingerprint(anthropic_body.get("system", ""))
1977
+ # Deliberately exclude `system` from fingerprint — clients often inject
1978
+ # volatile context (timestamps, cwd, session markers) into system prompts
1979
+ # which would break session stickiness for ongoing conversations.
1900
1980
  model = anthropic_body.get("model", "default")
1901
1981
  remote = request.client.host if request.client else "unknown"
1902
1982
  digest = hashlib.sha256(
1903
- f"{remote}|{model}|{system_fingerprint}|{first_user}".encode(
1983
+ f"{remote}|{model}|{first_user}".encode(
1904
1984
  "utf-8", errors="ignore"
1905
1985
  )
1906
1986
  ).hexdigest()[:20]
@@ -1965,7 +2045,10 @@ def _should_enforce_completion_contract(anthropic_body: dict) -> bool:
1965
2045
 
1966
2046
 
1967
2047
  def _completion_blockers(
1968
- anthropic_body: dict, has_tool_results: bool, phase: str = ""
2048
+ anthropic_body: dict,
2049
+ has_tool_results: bool,
2050
+ phase: str = "",
2051
+ finalize_fired: bool = False,
1969
2052
  ) -> list[str]:
1970
2053
  blockers: list[str] = []
1971
2054
  progress = _count_completion_progress_signals(anthropic_body)
@@ -1977,9 +2060,12 @@ def _completion_blockers(
1977
2060
  if last_user_has_result:
1978
2061
  blockers.append("awaiting_post_tool_followup")
1979
2062
  elif _last_assistant_was_text_only(anthropic_body):
1980
- # Option 2: Suppress during finalize — text-only is expected behavior
1981
- # for finalize turns, so blocking on it causes infinite ping-pong.
1982
- if phase != "finalize":
2063
+ # Suppress in two cases:
2064
+ # 1. Currently in finalize phase text-only is expected
2065
+ # 2. A finalize fired earlier this session — means the state machine
2066
+ # already wrapped up the loop, don't re-trigger it (was causing
2067
+ # finalize -> review -> cycle -> finalize -> review... infinite loop)
2068
+ if phase != "finalize" and not finalize_fired:
1983
2069
  blockers.append("text_only_after_tool_results")
1984
2070
 
1985
2071
  return blockers
@@ -2020,6 +2106,212 @@ def _sanitize_tool_schema_for_llama(schema):
2020
2106
  return _walk(schema), removed
2021
2107
 
2022
2108
 
2109
+ def openai_to_anthropic_request(openai_body: dict) -> dict:
2110
+ """Convert an OpenAI Chat Completions request to an Anthropic Messages request.
2111
+
2112
+ Inverse of anthropic_to_openai_messages. Used by /v1/chat/completions passthrough
2113
+ to let OpenAI-shaped clients (Forge, etc.) benefit from the Anthropic-path
2114
+ guardrails (loop detection, tool narrowing, cycle breaking, etc.).
2115
+ """
2116
+ anthropic_messages: list[dict] = []
2117
+ system_text_parts: list[str] = []
2118
+
2119
+ for msg in openai_body.get("messages", []):
2120
+ role = msg.get("role", "")
2121
+ content = msg.get("content")
2122
+
2123
+ if role == "system":
2124
+ if isinstance(content, str):
2125
+ system_text_parts.append(content)
2126
+ elif isinstance(content, list):
2127
+ for block in content:
2128
+ if isinstance(block, dict) and block.get("type") == "text":
2129
+ system_text_parts.append(block.get("text", ""))
2130
+ elif isinstance(block, str):
2131
+ system_text_parts.append(block)
2132
+ continue
2133
+
2134
+ if role == "tool":
2135
+ # OpenAI tool response -> Anthropic user message with tool_result block
2136
+ tool_call_id = msg.get("tool_call_id", "")
2137
+ tool_text = content if isinstance(content, str) else _extract_text(content)
2138
+ anthropic_messages.append(
2139
+ {
2140
+ "role": "user",
2141
+ "content": [
2142
+ {
2143
+ "type": "tool_result",
2144
+ "tool_use_id": tool_call_id,
2145
+ "content": tool_text,
2146
+ }
2147
+ ],
2148
+ }
2149
+ )
2150
+ continue
2151
+
2152
+ if role == "assistant":
2153
+ blocks: list[dict] = []
2154
+ if isinstance(content, str) and content:
2155
+ blocks.append({"type": "text", "text": content})
2156
+ elif isinstance(content, list):
2157
+ for block in content:
2158
+ if isinstance(block, dict) and block.get("type") == "text":
2159
+ blocks.append({"type": "text", "text": block.get("text", "")})
2160
+ elif isinstance(block, str):
2161
+ blocks.append({"type": "text", "text": block})
2162
+
2163
+ for tc in msg.get("tool_calls", []) or []:
2164
+ fn = tc.get("function", {})
2165
+ try:
2166
+ args = json.loads(fn.get("arguments", "{}") or "{}")
2167
+ except (ValueError, TypeError):
2168
+ args = {}
2169
+ blocks.append(
2170
+ {
2171
+ "type": "tool_use",
2172
+ "id": tc.get("id", f"toolu_{uuid.uuid4().hex[:12]}"),
2173
+ "name": fn.get("name", ""),
2174
+ "input": args,
2175
+ }
2176
+ )
2177
+
2178
+ anthropic_messages.append(
2179
+ {"role": "assistant", "content": blocks if blocks else ""}
2180
+ )
2181
+ continue
2182
+
2183
+ # role == "user" (or unknown -> treat as user)
2184
+ if isinstance(content, str):
2185
+ anthropic_messages.append({"role": "user", "content": content})
2186
+ elif isinstance(content, list):
2187
+ blocks = []
2188
+ for block in content:
2189
+ if isinstance(block, dict) and block.get("type") == "text":
2190
+ blocks.append({"type": "text", "text": block.get("text", "")})
2191
+ elif isinstance(block, str):
2192
+ blocks.append({"type": "text", "text": block})
2193
+ anthropic_messages.append(
2194
+ {"role": "user", "content": blocks if blocks else ""}
2195
+ )
2196
+ else:
2197
+ anthropic_messages.append({"role": "user", "content": ""})
2198
+
2199
+ anthropic_body: dict = {
2200
+ "model": openai_body.get("model", "default"),
2201
+ "messages": anthropic_messages,
2202
+ "max_tokens": int(openai_body.get("max_tokens", 4096) or 4096),
2203
+ }
2204
+ if system_text_parts:
2205
+ anthropic_body["system"] = "\n\n".join(p for p in system_text_parts if p)
2206
+
2207
+ for key_o, key_a in (
2208
+ ("temperature", "temperature"),
2209
+ ("top_p", "top_p"),
2210
+ ("top_k", "top_k"),
2211
+ ("stop", "stop_sequences"),
2212
+ ("stream", "stream"),
2213
+ ):
2214
+ if key_o in openai_body:
2215
+ val = openai_body[key_o]
2216
+ if key_a == "stop_sequences" and isinstance(val, str):
2217
+ val = [val]
2218
+ anthropic_body[key_a] = val
2219
+
2220
+ # Convert OpenAI tools -> Anthropic tools
2221
+ openai_tools = openai_body.get("tools") or []
2222
+ if openai_tools:
2223
+ anthropic_tools = []
2224
+ for tool in openai_tools:
2225
+ fn = tool.get("function", {}) if isinstance(tool, dict) else {}
2226
+ if not fn.get("name"):
2227
+ continue
2228
+ anthropic_tools.append(
2229
+ {
2230
+ "name": fn.get("name", ""),
2231
+ "description": fn.get("description", ""),
2232
+ "input_schema": fn.get("parameters", {"type": "object", "properties": {}}),
2233
+ }
2234
+ )
2235
+ if anthropic_tools:
2236
+ anthropic_body["tools"] = anthropic_tools
2237
+
2238
+ tool_choice = openai_body.get("tool_choice")
2239
+ if tool_choice == "none":
2240
+ anthropic_body.pop("tools", None)
2241
+ elif tool_choice == "required":
2242
+ anthropic_body["tool_choice"] = {"type": "any"}
2243
+ elif isinstance(tool_choice, dict) and tool_choice.get("type") == "function":
2244
+ anthropic_body["tool_choice"] = {
2245
+ "type": "tool",
2246
+ "name": tool_choice.get("function", {}).get("name", ""),
2247
+ }
2248
+
2249
+ return anthropic_body
2250
+
2251
+
2252
+ def anthropic_to_openai_response(anthropic_resp: dict) -> dict:
2253
+ """Convert an Anthropic Messages response to OpenAI Chat Completions format."""
2254
+ content_blocks = anthropic_resp.get("content", []) or []
2255
+ text_parts: list[str] = []
2256
+ tool_calls: list[dict] = []
2257
+
2258
+ for block in content_blocks:
2259
+ if not isinstance(block, dict):
2260
+ continue
2261
+ btype = block.get("type")
2262
+ if btype == "text":
2263
+ text_parts.append(block.get("text", ""))
2264
+ elif btype == "tool_use":
2265
+ tool_calls.append(
2266
+ {
2267
+ "id": block.get("id", f"call_{uuid.uuid4().hex[:12]}"),
2268
+ "type": "function",
2269
+ "function": {
2270
+ "name": block.get("name", ""),
2271
+ "arguments": json.dumps(block.get("input", {}) or {}),
2272
+ },
2273
+ }
2274
+ )
2275
+
2276
+ stop_reason = anthropic_resp.get("stop_reason", "end_turn")
2277
+ finish_map = {
2278
+ "end_turn": "stop",
2279
+ "stop_sequence": "stop",
2280
+ "max_tokens": "length",
2281
+ "tool_use": "tool_calls",
2282
+ }
2283
+ finish_reason = finish_map.get(stop_reason, "stop")
2284
+
2285
+ message: dict = {"role": "assistant"}
2286
+ if text_parts:
2287
+ message["content"] = "".join(text_parts)
2288
+ else:
2289
+ message["content"] = None
2290
+ if tool_calls:
2291
+ message["tool_calls"] = tool_calls
2292
+
2293
+ usage = anthropic_resp.get("usage", {}) or {}
2294
+
2295
+ return {
2296
+ "id": anthropic_resp.get("id", f"chatcmpl-{uuid.uuid4().hex[:12]}"),
2297
+ "object": "chat.completion",
2298
+ "created": int(time.time()),
2299
+ "model": anthropic_resp.get("model", "unknown"),
2300
+ "choices": [
2301
+ {
2302
+ "index": 0,
2303
+ "message": message,
2304
+ "finish_reason": finish_reason,
2305
+ }
2306
+ ],
2307
+ "usage": {
2308
+ "prompt_tokens": usage.get("input_tokens", 0),
2309
+ "completion_tokens": usage.get("output_tokens", 0),
2310
+ "total_tokens": usage.get("input_tokens", 0) + usage.get("output_tokens", 0),
2311
+ },
2312
+ }
2313
+
2314
+
2023
2315
  def _convert_anthropic_tools_to_openai(anthropic_tools: list[dict]) -> list[dict]:
2024
2316
  converted = []
2025
2317
  removed_pattern_fields = 0
@@ -2313,11 +2605,27 @@ def _resolve_state_machine_tool_choice(
2313
2605
 
2314
2606
  if monitor.tool_state_forced_budget_remaining <= 0:
2315
2607
  monitor.set_tool_turn_phase("review", reason="forced_budget_exhausted")
2316
- # Only count toward review cycle limit if there was an actual
2317
- # cycle/stagnation detected. Budget exhaustion alone means the
2318
- # model is working — it just used all its turns — not cycling.
2319
2608
  if cycle_looping or stagnating:
2320
2609
  monitor.tool_state_review_cycles += 1
2610
+ monitor.tool_state_unproductive_exhaustion_streak = 0
2611
+ else:
2612
+ # Track consecutive unproductive exhaustions. Even without a
2613
+ # detected cycle, if the model burns through the forced budget
2614
+ # repeatedly with distinct-but-useless tool calls, treat it as
2615
+ # a loop and force finalize. Catches the 35B-A3B failure mode
2616
+ # where different short tool calls defeat per-tool cycle
2617
+ # detection.
2618
+ monitor.tool_state_unproductive_exhaustion_streak += 1
2619
+ if monitor.tool_state_unproductive_exhaustion_streak >= PROXY_UNPRODUCTIVE_EXHAUSTION_LIMIT:
2620
+ logger.warning(
2621
+ "TOOL STATE MACHINE: %d consecutive unproductive budget exhaustions — forcing finalize",
2622
+ monitor.tool_state_unproductive_exhaustion_streak,
2623
+ )
2624
+ monitor.set_tool_turn_phase("finalize", reason="unproductive_exhaustion")
2625
+ monitor.tool_state_unproductive_exhaustion_streak = 0
2626
+ monitor.tool_state_forced_budget_remaining = 0
2627
+ monitor.tool_state_auto_budget_remaining = 0
2628
+ return "finalize", "unproductive_exhaustion"
2321
2629
  monitor.tool_state_auto_budget_remaining = max(
2322
2630
  1, PROXY_TOOL_STATE_AUTO_BUDGET
2323
2631
  )
@@ -2325,10 +2633,11 @@ def _resolve_state_machine_tool_choice(
2325
2633
  1, PROXY_TOOL_STATE_FORCED_BUDGET // 2
2326
2634
  )
2327
2635
  logger.warning(
2328
- "TOOL STATE MACHINE: forced budget exhausted, entering review (cycles=%d cycling=%s stagnating=%s)",
2636
+ "TOOL STATE MACHINE: forced budget exhausted, entering review (cycles=%d cycling=%s stagnating=%s unprod_exh=%d)",
2329
2637
  monitor.tool_state_review_cycles,
2330
2638
  cycle_looping,
2331
2639
  stagnating,
2640
+ monitor.tool_state_unproductive_exhaustion_streak,
2332
2641
  )
2333
2642
  return "required", "forced_budget_exhausted"
2334
2643
 
@@ -2612,6 +2921,8 @@ def build_openai_request(
2612
2921
  # Skip all further tool_choice logic — no tools this turn
2613
2922
  if PROXY_DISABLE_THINKING_ON_TOOL_TURNS:
2614
2923
  openai_body["enable_thinking"] = False
2924
+ if PROXY_DISABLE_SPEC_ON_TOOL_TURNS:
2925
+ openai_body["speculative.n_max"] = 0
2615
2926
  return openai_body
2616
2927
 
2617
2928
  # Check if forced-tool dampener or loop breaker should override tool_choice
@@ -2638,6 +2949,7 @@ def build_openai_request(
2638
2949
  openai_body.pop("tool_choice", None)
2639
2950
  openai_body.pop("tools", None)
2640
2951
  monitor.finalize_turn_active = True
2952
+ monitor.finalize_hard_stop_count += 1 # monotonic marker: a finalize fired this session
2641
2953
  monitor.consecutive_forced_count = 0
2642
2954
  monitor.no_progress_streak = 0
2643
2955
  # Option 3: Inject explicit "no tool calls" instruction to reduce XML leak
@@ -2732,11 +3044,21 @@ def build_openai_request(
2732
3044
  elif state_reason in {"fresh_user_text", "inactive_loop"} and n_msgs <= 1:
2733
3045
  monitor.consecutive_forced_count = 0
2734
3046
  monitor.no_progress_streak = 0
2735
- logger.info(
2736
- "tool_choice left unchanged after state reset (reason=%s n_msgs=%d)",
2737
- state_reason,
2738
- n_msgs,
2739
- )
3047
+ # Force tool_choice=required on first turn to ensure local models
3048
+ # produce a tool call instead of plain text (cold-start fix)
3049
+ if has_tools and n_msgs == 1:
3050
+ openai_body["tool_choice"] = "required"
3051
+ logger.info(
3052
+ "tool_choice forced to 'required' on first turn (reason=%s n_msgs=%d cold_start_fix=true)",
3053
+ state_reason,
3054
+ n_msgs,
3055
+ )
3056
+ else:
3057
+ logger.info(
3058
+ "tool_choice left unchanged after state reset (reason=%s n_msgs=%d)",
3059
+ state_reason,
3060
+ n_msgs,
3061
+ )
2740
3062
  elif monitor.should_release_tool_choice():
2741
3063
  openai_body["tool_choice"] = "auto"
2742
3064
  monitor.consecutive_forced_count = 0
@@ -2773,6 +3095,12 @@ def build_openai_request(
2773
3095
  "Thinking disabled for tool turn (PROXY_DISABLE_THINKING_ON_TOOL_TURNS=on)"
2774
3096
  )
2775
3097
 
3098
+ if PROXY_DISABLE_SPEC_ON_TOOL_TURNS:
3099
+ openai_body["speculative.n_max"] = 0
3100
+ logger.info(
3101
+ "Spec decoding disabled for tool turn (PROXY_DISABLE_SPEC_ON_TOOL_TURNS=on)"
3102
+ )
3103
+
2776
3104
  _apply_tool_call_grammar(openai_body, grammar_override=profile_grammar)
2777
3105
 
2778
3106
  return openai_body
@@ -5212,6 +5540,18 @@ def _inject_synthetic_continuation(
5212
5540
  Appends a no-op Read("/dev/null") tool_use block and changes stop_reason
5213
5541
  from "end_turn" to "tool_use" so the client continues sending requests.
5214
5542
  """
5543
+ # Session-level hard cap: if we've already done N continuations in this
5544
+ # session (counter is monotonic, survives fresh-user-text resets), stop
5545
+ # injecting and let the response terminate. This catches runaway loops
5546
+ # that dodge the per-cycle cap via state resets.
5547
+ if monitor.finalize_hard_stop_count >= PROXY_FINALIZE_SESSION_HARD_CAP:
5548
+ logger.warning(
5549
+ "FINALIZE CONTINUATION: session hard cap reached (%d/%d) — not injecting, allowing termination",
5550
+ monitor.finalize_hard_stop_count,
5551
+ PROXY_FINALIZE_SESSION_HARD_CAP,
5552
+ )
5553
+ return anthropic_resp
5554
+
5215
5555
  # Pick a safe tool the client knows about (case-insensitive match,
5216
5556
  # then use the client's actual casing for the tool name)
5217
5557
  if _client_has_tool(anthropic_body, "read"):
@@ -5227,6 +5567,7 @@ def _inject_synthetic_continuation(
5227
5567
  synthetic_id = f"toolu_{uuid.uuid4().hex[:12]}"
5228
5568
  monitor.finalize_synthetic_tool_id = synthetic_id
5229
5569
  monitor.finalize_continuation_count += 1
5570
+ monitor.finalize_hard_stop_count += 1
5230
5571
 
5231
5572
  content = anthropic_resp.get("content", [])
5232
5573
  content.append({
@@ -5239,11 +5580,13 @@ def _inject_synthetic_continuation(
5239
5580
  anthropic_resp["stop_reason"] = "tool_use"
5240
5581
 
5241
5582
  logger.info(
5242
- "FINALIZE CONTINUATION: injected synthetic %s tool_use id=%s (count=%d/%d)",
5583
+ "FINALIZE CONTINUATION: injected synthetic %s tool_use id=%s (count=%d/%d, session=%d/%d)",
5243
5584
  tool_name,
5244
5585
  synthetic_id,
5245
5586
  monitor.finalize_continuation_count,
5246
5587
  PROXY_FINALIZE_CONTINUATION_MAX,
5588
+ monitor.finalize_hard_stop_count,
5589
+ PROXY_FINALIZE_SESSION_HARD_CAP,
5247
5590
  )
5248
5591
  return anthropic_resp
5249
5592
 
@@ -5804,6 +6147,10 @@ async def messages(request: Request):
5804
6147
  is_stream = body.get("stream", False)
5805
6148
  model = body.get("model", "default")
5806
6149
  client_id = resolve_client_id(request)
6150
+
6151
+ # Periodically re-detect context window from upstream (handles server restarts)
6152
+ await _maybe_recheck_context_window()
6153
+
5807
6154
  if _should_passthrough_model(model):
5808
6155
  logger.info("PASSTHROUGH: model=%s -> %s", model, ANTHROPIC_API_BASE)
5809
6156
  return await _passthrough_anthropic_request(request, body, is_stream)
@@ -5861,8 +6208,9 @@ async def messages(request: Request):
5861
6208
  last_text = str(last_content)[:200]
5862
6209
  rate_count = log_client_rate(client_id)
5863
6210
  logger.info(
5864
- "REQ: client=%s rate_%ss=%d stream=%s msgs=%d tools=%d max_tokens=%s last_role=%s last_content=%.200s",
6211
+ "REQ: client=%s sess=%s rate_%ss=%d stream=%s msgs=%d tools=%d max_tokens=%s last_role=%s last_content=%.200s",
5865
6212
  client_id,
6213
+ session_id,
5866
6214
  PROXY_CLIENT_RATE_WINDOW_SECS,
5867
6215
  rate_count,
5868
6216
  is_stream,
@@ -6532,6 +6880,292 @@ async def messages_anthropic(request: Request):
6532
6880
  return await messages(request)
6533
6881
 
6534
6882
 
6883
+ @app.post("/v1/chat/completions")
6884
+ async def chat_completions(request: Request):
6885
+ """OpenAI-compatible chat/completions endpoint for clients like Forge
6886
+ that require the OpenAI API shape.
6887
+
6888
+ FULL GUARDRAIL PATH: Converts the OpenAI request to Anthropic format,
6889
+ runs the full /v1/messages pipeline (loop detection, tool narrowing,
6890
+ cycle breaking, malformed tool retry, context pruning, etc.), then
6891
+ converts the Anthropic response back to OpenAI format.
6892
+
6893
+ Streaming is down-converted to a single final OpenAI SSE chunk sequence
6894
+ built from the completed Anthropic response (not token-by-token from
6895
+ upstream). This preserves guardrails at the cost of stream granularity.
6896
+ """
6897
+ body_bytes = await request.body()
6898
+ try:
6899
+ openai_body = json.loads(body_bytes) if body_bytes else {}
6900
+ except (ValueError, TypeError):
6901
+ return Response(
6902
+ content=b'{"error":{"message":"invalid JSON","type":"invalid_request_error"}}',
6903
+ status_code=400,
6904
+ media_type="application/json",
6905
+ )
6906
+
6907
+ requested_stream = bool(openai_body.get("stream", False))
6908
+ model = openai_body.get("model", "default")
6909
+ client_id = resolve_client_id(request)
6910
+
6911
+ logger.info(
6912
+ "CHAT (guarded): client=%s model=%s stream=%s msgs=%d tools=%d",
6913
+ client_id,
6914
+ model,
6915
+ requested_stream,
6916
+ len(openai_body.get("messages", [])),
6917
+ len(openai_body.get("tools", []) or []),
6918
+ )
6919
+
6920
+ # Convert OpenAI request -> Anthropic request
6921
+ anthropic_body = openai_to_anthropic_request(openai_body)
6922
+ # Force non-streaming through the pipeline; we re-stream at the end if the
6923
+ # client wanted streaming. This keeps guardrail logic simpler/consistent.
6924
+ anthropic_body["stream"] = False
6925
+
6926
+ # Build a synthetic Request that the existing messages() handler can consume
6927
+ fake_body_bytes = json.dumps(anthropic_body).encode("utf-8")
6928
+
6929
+ async def receive():
6930
+ return {"type": "http.request", "body": fake_body_bytes, "more_body": False}
6931
+
6932
+ fake_scope = dict(request.scope)
6933
+ # Preserve client/headers but override the body + path
6934
+ fake_scope["path"] = "/v1/messages"
6935
+ fake_scope["raw_path"] = b"/v1/messages"
6936
+ # Strip content-length since the body changes
6937
+ fake_scope["headers"] = [
6938
+ (k, v)
6939
+ for (k, v) in fake_scope.get("headers", [])
6940
+ if k.lower() != b"content-length"
6941
+ ]
6942
+ fake_request = Request(fake_scope, receive)
6943
+
6944
+ # Run the full guarded Anthropic pipeline
6945
+ inner_resp = await messages(fake_request)
6946
+
6947
+ # Extract the Anthropic-format JSON from whatever messages() returned
6948
+ anthropic_resp_dict: dict | None = None
6949
+ status_code = 200
6950
+ if isinstance(inner_resp, StreamingResponse):
6951
+ # Pipeline shouldn't stream because we set stream=False, but defensively
6952
+ # consume the stream and parse the final message event.
6953
+ chunks: list[bytes] = []
6954
+ async for chunk in inner_resp.body_iterator:
6955
+ if isinstance(chunk, bytes):
6956
+ chunks.append(chunk)
6957
+ elif isinstance(chunk, str):
6958
+ chunks.append(chunk.encode("utf-8"))
6959
+ raw = b"".join(chunks)
6960
+ # Try to parse as JSON directly first, then fall back to SSE parsing
6961
+ try:
6962
+ anthropic_resp_dict = json.loads(raw)
6963
+ except (ValueError, TypeError):
6964
+ anthropic_resp_dict = _parse_anthropic_sse_to_message(raw)
6965
+ elif isinstance(inner_resp, Response):
6966
+ status_code = inner_resp.status_code
6967
+ try:
6968
+ anthropic_resp_dict = json.loads(inner_resp.body)
6969
+ except (ValueError, TypeError):
6970
+ anthropic_resp_dict = None
6971
+ elif isinstance(inner_resp, dict):
6972
+ anthropic_resp_dict = inner_resp
6973
+
6974
+ if anthropic_resp_dict is None or "content" not in anthropic_resp_dict:
6975
+ # Upstream error: forward as-is in OpenAI error shape
6976
+ err_msg = "upstream returned no message"
6977
+ if isinstance(anthropic_resp_dict, dict) and "error" in anthropic_resp_dict:
6978
+ err_msg = anthropic_resp_dict["error"].get("message", err_msg)
6979
+ return Response(
6980
+ content=json.dumps({"error": {"message": err_msg, "type": "upstream_error"}}).encode(),
6981
+ status_code=status_code if status_code >= 400 else 502,
6982
+ media_type="application/json",
6983
+ )
6984
+
6985
+ # Ensure model field is set for response
6986
+ anthropic_resp_dict.setdefault("model", model)
6987
+ openai_resp = anthropic_to_openai_response(anthropic_resp_dict)
6988
+
6989
+ if not requested_stream:
6990
+ return Response(
6991
+ content=json.dumps(openai_resp).encode(),
6992
+ status_code=200,
6993
+ media_type="application/json",
6994
+ )
6995
+
6996
+ # Client requested streaming: emit the response as OpenAI SSE chunks
6997
+ async def emit_openai_stream():
6998
+ resp_id = openai_resp["id"]
6999
+ created = openai_resp["created"]
7000
+ model_name = openai_resp["model"]
7001
+ choice = openai_resp["choices"][0]
7002
+ message = choice["message"]
7003
+
7004
+ # Opening chunk: role
7005
+ opening = {
7006
+ "id": resp_id,
7007
+ "object": "chat.completion.chunk",
7008
+ "created": created,
7009
+ "model": model_name,
7010
+ "choices": [{"index": 0, "delta": {"role": "assistant"}, "finish_reason": None}],
7011
+ }
7012
+ yield f"data: {json.dumps(opening)}\n\n".encode()
7013
+
7014
+ # Content chunk
7015
+ if message.get("content"):
7016
+ content_chunk = {
7017
+ "id": resp_id,
7018
+ "object": "chat.completion.chunk",
7019
+ "created": created,
7020
+ "model": model_name,
7021
+ "choices": [
7022
+ {
7023
+ "index": 0,
7024
+ "delta": {"content": message["content"]},
7025
+ "finish_reason": None,
7026
+ }
7027
+ ],
7028
+ }
7029
+ yield f"data: {json.dumps(content_chunk)}\n\n".encode()
7030
+
7031
+ # Tool call chunks
7032
+ for idx, tc in enumerate(message.get("tool_calls", []) or []):
7033
+ tc_chunk = {
7034
+ "id": resp_id,
7035
+ "object": "chat.completion.chunk",
7036
+ "created": created,
7037
+ "model": model_name,
7038
+ "choices": [
7039
+ {
7040
+ "index": 0,
7041
+ "delta": {
7042
+ "tool_calls": [
7043
+ {
7044
+ "index": idx,
7045
+ "id": tc["id"],
7046
+ "type": "function",
7047
+ "function": {
7048
+ "name": tc["function"]["name"],
7049
+ "arguments": tc["function"]["arguments"],
7050
+ },
7051
+ }
7052
+ ]
7053
+ },
7054
+ "finish_reason": None,
7055
+ }
7056
+ ],
7057
+ }
7058
+ yield f"data: {json.dumps(tc_chunk)}\n\n".encode()
7059
+
7060
+ # Final chunk with finish_reason
7061
+ final_chunk = {
7062
+ "id": resp_id,
7063
+ "object": "chat.completion.chunk",
7064
+ "created": created,
7065
+ "model": model_name,
7066
+ "choices": [
7067
+ {"index": 0, "delta": {}, "finish_reason": choice["finish_reason"]}
7068
+ ],
7069
+ }
7070
+ yield f"data: {json.dumps(final_chunk)}\n\n".encode()
7071
+ yield b"data: [DONE]\n\n"
7072
+
7073
+ return StreamingResponse(
7074
+ emit_openai_stream(),
7075
+ media_type="text/event-stream",
7076
+ headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"},
7077
+ )
7078
+
7079
+
7080
+ def _parse_anthropic_sse_to_message(raw: bytes) -> dict | None:
7081
+ """Parse a concatenated Anthropic SSE stream into a final message dict.
7082
+ Used as a fallback when messages() returns a StreamingResponse despite stream=False.
7083
+ """
7084
+ try:
7085
+ text = raw.decode("utf-8", errors="replace")
7086
+ except Exception:
7087
+ return None
7088
+
7089
+ text_parts: list[str] = []
7090
+ tool_uses: list[dict] = []
7091
+ usage = {"input_tokens": 0, "output_tokens": 0}
7092
+ stop_reason = "end_turn"
7093
+ model = "unknown"
7094
+ message_id = f"msg_{uuid.uuid4().hex[:24]}"
7095
+
7096
+ current_block: dict | None = None
7097
+ current_json_buffer = ""
7098
+
7099
+ for line in text.splitlines():
7100
+ if not line.startswith("data:"):
7101
+ continue
7102
+ payload = line[5:].strip()
7103
+ if not payload or payload == "[DONE]":
7104
+ continue
7105
+ try:
7106
+ evt = json.loads(payload)
7107
+ except (ValueError, TypeError):
7108
+ continue
7109
+ etype = evt.get("type")
7110
+ if etype == "message_start":
7111
+ m = evt.get("message", {}) or {}
7112
+ message_id = m.get("id", message_id)
7113
+ model = m.get("model", model)
7114
+ if "usage" in m:
7115
+ usage.update(m["usage"])
7116
+ elif etype == "content_block_start":
7117
+ current_block = evt.get("content_block", {})
7118
+ current_json_buffer = ""
7119
+ if current_block.get("type") == "text":
7120
+ text_parts.append(current_block.get("text", ""))
7121
+ elif etype == "content_block_delta":
7122
+ d = evt.get("delta", {}) or {}
7123
+ if d.get("type") == "text_delta":
7124
+ text_parts.append(d.get("text", ""))
7125
+ elif d.get("type") == "input_json_delta":
7126
+ current_json_buffer += d.get("partial_json", "")
7127
+ elif etype == "content_block_stop":
7128
+ if current_block and current_block.get("type") == "tool_use":
7129
+ try:
7130
+ input_obj = json.loads(current_json_buffer) if current_json_buffer else {}
7131
+ except (ValueError, TypeError):
7132
+ input_obj = {}
7133
+ tool_uses.append(
7134
+ {
7135
+ "type": "tool_use",
7136
+ "id": current_block.get("id", f"toolu_{uuid.uuid4().hex[:12]}"),
7137
+ "name": current_block.get("name", ""),
7138
+ "input": input_obj,
7139
+ }
7140
+ )
7141
+ current_block = None
7142
+ current_json_buffer = ""
7143
+ elif etype == "message_delta":
7144
+ d = evt.get("delta", {}) or {}
7145
+ if "stop_reason" in d:
7146
+ stop_reason = d["stop_reason"] or stop_reason
7147
+ u = evt.get("usage", {}) or {}
7148
+ if u:
7149
+ usage.update(u)
7150
+
7151
+ content: list[dict] = []
7152
+ joined_text = "".join(text_parts)
7153
+ if joined_text:
7154
+ content.append({"type": "text", "text": joined_text})
7155
+ content.extend(tool_uses)
7156
+
7157
+ return {
7158
+ "id": message_id,
7159
+ "type": "message",
7160
+ "role": "assistant",
7161
+ "content": content if content else [{"type": "text", "text": ""}],
7162
+ "model": model,
7163
+ "stop_reason": stop_reason,
7164
+ "stop_sequence": None,
7165
+ "usage": usage,
7166
+ }
7167
+
7168
+
6535
7169
  @app.get("/v1/models")
6536
7170
  async def models():
6537
7171
  """Return available model list (spoofs Anthropic model IDs for client compatibility)."""