caudate-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (153) hide show
  1. api/__init__.py +5 -0
  2. api/anthropic_compat.py +1518 -0
  3. api/artifact_viewer.py +366 -0
  4. api/caudate_middleware.py +618 -0
  5. api/forge_bootstrapper_routes.py +377 -0
  6. api/forge_routes.py +630 -0
  7. api/forge_system_routes.py +294 -0
  8. api/openai_compat.py +1993 -0
  9. api/server.py +667 -0
  10. api/storyboard_page.py +677 -0
  11. caudate_cli-0.1.0.dist-info/METADATA +354 -0
  12. caudate_cli-0.1.0.dist-info/RECORD +153 -0
  13. caudate_cli-0.1.0.dist-info/WHEEL +5 -0
  14. caudate_cli-0.1.0.dist-info/entry_points.txt +2 -0
  15. caudate_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
  16. caudate_cli-0.1.0.dist-info/top_level.txt +14 -0
  17. cognos_mcp/__init__.py +4 -0
  18. cognos_mcp/bridge.py +41 -0
  19. cognos_mcp/client.py +70 -0
  20. cognos_mcp/config.py +49 -0
  21. cognos_mcp/server.py +66 -0
  22. config.py +82 -0
  23. core/__init__.py +0 -0
  24. core/agent.py +468 -0
  25. core/agentic_loop.py +731 -0
  26. core/anthropic_auth.py +91 -0
  27. core/background.py +113 -0
  28. core/banner.py +134 -0
  29. core/bootstrap.py +292 -0
  30. core/citations.py +131 -0
  31. core/compaction.py +109 -0
  32. core/constitution.py +198 -0
  33. core/diff_viewer.py +87 -0
  34. core/export.py +85 -0
  35. core/file_refs.py +119 -0
  36. core/files.py +199 -0
  37. core/hooks.py +209 -0
  38. core/image.py +599 -0
  39. core/input.py +91 -0
  40. core/loop.py +238 -0
  41. core/memory_md.py +147 -0
  42. core/notifications.py +99 -0
  43. core/ownership.py +181 -0
  44. core/paste.py +81 -0
  45. core/permissions.py +210 -0
  46. core/plan_mode.py +215 -0
  47. core/sandbox_prompt.py +185 -0
  48. core/scheduler.py +195 -0
  49. core/schemas.py +202 -0
  50. core/session.py +90 -0
  51. core/settings.py +132 -0
  52. core/skills.py +398 -0
  53. core/slash_commands.py +977 -0
  54. core/statusline.py +61 -0
  55. core/subagent.py +300 -0
  56. core/thinking.py +50 -0
  57. core/updater.py +122 -0
  58. core/usage.py +109 -0
  59. core/worktree.py +93 -0
  60. execution/__init__.py +0 -0
  61. execution/executor.py +329 -0
  62. execution/plugins.py +108 -0
  63. execution/tools/__init__.py +0 -0
  64. execution/tools/agent_tool.py +107 -0
  65. execution/tools/agentic_tool.py +297 -0
  66. execution/tools/artifact_tool.py +191 -0
  67. execution/tools/ask_user_question_tool.py +137 -0
  68. execution/tools/base.py +81 -0
  69. execution/tools/calculator_tool.py +137 -0
  70. execution/tools/cognos_card_tool.py +124 -0
  71. execution/tools/cron_tool.py +215 -0
  72. execution/tools/datetime_tool.py +215 -0
  73. execution/tools/describe_image_tool.py +161 -0
  74. execution/tools/draw_tool.py +164 -0
  75. execution/tools/edit_image_tool.py +262 -0
  76. execution/tools/edit_tool.py +245 -0
  77. execution/tools/file_tool.py +90 -0
  78. execution/tools/find_anywhere_tool.py +255 -0
  79. execution/tools/forge_feature_tools.py +377 -0
  80. execution/tools/glob_tool.py +59 -0
  81. execution/tools/grep_tool.py +89 -0
  82. execution/tools/http_request_tool.py +224 -0
  83. execution/tools/load_skill_tool.py +104 -0
  84. execution/tools/longcat_avatar_tool.py +384 -0
  85. execution/tools/mcp_tool.py +100 -0
  86. execution/tools/notebook_tool.py +279 -0
  87. execution/tools/openapi_tool.py +440 -0
  88. execution/tools/plan_mode_tool.py +95 -0
  89. execution/tools/push_notification_tool.py +157 -0
  90. execution/tools/python_tool.py +61 -0
  91. execution/tools/respond_tool.py +40 -0
  92. execution/tools/sandbox_tool.py +378 -0
  93. execution/tools/search_tool.py +153 -0
  94. execution/tools/semantic_search_tool.py +106 -0
  95. execution/tools/shell_tool.py +283 -0
  96. execution/tools/speak_tool.py +134 -0
  97. execution/tools/storyboard_tool.py +727 -0
  98. execution/tools/system_info_tool.py +212 -0
  99. execution/tools/task_tool.py +323 -0
  100. execution/tools/think_tool.py +49 -0
  101. execution/tools/transcribe_audio_tool.py +86 -0
  102. execution/tools/update_memory_tool.py +92 -0
  103. execution/tools/web_fetch_tool.py +82 -0
  104. execution/tools/worktree_tool.py +174 -0
  105. llm/__init__.py +0 -0
  106. llm/fallback.py +116 -0
  107. llm/models.py +320 -0
  108. llm/provider.py +1356 -0
  109. llm/router.py +373 -0
  110. main.py +1889 -0
  111. memory/__init__.py +0 -0
  112. memory/episodic.py +99 -0
  113. memory/procedural.py +145 -0
  114. memory/semantic.py +71 -0
  115. memory/working.py +64 -0
  116. nn/__init__.py +43 -0
  117. nn/auto_evolve.py +245 -0
  118. nn/caudate.py +136 -0
  119. nn/config.py +141 -0
  120. nn/consolidator.py +81 -0
  121. nn/data.py +1635 -0
  122. nn/encoder.py +258 -0
  123. nn/forge_advisor.py +303 -0
  124. nn/format.py +235 -0
  125. nn/heads.py +432 -0
  126. nn/observer.py +994 -0
  127. nn/policy.py +214 -0
  128. nn/runtime.py +343 -0
  129. nn/scorer.py +175 -0
  130. nn/trainer.py +515 -0
  131. nn/vision.py +352 -0
  132. personality/__init__.py +23 -0
  133. personality/engine.py +129 -0
  134. personality/identity.py +144 -0
  135. personality/inner_voice.py +100 -0
  136. personality/mood.py +205 -0
  137. planning/__init__.py +0 -0
  138. planning/dev_server.py +221 -0
  139. planning/forge_models.py +718 -0
  140. planning/orchestrator.py +1363 -0
  141. planning/planner.py +451 -0
  142. planning/task_graph.py +61 -0
  143. reflection/__init__.py +0 -0
  144. reflection/meta_learner.py +156 -0
  145. reflection/reflector.py +127 -0
  146. ui/__init__.py +5 -0
  147. ui/display.py +88 -0
  148. voice/__init__.py +0 -0
  149. voice/conversation.py +125 -0
  150. voice/listener.py +111 -0
  151. voice/speaker.py +59 -0
  152. voice/stt.py +126 -0
  153. voice/tts.py +214 -0
@@ -0,0 +1,1518 @@
1
+ """Anthropic Messages API compatibility layer.
2
+
3
+ Lets Claude Code (or any other Anthropic-format client) point at Cognos
4
+ and get answers back as if Cognos were Anthropic. Internally:
5
+
6
+ incoming /v1/messages (Anthropic schema)
7
+
8
+
9
+ translate to Cognos's internal message format
10
+
11
+
12
+ route through CognosAgent.llm (DualLLMProvider — Caudate + dual-brain
13
+ routing + fallback chain + prompt caching all engaged)
14
+
15
+
16
+ translate response back to Anthropic schema (regular or SSE stream)
17
+
18
+
19
+ Claude Code consumes it the same way it consumes a real Anthropic call
20
+
21
+ This is a pure LLM proxy — Claude Code keeps doing its own tool
22
+ execution. Cognos does NOT run its agentic loop here. The benefit is
23
+ that Claude Code's well-engineered REPL + tool stack is preserved, and
24
+ Cognos contributes its routing brain, memory, and (eventually) Caudate
25
+ predictions to every call.
26
+
27
+ Set Claude Code's env to use Cognos:
28
+
29
+ export ANTHROPIC_BASE_URL=http://127.0.0.1:8000
30
+ export ANTHROPIC_AUTH_TOKEN=cognos
31
+ export ANTHROPIC_API_KEY=""
32
+ claude # now talks to Cognos instead of Anthropic
33
+ """
34
+
35
+ from __future__ import annotations
36
+
37
+ import asyncio
38
+ import base64
39
+ import json
40
+ import logging
41
+ import os
42
+ import time
43
+ import uuid
44
+ from typing import Any, AsyncIterator
45
+
46
+ import httpx
47
+ from fastapi import APIRouter, HTTPException, Request
48
+ from fastapi.responses import JSONResponse, StreamingResponse
49
+
50
+ from api.caudate_middleware import CaudateMiddleware
51
+ from core.schemas import StreamEvent, ToolUseBlock
52
+ from llm.provider import LLMProvider
53
+
54
+
55
+ # ---- Anthropic passthrough -------------------------------------------
56
+ # When system1 (or the requested model) is an Anthropic Claude id,
57
+ # Cognos forwards the request to api.anthropic.com using the caller's
58
+ # original Authorization header (the user's Claude Code subscription
59
+ # token) instead of routing through LiteLLM/Ollama. Caudate still
60
+ # observes — she sees Opus's behaviour and learns from it.
61
+
62
+ ANTHROPIC_API_URL = "https://api.anthropic.com/v1/messages"
63
+ _ANTHROPIC_VERSION_DEFAULT = "2023-06-01"
64
+
65
+ # Force-enable extended thinking on the upstream request so Caudate
66
+ # always observes Opus's reasoning channel. Disabled with
67
+ # COGNOS_FORCE_THINKING=0 if it ever causes issues.
68
+ _FORCE_THINKING_DEFAULT_BUDGET = 4096
69
+ _FORCE_THINKING_MIN_MAX_TOKENS = 1024 # don't force on tiny requests
70
+
71
+ # Headers we forward verbatim from the incoming request to Anthropic.
72
+ # Anything else (host, content-length, accept-encoding, x-forwarded-*)
73
+ # is dropped so httpx can compute its own.
74
+ _FORWARD_REQUEST_HEADERS = {
75
+ "authorization", "anthropic-version", "anthropic-beta",
76
+ "x-api-key", "anthropic-dangerous-direct-browser-access",
77
+ }
78
+
79
+
80
+ def _resolve_anthropic_model(requested_model: str | None) -> str | None:
81
+ """Return the Anthropic model id to send upstream, or None if this
82
+ request should NOT use the Anthropic passthrough.
83
+
84
+ Decision order:
85
+ 1. requested_model is a Cognos-internal id (has a `[...]` suffix
86
+ like "claude-opus-4-7[1m]" that Claude Code self-reports) → None,
87
+ route locally via the dual-brain path (these ids 404 upstream)
88
+ 2. requested_model is already a real claude id → use as-is
89
+ 3. requested_model has the "anthropic/" prefix → strip it
90
+ 4. settings.system1 starts with "anthropic/" → use that resolved id
91
+ (catches the case where the client picked a "cognos-*" alias)
92
+ 5. otherwise → None (fall through to local LiteLLM path)
93
+ """
94
+ if requested_model:
95
+ m = requested_model.strip()
96
+ # Claude Code identifies itself with a bracketed context-window
97
+ # suffix (e.g. "claude-opus-4-7[1m]"). That id is not in the
98
+ # Anthropic public catalog and forwarding it 404s — drop to
99
+ # local dual-brain routing instead.
100
+ if "[" in m and m.endswith("]"):
101
+ return None
102
+ if m.startswith("claude-"):
103
+ return m
104
+ if m.startswith("anthropic/"):
105
+ return m.split("/", 1)[1]
106
+ # Settings-driven fallback: if system1 is configured for Anthropic,
107
+ # any client request lands on Opus regardless of which "cognos-*"
108
+ # alias they picked.
109
+ try:
110
+ from core.settings import Settings
111
+ s1 = (Settings.load().get("system1") or "")
112
+ if s1.startswith("anthropic/"):
113
+ return s1.split("/", 1)[1]
114
+ except Exception:
115
+ pass
116
+ return None
117
+
118
+
119
+ # Capability matrix per model family. Claude Code sends a fistful of
120
+ # request fields that only the newer/larger models accept (extended
121
+ # thinking, the effort knob, etc). When the resolved upstream model
122
+ # doesn't support a given field, we strip it before forwarding — else
123
+ # Anthropic returns 400s like `adaptive thinking is not supported on
124
+ # this model` or `does not support the effort parameter`.
125
+ _THINKING_CAPABLE_PREFIXES = (
126
+ "claude-opus-",
127
+ "claude-sonnet-",
128
+ )
129
+ _EFFORT_CAPABLE_PREFIXES = (
130
+ "claude-opus-",
131
+ "claude-sonnet-",
132
+ )
133
+
134
+
135
+ def _model_supports_thinking(model_id: str) -> bool:
136
+ if not model_id:
137
+ return False
138
+ return any(model_id.startswith(p) for p in _THINKING_CAPABLE_PREFIXES)
139
+
140
+
141
+ def _model_supports_effort(model_id: str) -> bool:
142
+ if not model_id:
143
+ return False
144
+ return any(model_id.startswith(p) for p in _EFFORT_CAPABLE_PREFIXES)
145
+
146
+
147
+ def _strip_thinking_dependent_context_management(
148
+ upstream_body: dict[str, Any],
149
+ ) -> dict[str, Any]:
150
+ """Remove `context_management.edits` entries whose `type` mentions
151
+ 'thinking' — they require thinking to be enabled and Anthropic
152
+ rejects with 400 when it's been stripped (e.g. Haiku path).
153
+
154
+ If the resulting edits list is empty, drop the whole
155
+ `context_management` field; otherwise keep it with the surviving
156
+ edits."""
157
+ cm = upstream_body.get("context_management")
158
+ if not isinstance(cm, dict):
159
+ return upstream_body
160
+ edits = cm.get("edits")
161
+ if not isinstance(edits, list):
162
+ return upstream_body
163
+ surviving = [
164
+ e for e in edits
165
+ if not (isinstance(e, dict)
166
+ and isinstance(e.get("type"), str)
167
+ and "thinking" in e["type"])
168
+ ]
169
+ if not surviving:
170
+ upstream_body.pop("context_management", None)
171
+ else:
172
+ upstream_body["context_management"] = {**cm, "edits": surviving}
173
+ return upstream_body
174
+
175
+
176
+ def _strip_unsupported_thinking(
177
+ upstream_body: dict[str, Any], model_id: str,
178
+ ) -> dict[str, Any]:
179
+ """If the upstream model doesn't support thinking, drop the field
180
+ *and* any context_management strategies that depend on it."""
181
+ if not _model_supports_thinking(model_id):
182
+ upstream_body.pop("thinking", None)
183
+ upstream_body = _strip_thinking_dependent_context_management(upstream_body)
184
+ return upstream_body
185
+
186
+
187
+ def _strip_unsupported_fields(
188
+ upstream_body: dict[str, Any], model_id: str,
189
+ ) -> dict[str, Any]:
190
+ """Strip every Claude-Code-extended field that the resolved
191
+ upstream model doesn't accept. Centralises the per-field capability
192
+ checks so adding a new one is one line.
193
+
194
+ Effort lives both at the top level AND nested inside `output_config`
195
+ on newer Claude Code clients — handle both."""
196
+ upstream_body = _strip_unsupported_thinking(upstream_body, model_id)
197
+ if not _model_supports_effort(model_id):
198
+ upstream_body.pop("effort", None)
199
+ oc = upstream_body.get("output_config")
200
+ if isinstance(oc, dict) and "effort" in oc:
201
+ oc = {k: v for k, v in oc.items() if k != "effort"}
202
+ if oc:
203
+ upstream_body["output_config"] = oc
204
+ else:
205
+ upstream_body.pop("output_config", None)
206
+ return upstream_body
207
+
208
+
209
+ def _maybe_force_thinking(upstream_body: dict[str, Any]) -> dict[str, Any]:
210
+ """Inject `thinking: {type: enabled}` if the caller didn't ask for it.
211
+
212
+ Caudate's training depends on observing Opus's reasoning channel —
213
+ not just the visible answer. Without this, Claude Code's default
214
+ request shape leaves thinking off and Caudate sees only the surface
215
+ output.
216
+
217
+ Skips when:
218
+ - caller already set `thinking` (respect explicit choice)
219
+ - max_tokens is very small (thinking budget would starve the answer)
220
+ - COGNOS_FORCE_THINKING=0 in env (escape hatch)
221
+ """
222
+ # Default OFF: Claude Code already requests thinking when it wants
223
+ # it (and Anthropic rejects requests where forced budget brushes
224
+ # against max_tokens, which produces 400s on title-gen / tab-
225
+ # completion calls). Opt in via COGNOS_FORCE_THINKING=1.
226
+ if os.environ.get("COGNOS_FORCE_THINKING", "0") != "1":
227
+ return upstream_body
228
+ if "thinking" in upstream_body:
229
+ return upstream_body
230
+ max_tokens = int(upstream_body.get("max_tokens") or 0)
231
+ # Need enough headroom: thinking + answer must both fit in max_tokens.
232
+ # Skip unless we can guarantee a budget *and* leave 1024 tokens free.
233
+ if not max_tokens or max_tokens < (_FORCE_THINKING_MIN_MAX_TOKENS + 1024):
234
+ return upstream_body
235
+ budget = min(_FORCE_THINKING_DEFAULT_BUDGET, max_tokens - 1024)
236
+ if budget < 1024:
237
+ return upstream_body
238
+ upstream_body["thinking"] = {"type": "enabled", "budget_tokens": budget}
239
+ if "temperature" in upstream_body and upstream_body["temperature"] != 1:
240
+ upstream_body["temperature"] = 1
241
+ upstream_body.pop("top_p", None)
242
+ upstream_body.pop("top_k", None)
243
+ return upstream_body
244
+
245
+
246
+ def _filter_forward_headers(request: Request) -> dict[str, str]:
247
+ """Pick only the headers we want to forward upstream."""
248
+ out: dict[str, str] = {}
249
+ for k, v in request.headers.items():
250
+ if k.lower() in _FORWARD_REQUEST_HEADERS:
251
+ out[k] = v
252
+ out.setdefault("anthropic-version", _ANTHROPIC_VERSION_DEFAULT)
253
+ out["content-type"] = "application/json"
254
+ return out
255
+
256
+
257
+ def _parse_sse_event(raw_event: str) -> tuple[str | None, dict[str, Any] | None]:
258
+ """Parse one SSE event block ('event: X\\ndata: {...}') into (type, data)."""
259
+ event_type: str | None = None
260
+ data_lines: list[str] = []
261
+ for line in raw_event.splitlines():
262
+ if line.startswith("event:"):
263
+ event_type = line[6:].strip()
264
+ elif line.startswith("data:"):
265
+ data_lines.append(line[5:].strip())
266
+ if not data_lines:
267
+ return event_type, None
268
+ try:
269
+ data = json.loads("\n".join(data_lines))
270
+ except Exception:
271
+ return event_type, None
272
+ return event_type, data
273
+
274
+
275
+ async def _passthrough_anthropic_stream(
276
+ *,
277
+ upstream_body: dict[str, Any],
278
+ headers: dict[str, str],
279
+ middleware: CaudateMiddleware | None,
280
+ turn_ctx: Any,
281
+ ) -> AsyncIterator[bytes]:
282
+ """Forward streaming /v1/messages to api.anthropic.com.
283
+
284
+ Bytes are forwarded to the client unchanged so SSE event ordering
285
+ and field shape are preserved exactly. Each event is *also* parsed
286
+ in-flight so Caudate observes text/thinking/tool deltas as they
287
+ happen.
288
+ """
289
+ error_occurred = False
290
+ block_types: dict[int, str] = {} # index -> block type ("text"/"thinking"/"tool_use")
291
+ block_tool_names: dict[int, str] = {} # index -> tool name (for stop-event lookup)
292
+ block_tool_inputs: dict[int, str] = {} # index -> accumulated input_json string
293
+ pending_chunk = ""
294
+
295
+ try:
296
+ async with httpx.AsyncClient(timeout=httpx.Timeout(300.0, connect=15.0)) as client:
297
+ async with client.stream(
298
+ "POST", ANTHROPIC_API_URL,
299
+ headers=headers,
300
+ json=upstream_body,
301
+ ) as resp:
302
+ if resp.status_code >= 400:
303
+ error_occurred = True
304
+ body_bytes = await resp.aread()
305
+ yield (
306
+ f"event: error\ndata: "
307
+ f"{json.dumps({'type':'error','error':{'type':'api_error','message':body_bytes.decode('utf-8', errors='replace')}})}"
308
+ f"\n\n"
309
+ ).encode()
310
+ return
311
+
312
+ async for chunk in resp.aiter_bytes():
313
+ if not chunk:
314
+ continue
315
+ yield chunk
316
+ # Tee into Caudate. Accumulate until we see a blank
317
+ # line which terminates one SSE event.
318
+ pending_chunk += chunk.decode("utf-8", errors="replace")
319
+ while "\n\n" in pending_chunk:
320
+ raw_event, pending_chunk = pending_chunk.split("\n\n", 1)
321
+ if not raw_event.strip():
322
+ continue
323
+ evt_type, data = _parse_sse_event(raw_event)
324
+ if data is None or middleware is None or turn_ctx is None:
325
+ continue
326
+ try:
327
+ if evt_type == "content_block_start":
328
+ idx = int(data.get("index", -1))
329
+ cb = data.get("content_block") or {}
330
+ block_types[idx] = cb.get("type", "")
331
+ if cb.get("type") == "tool_use":
332
+ name = cb.get("name", "")
333
+ block_tool_names[idx] = name
334
+ block_tool_inputs[idx] = ""
335
+ middleware.observe_tool_use(turn_ctx, name)
336
+ elif evt_type == "content_block_delta":
337
+ idx = int(data.get("index", -1))
338
+ delta = data.get("delta") or {}
339
+ dtype = delta.get("type")
340
+ if dtype == "text_delta":
341
+ middleware.observe_response_text(turn_ctx, delta.get("text", ""))
342
+ elif dtype == "thinking_delta":
343
+ middleware.observe_thinking(turn_ctx, delta.get("thinking", ""))
344
+ elif dtype == "input_json_delta":
345
+ # Accumulate the streamed JSON of a
346
+ # tool_use block so we can capture
347
+ # questions/inputs (e.g.
348
+ # AskUserQuestion's question + options)
349
+ # for Caudate's training context.
350
+ block_tool_inputs[idx] = (
351
+ block_tool_inputs.get(idx, "")
352
+ + (delta.get("partial_json") or "")
353
+ )
354
+ elif evt_type == "content_block_stop":
355
+ idx = int(data.get("index", -1))
356
+ if block_types.get(idx) == "tool_use":
357
+ name = block_tool_names.get(idx, "")
358
+ raw = block_tool_inputs.get(idx, "")
359
+ if raw:
360
+ # Surface the structured input as
361
+ # part of the response text so
362
+ # Caudate's text-encoder sees it.
363
+ middleware.observe_response_text(
364
+ turn_ctx,
365
+ f"\n[tool_use {name}({raw[:1000]})]\n",
366
+ )
367
+ except Exception as e:
368
+ logger.debug(f"caudate observe (passthrough stream) failed: {e}")
369
+ except Exception as e:
370
+ logger.exception("Anthropic passthrough stream failed")
371
+ error_occurred = True
372
+ yield (
373
+ f"event: error\ndata: "
374
+ f"{json.dumps({'type':'error','error':{'type':'api_error','message':str(e)}})}"
375
+ f"\n\n"
376
+ ).encode()
377
+ finally:
378
+ if middleware is not None and turn_ctx is not None:
379
+ middleware.end_turn(turn_ctx, error=error_occurred)
380
+
381
+
382
+ async def _passthrough_anthropic_nonstream(
383
+ *,
384
+ upstream_body: dict[str, Any],
385
+ headers: dict[str, str],
386
+ middleware: CaudateMiddleware | None,
387
+ turn_ctx: Any,
388
+ ) -> JSONResponse:
389
+ """Forward non-streaming /v1/messages and observe the response."""
390
+ try:
391
+ async with httpx.AsyncClient(timeout=httpx.Timeout(300.0, connect=15.0)) as client:
392
+ resp = await client.post(
393
+ ANTHROPIC_API_URL, headers=headers, json=upstream_body,
394
+ )
395
+ if resp.status_code >= 400:
396
+ if middleware is not None and turn_ctx is not None:
397
+ middleware.end_turn(turn_ctx, error=True)
398
+ return JSONResponse(
399
+ status_code=resp.status_code,
400
+ content={"type": "error", "error": {
401
+ "type": "api_error",
402
+ "message": resp.text,
403
+ }},
404
+ )
405
+ data = resp.json()
406
+ if middleware is not None and turn_ctx is not None:
407
+ try:
408
+ for block in data.get("content") or []:
409
+ btype = block.get("type")
410
+ if btype == "text":
411
+ middleware.observe_response_text(turn_ctx, block.get("text", ""))
412
+ elif btype == "thinking":
413
+ middleware.observe_thinking(turn_ctx, block.get("thinking", ""))
414
+ elif btype == "tool_use":
415
+ name = block.get("name", "")
416
+ middleware.observe_tool_use(turn_ctx, name)
417
+ # Surface the tool's structured input (the
418
+ # question + options for AskUserQuestion, etc.)
419
+ # so Caudate sees the intent, not just the name.
420
+ try:
421
+ inp = json.dumps(block.get("input") or {}, ensure_ascii=False)[:1000]
422
+ except Exception:
423
+ inp = str(block.get("input"))[:1000]
424
+ if inp:
425
+ middleware.observe_response_text(
426
+ turn_ctx, f"\n[tool_use {name}({inp})]\n",
427
+ )
428
+ except Exception as e:
429
+ logger.debug(f"caudate observe (passthrough nonstream) failed: {e}")
430
+ middleware.end_turn(turn_ctx, error=False)
431
+ return JSONResponse(content=data, status_code=resp.status_code)
432
+ except Exception as e:
433
+ logger.exception("Anthropic passthrough nonstream failed")
434
+ if middleware is not None and turn_ctx is not None:
435
+ middleware.end_turn(turn_ctx, error=True)
436
+ raise HTTPException(502, f"Anthropic upstream error: {e}")
437
+
438
+ logger = logging.getLogger(__name__)
439
+
440
+
441
+ # ---- Translation helpers ---------------------------------------------
442
+
443
+
444
+ def _translate_anthropic_to_internal(
445
+ body: dict[str, Any],
446
+ ) -> tuple[list[dict[str, Any]], list[dict[str, Any]] | None]:
447
+ """Anthropic /v1/messages body → Cognos messages list + tools list.
448
+
449
+ Returns (messages, tools_or_None). Messages are in OpenAI/LiteLLM
450
+ shape since that's what `LLMProvider.chat()` expects underneath.
451
+ """
452
+ raw_messages = body.get("messages") or []
453
+ raw_system = body.get("system")
454
+ raw_tools = body.get("tools") or []
455
+
456
+ out: list[dict[str, Any]] = []
457
+
458
+ # System prompt — can be a string or a list of text blocks
459
+ if isinstance(raw_system, str) and raw_system:
460
+ out.append({"role": "system", "content": raw_system})
461
+ elif isinstance(raw_system, list):
462
+ text = "".join(
463
+ b.get("text", "") for b in raw_system
464
+ if isinstance(b, dict) and b.get("type") == "text"
465
+ )
466
+ if text:
467
+ out.append({"role": "system", "content": text})
468
+
469
+ # Convert each message
470
+ for msg in raw_messages:
471
+ role = msg.get("role")
472
+ content = msg.get("content")
473
+ if isinstance(content, str):
474
+ out.append({"role": role, "content": content})
475
+ continue
476
+
477
+ if not isinstance(content, list):
478
+ continue
479
+
480
+ # Multi-block content — translate each block
481
+ text_parts: list[dict[str, Any]] = []
482
+ tool_uses: list[dict[str, Any]] = []
483
+ tool_results: list[dict[str, Any]] = []
484
+
485
+ for block in content:
486
+ if not isinstance(block, dict):
487
+ continue
488
+ btype = block.get("type")
489
+ if btype == "text":
490
+ text_parts.append({"type": "text", "text": block.get("text", "")})
491
+ elif btype == "image":
492
+ src = block.get("source") or {}
493
+ if src.get("type") == "base64":
494
+ media_type = src.get("media_type", "image/png")
495
+ data = src.get("data", "")
496
+ text_parts.append({
497
+ "type": "image_url",
498
+ "image_url": {"url": f"data:{media_type};base64,{data}"},
499
+ })
500
+ elif src.get("type") == "url":
501
+ text_parts.append({
502
+ "type": "image_url",
503
+ "image_url": {"url": src.get("url", "")},
504
+ })
505
+ elif btype == "tool_use":
506
+ tool_uses.append({
507
+ "id": block.get("id", ""),
508
+ "type": "function",
509
+ "function": {
510
+ "name": block.get("name", ""),
511
+ "arguments": json.dumps(block.get("input") or {}),
512
+ },
513
+ })
514
+ elif btype == "tool_result":
515
+ # Anthropic puts tool results in user messages; OpenAI
516
+ # uses a separate "tool" role.
517
+ rc = block.get("content")
518
+ if isinstance(rc, list):
519
+ rc = "\n".join(
520
+ b.get("text", "") for b in rc
521
+ if isinstance(b, dict) and b.get("type") == "text"
522
+ )
523
+ tool_results.append({
524
+ "role": "tool",
525
+ "tool_call_id": block.get("tool_use_id", ""),
526
+ "content": str(rc or ""),
527
+ })
528
+
529
+ # Emit assistant message with tool_calls (OpenAI shape)
530
+ if role == "assistant":
531
+ entry: dict[str, Any] = {"role": "assistant"}
532
+ if text_parts:
533
+ entry["content"] = (
534
+ text_parts[0]["text"] if len(text_parts) == 1
535
+ and text_parts[0]["type"] == "text"
536
+ else text_parts
537
+ )
538
+ else:
539
+ entry["content"] = ""
540
+ if tool_uses:
541
+ entry["tool_calls"] = tool_uses
542
+ out.append(entry)
543
+ else:
544
+ # User message — text + images stay together; tool_results
545
+ # become separate "tool" role messages right after.
546
+ if text_parts:
547
+ payload: Any = (
548
+ text_parts[0]["text"] if len(text_parts) == 1
549
+ and text_parts[0]["type"] == "text"
550
+ else text_parts
551
+ )
552
+ out.append({"role": "user", "content": payload})
553
+ for tr in tool_results:
554
+ out.append(tr)
555
+
556
+ tools_translated: list[dict[str, Any]] | None = None
557
+ if raw_tools:
558
+ tools_translated = [
559
+ {
560
+ "type": "function",
561
+ "function": {
562
+ "name": t.get("name", ""),
563
+ "description": t.get("description", ""),
564
+ "parameters": t.get("input_schema") or {
565
+ "type": "object", "properties": {}, "required": [],
566
+ },
567
+ },
568
+ }
569
+ for t in raw_tools if isinstance(t, dict)
570
+ ]
571
+
572
+ return out, tools_translated
573
+
574
+
575
+ def _build_anthropic_response(
576
+ *,
577
+ thinking: str = "",
578
+ text: str,
579
+ tool_calls: list[ToolUseBlock],
580
+ model: str,
581
+ usage: dict[str, int],
582
+ stop_reason: str | None,
583
+ ) -> dict[str, Any]:
584
+ """Build the non-streaming /v1/messages response."""
585
+ blocks: list[dict[str, Any]] = []
586
+ # Thinking block first — Anthropic's spec puts thinking before text
587
+ # so clients (Claude Code) render the reasoning above the answer.
588
+ if thinking:
589
+ blocks.append({"type": "thinking", "thinking": thinking})
590
+ if text:
591
+ blocks.append({"type": "text", "text": text})
592
+ for tc in tool_calls:
593
+ blocks.append({
594
+ "type": "tool_use",
595
+ "id": tc.id or f"toolu_{uuid.uuid4().hex[:12]}",
596
+ "name": tc.name,
597
+ "input": tc.input or {},
598
+ })
599
+
600
+ # Translate stop reason
601
+ stop_map = {
602
+ "stop": "end_turn", "length": "max_tokens",
603
+ "tool_calls": "tool_use", "tool_use": "tool_use",
604
+ }
605
+ anthropic_stop = stop_map.get(stop_reason or "stop", "end_turn")
606
+ if tool_calls and anthropic_stop == "end_turn":
607
+ anthropic_stop = "tool_use"
608
+
609
+ return {
610
+ "id": f"msg_{uuid.uuid4().hex[:24]}",
611
+ "type": "message",
612
+ "role": "assistant",
613
+ "model": model,
614
+ "content": blocks or [{"type": "text", "text": ""}],
615
+ "stop_reason": anthropic_stop,
616
+ "stop_sequence": None,
617
+ "usage": {
618
+ "input_tokens": usage.get("prompt_tokens", 0),
619
+ "output_tokens": usage.get("completion_tokens", 0),
620
+ },
621
+ }
622
+
623
+
624
+ # ---- Streaming SSE generator -----------------------------------------
625
+
626
+
627
+ async def _stream_anthropic_events(
628
+ llm: LLMProvider,
629
+ messages: list[dict[str, Any]],
630
+ tools: list[dict[str, Any]] | None,
631
+ max_tokens: int,
632
+ temperature: float | None,
633
+ requested_model: str,
634
+ middleware: CaudateMiddleware | None = None,
635
+ turn_ctx: Any = None,
636
+ ) -> AsyncIterator[bytes]:
637
+ """Cognos stream → Anthropic SSE format.
638
+
639
+ Anthropic emits a strict sequence of events:
640
+ message_start, content_block_start (text), [content_block_delta]+
641
+ content_block_stop, [tool blocks], message_delta, message_stop
642
+ """
643
+ msg_id = f"msg_{uuid.uuid4().hex[:24]}"
644
+
645
+ def _sse(event: str, data: dict[str, Any]) -> bytes:
646
+ return f"event: {event}\ndata: {json.dumps(data)}\n\n".encode()
647
+
648
+ # message_start
649
+ yield _sse("message_start", {
650
+ "type": "message_start",
651
+ "message": {
652
+ "id": msg_id, "type": "message", "role": "assistant",
653
+ "model": requested_model, "content": [],
654
+ "stop_reason": None, "stop_sequence": None,
655
+ "usage": {"input_tokens": 0, "output_tokens": 1},
656
+ },
657
+ })
658
+
659
+ # We track which content blocks we've opened.
660
+ # Block layout follows Anthropic's spec: thinking (if any) at index 0,
661
+ # then text at the next index, then tool_use blocks after that.
662
+ thinking_open = False
663
+ thinking_index = 0
664
+ text_open = False
665
+ text_index = 0
666
+ tool_blocks: list[tuple[int, ToolUseBlock]] = []
667
+ next_index = 0
668
+ output_text = ""
669
+ stop_reason: str | None = None
670
+
671
+ def _close_thinking() -> bytes | None:
672
+ nonlocal thinking_open
673
+ if thinking_open:
674
+ thinking_open = False
675
+ return _sse("content_block_stop", {
676
+ "type": "content_block_stop", "index": thinking_index,
677
+ })
678
+ return None
679
+
680
+ error_occurred = False
681
+ try:
682
+ async for event in llm.stream(
683
+ messages=messages, tools=tools,
684
+ max_tokens=max_tokens, temperature=temperature,
685
+ ):
686
+ if event.type == "thinking_delta" and event.delta:
687
+ # Feed thinking into the middleware too — it's signal
688
+ # for tool-intent inference even though it's not visible.
689
+ if middleware is not None and turn_ctx is not None:
690
+ middleware.observe_thinking(turn_ctx, event.delta)
691
+ # First thinking chunk: open the thinking block
692
+ if not thinking_open:
693
+ yield _sse("content_block_start", {
694
+ "type": "content_block_start",
695
+ "index": thinking_index,
696
+ "content_block": {"type": "thinking", "thinking": ""},
697
+ })
698
+ thinking_open = True
699
+ text_index = thinking_index + 1
700
+ next_index = max(next_index, thinking_index + 1)
701
+ yield _sse("content_block_delta", {
702
+ "type": "content_block_delta",
703
+ "index": thinking_index,
704
+ "delta": {"type": "thinking_delta", "thinking": event.delta},
705
+ })
706
+ elif event.type == "text_delta" and event.delta:
707
+ # If thinking was open and we're switching to text, close it
708
+ close_evt = _close_thinking()
709
+ if close_evt:
710
+ yield close_evt
711
+ if not text_open:
712
+ yield _sse("content_block_start", {
713
+ "type": "content_block_start",
714
+ "index": text_index,
715
+ "content_block": {"type": "text", "text": ""},
716
+ })
717
+ text_open = True
718
+ next_index = max(next_index, text_index + 1)
719
+ output_text += event.delta
720
+ if middleware is not None and turn_ctx is not None:
721
+ middleware.observe_response_text(turn_ctx, event.delta)
722
+ yield _sse("content_block_delta", {
723
+ "type": "content_block_delta",
724
+ "index": text_index,
725
+ "delta": {"type": "text_delta", "text": event.delta},
726
+ })
727
+ elif event.type == "tool_use_end":
728
+ # Close any open thinking block before emitting tool blocks
729
+ close_evt = _close_thinking()
730
+ if close_evt:
731
+ yield close_evt
732
+ # Cognos emits the whole tool call at once. Anthropic
733
+ # wants a content_block_start + input_json_delta + stop.
734
+ idx = next_index
735
+ next_index += 1
736
+ tc = ToolUseBlock(
737
+ id=event.tool_use_id or f"toolu_{uuid.uuid4().hex[:12]}",
738
+ name=event.tool_name or "",
739
+ input=event.tool_input or {},
740
+ )
741
+ tool_blocks.append((idx, tc))
742
+ if middleware is not None and turn_ctx is not None:
743
+ middleware.observe_tool_use(turn_ctx, tc.name)
744
+ yield _sse("content_block_start", {
745
+ "type": "content_block_start",
746
+ "index": idx,
747
+ "content_block": {
748
+ "type": "tool_use",
749
+ "id": tc.id,
750
+ "name": tc.name,
751
+ "input": {},
752
+ },
753
+ })
754
+ yield _sse("content_block_delta", {
755
+ "type": "content_block_delta",
756
+ "index": idx,
757
+ "delta": {
758
+ "type": "input_json_delta",
759
+ "partial_json": json.dumps(tc.input),
760
+ },
761
+ })
762
+ yield _sse("content_block_stop", {
763
+ "type": "content_block_stop", "index": idx,
764
+ })
765
+ elif event.type == "message_stop":
766
+ stop_reason = event.stop_reason
767
+ except Exception as e:
768
+ logger.exception("Stream upstream failed")
769
+ error_occurred = True
770
+ # Emit an error event so the client knows
771
+ yield _sse("error", {
772
+ "type": "error",
773
+ "error": {"type": "api_error", "message": str(e)},
774
+ })
775
+ if middleware is not None and turn_ctx is not None:
776
+ middleware.end_turn(turn_ctx, error=True)
777
+ return
778
+
779
+ if thinking_open:
780
+ yield _sse("content_block_stop", {
781
+ "type": "content_block_stop", "index": thinking_index,
782
+ })
783
+ thinking_open = False
784
+ if text_open:
785
+ yield _sse("content_block_stop", {
786
+ "type": "content_block_stop", "index": text_index,
787
+ })
788
+
789
+ # Translate stop reason
790
+ stop_map = {
791
+ "stop": "end_turn", "length": "max_tokens",
792
+ "tool_calls": "tool_use", "tool_use": "tool_use",
793
+ }
794
+ anthropic_stop = stop_map.get(stop_reason or "stop", "end_turn")
795
+ if tool_blocks and anthropic_stop == "end_turn":
796
+ anthropic_stop = "tool_use"
797
+
798
+ # Approximate output token count — we don't have it exact for streaming
799
+ output_tokens = max(1, len(output_text.split()))
800
+
801
+ yield _sse("message_delta", {
802
+ "type": "message_delta",
803
+ "delta": {"stop_reason": anthropic_stop, "stop_sequence": None},
804
+ "usage": {"output_tokens": output_tokens},
805
+ })
806
+ yield _sse("message_stop", {"type": "message_stop"})
807
+
808
+ # Close the Caudate turn — derive reward, push samples, write episode,
809
+ # potentially trigger auto-train.
810
+ if middleware is not None and turn_ctx is not None and not error_occurred:
811
+ middleware.end_turn(turn_ctx, error=False)
812
+
813
+
814
+ # ---- Dual-brain arbitration on /v1/messages -------------------------
815
+ # Mirror of the Open-WebUI arbitration in api/openai_compat.py, but the
816
+ # response/stream shape is Anthropic, not OpenAI. Both brains are run
817
+ # in parallel and a heuristic scorer picks the winner; both drafts are
818
+ # captured for Caudate's preference-learning corpus
819
+ # (data/nn/arbitrations.jsonl). This is the substrate for Phase 4 of
820
+ # CAUDATE_EVOLUTION.md (the conductor) — same data shape regardless of
821
+ # which client speaks to Cognos.
822
+
823
+
824
+ def _anthropic_response_from_llm_response(
825
+ *,
826
+ resp: Any, # LLMResponse
827
+ requested_model: str,
828
+ ) -> dict[str, Any]:
829
+ """Translate an internal `LLMResponse` into Anthropic /v1/messages body."""
830
+ # Surface thinking when content is empty (mirror the openai-compat
831
+ # fallback so the user always sees something).
832
+ text = resp.content or ""
833
+ thinking = getattr(resp, "thinking", "") or ""
834
+ if not text and thinking:
835
+ text = (
836
+ f"[thinking — model didn't finish before max_tokens]\n\n{thinking}"
837
+ )
838
+ return _build_anthropic_response(
839
+ thinking=thinking if text != f"[thinking — model didn't finish before max_tokens]\n\n{thinking}" else "",
840
+ text=text,
841
+ tool_calls=resp.tool_calls or [],
842
+ model=requested_model,
843
+ usage=resp.usage or {},
844
+ stop_reason=resp.stop_reason,
845
+ )
846
+
847
+
848
+ async def _anthropic_arbitrate_nonstream(
849
+ *,
850
+ body: dict[str, Any],
851
+ internal_msgs: list[dict[str, Any]],
852
+ internal_tools: list[dict[str, Any]] | None,
853
+ max_tokens: int,
854
+ temperature: float | None,
855
+ requested_model: str,
856
+ agent: Any,
857
+ middleware: CaudateMiddleware,
858
+ turn_ctx: Any,
859
+ ) -> JSONResponse:
860
+ """Run system1 + system2 in parallel via DualLLMProvider, score
861
+ both, return the winner as an Anthropic /v1/messages JSON body.
862
+ Subscription auth scope is needed so the Anthropic-side brain can
863
+ use the user's OAuth token (it'd 401 with x-api-key otherwise)."""
864
+ from api.openai_compat import _dual_brain_arbitrate
865
+ from core.anthropic_auth import subscription_auth_scope
866
+ from llm.router import DualLLMProvider
867
+
868
+ if not isinstance(agent.llm, DualLLMProvider):
869
+ # No dual-brain wired — fall back to a single chat call.
870
+ with subscription_auth_scope():
871
+ resp = await agent.llm.chat(
872
+ messages=internal_msgs, tools=internal_tools,
873
+ max_tokens=max_tokens, temperature=temperature,
874
+ )
875
+ else:
876
+ with subscription_auth_scope():
877
+ resp = await _dual_brain_arbitrate(
878
+ llm=agent.llm,
879
+ messages=internal_msgs, tools=internal_tools,
880
+ max_tokens=max_tokens, temperature=temperature,
881
+ middleware=middleware, turn_ctx=turn_ctx,
882
+ )
883
+
884
+ # Feed observer (ones not already covered by _dual_brain_arbitrate's
885
+ # observe_arbitration call — that records BOTH drafts; we still
886
+ # need to record the chosen text for the normal observation path).
887
+ middleware.observe_response_text(turn_ctx, resp.content or "")
888
+ if getattr(resp, "thinking", None):
889
+ middleware.observe_thinking(turn_ctx, resp.thinking)
890
+ for tc in resp.tool_calls or []:
891
+ middleware.observe_tool_use(turn_ctx, tc.name)
892
+ middleware.end_turn(turn_ctx, error=False)
893
+
894
+ return JSONResponse(_anthropic_response_from_llm_response(
895
+ resp=resp, requested_model=requested_model,
896
+ ))
897
+
898
+
899
+ async def _anthropic_arbitrate_stream(
900
+ *,
901
+ body: dict[str, Any],
902
+ internal_msgs: list[dict[str, Any]],
903
+ internal_tools: list[dict[str, Any]] | None,
904
+ max_tokens: int,
905
+ temperature: float | None,
906
+ requested_model: str,
907
+ agent: Any,
908
+ middleware: CaudateMiddleware,
909
+ turn_ctx: Any,
910
+ ) -> AsyncIterator[bytes]:
911
+ """Streaming arbitration: stream system1 LIVE so the user sees
912
+ text flow within seconds, run system2 in parallel as a buffered
913
+ background draft for the preference corpus. Avoids the 20+ second
914
+ blackout that comes from buffering both before any data flows
915
+ (Claude Code times out on that)."""
916
+ import asyncio
917
+ from api.openai_compat import _score_draft
918
+ from core.anthropic_auth import subscription_auth_scope
919
+ from llm.router import DualLLMProvider
920
+
921
+ msg_id = f"msg_{uuid.uuid4().hex[:24]}"
922
+
923
+ def _sse(event_name: str, data: dict[str, Any]) -> bytes:
924
+ return f"event: {event_name}\ndata: {json.dumps(data)}\n\n".encode()
925
+
926
+ # No dual-brain — fall back to a single-brain stream.
927
+ if not isinstance(agent.llm, DualLLMProvider):
928
+ try:
929
+ with subscription_auth_scope():
930
+ resp = await agent.llm.chat(
931
+ messages=internal_msgs, tools=internal_tools,
932
+ max_tokens=max_tokens, temperature=temperature,
933
+ )
934
+ except Exception as e:
935
+ logger.exception("anthropic-arbitrate single fallback failed")
936
+ middleware.end_turn(turn_ctx, error=True)
937
+ yield _sse("error", {
938
+ "type": "error",
939
+ "error": {"type": "api_error", "message": str(e)},
940
+ })
941
+ return
942
+ text = resp.content or ""
943
+ thinking = getattr(resp, "thinking", "") or ""
944
+ if not text and thinking:
945
+ text = f"[thinking — model didn't finish before max_tokens]\n\n{thinking}"
946
+ middleware.observe_response_text(turn_ctx, text)
947
+ for tc in resp.tool_calls or []:
948
+ middleware.observe_tool_use(turn_ctx, tc.name)
949
+ middleware.end_turn(turn_ctx, error=False)
950
+ # Single-shot fake stream for the fallback case.
951
+ yield _sse("message_start", {
952
+ "type": "message_start",
953
+ "message": {
954
+ "id": msg_id, "type": "message", "role": "assistant",
955
+ "model": requested_model, "content": [],
956
+ "stop_reason": None, "stop_sequence": None,
957
+ "usage": {"input_tokens": 0, "output_tokens": 1},
958
+ },
959
+ })
960
+ if text:
961
+ yield _sse("content_block_start", {
962
+ "type": "content_block_start", "index": 0,
963
+ "content_block": {"type": "text", "text": ""},
964
+ })
965
+ step = 80
966
+ for i in range(0, len(text), step):
967
+ yield _sse("content_block_delta", {
968
+ "type": "content_block_delta", "index": 0,
969
+ "delta": {"type": "text_delta", "text": text[i:i+step]},
970
+ })
971
+ yield _sse("content_block_stop", {
972
+ "type": "content_block_stop", "index": 0,
973
+ })
974
+ yield _sse("message_delta", {
975
+ "type": "message_delta",
976
+ "delta": {"stop_reason": "end_turn", "stop_sequence": None},
977
+ "usage": {"output_tokens": 1},
978
+ })
979
+ yield _sse("message_stop", {"type": "message_stop"})
980
+ return
981
+
982
+ # Dual-brain path: stream the FAST-RESPONDING brain live, run the
983
+ # other in parallel as a buffered background draft. Streaming a
984
+ # thinking model (Kimi) blocks visible content for tens of seconds
985
+ # while it reasons — so we route the *visible* stream to the
986
+ # non-thinking brain when one is available. Both are still
987
+ # engaged for arbitration.
988
+ s1, s2 = agent.llm.router.fast, agent.llm.router.slow
989
+ s1_thinks = any(h in (s1.model or "").lower()
990
+ for h in ("kimi", "deepseek", "qwen3", "o1", "o3"))
991
+ s2_thinks = any(h in (s2.model or "").lower()
992
+ for h in ("kimi", "deepseek", "qwen3", "o1", "o3"))
993
+ if s1_thinks and not s2_thinks:
994
+ stream_provider, bg_provider = s2, s1
995
+ stream_label = "slow"
996
+ else:
997
+ stream_provider, bg_provider = s1, s2
998
+ stream_label = "fast"
999
+
1000
+ # Kick off the background draft.
1001
+ bg_task = asyncio.create_task(
1002
+ _slow_with_subscription_scope(
1003
+ bg_provider, internal_msgs, internal_tools,
1004
+ max_tokens, temperature,
1005
+ )
1006
+ )
1007
+
1008
+ # message_start event — arrives immediately.
1009
+ yield _sse("message_start", {
1010
+ "type": "message_start",
1011
+ "message": {
1012
+ "id": msg_id, "type": "message", "role": "assistant",
1013
+ "model": requested_model, "content": [],
1014
+ "stop_reason": None, "stop_sequence": None,
1015
+ "usage": {"input_tokens": 0, "output_tokens": 1},
1016
+ },
1017
+ })
1018
+
1019
+ stream_text_buf: list[str] = []
1020
+ stream_tool_calls: list[ToolUseBlock] = []
1021
+ text_open = False
1022
+ text_index = 0
1023
+ next_index = 0
1024
+ thinking_open = False
1025
+ thinking_index = 0
1026
+ stop_reason: str | None = None
1027
+
1028
+ try:
1029
+ with subscription_auth_scope():
1030
+ async for event in stream_provider.stream(
1031
+ messages=internal_msgs, tools=internal_tools,
1032
+ max_tokens=max_tokens, temperature=temperature,
1033
+ ):
1034
+ if event.type == "thinking_delta" and event.delta:
1035
+ middleware.observe_thinking(turn_ctx, event.delta)
1036
+ if not thinking_open:
1037
+ yield _sse("content_block_start", {
1038
+ "type": "content_block_start",
1039
+ "index": thinking_index,
1040
+ "content_block": {"type": "thinking", "thinking": ""},
1041
+ })
1042
+ thinking_open = True
1043
+ text_index = thinking_index + 1
1044
+ next_index = max(next_index, thinking_index + 1)
1045
+ yield _sse("content_block_delta", {
1046
+ "type": "content_block_delta",
1047
+ "index": thinking_index,
1048
+ "delta": {"type": "thinking_delta", "thinking": event.delta},
1049
+ })
1050
+ elif event.type == "text_delta" and event.delta:
1051
+ if thinking_open:
1052
+ yield _sse("content_block_stop", {
1053
+ "type": "content_block_stop", "index": thinking_index,
1054
+ })
1055
+ thinking_open = False
1056
+ if not text_open:
1057
+ yield _sse("content_block_start", {
1058
+ "type": "content_block_start",
1059
+ "index": text_index,
1060
+ "content_block": {"type": "text", "text": ""},
1061
+ })
1062
+ text_open = True
1063
+ next_index = max(next_index, text_index + 1)
1064
+ stream_text_buf.append(event.delta)
1065
+ middleware.observe_response_text(turn_ctx, event.delta)
1066
+ yield _sse("content_block_delta", {
1067
+ "type": "content_block_delta",
1068
+ "index": text_index,
1069
+ "delta": {"type": "text_delta", "text": event.delta},
1070
+ })
1071
+ elif event.type == "tool_use_end":
1072
+ if thinking_open:
1073
+ yield _sse("content_block_stop", {
1074
+ "type": "content_block_stop", "index": thinking_index,
1075
+ })
1076
+ thinking_open = False
1077
+ idx = next_index
1078
+ next_index += 1
1079
+ tc = ToolUseBlock(
1080
+ id=event.tool_use_id or f"toolu_{uuid.uuid4().hex[:12]}",
1081
+ name=event.tool_name or "",
1082
+ input=event.tool_input or {},
1083
+ )
1084
+ stream_tool_calls.append(tc)
1085
+ middleware.observe_tool_use(turn_ctx, tc.name)
1086
+ yield _sse("content_block_start", {
1087
+ "type": "content_block_start",
1088
+ "index": idx,
1089
+ "content_block": {
1090
+ "type": "tool_use", "id": tc.id,
1091
+ "name": tc.name, "input": {},
1092
+ },
1093
+ })
1094
+ yield _sse("content_block_delta", {
1095
+ "type": "content_block_delta",
1096
+ "index": idx,
1097
+ "delta": {
1098
+ "type": "input_json_delta",
1099
+ "partial_json": json.dumps(tc.input),
1100
+ },
1101
+ })
1102
+ yield _sse("content_block_stop", {
1103
+ "type": "content_block_stop", "index": idx,
1104
+ })
1105
+ elif event.type == "message_stop":
1106
+ stop_reason = event.stop_reason
1107
+ except Exception as e:
1108
+ logger.exception("stream-leg failed in arbitrate")
1109
+ middleware.end_turn(turn_ctx, error=True)
1110
+ yield _sse("error", {
1111
+ "type": "error",
1112
+ "error": {"type": "api_error", "message": str(e)},
1113
+ })
1114
+ bg_task.cancel()
1115
+ return
1116
+
1117
+ if thinking_open:
1118
+ yield _sse("content_block_stop", {
1119
+ "type": "content_block_stop", "index": thinking_index,
1120
+ })
1121
+ if text_open:
1122
+ yield _sse("content_block_stop", {
1123
+ "type": "content_block_stop", "index": text_index,
1124
+ })
1125
+
1126
+ stop_map = {
1127
+ "stop": "end_turn", "length": "max_tokens",
1128
+ "tool_calls": "tool_use", "tool_use": "tool_use",
1129
+ }
1130
+ anthropic_stop = stop_map.get(stop_reason or "stop", "end_turn")
1131
+ if stream_tool_calls and anthropic_stop == "end_turn":
1132
+ anthropic_stop = "tool_use"
1133
+
1134
+ output_tokens = max(1, len("".join(stream_text_buf).split()))
1135
+ yield _sse("message_delta", {
1136
+ "type": "message_delta",
1137
+ "delta": {"stop_reason": anthropic_stop, "stop_sequence": None},
1138
+ "usage": {"output_tokens": output_tokens},
1139
+ })
1140
+ yield _sse("message_stop", {"type": "message_stop"})
1141
+
1142
+ # Background: wait for bg brain to finish, score both, log
1143
+ # arbitration. User has already seen the streamed brain; this is
1144
+ # purely for the preference corpus.
1145
+ try:
1146
+ bg_resp = await bg_task
1147
+ except Exception as e:
1148
+ logger.warning(f"bg-draft failed: {e}")
1149
+ bg_resp = None
1150
+
1151
+ try:
1152
+ streamed_text = "".join(stream_text_buf)
1153
+ class _FakeResp:
1154
+ content = streamed_text
1155
+ thinking = ""
1156
+ tool_calls = stream_tool_calls
1157
+ streamed_score = _score_draft(_FakeResp()) # type: ignore[arg-type]
1158
+ bg_score = _score_draft(bg_resp) if bg_resp else 0.0
1159
+ if stream_label == "fast":
1160
+ fast_text, fast_score = streamed_text, streamed_score
1161
+ slow_text = bg_resp.content if bg_resp else ""
1162
+ slow_score = bg_score
1163
+ else:
1164
+ slow_text, slow_score = streamed_text, streamed_score
1165
+ fast_text = bg_resp.content if bg_resp else ""
1166
+ fast_score = bg_score
1167
+ winner = ("fast" if fast_score >= slow_score
1168
+ or abs(fast_score - slow_score) < 0.02
1169
+ else "slow")
1170
+ middleware.observe_arbitration(
1171
+ turn_ctx,
1172
+ fast_text=fast_text,
1173
+ slow_text=slow_text,
1174
+ fast_score=fast_score,
1175
+ slow_score=slow_score,
1176
+ winner=winner,
1177
+ fast_model=s1.model,
1178
+ slow_model=s2.model,
1179
+ )
1180
+ except Exception as e:
1181
+ logger.debug(f"arbitration log failed: {e}")
1182
+ middleware.end_turn(turn_ctx, error=False)
1183
+
1184
+
1185
+ async def _slow_with_subscription_scope(
1186
+ provider, messages, tools, max_tokens, temperature,
1187
+ ):
1188
+ """Helper: invoke a provider's chat() inside the subscription
1189
+ auth scope (so anthropic/* models work via OAuth) — used by the
1190
+ streaming arbitration path's background slow-draft."""
1191
+ from core.anthropic_auth import subscription_auth_scope
1192
+ with subscription_auth_scope():
1193
+ return await provider.chat(
1194
+ messages=messages, tools=tools,
1195
+ max_tokens=max_tokens, temperature=temperature,
1196
+ )
1197
+
1198
+
1199
+ # ---- FastAPI router --------------------------------------------------
1200
+
1201
+
1202
+ def build_router() -> APIRouter:
1203
+ """Build the /v1/messages router. Uses a process-wide CognosAgent so
1204
+ state (memory, Caudate, mood) carries across requests."""
1205
+ router = APIRouter()
1206
+
1207
+ # Lazy singleton — first request builds it, subsequent reuse.
1208
+ _agent_box: dict[str, Any] = {"agent": None, "middleware": None}
1209
+
1210
+ def _get_agent():
1211
+ if _agent_box["agent"] is None:
1212
+ from core.agent import CognosAgent
1213
+ agent = CognosAgent(
1214
+ mode="agentic",
1215
+ permission_mode="bypass", # Claude Code handles permissions
1216
+ personality=True, # keep mood signal alive for Caudate
1217
+ )
1218
+ _agent_box["agent"] = agent
1219
+ _agent_box["middleware"] = CaudateMiddleware(agent)
1220
+ cau = getattr(agent, "caudate", None)
1221
+ cau_status = (cau.policy.level.label
1222
+ if cau and cau.policy else "unavailable")
1223
+ logger.info(
1224
+ f"Anthropic-compat singleton agent ready, "
1225
+ f"llm={agent.llm.model}, caudate={cau_status}"
1226
+ )
1227
+ return _agent_box["agent"], _agent_box["middleware"]
1228
+
1229
+ @router.post("/v1/messages")
1230
+ async def messages_endpoint(request: Request):
1231
+ try:
1232
+ body = await request.json()
1233
+ except Exception:
1234
+ raise HTTPException(400, "Invalid JSON body")
1235
+
1236
+ if not isinstance(body, dict):
1237
+ raise HTTPException(400, "Body must be a JSON object")
1238
+
1239
+ try:
1240
+ internal_msgs, internal_tools = _translate_anthropic_to_internal(body)
1241
+ except Exception as e:
1242
+ logger.exception("Anthropic→internal translation failed")
1243
+ raise HTTPException(400, f"Bad message format: {e}")
1244
+
1245
+ # Inject sandbox-awareness hint so the LLM scaffolds new files
1246
+ # into cognos/sandbox/ by default. Idempotent across multi-turn.
1247
+ from core.sandbox_prompt import inject_sandbox_hint
1248
+ internal_msgs = inject_sandbox_hint(internal_msgs)
1249
+
1250
+ max_tokens = int(body.get("max_tokens") or 4096)
1251
+ temperature = body.get("temperature")
1252
+ if temperature is not None:
1253
+ temperature = float(temperature)
1254
+ requested_model = body.get("model") or "cognos"
1255
+ stream = bool(body.get("stream", False))
1256
+
1257
+ agent, middleware = _get_agent()
1258
+ llm = agent.llm
1259
+
1260
+ # ---- Dual-brain arbitration branch (Phase 4 substrate) --------
1261
+ # `cognos-dual-brain` (and the `cognos-collab` alias) trigger
1262
+ # parallel arbitration through both system1 and system2. The
1263
+ # winner is returned in Anthropic format; both drafts are
1264
+ # captured for Caudate's preference-learning corpus. This path
1265
+ # has its own non-passthrough flow because we need to dispatch
1266
+ # both providers in parallel, not just forward upstream.
1267
+ rm_lower = (requested_model or "").lower()
1268
+ if rm_lower in ("cognos-dual-brain", "cognos-collab"):
1269
+ turn_ctx = middleware.begin_turn(
1270
+ internal_msgs, internal_tools,
1271
+ model_source=f"dual[fast={getattr(getattr(llm,'router',None),'fast',type('x',(),{'model':'?'})).model},"
1272
+ f"slow={getattr(getattr(llm,'router',None),'slow',type('x',(),{'model':'?'})).model}]"
1273
+ if hasattr(llm, "router") else "unknown",
1274
+ )
1275
+ internal_msgs = middleware.maybe_inject_hint(internal_msgs, turn_ctx)
1276
+ if stream:
1277
+ return StreamingResponse(
1278
+ _anthropic_arbitrate_stream(
1279
+ body=body,
1280
+ internal_msgs=internal_msgs,
1281
+ internal_tools=internal_tools,
1282
+ max_tokens=max_tokens,
1283
+ temperature=temperature,
1284
+ requested_model=requested_model,
1285
+ agent=agent,
1286
+ middleware=middleware,
1287
+ turn_ctx=turn_ctx,
1288
+ ),
1289
+ media_type="text/event-stream",
1290
+ )
1291
+ return await _anthropic_arbitrate_nonstream(
1292
+ body=body,
1293
+ internal_msgs=internal_msgs,
1294
+ internal_tools=internal_tools,
1295
+ max_tokens=max_tokens,
1296
+ temperature=temperature,
1297
+ requested_model=requested_model,
1298
+ agent=agent,
1299
+ middleware=middleware,
1300
+ turn_ctx=turn_ctx,
1301
+ )
1302
+
1303
+ # ---- Anthropic passthrough branch -----------------------------
1304
+ # If the resolved primary brain is an Anthropic Claude id, forward
1305
+ # the request to api.anthropic.com using the caller's own auth
1306
+ # header. Caudate observes the response so she still learns from
1307
+ # every turn — just from a stronger teacher.
1308
+ passthrough_model = _resolve_anthropic_model(requested_model)
1309
+ if passthrough_model is not None:
1310
+ turn_ctx = middleware.begin_turn(
1311
+ internal_msgs, internal_tools,
1312
+ model_source=f"anthropic/{passthrough_model}",
1313
+ )
1314
+ internal_msgs_with_hint = middleware.maybe_inject_hint(internal_msgs, turn_ctx)
1315
+ # Build an Anthropic-shape body to forward. We start from the
1316
+ # caller's body so any client-set fields (system, tools,
1317
+ # tool_choice, metadata, top_p, top_k...) survive — we only
1318
+ # override the model id.
1319
+ upstream_body = dict(body)
1320
+ upstream_body["model"] = passthrough_model
1321
+ # DIAG: log the keys + any 'effort' references before strip
1322
+ logger.warning(
1323
+ f"[passthrough pre-strip] model={passthrough_model} "
1324
+ f"top_keys={sorted(upstream_body.keys())} "
1325
+ f"has_effort={'effort' in upstream_body} "
1326
+ f"has_thinking={'thinking' in upstream_body}"
1327
+ )
1328
+ upstream_body = _strip_unsupported_fields(upstream_body, passthrough_model)
1329
+ upstream_body = _maybe_force_thinking(upstream_body)
1330
+ logger.warning(
1331
+ f"[passthrough post-strip] top_keys={sorted(upstream_body.keys())} "
1332
+ f"has_effort={'effort' in upstream_body} "
1333
+ f"has_thinking={'thinking' in upstream_body}"
1334
+ )
1335
+ # If Caudate injected a hint, also inject it into the system
1336
+ # prompt of the upstream body. Not just the internal_msgs.
1337
+ if internal_msgs_with_hint is not internal_msgs:
1338
+ # Find the system content in the hinted internal messages.
1339
+ injected_system = ""
1340
+ for m in internal_msgs_with_hint:
1341
+ if m.get("role") == "system":
1342
+ c = m.get("content")
1343
+ injected_system = c if isinstance(c, str) else (
1344
+ " ".join(b.get("text", "") for b in c
1345
+ if isinstance(b, dict) and b.get("type") == "text")
1346
+ if isinstance(c, list) else ""
1347
+ )
1348
+ break
1349
+ if injected_system:
1350
+ raw_sys = body.get("system")
1351
+ if isinstance(raw_sys, str) and raw_sys:
1352
+ upstream_body["system"] = injected_system
1353
+ elif isinstance(raw_sys, list):
1354
+ upstream_body["system"] = [
1355
+ {"type": "text", "text": injected_system}
1356
+ ]
1357
+ else:
1358
+ upstream_body["system"] = injected_system
1359
+
1360
+ headers = _filter_forward_headers(request)
1361
+ if "authorization" not in {k.lower() for k in headers}:
1362
+ # Fall back to ANTHROPIC_API_KEY env var if the client
1363
+ # didn't send a Bearer token. Common when the caller is
1364
+ # something other than Claude Code.
1365
+ env_key = os.environ.get("ANTHROPIC_API_KEY")
1366
+ if env_key:
1367
+ headers["x-api-key"] = env_key
1368
+
1369
+ if stream:
1370
+ return StreamingResponse(
1371
+ _passthrough_anthropic_stream(
1372
+ upstream_body=upstream_body,
1373
+ headers=headers,
1374
+ middleware=middleware,
1375
+ turn_ctx=turn_ctx,
1376
+ ),
1377
+ media_type="text/event-stream",
1378
+ )
1379
+ return await _passthrough_anthropic_nonstream(
1380
+ upstream_body=upstream_body,
1381
+ headers=headers,
1382
+ middleware=middleware,
1383
+ turn_ctx=turn_ctx,
1384
+ )
1385
+
1386
+ # ---- Local LiteLLM path (Ollama / OpenAI / etc.) --------------
1387
+ # Open Caudate's turn — capture state, predict, log, register
1388
+ # tool vocab, extract images, etc.
1389
+ turn_ctx = middleware.begin_turn(
1390
+ internal_msgs, internal_tools,
1391
+ model_source=getattr(llm, "model", "unknown"),
1392
+ )
1393
+
1394
+ # WHISPER+ : prepend Caudate's hint to the system message
1395
+ internal_msgs = middleware.maybe_inject_hint(internal_msgs, turn_ctx)
1396
+
1397
+ if stream:
1398
+ return StreamingResponse(
1399
+ _stream_anthropic_events(
1400
+ llm=llm,
1401
+ messages=internal_msgs,
1402
+ tools=internal_tools,
1403
+ max_tokens=max_tokens,
1404
+ temperature=temperature,
1405
+ requested_model=requested_model,
1406
+ middleware=middleware,
1407
+ turn_ctx=turn_ctx,
1408
+ ),
1409
+ media_type="text/event-stream",
1410
+ )
1411
+
1412
+ # Non-streaming
1413
+ try:
1414
+ resp = await llm.chat(
1415
+ messages=internal_msgs,
1416
+ tools=internal_tools,
1417
+ max_tokens=max_tokens,
1418
+ temperature=temperature,
1419
+ )
1420
+ except Exception as e:
1421
+ logger.exception("LLM call failed")
1422
+ middleware.end_turn(turn_ctx, error=True)
1423
+ raise HTTPException(500, f"LLM error: {e}")
1424
+
1425
+ # Feed the response back into Caudate's observer
1426
+ middleware.observe_response_text(turn_ctx, resp.content or "")
1427
+ if getattr(resp, "thinking", None):
1428
+ middleware.observe_thinking(turn_ctx, resp.thinking)
1429
+ for tc in resp.tool_calls:
1430
+ middleware.observe_tool_use(turn_ctx, tc.name)
1431
+ middleware.end_turn(turn_ctx, error=False)
1432
+
1433
+ return JSONResponse(_build_anthropic_response(
1434
+ thinking=getattr(resp, "thinking", "") or "",
1435
+ text=resp.content,
1436
+ tool_calls=resp.tool_calls,
1437
+ model=requested_model,
1438
+ usage=resp.usage,
1439
+ stop_reason=resp.stop_reason,
1440
+ ))
1441
+
1442
+ @router.get("/v1/models")
1443
+ async def models_endpoint():
1444
+ """Anthropic-compat model list.
1445
+
1446
+ Exposes friendly cognos-* aliases so Claude Code's header shows
1447
+ something honest (e.g. "cognos-dual-brain") rather than the
1448
+ misleading "Sonnet 4.6". Each cognos-* id routes through the
1449
+ normal Cognos LLM stack regardless of which one Claude Code
1450
+ picks — the name is purely cosmetic.
1451
+ """
1452
+ from config import LLM_MODEL
1453
+ from core.settings import Settings
1454
+ s = Settings.load()
1455
+ s1 = s.get("system1") or "(unset)"
1456
+ s2 = s.get("system2") or "(unset)"
1457
+ primary = s.get("model") or LLM_MODEL
1458
+
1459
+ # Friendly cognos-* aliases — what Claude Code will display.
1460
+ # User-facing: ONE model. Caudate decides per-turn whether to
1461
+ # use one brain or both, whether to engage constitutional
1462
+ # critique, which tier, and whether to think — driven by her
1463
+ # tool/tier/think/value head predictions. The other cognos-*
1464
+ # aliases below are *functional* but unlisted — power users can
1465
+ # still hit them directly to force a specific behaviour, but
1466
+ # the default catalog stays clean.
1467
+ cognos_aliases = [
1468
+ ("cognos", f"Cognos · Caudate-driven smart routing · S1={s1} + S2={s2}"),
1469
+ ]
1470
+
1471
+ # Real model ids the user has — for transparency / picker support.
1472
+ real_ids: set[str] = set()
1473
+ for v in (s.get("system1"), s.get("system2"), s.get("model"), LLM_MODEL):
1474
+ if v: real_ids.add(v)
1475
+
1476
+ # Claude id stubs so anyone with a Claude default in their
1477
+ # ~/.claude/settings.json still works.
1478
+ claude_stubs = [
1479
+ "claude-opus-4-7", "claude-sonnet-4-6", "claude-haiku-4-5",
1480
+ ]
1481
+
1482
+ # Each model carries BOTH Anthropic-shape and OpenAI-shape
1483
+ # fields so Claude Code AND Open WebUI can both parse this list.
1484
+ # Anthropic clients read `type` + `display_name` + `created_at`;
1485
+ # OpenAI clients read `object` + `created` (Unix int) + `owned_by`.
1486
+ # Carrying both is harmless to either side.
1487
+ ts_iso = "2026-01-01T00:00:00Z"
1488
+ ts_unix = 1735689600 # 2025-01-01T00:00:00Z, intentionally older
1489
+ # than session ids so picker sorts stable
1490
+ owner = "cognos"
1491
+ def _entry(model_id: str, display: str) -> dict[str, Any]:
1492
+ return {
1493
+ "id": model_id,
1494
+ "type": "model", # Anthropic shape
1495
+ "object": "model", # OpenAI shape
1496
+ "display_name": display,
1497
+ "created_at": ts_iso,
1498
+ "created": ts_unix,
1499
+ "owned_by": owner,
1500
+ }
1501
+
1502
+ models: list[dict[str, Any]] = []
1503
+ for name, display in cognos_aliases:
1504
+ models.append(_entry(name, display))
1505
+ for name in sorted(real_ids):
1506
+ models.append(_entry(name, name))
1507
+ for name in claude_stubs:
1508
+ models.append(_entry(name, f"{name} (alias → Cognos)"))
1509
+
1510
+ return {
1511
+ "object": "list", # OpenAI clients require this
1512
+ "data": models,
1513
+ "first_id": models[0]["id"] if models else None,
1514
+ "last_id": models[-1]["id"] if models else None,
1515
+ "has_more": False,
1516
+ }
1517
+
1518
+ return router