caudate-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- api/__init__.py +5 -0
- api/anthropic_compat.py +1518 -0
- api/artifact_viewer.py +366 -0
- api/caudate_middleware.py +618 -0
- api/forge_bootstrapper_routes.py +377 -0
- api/forge_routes.py +630 -0
- api/forge_system_routes.py +294 -0
- api/openai_compat.py +1993 -0
- api/server.py +667 -0
- api/storyboard_page.py +677 -0
- caudate_cli-0.1.0.dist-info/METADATA +354 -0
- caudate_cli-0.1.0.dist-info/RECORD +153 -0
- caudate_cli-0.1.0.dist-info/WHEEL +5 -0
- caudate_cli-0.1.0.dist-info/entry_points.txt +2 -0
- caudate_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
- caudate_cli-0.1.0.dist-info/top_level.txt +14 -0
- cognos_mcp/__init__.py +4 -0
- cognos_mcp/bridge.py +41 -0
- cognos_mcp/client.py +70 -0
- cognos_mcp/config.py +49 -0
- cognos_mcp/server.py +66 -0
- config.py +82 -0
- core/__init__.py +0 -0
- core/agent.py +468 -0
- core/agentic_loop.py +731 -0
- core/anthropic_auth.py +91 -0
- core/background.py +113 -0
- core/banner.py +134 -0
- core/bootstrap.py +292 -0
- core/citations.py +131 -0
- core/compaction.py +109 -0
- core/constitution.py +198 -0
- core/diff_viewer.py +87 -0
- core/export.py +85 -0
- core/file_refs.py +119 -0
- core/files.py +199 -0
- core/hooks.py +209 -0
- core/image.py +599 -0
- core/input.py +91 -0
- core/loop.py +238 -0
- core/memory_md.py +147 -0
- core/notifications.py +99 -0
- core/ownership.py +181 -0
- core/paste.py +81 -0
- core/permissions.py +210 -0
- core/plan_mode.py +215 -0
- core/sandbox_prompt.py +185 -0
- core/scheduler.py +195 -0
- core/schemas.py +202 -0
- core/session.py +90 -0
- core/settings.py +132 -0
- core/skills.py +398 -0
- core/slash_commands.py +977 -0
- core/statusline.py +61 -0
- core/subagent.py +300 -0
- core/thinking.py +50 -0
- core/updater.py +122 -0
- core/usage.py +109 -0
- core/worktree.py +93 -0
- execution/__init__.py +0 -0
- execution/executor.py +329 -0
- execution/plugins.py +108 -0
- execution/tools/__init__.py +0 -0
- execution/tools/agent_tool.py +107 -0
- execution/tools/agentic_tool.py +297 -0
- execution/tools/artifact_tool.py +191 -0
- execution/tools/ask_user_question_tool.py +137 -0
- execution/tools/base.py +81 -0
- execution/tools/calculator_tool.py +137 -0
- execution/tools/cognos_card_tool.py +124 -0
- execution/tools/cron_tool.py +215 -0
- execution/tools/datetime_tool.py +215 -0
- execution/tools/describe_image_tool.py +161 -0
- execution/tools/draw_tool.py +164 -0
- execution/tools/edit_image_tool.py +262 -0
- execution/tools/edit_tool.py +245 -0
- execution/tools/file_tool.py +90 -0
- execution/tools/find_anywhere_tool.py +255 -0
- execution/tools/forge_feature_tools.py +377 -0
- execution/tools/glob_tool.py +59 -0
- execution/tools/grep_tool.py +89 -0
- execution/tools/http_request_tool.py +224 -0
- execution/tools/load_skill_tool.py +104 -0
- execution/tools/longcat_avatar_tool.py +384 -0
- execution/tools/mcp_tool.py +100 -0
- execution/tools/notebook_tool.py +279 -0
- execution/tools/openapi_tool.py +440 -0
- execution/tools/plan_mode_tool.py +95 -0
- execution/tools/push_notification_tool.py +157 -0
- execution/tools/python_tool.py +61 -0
- execution/tools/respond_tool.py +40 -0
- execution/tools/sandbox_tool.py +378 -0
- execution/tools/search_tool.py +153 -0
- execution/tools/semantic_search_tool.py +106 -0
- execution/tools/shell_tool.py +283 -0
- execution/tools/speak_tool.py +134 -0
- execution/tools/storyboard_tool.py +727 -0
- execution/tools/system_info_tool.py +212 -0
- execution/tools/task_tool.py +323 -0
- execution/tools/think_tool.py +49 -0
- execution/tools/transcribe_audio_tool.py +86 -0
- execution/tools/update_memory_tool.py +92 -0
- execution/tools/web_fetch_tool.py +82 -0
- execution/tools/worktree_tool.py +174 -0
- llm/__init__.py +0 -0
- llm/fallback.py +116 -0
- llm/models.py +320 -0
- llm/provider.py +1356 -0
- llm/router.py +373 -0
- main.py +1889 -0
- memory/__init__.py +0 -0
- memory/episodic.py +99 -0
- memory/procedural.py +145 -0
- memory/semantic.py +71 -0
- memory/working.py +64 -0
- nn/__init__.py +43 -0
- nn/auto_evolve.py +245 -0
- nn/caudate.py +136 -0
- nn/config.py +141 -0
- nn/consolidator.py +81 -0
- nn/data.py +1635 -0
- nn/encoder.py +258 -0
- nn/forge_advisor.py +303 -0
- nn/format.py +235 -0
- nn/heads.py +432 -0
- nn/observer.py +994 -0
- nn/policy.py +214 -0
- nn/runtime.py +343 -0
- nn/scorer.py +175 -0
- nn/trainer.py +515 -0
- nn/vision.py +352 -0
- personality/__init__.py +23 -0
- personality/engine.py +129 -0
- personality/identity.py +144 -0
- personality/inner_voice.py +100 -0
- personality/mood.py +205 -0
- planning/__init__.py +0 -0
- planning/dev_server.py +221 -0
- planning/forge_models.py +718 -0
- planning/orchestrator.py +1363 -0
- planning/planner.py +451 -0
- planning/task_graph.py +61 -0
- reflection/__init__.py +0 -0
- reflection/meta_learner.py +156 -0
- reflection/reflector.py +127 -0
- ui/__init__.py +5 -0
- ui/display.py +88 -0
- voice/__init__.py +0 -0
- voice/conversation.py +125 -0
- voice/listener.py +111 -0
- voice/speaker.py +59 -0
- voice/stt.py +126 -0
- voice/tts.py +214 -0
api/anthropic_compat.py
ADDED
|
@@ -0,0 +1,1518 @@
|
|
|
1
|
+
"""Anthropic Messages API compatibility layer.
|
|
2
|
+
|
|
3
|
+
Lets Claude Code (or any other Anthropic-format client) point at Cognos
|
|
4
|
+
and get answers back as if Cognos were Anthropic. Internally:
|
|
5
|
+
|
|
6
|
+
incoming /v1/messages (Anthropic schema)
|
|
7
|
+
│
|
|
8
|
+
▼
|
|
9
|
+
translate to Cognos's internal message format
|
|
10
|
+
│
|
|
11
|
+
▼
|
|
12
|
+
route through CognosAgent.llm (DualLLMProvider — Caudate + dual-brain
|
|
13
|
+
routing + fallback chain + prompt caching all engaged)
|
|
14
|
+
│
|
|
15
|
+
▼
|
|
16
|
+
translate response back to Anthropic schema (regular or SSE stream)
|
|
17
|
+
│
|
|
18
|
+
▼
|
|
19
|
+
Claude Code consumes it the same way it consumes a real Anthropic call
|
|
20
|
+
|
|
21
|
+
This is a pure LLM proxy — Claude Code keeps doing its own tool
|
|
22
|
+
execution. Cognos does NOT run its agentic loop here. The benefit is
|
|
23
|
+
that Claude Code's well-engineered REPL + tool stack is preserved, and
|
|
24
|
+
Cognos contributes its routing brain, memory, and (eventually) Caudate
|
|
25
|
+
predictions to every call.
|
|
26
|
+
|
|
27
|
+
Set Claude Code's env to use Cognos:
|
|
28
|
+
|
|
29
|
+
export ANTHROPIC_BASE_URL=http://127.0.0.1:8000
|
|
30
|
+
export ANTHROPIC_AUTH_TOKEN=cognos
|
|
31
|
+
export ANTHROPIC_API_KEY=""
|
|
32
|
+
claude # now talks to Cognos instead of Anthropic
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
from __future__ import annotations
|
|
36
|
+
|
|
37
|
+
import asyncio
|
|
38
|
+
import base64
|
|
39
|
+
import json
|
|
40
|
+
import logging
|
|
41
|
+
import os
|
|
42
|
+
import time
|
|
43
|
+
import uuid
|
|
44
|
+
from typing import Any, AsyncIterator
|
|
45
|
+
|
|
46
|
+
import httpx
|
|
47
|
+
from fastapi import APIRouter, HTTPException, Request
|
|
48
|
+
from fastapi.responses import JSONResponse, StreamingResponse
|
|
49
|
+
|
|
50
|
+
from api.caudate_middleware import CaudateMiddleware
|
|
51
|
+
from core.schemas import StreamEvent, ToolUseBlock
|
|
52
|
+
from llm.provider import LLMProvider
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
# ---- Anthropic passthrough -------------------------------------------
|
|
56
|
+
# When system1 (or the requested model) is an Anthropic Claude id,
|
|
57
|
+
# Cognos forwards the request to api.anthropic.com using the caller's
|
|
58
|
+
# original Authorization header (the user's Claude Code subscription
|
|
59
|
+
# token) instead of routing through LiteLLM/Ollama. Caudate still
|
|
60
|
+
# observes — she sees Opus's behaviour and learns from it.
|
|
61
|
+
|
|
62
|
+
ANTHROPIC_API_URL = "https://api.anthropic.com/v1/messages"
|
|
63
|
+
_ANTHROPIC_VERSION_DEFAULT = "2023-06-01"
|
|
64
|
+
|
|
65
|
+
# Force-enable extended thinking on the upstream request so Caudate
|
|
66
|
+
# always observes Opus's reasoning channel. Disabled with
|
|
67
|
+
# COGNOS_FORCE_THINKING=0 if it ever causes issues.
|
|
68
|
+
_FORCE_THINKING_DEFAULT_BUDGET = 4096
|
|
69
|
+
_FORCE_THINKING_MIN_MAX_TOKENS = 1024 # don't force on tiny requests
|
|
70
|
+
|
|
71
|
+
# Headers we forward verbatim from the incoming request to Anthropic.
|
|
72
|
+
# Anything else (host, content-length, accept-encoding, x-forwarded-*)
|
|
73
|
+
# is dropped so httpx can compute its own.
|
|
74
|
+
_FORWARD_REQUEST_HEADERS = {
|
|
75
|
+
"authorization", "anthropic-version", "anthropic-beta",
|
|
76
|
+
"x-api-key", "anthropic-dangerous-direct-browser-access",
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _resolve_anthropic_model(requested_model: str | None) -> str | None:
|
|
81
|
+
"""Return the Anthropic model id to send upstream, or None if this
|
|
82
|
+
request should NOT use the Anthropic passthrough.
|
|
83
|
+
|
|
84
|
+
Decision order:
|
|
85
|
+
1. requested_model is a Cognos-internal id (has a `[...]` suffix
|
|
86
|
+
like "claude-opus-4-7[1m]" that Claude Code self-reports) → None,
|
|
87
|
+
route locally via the dual-brain path (these ids 404 upstream)
|
|
88
|
+
2. requested_model is already a real claude id → use as-is
|
|
89
|
+
3. requested_model has the "anthropic/" prefix → strip it
|
|
90
|
+
4. settings.system1 starts with "anthropic/" → use that resolved id
|
|
91
|
+
(catches the case where the client picked a "cognos-*" alias)
|
|
92
|
+
5. otherwise → None (fall through to local LiteLLM path)
|
|
93
|
+
"""
|
|
94
|
+
if requested_model:
|
|
95
|
+
m = requested_model.strip()
|
|
96
|
+
# Claude Code identifies itself with a bracketed context-window
|
|
97
|
+
# suffix (e.g. "claude-opus-4-7[1m]"). That id is not in the
|
|
98
|
+
# Anthropic public catalog and forwarding it 404s — drop to
|
|
99
|
+
# local dual-brain routing instead.
|
|
100
|
+
if "[" in m and m.endswith("]"):
|
|
101
|
+
return None
|
|
102
|
+
if m.startswith("claude-"):
|
|
103
|
+
return m
|
|
104
|
+
if m.startswith("anthropic/"):
|
|
105
|
+
return m.split("/", 1)[1]
|
|
106
|
+
# Settings-driven fallback: if system1 is configured for Anthropic,
|
|
107
|
+
# any client request lands on Opus regardless of which "cognos-*"
|
|
108
|
+
# alias they picked.
|
|
109
|
+
try:
|
|
110
|
+
from core.settings import Settings
|
|
111
|
+
s1 = (Settings.load().get("system1") or "")
|
|
112
|
+
if s1.startswith("anthropic/"):
|
|
113
|
+
return s1.split("/", 1)[1]
|
|
114
|
+
except Exception:
|
|
115
|
+
pass
|
|
116
|
+
return None
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
# Capability matrix per model family. Claude Code sends a fistful of
|
|
120
|
+
# request fields that only the newer/larger models accept (extended
|
|
121
|
+
# thinking, the effort knob, etc). When the resolved upstream model
|
|
122
|
+
# doesn't support a given field, we strip it before forwarding — else
|
|
123
|
+
# Anthropic returns 400s like `adaptive thinking is not supported on
|
|
124
|
+
# this model` or `does not support the effort parameter`.
|
|
125
|
+
_THINKING_CAPABLE_PREFIXES = (
|
|
126
|
+
"claude-opus-",
|
|
127
|
+
"claude-sonnet-",
|
|
128
|
+
)
|
|
129
|
+
_EFFORT_CAPABLE_PREFIXES = (
|
|
130
|
+
"claude-opus-",
|
|
131
|
+
"claude-sonnet-",
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def _model_supports_thinking(model_id: str) -> bool:
|
|
136
|
+
if not model_id:
|
|
137
|
+
return False
|
|
138
|
+
return any(model_id.startswith(p) for p in _THINKING_CAPABLE_PREFIXES)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def _model_supports_effort(model_id: str) -> bool:
|
|
142
|
+
if not model_id:
|
|
143
|
+
return False
|
|
144
|
+
return any(model_id.startswith(p) for p in _EFFORT_CAPABLE_PREFIXES)
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def _strip_thinking_dependent_context_management(
|
|
148
|
+
upstream_body: dict[str, Any],
|
|
149
|
+
) -> dict[str, Any]:
|
|
150
|
+
"""Remove `context_management.edits` entries whose `type` mentions
|
|
151
|
+
'thinking' — they require thinking to be enabled and Anthropic
|
|
152
|
+
rejects with 400 when it's been stripped (e.g. Haiku path).
|
|
153
|
+
|
|
154
|
+
If the resulting edits list is empty, drop the whole
|
|
155
|
+
`context_management` field; otherwise keep it with the surviving
|
|
156
|
+
edits."""
|
|
157
|
+
cm = upstream_body.get("context_management")
|
|
158
|
+
if not isinstance(cm, dict):
|
|
159
|
+
return upstream_body
|
|
160
|
+
edits = cm.get("edits")
|
|
161
|
+
if not isinstance(edits, list):
|
|
162
|
+
return upstream_body
|
|
163
|
+
surviving = [
|
|
164
|
+
e for e in edits
|
|
165
|
+
if not (isinstance(e, dict)
|
|
166
|
+
and isinstance(e.get("type"), str)
|
|
167
|
+
and "thinking" in e["type"])
|
|
168
|
+
]
|
|
169
|
+
if not surviving:
|
|
170
|
+
upstream_body.pop("context_management", None)
|
|
171
|
+
else:
|
|
172
|
+
upstream_body["context_management"] = {**cm, "edits": surviving}
|
|
173
|
+
return upstream_body
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def _strip_unsupported_thinking(
|
|
177
|
+
upstream_body: dict[str, Any], model_id: str,
|
|
178
|
+
) -> dict[str, Any]:
|
|
179
|
+
"""If the upstream model doesn't support thinking, drop the field
|
|
180
|
+
*and* any context_management strategies that depend on it."""
|
|
181
|
+
if not _model_supports_thinking(model_id):
|
|
182
|
+
upstream_body.pop("thinking", None)
|
|
183
|
+
upstream_body = _strip_thinking_dependent_context_management(upstream_body)
|
|
184
|
+
return upstream_body
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def _strip_unsupported_fields(
|
|
188
|
+
upstream_body: dict[str, Any], model_id: str,
|
|
189
|
+
) -> dict[str, Any]:
|
|
190
|
+
"""Strip every Claude-Code-extended field that the resolved
|
|
191
|
+
upstream model doesn't accept. Centralises the per-field capability
|
|
192
|
+
checks so adding a new one is one line.
|
|
193
|
+
|
|
194
|
+
Effort lives both at the top level AND nested inside `output_config`
|
|
195
|
+
on newer Claude Code clients — handle both."""
|
|
196
|
+
upstream_body = _strip_unsupported_thinking(upstream_body, model_id)
|
|
197
|
+
if not _model_supports_effort(model_id):
|
|
198
|
+
upstream_body.pop("effort", None)
|
|
199
|
+
oc = upstream_body.get("output_config")
|
|
200
|
+
if isinstance(oc, dict) and "effort" in oc:
|
|
201
|
+
oc = {k: v for k, v in oc.items() if k != "effort"}
|
|
202
|
+
if oc:
|
|
203
|
+
upstream_body["output_config"] = oc
|
|
204
|
+
else:
|
|
205
|
+
upstream_body.pop("output_config", None)
|
|
206
|
+
return upstream_body
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def _maybe_force_thinking(upstream_body: dict[str, Any]) -> dict[str, Any]:
|
|
210
|
+
"""Inject `thinking: {type: enabled}` if the caller didn't ask for it.
|
|
211
|
+
|
|
212
|
+
Caudate's training depends on observing Opus's reasoning channel —
|
|
213
|
+
not just the visible answer. Without this, Claude Code's default
|
|
214
|
+
request shape leaves thinking off and Caudate sees only the surface
|
|
215
|
+
output.
|
|
216
|
+
|
|
217
|
+
Skips when:
|
|
218
|
+
- caller already set `thinking` (respect explicit choice)
|
|
219
|
+
- max_tokens is very small (thinking budget would starve the answer)
|
|
220
|
+
- COGNOS_FORCE_THINKING=0 in env (escape hatch)
|
|
221
|
+
"""
|
|
222
|
+
# Default OFF: Claude Code already requests thinking when it wants
|
|
223
|
+
# it (and Anthropic rejects requests where forced budget brushes
|
|
224
|
+
# against max_tokens, which produces 400s on title-gen / tab-
|
|
225
|
+
# completion calls). Opt in via COGNOS_FORCE_THINKING=1.
|
|
226
|
+
if os.environ.get("COGNOS_FORCE_THINKING", "0") != "1":
|
|
227
|
+
return upstream_body
|
|
228
|
+
if "thinking" in upstream_body:
|
|
229
|
+
return upstream_body
|
|
230
|
+
max_tokens = int(upstream_body.get("max_tokens") or 0)
|
|
231
|
+
# Need enough headroom: thinking + answer must both fit in max_tokens.
|
|
232
|
+
# Skip unless we can guarantee a budget *and* leave 1024 tokens free.
|
|
233
|
+
if not max_tokens or max_tokens < (_FORCE_THINKING_MIN_MAX_TOKENS + 1024):
|
|
234
|
+
return upstream_body
|
|
235
|
+
budget = min(_FORCE_THINKING_DEFAULT_BUDGET, max_tokens - 1024)
|
|
236
|
+
if budget < 1024:
|
|
237
|
+
return upstream_body
|
|
238
|
+
upstream_body["thinking"] = {"type": "enabled", "budget_tokens": budget}
|
|
239
|
+
if "temperature" in upstream_body and upstream_body["temperature"] != 1:
|
|
240
|
+
upstream_body["temperature"] = 1
|
|
241
|
+
upstream_body.pop("top_p", None)
|
|
242
|
+
upstream_body.pop("top_k", None)
|
|
243
|
+
return upstream_body
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def _filter_forward_headers(request: Request) -> dict[str, str]:
|
|
247
|
+
"""Pick only the headers we want to forward upstream."""
|
|
248
|
+
out: dict[str, str] = {}
|
|
249
|
+
for k, v in request.headers.items():
|
|
250
|
+
if k.lower() in _FORWARD_REQUEST_HEADERS:
|
|
251
|
+
out[k] = v
|
|
252
|
+
out.setdefault("anthropic-version", _ANTHROPIC_VERSION_DEFAULT)
|
|
253
|
+
out["content-type"] = "application/json"
|
|
254
|
+
return out
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
def _parse_sse_event(raw_event: str) -> tuple[str | None, dict[str, Any] | None]:
|
|
258
|
+
"""Parse one SSE event block ('event: X\\ndata: {...}') into (type, data)."""
|
|
259
|
+
event_type: str | None = None
|
|
260
|
+
data_lines: list[str] = []
|
|
261
|
+
for line in raw_event.splitlines():
|
|
262
|
+
if line.startswith("event:"):
|
|
263
|
+
event_type = line[6:].strip()
|
|
264
|
+
elif line.startswith("data:"):
|
|
265
|
+
data_lines.append(line[5:].strip())
|
|
266
|
+
if not data_lines:
|
|
267
|
+
return event_type, None
|
|
268
|
+
try:
|
|
269
|
+
data = json.loads("\n".join(data_lines))
|
|
270
|
+
except Exception:
|
|
271
|
+
return event_type, None
|
|
272
|
+
return event_type, data
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
async def _passthrough_anthropic_stream(
|
|
276
|
+
*,
|
|
277
|
+
upstream_body: dict[str, Any],
|
|
278
|
+
headers: dict[str, str],
|
|
279
|
+
middleware: CaudateMiddleware | None,
|
|
280
|
+
turn_ctx: Any,
|
|
281
|
+
) -> AsyncIterator[bytes]:
|
|
282
|
+
"""Forward streaming /v1/messages to api.anthropic.com.
|
|
283
|
+
|
|
284
|
+
Bytes are forwarded to the client unchanged so SSE event ordering
|
|
285
|
+
and field shape are preserved exactly. Each event is *also* parsed
|
|
286
|
+
in-flight so Caudate observes text/thinking/tool deltas as they
|
|
287
|
+
happen.
|
|
288
|
+
"""
|
|
289
|
+
error_occurred = False
|
|
290
|
+
block_types: dict[int, str] = {} # index -> block type ("text"/"thinking"/"tool_use")
|
|
291
|
+
block_tool_names: dict[int, str] = {} # index -> tool name (for stop-event lookup)
|
|
292
|
+
block_tool_inputs: dict[int, str] = {} # index -> accumulated input_json string
|
|
293
|
+
pending_chunk = ""
|
|
294
|
+
|
|
295
|
+
try:
|
|
296
|
+
async with httpx.AsyncClient(timeout=httpx.Timeout(300.0, connect=15.0)) as client:
|
|
297
|
+
async with client.stream(
|
|
298
|
+
"POST", ANTHROPIC_API_URL,
|
|
299
|
+
headers=headers,
|
|
300
|
+
json=upstream_body,
|
|
301
|
+
) as resp:
|
|
302
|
+
if resp.status_code >= 400:
|
|
303
|
+
error_occurred = True
|
|
304
|
+
body_bytes = await resp.aread()
|
|
305
|
+
yield (
|
|
306
|
+
f"event: error\ndata: "
|
|
307
|
+
f"{json.dumps({'type':'error','error':{'type':'api_error','message':body_bytes.decode('utf-8', errors='replace')}})}"
|
|
308
|
+
f"\n\n"
|
|
309
|
+
).encode()
|
|
310
|
+
return
|
|
311
|
+
|
|
312
|
+
async for chunk in resp.aiter_bytes():
|
|
313
|
+
if not chunk:
|
|
314
|
+
continue
|
|
315
|
+
yield chunk
|
|
316
|
+
# Tee into Caudate. Accumulate until we see a blank
|
|
317
|
+
# line which terminates one SSE event.
|
|
318
|
+
pending_chunk += chunk.decode("utf-8", errors="replace")
|
|
319
|
+
while "\n\n" in pending_chunk:
|
|
320
|
+
raw_event, pending_chunk = pending_chunk.split("\n\n", 1)
|
|
321
|
+
if not raw_event.strip():
|
|
322
|
+
continue
|
|
323
|
+
evt_type, data = _parse_sse_event(raw_event)
|
|
324
|
+
if data is None or middleware is None or turn_ctx is None:
|
|
325
|
+
continue
|
|
326
|
+
try:
|
|
327
|
+
if evt_type == "content_block_start":
|
|
328
|
+
idx = int(data.get("index", -1))
|
|
329
|
+
cb = data.get("content_block") or {}
|
|
330
|
+
block_types[idx] = cb.get("type", "")
|
|
331
|
+
if cb.get("type") == "tool_use":
|
|
332
|
+
name = cb.get("name", "")
|
|
333
|
+
block_tool_names[idx] = name
|
|
334
|
+
block_tool_inputs[idx] = ""
|
|
335
|
+
middleware.observe_tool_use(turn_ctx, name)
|
|
336
|
+
elif evt_type == "content_block_delta":
|
|
337
|
+
idx = int(data.get("index", -1))
|
|
338
|
+
delta = data.get("delta") or {}
|
|
339
|
+
dtype = delta.get("type")
|
|
340
|
+
if dtype == "text_delta":
|
|
341
|
+
middleware.observe_response_text(turn_ctx, delta.get("text", ""))
|
|
342
|
+
elif dtype == "thinking_delta":
|
|
343
|
+
middleware.observe_thinking(turn_ctx, delta.get("thinking", ""))
|
|
344
|
+
elif dtype == "input_json_delta":
|
|
345
|
+
# Accumulate the streamed JSON of a
|
|
346
|
+
# tool_use block so we can capture
|
|
347
|
+
# questions/inputs (e.g.
|
|
348
|
+
# AskUserQuestion's question + options)
|
|
349
|
+
# for Caudate's training context.
|
|
350
|
+
block_tool_inputs[idx] = (
|
|
351
|
+
block_tool_inputs.get(idx, "")
|
|
352
|
+
+ (delta.get("partial_json") or "")
|
|
353
|
+
)
|
|
354
|
+
elif evt_type == "content_block_stop":
|
|
355
|
+
idx = int(data.get("index", -1))
|
|
356
|
+
if block_types.get(idx) == "tool_use":
|
|
357
|
+
name = block_tool_names.get(idx, "")
|
|
358
|
+
raw = block_tool_inputs.get(idx, "")
|
|
359
|
+
if raw:
|
|
360
|
+
# Surface the structured input as
|
|
361
|
+
# part of the response text so
|
|
362
|
+
# Caudate's text-encoder sees it.
|
|
363
|
+
middleware.observe_response_text(
|
|
364
|
+
turn_ctx,
|
|
365
|
+
f"\n[tool_use {name}({raw[:1000]})]\n",
|
|
366
|
+
)
|
|
367
|
+
except Exception as e:
|
|
368
|
+
logger.debug(f"caudate observe (passthrough stream) failed: {e}")
|
|
369
|
+
except Exception as e:
|
|
370
|
+
logger.exception("Anthropic passthrough stream failed")
|
|
371
|
+
error_occurred = True
|
|
372
|
+
yield (
|
|
373
|
+
f"event: error\ndata: "
|
|
374
|
+
f"{json.dumps({'type':'error','error':{'type':'api_error','message':str(e)}})}"
|
|
375
|
+
f"\n\n"
|
|
376
|
+
).encode()
|
|
377
|
+
finally:
|
|
378
|
+
if middleware is not None and turn_ctx is not None:
|
|
379
|
+
middleware.end_turn(turn_ctx, error=error_occurred)
|
|
380
|
+
|
|
381
|
+
|
|
382
|
+
async def _passthrough_anthropic_nonstream(
|
|
383
|
+
*,
|
|
384
|
+
upstream_body: dict[str, Any],
|
|
385
|
+
headers: dict[str, str],
|
|
386
|
+
middleware: CaudateMiddleware | None,
|
|
387
|
+
turn_ctx: Any,
|
|
388
|
+
) -> JSONResponse:
|
|
389
|
+
"""Forward non-streaming /v1/messages and observe the response."""
|
|
390
|
+
try:
|
|
391
|
+
async with httpx.AsyncClient(timeout=httpx.Timeout(300.0, connect=15.0)) as client:
|
|
392
|
+
resp = await client.post(
|
|
393
|
+
ANTHROPIC_API_URL, headers=headers, json=upstream_body,
|
|
394
|
+
)
|
|
395
|
+
if resp.status_code >= 400:
|
|
396
|
+
if middleware is not None and turn_ctx is not None:
|
|
397
|
+
middleware.end_turn(turn_ctx, error=True)
|
|
398
|
+
return JSONResponse(
|
|
399
|
+
status_code=resp.status_code,
|
|
400
|
+
content={"type": "error", "error": {
|
|
401
|
+
"type": "api_error",
|
|
402
|
+
"message": resp.text,
|
|
403
|
+
}},
|
|
404
|
+
)
|
|
405
|
+
data = resp.json()
|
|
406
|
+
if middleware is not None and turn_ctx is not None:
|
|
407
|
+
try:
|
|
408
|
+
for block in data.get("content") or []:
|
|
409
|
+
btype = block.get("type")
|
|
410
|
+
if btype == "text":
|
|
411
|
+
middleware.observe_response_text(turn_ctx, block.get("text", ""))
|
|
412
|
+
elif btype == "thinking":
|
|
413
|
+
middleware.observe_thinking(turn_ctx, block.get("thinking", ""))
|
|
414
|
+
elif btype == "tool_use":
|
|
415
|
+
name = block.get("name", "")
|
|
416
|
+
middleware.observe_tool_use(turn_ctx, name)
|
|
417
|
+
# Surface the tool's structured input (the
|
|
418
|
+
# question + options for AskUserQuestion, etc.)
|
|
419
|
+
# so Caudate sees the intent, not just the name.
|
|
420
|
+
try:
|
|
421
|
+
inp = json.dumps(block.get("input") or {}, ensure_ascii=False)[:1000]
|
|
422
|
+
except Exception:
|
|
423
|
+
inp = str(block.get("input"))[:1000]
|
|
424
|
+
if inp:
|
|
425
|
+
middleware.observe_response_text(
|
|
426
|
+
turn_ctx, f"\n[tool_use {name}({inp})]\n",
|
|
427
|
+
)
|
|
428
|
+
except Exception as e:
|
|
429
|
+
logger.debug(f"caudate observe (passthrough nonstream) failed: {e}")
|
|
430
|
+
middleware.end_turn(turn_ctx, error=False)
|
|
431
|
+
return JSONResponse(content=data, status_code=resp.status_code)
|
|
432
|
+
except Exception as e:
|
|
433
|
+
logger.exception("Anthropic passthrough nonstream failed")
|
|
434
|
+
if middleware is not None and turn_ctx is not None:
|
|
435
|
+
middleware.end_turn(turn_ctx, error=True)
|
|
436
|
+
raise HTTPException(502, f"Anthropic upstream error: {e}")
|
|
437
|
+
|
|
438
|
+
logger = logging.getLogger(__name__)
|
|
439
|
+
|
|
440
|
+
|
|
441
|
+
# ---- Translation helpers ---------------------------------------------
|
|
442
|
+
|
|
443
|
+
|
|
444
|
+
def _translate_anthropic_to_internal(
|
|
445
|
+
body: dict[str, Any],
|
|
446
|
+
) -> tuple[list[dict[str, Any]], list[dict[str, Any]] | None]:
|
|
447
|
+
"""Anthropic /v1/messages body → Cognos messages list + tools list.
|
|
448
|
+
|
|
449
|
+
Returns (messages, tools_or_None). Messages are in OpenAI/LiteLLM
|
|
450
|
+
shape since that's what `LLMProvider.chat()` expects underneath.
|
|
451
|
+
"""
|
|
452
|
+
raw_messages = body.get("messages") or []
|
|
453
|
+
raw_system = body.get("system")
|
|
454
|
+
raw_tools = body.get("tools") or []
|
|
455
|
+
|
|
456
|
+
out: list[dict[str, Any]] = []
|
|
457
|
+
|
|
458
|
+
# System prompt — can be a string or a list of text blocks
|
|
459
|
+
if isinstance(raw_system, str) and raw_system:
|
|
460
|
+
out.append({"role": "system", "content": raw_system})
|
|
461
|
+
elif isinstance(raw_system, list):
|
|
462
|
+
text = "".join(
|
|
463
|
+
b.get("text", "") for b in raw_system
|
|
464
|
+
if isinstance(b, dict) and b.get("type") == "text"
|
|
465
|
+
)
|
|
466
|
+
if text:
|
|
467
|
+
out.append({"role": "system", "content": text})
|
|
468
|
+
|
|
469
|
+
# Convert each message
|
|
470
|
+
for msg in raw_messages:
|
|
471
|
+
role = msg.get("role")
|
|
472
|
+
content = msg.get("content")
|
|
473
|
+
if isinstance(content, str):
|
|
474
|
+
out.append({"role": role, "content": content})
|
|
475
|
+
continue
|
|
476
|
+
|
|
477
|
+
if not isinstance(content, list):
|
|
478
|
+
continue
|
|
479
|
+
|
|
480
|
+
# Multi-block content — translate each block
|
|
481
|
+
text_parts: list[dict[str, Any]] = []
|
|
482
|
+
tool_uses: list[dict[str, Any]] = []
|
|
483
|
+
tool_results: list[dict[str, Any]] = []
|
|
484
|
+
|
|
485
|
+
for block in content:
|
|
486
|
+
if not isinstance(block, dict):
|
|
487
|
+
continue
|
|
488
|
+
btype = block.get("type")
|
|
489
|
+
if btype == "text":
|
|
490
|
+
text_parts.append({"type": "text", "text": block.get("text", "")})
|
|
491
|
+
elif btype == "image":
|
|
492
|
+
src = block.get("source") or {}
|
|
493
|
+
if src.get("type") == "base64":
|
|
494
|
+
media_type = src.get("media_type", "image/png")
|
|
495
|
+
data = src.get("data", "")
|
|
496
|
+
text_parts.append({
|
|
497
|
+
"type": "image_url",
|
|
498
|
+
"image_url": {"url": f"data:{media_type};base64,{data}"},
|
|
499
|
+
})
|
|
500
|
+
elif src.get("type") == "url":
|
|
501
|
+
text_parts.append({
|
|
502
|
+
"type": "image_url",
|
|
503
|
+
"image_url": {"url": src.get("url", "")},
|
|
504
|
+
})
|
|
505
|
+
elif btype == "tool_use":
|
|
506
|
+
tool_uses.append({
|
|
507
|
+
"id": block.get("id", ""),
|
|
508
|
+
"type": "function",
|
|
509
|
+
"function": {
|
|
510
|
+
"name": block.get("name", ""),
|
|
511
|
+
"arguments": json.dumps(block.get("input") or {}),
|
|
512
|
+
},
|
|
513
|
+
})
|
|
514
|
+
elif btype == "tool_result":
|
|
515
|
+
# Anthropic puts tool results in user messages; OpenAI
|
|
516
|
+
# uses a separate "tool" role.
|
|
517
|
+
rc = block.get("content")
|
|
518
|
+
if isinstance(rc, list):
|
|
519
|
+
rc = "\n".join(
|
|
520
|
+
b.get("text", "") for b in rc
|
|
521
|
+
if isinstance(b, dict) and b.get("type") == "text"
|
|
522
|
+
)
|
|
523
|
+
tool_results.append({
|
|
524
|
+
"role": "tool",
|
|
525
|
+
"tool_call_id": block.get("tool_use_id", ""),
|
|
526
|
+
"content": str(rc or ""),
|
|
527
|
+
})
|
|
528
|
+
|
|
529
|
+
# Emit assistant message with tool_calls (OpenAI shape)
|
|
530
|
+
if role == "assistant":
|
|
531
|
+
entry: dict[str, Any] = {"role": "assistant"}
|
|
532
|
+
if text_parts:
|
|
533
|
+
entry["content"] = (
|
|
534
|
+
text_parts[0]["text"] if len(text_parts) == 1
|
|
535
|
+
and text_parts[0]["type"] == "text"
|
|
536
|
+
else text_parts
|
|
537
|
+
)
|
|
538
|
+
else:
|
|
539
|
+
entry["content"] = ""
|
|
540
|
+
if tool_uses:
|
|
541
|
+
entry["tool_calls"] = tool_uses
|
|
542
|
+
out.append(entry)
|
|
543
|
+
else:
|
|
544
|
+
# User message — text + images stay together; tool_results
|
|
545
|
+
# become separate "tool" role messages right after.
|
|
546
|
+
if text_parts:
|
|
547
|
+
payload: Any = (
|
|
548
|
+
text_parts[0]["text"] if len(text_parts) == 1
|
|
549
|
+
and text_parts[0]["type"] == "text"
|
|
550
|
+
else text_parts
|
|
551
|
+
)
|
|
552
|
+
out.append({"role": "user", "content": payload})
|
|
553
|
+
for tr in tool_results:
|
|
554
|
+
out.append(tr)
|
|
555
|
+
|
|
556
|
+
tools_translated: list[dict[str, Any]] | None = None
|
|
557
|
+
if raw_tools:
|
|
558
|
+
tools_translated = [
|
|
559
|
+
{
|
|
560
|
+
"type": "function",
|
|
561
|
+
"function": {
|
|
562
|
+
"name": t.get("name", ""),
|
|
563
|
+
"description": t.get("description", ""),
|
|
564
|
+
"parameters": t.get("input_schema") or {
|
|
565
|
+
"type": "object", "properties": {}, "required": [],
|
|
566
|
+
},
|
|
567
|
+
},
|
|
568
|
+
}
|
|
569
|
+
for t in raw_tools if isinstance(t, dict)
|
|
570
|
+
]
|
|
571
|
+
|
|
572
|
+
return out, tools_translated
|
|
573
|
+
|
|
574
|
+
|
|
575
|
+
def _build_anthropic_response(
|
|
576
|
+
*,
|
|
577
|
+
thinking: str = "",
|
|
578
|
+
text: str,
|
|
579
|
+
tool_calls: list[ToolUseBlock],
|
|
580
|
+
model: str,
|
|
581
|
+
usage: dict[str, int],
|
|
582
|
+
stop_reason: str | None,
|
|
583
|
+
) -> dict[str, Any]:
|
|
584
|
+
"""Build the non-streaming /v1/messages response."""
|
|
585
|
+
blocks: list[dict[str, Any]] = []
|
|
586
|
+
# Thinking block first — Anthropic's spec puts thinking before text
|
|
587
|
+
# so clients (Claude Code) render the reasoning above the answer.
|
|
588
|
+
if thinking:
|
|
589
|
+
blocks.append({"type": "thinking", "thinking": thinking})
|
|
590
|
+
if text:
|
|
591
|
+
blocks.append({"type": "text", "text": text})
|
|
592
|
+
for tc in tool_calls:
|
|
593
|
+
blocks.append({
|
|
594
|
+
"type": "tool_use",
|
|
595
|
+
"id": tc.id or f"toolu_{uuid.uuid4().hex[:12]}",
|
|
596
|
+
"name": tc.name,
|
|
597
|
+
"input": tc.input or {},
|
|
598
|
+
})
|
|
599
|
+
|
|
600
|
+
# Translate stop reason
|
|
601
|
+
stop_map = {
|
|
602
|
+
"stop": "end_turn", "length": "max_tokens",
|
|
603
|
+
"tool_calls": "tool_use", "tool_use": "tool_use",
|
|
604
|
+
}
|
|
605
|
+
anthropic_stop = stop_map.get(stop_reason or "stop", "end_turn")
|
|
606
|
+
if tool_calls and anthropic_stop == "end_turn":
|
|
607
|
+
anthropic_stop = "tool_use"
|
|
608
|
+
|
|
609
|
+
return {
|
|
610
|
+
"id": f"msg_{uuid.uuid4().hex[:24]}",
|
|
611
|
+
"type": "message",
|
|
612
|
+
"role": "assistant",
|
|
613
|
+
"model": model,
|
|
614
|
+
"content": blocks or [{"type": "text", "text": ""}],
|
|
615
|
+
"stop_reason": anthropic_stop,
|
|
616
|
+
"stop_sequence": None,
|
|
617
|
+
"usage": {
|
|
618
|
+
"input_tokens": usage.get("prompt_tokens", 0),
|
|
619
|
+
"output_tokens": usage.get("completion_tokens", 0),
|
|
620
|
+
},
|
|
621
|
+
}
|
|
622
|
+
|
|
623
|
+
|
|
624
|
+
# ---- Streaming SSE generator -----------------------------------------
|
|
625
|
+
|
|
626
|
+
|
|
627
|
+
async def _stream_anthropic_events(
|
|
628
|
+
llm: LLMProvider,
|
|
629
|
+
messages: list[dict[str, Any]],
|
|
630
|
+
tools: list[dict[str, Any]] | None,
|
|
631
|
+
max_tokens: int,
|
|
632
|
+
temperature: float | None,
|
|
633
|
+
requested_model: str,
|
|
634
|
+
middleware: CaudateMiddleware | None = None,
|
|
635
|
+
turn_ctx: Any = None,
|
|
636
|
+
) -> AsyncIterator[bytes]:
|
|
637
|
+
"""Cognos stream → Anthropic SSE format.
|
|
638
|
+
|
|
639
|
+
Anthropic emits a strict sequence of events:
|
|
640
|
+
message_start, content_block_start (text), [content_block_delta]+
|
|
641
|
+
content_block_stop, [tool blocks], message_delta, message_stop
|
|
642
|
+
"""
|
|
643
|
+
msg_id = f"msg_{uuid.uuid4().hex[:24]}"
|
|
644
|
+
|
|
645
|
+
def _sse(event: str, data: dict[str, Any]) -> bytes:
|
|
646
|
+
return f"event: {event}\ndata: {json.dumps(data)}\n\n".encode()
|
|
647
|
+
|
|
648
|
+
# message_start
|
|
649
|
+
yield _sse("message_start", {
|
|
650
|
+
"type": "message_start",
|
|
651
|
+
"message": {
|
|
652
|
+
"id": msg_id, "type": "message", "role": "assistant",
|
|
653
|
+
"model": requested_model, "content": [],
|
|
654
|
+
"stop_reason": None, "stop_sequence": None,
|
|
655
|
+
"usage": {"input_tokens": 0, "output_tokens": 1},
|
|
656
|
+
},
|
|
657
|
+
})
|
|
658
|
+
|
|
659
|
+
# We track which content blocks we've opened.
|
|
660
|
+
# Block layout follows Anthropic's spec: thinking (if any) at index 0,
|
|
661
|
+
# then text at the next index, then tool_use blocks after that.
|
|
662
|
+
thinking_open = False
|
|
663
|
+
thinking_index = 0
|
|
664
|
+
text_open = False
|
|
665
|
+
text_index = 0
|
|
666
|
+
tool_blocks: list[tuple[int, ToolUseBlock]] = []
|
|
667
|
+
next_index = 0
|
|
668
|
+
output_text = ""
|
|
669
|
+
stop_reason: str | None = None
|
|
670
|
+
|
|
671
|
+
def _close_thinking() -> bytes | None:
|
|
672
|
+
nonlocal thinking_open
|
|
673
|
+
if thinking_open:
|
|
674
|
+
thinking_open = False
|
|
675
|
+
return _sse("content_block_stop", {
|
|
676
|
+
"type": "content_block_stop", "index": thinking_index,
|
|
677
|
+
})
|
|
678
|
+
return None
|
|
679
|
+
|
|
680
|
+
error_occurred = False
|
|
681
|
+
try:
|
|
682
|
+
async for event in llm.stream(
|
|
683
|
+
messages=messages, tools=tools,
|
|
684
|
+
max_tokens=max_tokens, temperature=temperature,
|
|
685
|
+
):
|
|
686
|
+
if event.type == "thinking_delta" and event.delta:
|
|
687
|
+
# Feed thinking into the middleware too — it's signal
|
|
688
|
+
# for tool-intent inference even though it's not visible.
|
|
689
|
+
if middleware is not None and turn_ctx is not None:
|
|
690
|
+
middleware.observe_thinking(turn_ctx, event.delta)
|
|
691
|
+
# First thinking chunk: open the thinking block
|
|
692
|
+
if not thinking_open:
|
|
693
|
+
yield _sse("content_block_start", {
|
|
694
|
+
"type": "content_block_start",
|
|
695
|
+
"index": thinking_index,
|
|
696
|
+
"content_block": {"type": "thinking", "thinking": ""},
|
|
697
|
+
})
|
|
698
|
+
thinking_open = True
|
|
699
|
+
text_index = thinking_index + 1
|
|
700
|
+
next_index = max(next_index, thinking_index + 1)
|
|
701
|
+
yield _sse("content_block_delta", {
|
|
702
|
+
"type": "content_block_delta",
|
|
703
|
+
"index": thinking_index,
|
|
704
|
+
"delta": {"type": "thinking_delta", "thinking": event.delta},
|
|
705
|
+
})
|
|
706
|
+
elif event.type == "text_delta" and event.delta:
|
|
707
|
+
# If thinking was open and we're switching to text, close it
|
|
708
|
+
close_evt = _close_thinking()
|
|
709
|
+
if close_evt:
|
|
710
|
+
yield close_evt
|
|
711
|
+
if not text_open:
|
|
712
|
+
yield _sse("content_block_start", {
|
|
713
|
+
"type": "content_block_start",
|
|
714
|
+
"index": text_index,
|
|
715
|
+
"content_block": {"type": "text", "text": ""},
|
|
716
|
+
})
|
|
717
|
+
text_open = True
|
|
718
|
+
next_index = max(next_index, text_index + 1)
|
|
719
|
+
output_text += event.delta
|
|
720
|
+
if middleware is not None and turn_ctx is not None:
|
|
721
|
+
middleware.observe_response_text(turn_ctx, event.delta)
|
|
722
|
+
yield _sse("content_block_delta", {
|
|
723
|
+
"type": "content_block_delta",
|
|
724
|
+
"index": text_index,
|
|
725
|
+
"delta": {"type": "text_delta", "text": event.delta},
|
|
726
|
+
})
|
|
727
|
+
elif event.type == "tool_use_end":
|
|
728
|
+
# Close any open thinking block before emitting tool blocks
|
|
729
|
+
close_evt = _close_thinking()
|
|
730
|
+
if close_evt:
|
|
731
|
+
yield close_evt
|
|
732
|
+
# Cognos emits the whole tool call at once. Anthropic
|
|
733
|
+
# wants a content_block_start + input_json_delta + stop.
|
|
734
|
+
idx = next_index
|
|
735
|
+
next_index += 1
|
|
736
|
+
tc = ToolUseBlock(
|
|
737
|
+
id=event.tool_use_id or f"toolu_{uuid.uuid4().hex[:12]}",
|
|
738
|
+
name=event.tool_name or "",
|
|
739
|
+
input=event.tool_input or {},
|
|
740
|
+
)
|
|
741
|
+
tool_blocks.append((idx, tc))
|
|
742
|
+
if middleware is not None and turn_ctx is not None:
|
|
743
|
+
middleware.observe_tool_use(turn_ctx, tc.name)
|
|
744
|
+
yield _sse("content_block_start", {
|
|
745
|
+
"type": "content_block_start",
|
|
746
|
+
"index": idx,
|
|
747
|
+
"content_block": {
|
|
748
|
+
"type": "tool_use",
|
|
749
|
+
"id": tc.id,
|
|
750
|
+
"name": tc.name,
|
|
751
|
+
"input": {},
|
|
752
|
+
},
|
|
753
|
+
})
|
|
754
|
+
yield _sse("content_block_delta", {
|
|
755
|
+
"type": "content_block_delta",
|
|
756
|
+
"index": idx,
|
|
757
|
+
"delta": {
|
|
758
|
+
"type": "input_json_delta",
|
|
759
|
+
"partial_json": json.dumps(tc.input),
|
|
760
|
+
},
|
|
761
|
+
})
|
|
762
|
+
yield _sse("content_block_stop", {
|
|
763
|
+
"type": "content_block_stop", "index": idx,
|
|
764
|
+
})
|
|
765
|
+
elif event.type == "message_stop":
|
|
766
|
+
stop_reason = event.stop_reason
|
|
767
|
+
except Exception as e:
|
|
768
|
+
logger.exception("Stream upstream failed")
|
|
769
|
+
error_occurred = True
|
|
770
|
+
# Emit an error event so the client knows
|
|
771
|
+
yield _sse("error", {
|
|
772
|
+
"type": "error",
|
|
773
|
+
"error": {"type": "api_error", "message": str(e)},
|
|
774
|
+
})
|
|
775
|
+
if middleware is not None and turn_ctx is not None:
|
|
776
|
+
middleware.end_turn(turn_ctx, error=True)
|
|
777
|
+
return
|
|
778
|
+
|
|
779
|
+
if thinking_open:
|
|
780
|
+
yield _sse("content_block_stop", {
|
|
781
|
+
"type": "content_block_stop", "index": thinking_index,
|
|
782
|
+
})
|
|
783
|
+
thinking_open = False
|
|
784
|
+
if text_open:
|
|
785
|
+
yield _sse("content_block_stop", {
|
|
786
|
+
"type": "content_block_stop", "index": text_index,
|
|
787
|
+
})
|
|
788
|
+
|
|
789
|
+
# Translate stop reason
|
|
790
|
+
stop_map = {
|
|
791
|
+
"stop": "end_turn", "length": "max_tokens",
|
|
792
|
+
"tool_calls": "tool_use", "tool_use": "tool_use",
|
|
793
|
+
}
|
|
794
|
+
anthropic_stop = stop_map.get(stop_reason or "stop", "end_turn")
|
|
795
|
+
if tool_blocks and anthropic_stop == "end_turn":
|
|
796
|
+
anthropic_stop = "tool_use"
|
|
797
|
+
|
|
798
|
+
# Approximate output token count — we don't have it exact for streaming
|
|
799
|
+
output_tokens = max(1, len(output_text.split()))
|
|
800
|
+
|
|
801
|
+
yield _sse("message_delta", {
|
|
802
|
+
"type": "message_delta",
|
|
803
|
+
"delta": {"stop_reason": anthropic_stop, "stop_sequence": None},
|
|
804
|
+
"usage": {"output_tokens": output_tokens},
|
|
805
|
+
})
|
|
806
|
+
yield _sse("message_stop", {"type": "message_stop"})
|
|
807
|
+
|
|
808
|
+
# Close the Caudate turn — derive reward, push samples, write episode,
|
|
809
|
+
# potentially trigger auto-train.
|
|
810
|
+
if middleware is not None and turn_ctx is not None and not error_occurred:
|
|
811
|
+
middleware.end_turn(turn_ctx, error=False)
|
|
812
|
+
|
|
813
|
+
|
|
814
|
+
# ---- Dual-brain arbitration on /v1/messages -------------------------
|
|
815
|
+
# Mirror of the Open-WebUI arbitration in api/openai_compat.py, but the
|
|
816
|
+
# response/stream shape is Anthropic, not OpenAI. Both brains are run
|
|
817
|
+
# in parallel and a heuristic scorer picks the winner; both drafts are
|
|
818
|
+
# captured for Caudate's preference-learning corpus
|
|
819
|
+
# (data/nn/arbitrations.jsonl). This is the substrate for Phase 4 of
|
|
820
|
+
# CAUDATE_EVOLUTION.md (the conductor) — same data shape regardless of
|
|
821
|
+
# which client speaks to Cognos.
|
|
822
|
+
|
|
823
|
+
|
|
824
|
+
def _anthropic_response_from_llm_response(
|
|
825
|
+
*,
|
|
826
|
+
resp: Any, # LLMResponse
|
|
827
|
+
requested_model: str,
|
|
828
|
+
) -> dict[str, Any]:
|
|
829
|
+
"""Translate an internal `LLMResponse` into Anthropic /v1/messages body."""
|
|
830
|
+
# Surface thinking when content is empty (mirror the openai-compat
|
|
831
|
+
# fallback so the user always sees something).
|
|
832
|
+
text = resp.content or ""
|
|
833
|
+
thinking = getattr(resp, "thinking", "") or ""
|
|
834
|
+
if not text and thinking:
|
|
835
|
+
text = (
|
|
836
|
+
f"[thinking — model didn't finish before max_tokens]\n\n{thinking}"
|
|
837
|
+
)
|
|
838
|
+
return _build_anthropic_response(
|
|
839
|
+
thinking=thinking if text != f"[thinking — model didn't finish before max_tokens]\n\n{thinking}" else "",
|
|
840
|
+
text=text,
|
|
841
|
+
tool_calls=resp.tool_calls or [],
|
|
842
|
+
model=requested_model,
|
|
843
|
+
usage=resp.usage or {},
|
|
844
|
+
stop_reason=resp.stop_reason,
|
|
845
|
+
)
|
|
846
|
+
|
|
847
|
+
|
|
848
|
+
async def _anthropic_arbitrate_nonstream(
|
|
849
|
+
*,
|
|
850
|
+
body: dict[str, Any],
|
|
851
|
+
internal_msgs: list[dict[str, Any]],
|
|
852
|
+
internal_tools: list[dict[str, Any]] | None,
|
|
853
|
+
max_tokens: int,
|
|
854
|
+
temperature: float | None,
|
|
855
|
+
requested_model: str,
|
|
856
|
+
agent: Any,
|
|
857
|
+
middleware: CaudateMiddleware,
|
|
858
|
+
turn_ctx: Any,
|
|
859
|
+
) -> JSONResponse:
|
|
860
|
+
"""Run system1 + system2 in parallel via DualLLMProvider, score
|
|
861
|
+
both, return the winner as an Anthropic /v1/messages JSON body.
|
|
862
|
+
Subscription auth scope is needed so the Anthropic-side brain can
|
|
863
|
+
use the user's OAuth token (it'd 401 with x-api-key otherwise)."""
|
|
864
|
+
from api.openai_compat import _dual_brain_arbitrate
|
|
865
|
+
from core.anthropic_auth import subscription_auth_scope
|
|
866
|
+
from llm.router import DualLLMProvider
|
|
867
|
+
|
|
868
|
+
if not isinstance(agent.llm, DualLLMProvider):
|
|
869
|
+
# No dual-brain wired — fall back to a single chat call.
|
|
870
|
+
with subscription_auth_scope():
|
|
871
|
+
resp = await agent.llm.chat(
|
|
872
|
+
messages=internal_msgs, tools=internal_tools,
|
|
873
|
+
max_tokens=max_tokens, temperature=temperature,
|
|
874
|
+
)
|
|
875
|
+
else:
|
|
876
|
+
with subscription_auth_scope():
|
|
877
|
+
resp = await _dual_brain_arbitrate(
|
|
878
|
+
llm=agent.llm,
|
|
879
|
+
messages=internal_msgs, tools=internal_tools,
|
|
880
|
+
max_tokens=max_tokens, temperature=temperature,
|
|
881
|
+
middleware=middleware, turn_ctx=turn_ctx,
|
|
882
|
+
)
|
|
883
|
+
|
|
884
|
+
# Feed observer (ones not already covered by _dual_brain_arbitrate's
|
|
885
|
+
# observe_arbitration call — that records BOTH drafts; we still
|
|
886
|
+
# need to record the chosen text for the normal observation path).
|
|
887
|
+
middleware.observe_response_text(turn_ctx, resp.content or "")
|
|
888
|
+
if getattr(resp, "thinking", None):
|
|
889
|
+
middleware.observe_thinking(turn_ctx, resp.thinking)
|
|
890
|
+
for tc in resp.tool_calls or []:
|
|
891
|
+
middleware.observe_tool_use(turn_ctx, tc.name)
|
|
892
|
+
middleware.end_turn(turn_ctx, error=False)
|
|
893
|
+
|
|
894
|
+
return JSONResponse(_anthropic_response_from_llm_response(
|
|
895
|
+
resp=resp, requested_model=requested_model,
|
|
896
|
+
))
|
|
897
|
+
|
|
898
|
+
|
|
899
|
+
async def _anthropic_arbitrate_stream(
|
|
900
|
+
*,
|
|
901
|
+
body: dict[str, Any],
|
|
902
|
+
internal_msgs: list[dict[str, Any]],
|
|
903
|
+
internal_tools: list[dict[str, Any]] | None,
|
|
904
|
+
max_tokens: int,
|
|
905
|
+
temperature: float | None,
|
|
906
|
+
requested_model: str,
|
|
907
|
+
agent: Any,
|
|
908
|
+
middleware: CaudateMiddleware,
|
|
909
|
+
turn_ctx: Any,
|
|
910
|
+
) -> AsyncIterator[bytes]:
|
|
911
|
+
"""Streaming arbitration: stream system1 LIVE so the user sees
|
|
912
|
+
text flow within seconds, run system2 in parallel as a buffered
|
|
913
|
+
background draft for the preference corpus. Avoids the 20+ second
|
|
914
|
+
blackout that comes from buffering both before any data flows
|
|
915
|
+
(Claude Code times out on that)."""
|
|
916
|
+
import asyncio
|
|
917
|
+
from api.openai_compat import _score_draft
|
|
918
|
+
from core.anthropic_auth import subscription_auth_scope
|
|
919
|
+
from llm.router import DualLLMProvider
|
|
920
|
+
|
|
921
|
+
msg_id = f"msg_{uuid.uuid4().hex[:24]}"
|
|
922
|
+
|
|
923
|
+
def _sse(event_name: str, data: dict[str, Any]) -> bytes:
|
|
924
|
+
return f"event: {event_name}\ndata: {json.dumps(data)}\n\n".encode()
|
|
925
|
+
|
|
926
|
+
# No dual-brain — fall back to a single-brain stream.
|
|
927
|
+
if not isinstance(agent.llm, DualLLMProvider):
|
|
928
|
+
try:
|
|
929
|
+
with subscription_auth_scope():
|
|
930
|
+
resp = await agent.llm.chat(
|
|
931
|
+
messages=internal_msgs, tools=internal_tools,
|
|
932
|
+
max_tokens=max_tokens, temperature=temperature,
|
|
933
|
+
)
|
|
934
|
+
except Exception as e:
|
|
935
|
+
logger.exception("anthropic-arbitrate single fallback failed")
|
|
936
|
+
middleware.end_turn(turn_ctx, error=True)
|
|
937
|
+
yield _sse("error", {
|
|
938
|
+
"type": "error",
|
|
939
|
+
"error": {"type": "api_error", "message": str(e)},
|
|
940
|
+
})
|
|
941
|
+
return
|
|
942
|
+
text = resp.content or ""
|
|
943
|
+
thinking = getattr(resp, "thinking", "") or ""
|
|
944
|
+
if not text and thinking:
|
|
945
|
+
text = f"[thinking — model didn't finish before max_tokens]\n\n{thinking}"
|
|
946
|
+
middleware.observe_response_text(turn_ctx, text)
|
|
947
|
+
for tc in resp.tool_calls or []:
|
|
948
|
+
middleware.observe_tool_use(turn_ctx, tc.name)
|
|
949
|
+
middleware.end_turn(turn_ctx, error=False)
|
|
950
|
+
# Single-shot fake stream for the fallback case.
|
|
951
|
+
yield _sse("message_start", {
|
|
952
|
+
"type": "message_start",
|
|
953
|
+
"message": {
|
|
954
|
+
"id": msg_id, "type": "message", "role": "assistant",
|
|
955
|
+
"model": requested_model, "content": [],
|
|
956
|
+
"stop_reason": None, "stop_sequence": None,
|
|
957
|
+
"usage": {"input_tokens": 0, "output_tokens": 1},
|
|
958
|
+
},
|
|
959
|
+
})
|
|
960
|
+
if text:
|
|
961
|
+
yield _sse("content_block_start", {
|
|
962
|
+
"type": "content_block_start", "index": 0,
|
|
963
|
+
"content_block": {"type": "text", "text": ""},
|
|
964
|
+
})
|
|
965
|
+
step = 80
|
|
966
|
+
for i in range(0, len(text), step):
|
|
967
|
+
yield _sse("content_block_delta", {
|
|
968
|
+
"type": "content_block_delta", "index": 0,
|
|
969
|
+
"delta": {"type": "text_delta", "text": text[i:i+step]},
|
|
970
|
+
})
|
|
971
|
+
yield _sse("content_block_stop", {
|
|
972
|
+
"type": "content_block_stop", "index": 0,
|
|
973
|
+
})
|
|
974
|
+
yield _sse("message_delta", {
|
|
975
|
+
"type": "message_delta",
|
|
976
|
+
"delta": {"stop_reason": "end_turn", "stop_sequence": None},
|
|
977
|
+
"usage": {"output_tokens": 1},
|
|
978
|
+
})
|
|
979
|
+
yield _sse("message_stop", {"type": "message_stop"})
|
|
980
|
+
return
|
|
981
|
+
|
|
982
|
+
# Dual-brain path: stream the FAST-RESPONDING brain live, run the
|
|
983
|
+
# other in parallel as a buffered background draft. Streaming a
|
|
984
|
+
# thinking model (Kimi) blocks visible content for tens of seconds
|
|
985
|
+
# while it reasons — so we route the *visible* stream to the
|
|
986
|
+
# non-thinking brain when one is available. Both are still
|
|
987
|
+
# engaged for arbitration.
|
|
988
|
+
s1, s2 = agent.llm.router.fast, agent.llm.router.slow
|
|
989
|
+
s1_thinks = any(h in (s1.model or "").lower()
|
|
990
|
+
for h in ("kimi", "deepseek", "qwen3", "o1", "o3"))
|
|
991
|
+
s2_thinks = any(h in (s2.model or "").lower()
|
|
992
|
+
for h in ("kimi", "deepseek", "qwen3", "o1", "o3"))
|
|
993
|
+
if s1_thinks and not s2_thinks:
|
|
994
|
+
stream_provider, bg_provider = s2, s1
|
|
995
|
+
stream_label = "slow"
|
|
996
|
+
else:
|
|
997
|
+
stream_provider, bg_provider = s1, s2
|
|
998
|
+
stream_label = "fast"
|
|
999
|
+
|
|
1000
|
+
# Kick off the background draft.
|
|
1001
|
+
bg_task = asyncio.create_task(
|
|
1002
|
+
_slow_with_subscription_scope(
|
|
1003
|
+
bg_provider, internal_msgs, internal_tools,
|
|
1004
|
+
max_tokens, temperature,
|
|
1005
|
+
)
|
|
1006
|
+
)
|
|
1007
|
+
|
|
1008
|
+
# message_start event — arrives immediately.
|
|
1009
|
+
yield _sse("message_start", {
|
|
1010
|
+
"type": "message_start",
|
|
1011
|
+
"message": {
|
|
1012
|
+
"id": msg_id, "type": "message", "role": "assistant",
|
|
1013
|
+
"model": requested_model, "content": [],
|
|
1014
|
+
"stop_reason": None, "stop_sequence": None,
|
|
1015
|
+
"usage": {"input_tokens": 0, "output_tokens": 1},
|
|
1016
|
+
},
|
|
1017
|
+
})
|
|
1018
|
+
|
|
1019
|
+
stream_text_buf: list[str] = []
|
|
1020
|
+
stream_tool_calls: list[ToolUseBlock] = []
|
|
1021
|
+
text_open = False
|
|
1022
|
+
text_index = 0
|
|
1023
|
+
next_index = 0
|
|
1024
|
+
thinking_open = False
|
|
1025
|
+
thinking_index = 0
|
|
1026
|
+
stop_reason: str | None = None
|
|
1027
|
+
|
|
1028
|
+
try:
|
|
1029
|
+
with subscription_auth_scope():
|
|
1030
|
+
async for event in stream_provider.stream(
|
|
1031
|
+
messages=internal_msgs, tools=internal_tools,
|
|
1032
|
+
max_tokens=max_tokens, temperature=temperature,
|
|
1033
|
+
):
|
|
1034
|
+
if event.type == "thinking_delta" and event.delta:
|
|
1035
|
+
middleware.observe_thinking(turn_ctx, event.delta)
|
|
1036
|
+
if not thinking_open:
|
|
1037
|
+
yield _sse("content_block_start", {
|
|
1038
|
+
"type": "content_block_start",
|
|
1039
|
+
"index": thinking_index,
|
|
1040
|
+
"content_block": {"type": "thinking", "thinking": ""},
|
|
1041
|
+
})
|
|
1042
|
+
thinking_open = True
|
|
1043
|
+
text_index = thinking_index + 1
|
|
1044
|
+
next_index = max(next_index, thinking_index + 1)
|
|
1045
|
+
yield _sse("content_block_delta", {
|
|
1046
|
+
"type": "content_block_delta",
|
|
1047
|
+
"index": thinking_index,
|
|
1048
|
+
"delta": {"type": "thinking_delta", "thinking": event.delta},
|
|
1049
|
+
})
|
|
1050
|
+
elif event.type == "text_delta" and event.delta:
|
|
1051
|
+
if thinking_open:
|
|
1052
|
+
yield _sse("content_block_stop", {
|
|
1053
|
+
"type": "content_block_stop", "index": thinking_index,
|
|
1054
|
+
})
|
|
1055
|
+
thinking_open = False
|
|
1056
|
+
if not text_open:
|
|
1057
|
+
yield _sse("content_block_start", {
|
|
1058
|
+
"type": "content_block_start",
|
|
1059
|
+
"index": text_index,
|
|
1060
|
+
"content_block": {"type": "text", "text": ""},
|
|
1061
|
+
})
|
|
1062
|
+
text_open = True
|
|
1063
|
+
next_index = max(next_index, text_index + 1)
|
|
1064
|
+
stream_text_buf.append(event.delta)
|
|
1065
|
+
middleware.observe_response_text(turn_ctx, event.delta)
|
|
1066
|
+
yield _sse("content_block_delta", {
|
|
1067
|
+
"type": "content_block_delta",
|
|
1068
|
+
"index": text_index,
|
|
1069
|
+
"delta": {"type": "text_delta", "text": event.delta},
|
|
1070
|
+
})
|
|
1071
|
+
elif event.type == "tool_use_end":
|
|
1072
|
+
if thinking_open:
|
|
1073
|
+
yield _sse("content_block_stop", {
|
|
1074
|
+
"type": "content_block_stop", "index": thinking_index,
|
|
1075
|
+
})
|
|
1076
|
+
thinking_open = False
|
|
1077
|
+
idx = next_index
|
|
1078
|
+
next_index += 1
|
|
1079
|
+
tc = ToolUseBlock(
|
|
1080
|
+
id=event.tool_use_id or f"toolu_{uuid.uuid4().hex[:12]}",
|
|
1081
|
+
name=event.tool_name or "",
|
|
1082
|
+
input=event.tool_input or {},
|
|
1083
|
+
)
|
|
1084
|
+
stream_tool_calls.append(tc)
|
|
1085
|
+
middleware.observe_tool_use(turn_ctx, tc.name)
|
|
1086
|
+
yield _sse("content_block_start", {
|
|
1087
|
+
"type": "content_block_start",
|
|
1088
|
+
"index": idx,
|
|
1089
|
+
"content_block": {
|
|
1090
|
+
"type": "tool_use", "id": tc.id,
|
|
1091
|
+
"name": tc.name, "input": {},
|
|
1092
|
+
},
|
|
1093
|
+
})
|
|
1094
|
+
yield _sse("content_block_delta", {
|
|
1095
|
+
"type": "content_block_delta",
|
|
1096
|
+
"index": idx,
|
|
1097
|
+
"delta": {
|
|
1098
|
+
"type": "input_json_delta",
|
|
1099
|
+
"partial_json": json.dumps(tc.input),
|
|
1100
|
+
},
|
|
1101
|
+
})
|
|
1102
|
+
yield _sse("content_block_stop", {
|
|
1103
|
+
"type": "content_block_stop", "index": idx,
|
|
1104
|
+
})
|
|
1105
|
+
elif event.type == "message_stop":
|
|
1106
|
+
stop_reason = event.stop_reason
|
|
1107
|
+
except Exception as e:
|
|
1108
|
+
logger.exception("stream-leg failed in arbitrate")
|
|
1109
|
+
middleware.end_turn(turn_ctx, error=True)
|
|
1110
|
+
yield _sse("error", {
|
|
1111
|
+
"type": "error",
|
|
1112
|
+
"error": {"type": "api_error", "message": str(e)},
|
|
1113
|
+
})
|
|
1114
|
+
bg_task.cancel()
|
|
1115
|
+
return
|
|
1116
|
+
|
|
1117
|
+
if thinking_open:
|
|
1118
|
+
yield _sse("content_block_stop", {
|
|
1119
|
+
"type": "content_block_stop", "index": thinking_index,
|
|
1120
|
+
})
|
|
1121
|
+
if text_open:
|
|
1122
|
+
yield _sse("content_block_stop", {
|
|
1123
|
+
"type": "content_block_stop", "index": text_index,
|
|
1124
|
+
})
|
|
1125
|
+
|
|
1126
|
+
stop_map = {
|
|
1127
|
+
"stop": "end_turn", "length": "max_tokens",
|
|
1128
|
+
"tool_calls": "tool_use", "tool_use": "tool_use",
|
|
1129
|
+
}
|
|
1130
|
+
anthropic_stop = stop_map.get(stop_reason or "stop", "end_turn")
|
|
1131
|
+
if stream_tool_calls and anthropic_stop == "end_turn":
|
|
1132
|
+
anthropic_stop = "tool_use"
|
|
1133
|
+
|
|
1134
|
+
output_tokens = max(1, len("".join(stream_text_buf).split()))
|
|
1135
|
+
yield _sse("message_delta", {
|
|
1136
|
+
"type": "message_delta",
|
|
1137
|
+
"delta": {"stop_reason": anthropic_stop, "stop_sequence": None},
|
|
1138
|
+
"usage": {"output_tokens": output_tokens},
|
|
1139
|
+
})
|
|
1140
|
+
yield _sse("message_stop", {"type": "message_stop"})
|
|
1141
|
+
|
|
1142
|
+
# Background: wait for bg brain to finish, score both, log
|
|
1143
|
+
# arbitration. User has already seen the streamed brain; this is
|
|
1144
|
+
# purely for the preference corpus.
|
|
1145
|
+
try:
|
|
1146
|
+
bg_resp = await bg_task
|
|
1147
|
+
except Exception as e:
|
|
1148
|
+
logger.warning(f"bg-draft failed: {e}")
|
|
1149
|
+
bg_resp = None
|
|
1150
|
+
|
|
1151
|
+
try:
|
|
1152
|
+
streamed_text = "".join(stream_text_buf)
|
|
1153
|
+
class _FakeResp:
|
|
1154
|
+
content = streamed_text
|
|
1155
|
+
thinking = ""
|
|
1156
|
+
tool_calls = stream_tool_calls
|
|
1157
|
+
streamed_score = _score_draft(_FakeResp()) # type: ignore[arg-type]
|
|
1158
|
+
bg_score = _score_draft(bg_resp) if bg_resp else 0.0
|
|
1159
|
+
if stream_label == "fast":
|
|
1160
|
+
fast_text, fast_score = streamed_text, streamed_score
|
|
1161
|
+
slow_text = bg_resp.content if bg_resp else ""
|
|
1162
|
+
slow_score = bg_score
|
|
1163
|
+
else:
|
|
1164
|
+
slow_text, slow_score = streamed_text, streamed_score
|
|
1165
|
+
fast_text = bg_resp.content if bg_resp else ""
|
|
1166
|
+
fast_score = bg_score
|
|
1167
|
+
winner = ("fast" if fast_score >= slow_score
|
|
1168
|
+
or abs(fast_score - slow_score) < 0.02
|
|
1169
|
+
else "slow")
|
|
1170
|
+
middleware.observe_arbitration(
|
|
1171
|
+
turn_ctx,
|
|
1172
|
+
fast_text=fast_text,
|
|
1173
|
+
slow_text=slow_text,
|
|
1174
|
+
fast_score=fast_score,
|
|
1175
|
+
slow_score=slow_score,
|
|
1176
|
+
winner=winner,
|
|
1177
|
+
fast_model=s1.model,
|
|
1178
|
+
slow_model=s2.model,
|
|
1179
|
+
)
|
|
1180
|
+
except Exception as e:
|
|
1181
|
+
logger.debug(f"arbitration log failed: {e}")
|
|
1182
|
+
middleware.end_turn(turn_ctx, error=False)
|
|
1183
|
+
|
|
1184
|
+
|
|
1185
|
+
async def _slow_with_subscription_scope(
|
|
1186
|
+
provider, messages, tools, max_tokens, temperature,
|
|
1187
|
+
):
|
|
1188
|
+
"""Helper: invoke a provider's chat() inside the subscription
|
|
1189
|
+
auth scope (so anthropic/* models work via OAuth) — used by the
|
|
1190
|
+
streaming arbitration path's background slow-draft."""
|
|
1191
|
+
from core.anthropic_auth import subscription_auth_scope
|
|
1192
|
+
with subscription_auth_scope():
|
|
1193
|
+
return await provider.chat(
|
|
1194
|
+
messages=messages, tools=tools,
|
|
1195
|
+
max_tokens=max_tokens, temperature=temperature,
|
|
1196
|
+
)
|
|
1197
|
+
|
|
1198
|
+
|
|
1199
|
+
# ---- FastAPI router --------------------------------------------------
|
|
1200
|
+
|
|
1201
|
+
|
|
1202
|
+
def build_router() -> APIRouter:
|
|
1203
|
+
"""Build the /v1/messages router. Uses a process-wide CognosAgent so
|
|
1204
|
+
state (memory, Caudate, mood) carries across requests."""
|
|
1205
|
+
router = APIRouter()
|
|
1206
|
+
|
|
1207
|
+
# Lazy singleton — first request builds it, subsequent reuse.
|
|
1208
|
+
_agent_box: dict[str, Any] = {"agent": None, "middleware": None}
|
|
1209
|
+
|
|
1210
|
+
def _get_agent():
|
|
1211
|
+
if _agent_box["agent"] is None:
|
|
1212
|
+
from core.agent import CognosAgent
|
|
1213
|
+
agent = CognosAgent(
|
|
1214
|
+
mode="agentic",
|
|
1215
|
+
permission_mode="bypass", # Claude Code handles permissions
|
|
1216
|
+
personality=True, # keep mood signal alive for Caudate
|
|
1217
|
+
)
|
|
1218
|
+
_agent_box["agent"] = agent
|
|
1219
|
+
_agent_box["middleware"] = CaudateMiddleware(agent)
|
|
1220
|
+
cau = getattr(agent, "caudate", None)
|
|
1221
|
+
cau_status = (cau.policy.level.label
|
|
1222
|
+
if cau and cau.policy else "unavailable")
|
|
1223
|
+
logger.info(
|
|
1224
|
+
f"Anthropic-compat singleton agent ready, "
|
|
1225
|
+
f"llm={agent.llm.model}, caudate={cau_status}"
|
|
1226
|
+
)
|
|
1227
|
+
return _agent_box["agent"], _agent_box["middleware"]
|
|
1228
|
+
|
|
1229
|
+
@router.post("/v1/messages")
|
|
1230
|
+
async def messages_endpoint(request: Request):
|
|
1231
|
+
try:
|
|
1232
|
+
body = await request.json()
|
|
1233
|
+
except Exception:
|
|
1234
|
+
raise HTTPException(400, "Invalid JSON body")
|
|
1235
|
+
|
|
1236
|
+
if not isinstance(body, dict):
|
|
1237
|
+
raise HTTPException(400, "Body must be a JSON object")
|
|
1238
|
+
|
|
1239
|
+
try:
|
|
1240
|
+
internal_msgs, internal_tools = _translate_anthropic_to_internal(body)
|
|
1241
|
+
except Exception as e:
|
|
1242
|
+
logger.exception("Anthropic→internal translation failed")
|
|
1243
|
+
raise HTTPException(400, f"Bad message format: {e}")
|
|
1244
|
+
|
|
1245
|
+
# Inject sandbox-awareness hint so the LLM scaffolds new files
|
|
1246
|
+
# into cognos/sandbox/ by default. Idempotent across multi-turn.
|
|
1247
|
+
from core.sandbox_prompt import inject_sandbox_hint
|
|
1248
|
+
internal_msgs = inject_sandbox_hint(internal_msgs)
|
|
1249
|
+
|
|
1250
|
+
max_tokens = int(body.get("max_tokens") or 4096)
|
|
1251
|
+
temperature = body.get("temperature")
|
|
1252
|
+
if temperature is not None:
|
|
1253
|
+
temperature = float(temperature)
|
|
1254
|
+
requested_model = body.get("model") or "cognos"
|
|
1255
|
+
stream = bool(body.get("stream", False))
|
|
1256
|
+
|
|
1257
|
+
agent, middleware = _get_agent()
|
|
1258
|
+
llm = agent.llm
|
|
1259
|
+
|
|
1260
|
+
# ---- Dual-brain arbitration branch (Phase 4 substrate) --------
|
|
1261
|
+
# `cognos-dual-brain` (and the `cognos-collab` alias) trigger
|
|
1262
|
+
# parallel arbitration through both system1 and system2. The
|
|
1263
|
+
# winner is returned in Anthropic format; both drafts are
|
|
1264
|
+
# captured for Caudate's preference-learning corpus. This path
|
|
1265
|
+
# has its own non-passthrough flow because we need to dispatch
|
|
1266
|
+
# both providers in parallel, not just forward upstream.
|
|
1267
|
+
rm_lower = (requested_model or "").lower()
|
|
1268
|
+
if rm_lower in ("cognos-dual-brain", "cognos-collab"):
|
|
1269
|
+
turn_ctx = middleware.begin_turn(
|
|
1270
|
+
internal_msgs, internal_tools,
|
|
1271
|
+
model_source=f"dual[fast={getattr(getattr(llm,'router',None),'fast',type('x',(),{'model':'?'})).model},"
|
|
1272
|
+
f"slow={getattr(getattr(llm,'router',None),'slow',type('x',(),{'model':'?'})).model}]"
|
|
1273
|
+
if hasattr(llm, "router") else "unknown",
|
|
1274
|
+
)
|
|
1275
|
+
internal_msgs = middleware.maybe_inject_hint(internal_msgs, turn_ctx)
|
|
1276
|
+
if stream:
|
|
1277
|
+
return StreamingResponse(
|
|
1278
|
+
_anthropic_arbitrate_stream(
|
|
1279
|
+
body=body,
|
|
1280
|
+
internal_msgs=internal_msgs,
|
|
1281
|
+
internal_tools=internal_tools,
|
|
1282
|
+
max_tokens=max_tokens,
|
|
1283
|
+
temperature=temperature,
|
|
1284
|
+
requested_model=requested_model,
|
|
1285
|
+
agent=agent,
|
|
1286
|
+
middleware=middleware,
|
|
1287
|
+
turn_ctx=turn_ctx,
|
|
1288
|
+
),
|
|
1289
|
+
media_type="text/event-stream",
|
|
1290
|
+
)
|
|
1291
|
+
return await _anthropic_arbitrate_nonstream(
|
|
1292
|
+
body=body,
|
|
1293
|
+
internal_msgs=internal_msgs,
|
|
1294
|
+
internal_tools=internal_tools,
|
|
1295
|
+
max_tokens=max_tokens,
|
|
1296
|
+
temperature=temperature,
|
|
1297
|
+
requested_model=requested_model,
|
|
1298
|
+
agent=agent,
|
|
1299
|
+
middleware=middleware,
|
|
1300
|
+
turn_ctx=turn_ctx,
|
|
1301
|
+
)
|
|
1302
|
+
|
|
1303
|
+
# ---- Anthropic passthrough branch -----------------------------
|
|
1304
|
+
# If the resolved primary brain is an Anthropic Claude id, forward
|
|
1305
|
+
# the request to api.anthropic.com using the caller's own auth
|
|
1306
|
+
# header. Caudate observes the response so she still learns from
|
|
1307
|
+
# every turn — just from a stronger teacher.
|
|
1308
|
+
passthrough_model = _resolve_anthropic_model(requested_model)
|
|
1309
|
+
if passthrough_model is not None:
|
|
1310
|
+
turn_ctx = middleware.begin_turn(
|
|
1311
|
+
internal_msgs, internal_tools,
|
|
1312
|
+
model_source=f"anthropic/{passthrough_model}",
|
|
1313
|
+
)
|
|
1314
|
+
internal_msgs_with_hint = middleware.maybe_inject_hint(internal_msgs, turn_ctx)
|
|
1315
|
+
# Build an Anthropic-shape body to forward. We start from the
|
|
1316
|
+
# caller's body so any client-set fields (system, tools,
|
|
1317
|
+
# tool_choice, metadata, top_p, top_k...) survive — we only
|
|
1318
|
+
# override the model id.
|
|
1319
|
+
upstream_body = dict(body)
|
|
1320
|
+
upstream_body["model"] = passthrough_model
|
|
1321
|
+
# DIAG: log the keys + any 'effort' references before strip
|
|
1322
|
+
logger.warning(
|
|
1323
|
+
f"[passthrough pre-strip] model={passthrough_model} "
|
|
1324
|
+
f"top_keys={sorted(upstream_body.keys())} "
|
|
1325
|
+
f"has_effort={'effort' in upstream_body} "
|
|
1326
|
+
f"has_thinking={'thinking' in upstream_body}"
|
|
1327
|
+
)
|
|
1328
|
+
upstream_body = _strip_unsupported_fields(upstream_body, passthrough_model)
|
|
1329
|
+
upstream_body = _maybe_force_thinking(upstream_body)
|
|
1330
|
+
logger.warning(
|
|
1331
|
+
f"[passthrough post-strip] top_keys={sorted(upstream_body.keys())} "
|
|
1332
|
+
f"has_effort={'effort' in upstream_body} "
|
|
1333
|
+
f"has_thinking={'thinking' in upstream_body}"
|
|
1334
|
+
)
|
|
1335
|
+
# If Caudate injected a hint, also inject it into the system
|
|
1336
|
+
# prompt of the upstream body. Not just the internal_msgs.
|
|
1337
|
+
if internal_msgs_with_hint is not internal_msgs:
|
|
1338
|
+
# Find the system content in the hinted internal messages.
|
|
1339
|
+
injected_system = ""
|
|
1340
|
+
for m in internal_msgs_with_hint:
|
|
1341
|
+
if m.get("role") == "system":
|
|
1342
|
+
c = m.get("content")
|
|
1343
|
+
injected_system = c if isinstance(c, str) else (
|
|
1344
|
+
" ".join(b.get("text", "") for b in c
|
|
1345
|
+
if isinstance(b, dict) and b.get("type") == "text")
|
|
1346
|
+
if isinstance(c, list) else ""
|
|
1347
|
+
)
|
|
1348
|
+
break
|
|
1349
|
+
if injected_system:
|
|
1350
|
+
raw_sys = body.get("system")
|
|
1351
|
+
if isinstance(raw_sys, str) and raw_sys:
|
|
1352
|
+
upstream_body["system"] = injected_system
|
|
1353
|
+
elif isinstance(raw_sys, list):
|
|
1354
|
+
upstream_body["system"] = [
|
|
1355
|
+
{"type": "text", "text": injected_system}
|
|
1356
|
+
]
|
|
1357
|
+
else:
|
|
1358
|
+
upstream_body["system"] = injected_system
|
|
1359
|
+
|
|
1360
|
+
headers = _filter_forward_headers(request)
|
|
1361
|
+
if "authorization" not in {k.lower() for k in headers}:
|
|
1362
|
+
# Fall back to ANTHROPIC_API_KEY env var if the client
|
|
1363
|
+
# didn't send a Bearer token. Common when the caller is
|
|
1364
|
+
# something other than Claude Code.
|
|
1365
|
+
env_key = os.environ.get("ANTHROPIC_API_KEY")
|
|
1366
|
+
if env_key:
|
|
1367
|
+
headers["x-api-key"] = env_key
|
|
1368
|
+
|
|
1369
|
+
if stream:
|
|
1370
|
+
return StreamingResponse(
|
|
1371
|
+
_passthrough_anthropic_stream(
|
|
1372
|
+
upstream_body=upstream_body,
|
|
1373
|
+
headers=headers,
|
|
1374
|
+
middleware=middleware,
|
|
1375
|
+
turn_ctx=turn_ctx,
|
|
1376
|
+
),
|
|
1377
|
+
media_type="text/event-stream",
|
|
1378
|
+
)
|
|
1379
|
+
return await _passthrough_anthropic_nonstream(
|
|
1380
|
+
upstream_body=upstream_body,
|
|
1381
|
+
headers=headers,
|
|
1382
|
+
middleware=middleware,
|
|
1383
|
+
turn_ctx=turn_ctx,
|
|
1384
|
+
)
|
|
1385
|
+
|
|
1386
|
+
# ---- Local LiteLLM path (Ollama / OpenAI / etc.) --------------
|
|
1387
|
+
# Open Caudate's turn — capture state, predict, log, register
|
|
1388
|
+
# tool vocab, extract images, etc.
|
|
1389
|
+
turn_ctx = middleware.begin_turn(
|
|
1390
|
+
internal_msgs, internal_tools,
|
|
1391
|
+
model_source=getattr(llm, "model", "unknown"),
|
|
1392
|
+
)
|
|
1393
|
+
|
|
1394
|
+
# WHISPER+ : prepend Caudate's hint to the system message
|
|
1395
|
+
internal_msgs = middleware.maybe_inject_hint(internal_msgs, turn_ctx)
|
|
1396
|
+
|
|
1397
|
+
if stream:
|
|
1398
|
+
return StreamingResponse(
|
|
1399
|
+
_stream_anthropic_events(
|
|
1400
|
+
llm=llm,
|
|
1401
|
+
messages=internal_msgs,
|
|
1402
|
+
tools=internal_tools,
|
|
1403
|
+
max_tokens=max_tokens,
|
|
1404
|
+
temperature=temperature,
|
|
1405
|
+
requested_model=requested_model,
|
|
1406
|
+
middleware=middleware,
|
|
1407
|
+
turn_ctx=turn_ctx,
|
|
1408
|
+
),
|
|
1409
|
+
media_type="text/event-stream",
|
|
1410
|
+
)
|
|
1411
|
+
|
|
1412
|
+
# Non-streaming
|
|
1413
|
+
try:
|
|
1414
|
+
resp = await llm.chat(
|
|
1415
|
+
messages=internal_msgs,
|
|
1416
|
+
tools=internal_tools,
|
|
1417
|
+
max_tokens=max_tokens,
|
|
1418
|
+
temperature=temperature,
|
|
1419
|
+
)
|
|
1420
|
+
except Exception as e:
|
|
1421
|
+
logger.exception("LLM call failed")
|
|
1422
|
+
middleware.end_turn(turn_ctx, error=True)
|
|
1423
|
+
raise HTTPException(500, f"LLM error: {e}")
|
|
1424
|
+
|
|
1425
|
+
# Feed the response back into Caudate's observer
|
|
1426
|
+
middleware.observe_response_text(turn_ctx, resp.content or "")
|
|
1427
|
+
if getattr(resp, "thinking", None):
|
|
1428
|
+
middleware.observe_thinking(turn_ctx, resp.thinking)
|
|
1429
|
+
for tc in resp.tool_calls:
|
|
1430
|
+
middleware.observe_tool_use(turn_ctx, tc.name)
|
|
1431
|
+
middleware.end_turn(turn_ctx, error=False)
|
|
1432
|
+
|
|
1433
|
+
return JSONResponse(_build_anthropic_response(
|
|
1434
|
+
thinking=getattr(resp, "thinking", "") or "",
|
|
1435
|
+
text=resp.content,
|
|
1436
|
+
tool_calls=resp.tool_calls,
|
|
1437
|
+
model=requested_model,
|
|
1438
|
+
usage=resp.usage,
|
|
1439
|
+
stop_reason=resp.stop_reason,
|
|
1440
|
+
))
|
|
1441
|
+
|
|
1442
|
+
@router.get("/v1/models")
|
|
1443
|
+
async def models_endpoint():
|
|
1444
|
+
"""Anthropic-compat model list.
|
|
1445
|
+
|
|
1446
|
+
Exposes friendly cognos-* aliases so Claude Code's header shows
|
|
1447
|
+
something honest (e.g. "cognos-dual-brain") rather than the
|
|
1448
|
+
misleading "Sonnet 4.6". Each cognos-* id routes through the
|
|
1449
|
+
normal Cognos LLM stack regardless of which one Claude Code
|
|
1450
|
+
picks — the name is purely cosmetic.
|
|
1451
|
+
"""
|
|
1452
|
+
from config import LLM_MODEL
|
|
1453
|
+
from core.settings import Settings
|
|
1454
|
+
s = Settings.load()
|
|
1455
|
+
s1 = s.get("system1") or "(unset)"
|
|
1456
|
+
s2 = s.get("system2") or "(unset)"
|
|
1457
|
+
primary = s.get("model") or LLM_MODEL
|
|
1458
|
+
|
|
1459
|
+
# Friendly cognos-* aliases — what Claude Code will display.
|
|
1460
|
+
# User-facing: ONE model. Caudate decides per-turn whether to
|
|
1461
|
+
# use one brain or both, whether to engage constitutional
|
|
1462
|
+
# critique, which tier, and whether to think — driven by her
|
|
1463
|
+
# tool/tier/think/value head predictions. The other cognos-*
|
|
1464
|
+
# aliases below are *functional* but unlisted — power users can
|
|
1465
|
+
# still hit them directly to force a specific behaviour, but
|
|
1466
|
+
# the default catalog stays clean.
|
|
1467
|
+
cognos_aliases = [
|
|
1468
|
+
("cognos", f"Cognos · Caudate-driven smart routing · S1={s1} + S2={s2}"),
|
|
1469
|
+
]
|
|
1470
|
+
|
|
1471
|
+
# Real model ids the user has — for transparency / picker support.
|
|
1472
|
+
real_ids: set[str] = set()
|
|
1473
|
+
for v in (s.get("system1"), s.get("system2"), s.get("model"), LLM_MODEL):
|
|
1474
|
+
if v: real_ids.add(v)
|
|
1475
|
+
|
|
1476
|
+
# Claude id stubs so anyone with a Claude default in their
|
|
1477
|
+
# ~/.claude/settings.json still works.
|
|
1478
|
+
claude_stubs = [
|
|
1479
|
+
"claude-opus-4-7", "claude-sonnet-4-6", "claude-haiku-4-5",
|
|
1480
|
+
]
|
|
1481
|
+
|
|
1482
|
+
# Each model carries BOTH Anthropic-shape and OpenAI-shape
|
|
1483
|
+
# fields so Claude Code AND Open WebUI can both parse this list.
|
|
1484
|
+
# Anthropic clients read `type` + `display_name` + `created_at`;
|
|
1485
|
+
# OpenAI clients read `object` + `created` (Unix int) + `owned_by`.
|
|
1486
|
+
# Carrying both is harmless to either side.
|
|
1487
|
+
ts_iso = "2026-01-01T00:00:00Z"
|
|
1488
|
+
ts_unix = 1735689600 # 2025-01-01T00:00:00Z, intentionally older
|
|
1489
|
+
# than session ids so picker sorts stable
|
|
1490
|
+
owner = "cognos"
|
|
1491
|
+
def _entry(model_id: str, display: str) -> dict[str, Any]:
|
|
1492
|
+
return {
|
|
1493
|
+
"id": model_id,
|
|
1494
|
+
"type": "model", # Anthropic shape
|
|
1495
|
+
"object": "model", # OpenAI shape
|
|
1496
|
+
"display_name": display,
|
|
1497
|
+
"created_at": ts_iso,
|
|
1498
|
+
"created": ts_unix,
|
|
1499
|
+
"owned_by": owner,
|
|
1500
|
+
}
|
|
1501
|
+
|
|
1502
|
+
models: list[dict[str, Any]] = []
|
|
1503
|
+
for name, display in cognos_aliases:
|
|
1504
|
+
models.append(_entry(name, display))
|
|
1505
|
+
for name in sorted(real_ids):
|
|
1506
|
+
models.append(_entry(name, name))
|
|
1507
|
+
for name in claude_stubs:
|
|
1508
|
+
models.append(_entry(name, f"{name} (alias → Cognos)"))
|
|
1509
|
+
|
|
1510
|
+
return {
|
|
1511
|
+
"object": "list", # OpenAI clients require this
|
|
1512
|
+
"data": models,
|
|
1513
|
+
"first_id": models[0]["id"] if models else None,
|
|
1514
|
+
"last_id": models[-1]["id"] if models else None,
|
|
1515
|
+
"has_more": False,
|
|
1516
|
+
}
|
|
1517
|
+
|
|
1518
|
+
return router
|