sherlock-context 1.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. playground/__init__.py +3 -0
  2. playground/providers.py +652 -0
  3. playground/server.py +500 -0
  4. playground/session.py +148 -0
  5. playground/static/app.js +1013 -0
  6. playground/static/index.html +294 -0
  7. sherlock/__init__.py +60 -0
  8. sherlock/agent.py +6139 -0
  9. sherlock/bootstrap/__init__.py +8 -0
  10. sherlock/bootstrap/engine.py +100 -0
  11. sherlock/bootstrap/meta_context.py +193 -0
  12. sherlock/budget.py +252 -0
  13. sherlock/cli/__init__.py +5 -0
  14. sherlock/cli/main.py +269 -0
  15. sherlock/compress.py +120 -0
  16. sherlock/config.py +410 -0
  17. sherlock/evaluation/__init__.py +8 -0
  18. sherlock/evaluation/evaluator.py +213 -0
  19. sherlock/evaluation/output_format.py +686 -0
  20. sherlock/evaluation/replay.py +91 -0
  21. sherlock/evolution/__init__.py +5 -0
  22. sherlock/evolution/versioning.py +104 -0
  23. sherlock/inference/__init__.py +5 -0
  24. sherlock/inference/engine.py +969 -0
  25. sherlock/jsonish.py +169 -0
  26. sherlock/memory/__init__.py +31 -0
  27. sherlock/memory/decay.py +124 -0
  28. sherlock/memory/embeddings.py +214 -0
  29. sherlock/memory/entry.py +159 -0
  30. sherlock/memory/k_turn.py +26 -0
  31. sherlock/memory/store.py +826 -0
  32. sherlock/memory/summarizer.py +571 -0
  33. sherlock/perception/__init__.py +14 -0
  34. sherlock/perception/core.py +788 -0
  35. sherlock/perception/render.py +43 -0
  36. sherlock/providers/__init__.py +55 -0
  37. sherlock/providers/base.py +71 -0
  38. sherlock/providers/callable_provider.py +118 -0
  39. sherlock/providers/fake.py +42 -0
  40. sherlock/providers/litellm_provider.py +194 -0
  41. sherlock/providers/wrapper_provider.py +120 -0
  42. sherlock/rag/__init__.py +9 -0
  43. sherlock/rag/hybrid.py +323 -0
  44. sherlock/security/__init__.py +6 -0
  45. sherlock/security/redaction.py +82 -0
  46. sherlock/security/urlguard.py +83 -0
  47. sherlock/storage/__init__.py +9 -0
  48. sherlock/storage/db.py +199 -0
  49. sherlock/tools/__init__.py +321 -0
  50. sherlock/tools/builtin.py +250 -0
  51. sherlock/tools/memory_tool.py +312 -0
  52. sherlock/tools/web_search.py +601 -0
  53. sherlock_context-1.7.0.dist-info/METADATA +870 -0
  54. sherlock_context-1.7.0.dist-info/RECORD +58 -0
  55. sherlock_context-1.7.0.dist-info/WHEEL +5 -0
  56. sherlock_context-1.7.0.dist-info/entry_points.txt +2 -0
  57. sherlock_context-1.7.0.dist-info/licenses/LICENSE +21 -0
  58. sherlock_context-1.7.0.dist-info/top_level.txt +2 -0
playground/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ """Sherlock Live Inspector — a browser test platform that drives the Sherlock
2
+ system with a user-supplied Gemini key and visualizes the 4-LLM internals in
3
+ real time. See playground/README.md to run it."""
@@ -0,0 +1,652 @@
1
+ """Provider glue for the playground: live model listing + per-role chat callables.
2
+
3
+ Supported providers (all through litellm, so Sherlock itself stays BYO-LLM):
4
+
5
+ gemini Google AI Studio key -> litellm "gemini/<model>"
6
+ openai OpenAI API key -> litellm "openai/<model>"
7
+ anthropic Anthropic API key -> litellm "anthropic/<model>"
8
+ local any OpenAI-compatible server -> litellm "openai/<model>" + api_base
9
+ (Ollama, LM Studio, vLLM, llama.cpp server, ...)
10
+
11
+ Open-source-model aggregators (OpenAI-compatible, descriptor-driven via
12
+ ``OPENAI_COMPAT`` below — adding the next one is a one-row dict entry):
13
+ deepinfra DeepInfra key -> litellm "deepinfra/<org/model>"
14
+ together Together AI key -> litellm "together_ai/<org/model>"
15
+ openrouter OpenRouter key -> litellm "openrouter/<org/model>"
16
+
17
+ API keys live ONLY in the server-side Session — they are sent once from the
18
+ browser to /api/models and /api/session and never echoed back.
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ import re
24
+ import time
25
+
26
+ import httpx
27
+
28
+ from sherlock.providers.base import ChatMessage, ChatResponse, TokenUsage
29
+
30
+ _GEMINI_MODELS_URL = "https://generativelanguage.googleapis.com/v1beta/models"
31
+ _OPENAI_MODELS_URL = "https://api.openai.com/v1/models"
32
+ _ANTHROPIC_MODELS_URL = "https://api.anthropic.com/v1/models"
33
+ _ROLE_ACTOR = {"main": "llm1", "summary": "llm2", "inference": "llm3"}
34
+
35
+ # OpenAI /v1/models lists every modality; keep only chat-capable families.
36
+ _OPENAI_CHAT_RE = re.compile(r"^(gpt-[45o]|gpt-oss|o[134](-|$)|chatgpt-)")
37
+ _OPENAI_NON_CHAT_RE = re.compile(
38
+ r"(embed|whisper|tts|audio|realtime|image|dall-e|moderation|transcribe|search|davinci|babbage|instruct)"
39
+ )
40
+
41
+ # Open-source-model aggregators. All three are the SAME OpenAI-compatible API
42
+ # behind three base URLs — the only differences (base URL, litellm route prefix,
43
+ # whether /models needs the key, and the /models JSON shape) are DATA, not logic.
44
+ # A descriptor table keeps adding the next aggregator (Fireworks, Novita, ...) a
45
+ # one-row change instead of another if-ladder. Verified live 2026-06-19.
46
+ OPENAI_COMPAT = {
47
+ "deepinfra": {
48
+ "label": "DeepInfra",
49
+ "litellm_prefix": "deepinfra/",
50
+ "models_url": "https://api.deepinfra.com/v1/openai/models",
51
+ "models_need_key": False, # public; a NON-EMPTY invalid key 401s → never send it to list
52
+ "list_shape": "data", # {"data": [{id, metadata:{tags, context_length}}]}
53
+ "chat_filter": "deepinfra", # keep metadata.tags ∋ "chat"
54
+ "extra_headers": {},
55
+ },
56
+ "together": {
57
+ "label": "Together AI",
58
+ "litellm_prefix": "together_ai/",
59
+ "models_url": "https://api.together.ai/v1/models",
60
+ "models_need_key": True,
61
+ "list_shape": "bare_array", # TOP-LEVEL [ {...} ] — NO {"data": ...} envelope
62
+ "chat_filter": "together", # keep type == "chat"
63
+ "extra_headers": {},
64
+ },
65
+ "openrouter": {
66
+ "label": "OpenRouter",
67
+ "litellm_prefix": "openrouter/",
68
+ "models_url": "https://openrouter.ai/api/v1/models",
69
+ "models_need_key": False, # public list
70
+ "list_shape": "data", # {"data": [{id, architecture:{output_modalities}}]}
71
+ "chat_filter": "openrouter", # keep text-output models
72
+ # X-Title shows up in the user's OpenRouter dashboard; purely cosmetic,
73
+ # never required, no fake referrer URL shipped.
74
+ "extra_headers": {"X-Title": "Sherlock"},
75
+ },
76
+ }
77
+
78
+
79
+ def _normalize_local_base(base_url: str) -> str:
80
+ """'http://localhost:11434' -> 'http://localhost:11434/v1' (Ollama/LM Studio
81
+ both serve the OpenAI-compatible surface under /v1)."""
82
+ base = (base_url or "").strip().rstrip("/")
83
+ if not base:
84
+ raise ValueError("local provider needs a base URL (e.g. http://localhost:11434/v1)")
85
+ if not base.startswith("http"):
86
+ base = "http://" + base
87
+ if not base.endswith("/v1"):
88
+ base = base + "/v1"
89
+ return base
90
+
91
+
92
+ def list_models(provider: str, api_key: str = "", base_url: str = "") -> list[dict]:
93
+ """Live model list for one provider: ``[{id, display, ...}]``, newest-ish
94
+ first. Raises on HTTP/auth failure so the caller can surface the error."""
95
+ provider = (provider or "gemini").lower()
96
+ if provider in OPENAI_COMPAT:
97
+ return _list_openai_compat(provider, api_key)
98
+ if provider == "gemini":
99
+ return _list_gemini(api_key)
100
+ if provider == "openai":
101
+ return _list_openai(api_key)
102
+ if provider == "anthropic":
103
+ return _list_anthropic(api_key)
104
+ if provider == "local":
105
+ return _list_local(base_url, api_key)
106
+ raise ValueError(f"unknown provider: {provider}")
107
+
108
+
109
+ def _list_gemini(api_key: str) -> list[dict]:
110
+ r = httpx.get(_GEMINI_MODELS_URL, params={"key": api_key, "pageSize": 1000}, timeout=20.0)
111
+ r.raise_for_status()
112
+ out: list[dict] = []
113
+ for m in r.json().get("models", []):
114
+ if "generateContent" not in (m.get("supportedGenerationMethods") or []):
115
+ continue
116
+ mid = (m.get("name") or "").removeprefix("models/")
117
+ if not mid:
118
+ continue
119
+ out.append(
120
+ {
121
+ "id": mid,
122
+ "display": m.get("displayName") or mid,
123
+ "input_limit": m.get("inputTokenLimit"),
124
+ "output_limit": m.get("outputTokenLimit"),
125
+ }
126
+ )
127
+ out.sort(key=lambda x: x["id"], reverse=True)
128
+ return out
129
+
130
+
131
+ def _list_openai(api_key: str) -> list[dict]:
132
+ r = httpx.get(_OPENAI_MODELS_URL, headers={"Authorization": f"Bearer {api_key}"}, timeout=20.0)
133
+ r.raise_for_status()
134
+ out: list[dict] = []
135
+ for m in r.json().get("data", []):
136
+ mid = m.get("id") or ""
137
+ if not _OPENAI_CHAT_RE.search(mid) or _OPENAI_NON_CHAT_RE.search(mid):
138
+ continue
139
+ out.append({"id": mid, "display": mid, "created": m.get("created") or 0})
140
+ out.sort(key=lambda x: (-(x.get("created") or 0), x["id"]))
141
+ return out
142
+
143
+
144
+ def _list_anthropic(api_key: str) -> list[dict]:
145
+ r = httpx.get(
146
+ _ANTHROPIC_MODELS_URL,
147
+ headers={"x-api-key": api_key, "anthropic-version": "2023-06-01"},
148
+ params={"limit": 100},
149
+ timeout=20.0,
150
+ )
151
+ r.raise_for_status()
152
+ out = [
153
+ {"id": m.get("id"), "display": m.get("display_name") or m.get("id")}
154
+ for m in r.json().get("data", [])
155
+ if m.get("id")
156
+ ]
157
+ return out # API already returns newest first
158
+
159
+
160
+ def _list_local(base_url: str, api_key: str = "") -> list[dict]:
161
+ base = _normalize_local_base(base_url)
162
+ headers = {"Authorization": f"Bearer {api_key}"} if api_key else {}
163
+ r = httpx.get(f"{base}/models", headers=headers, timeout=10.0)
164
+ r.raise_for_status()
165
+ data = r.json().get("data", [])
166
+ out = [{"id": m.get("id"), "display": m.get("id")} for m in data if m.get("id")]
167
+ out.sort(key=lambda x: x["id"])
168
+ return out
169
+
170
+
171
+ def _chat_models_deepinfra(items: list[dict]) -> list[dict]:
172
+ """DeepInfra /models mixes chat with embed/image/tts/stt — keep tags ∋ 'chat'."""
173
+ out = []
174
+ for m in items:
175
+ meta = m.get("metadata") or {}
176
+ if "chat" not in (meta.get("tags") or []):
177
+ continue
178
+ mid = m.get("id") or ""
179
+ if mid:
180
+ out.append({"id": mid, "display": mid, "input_limit": meta.get("context_length")})
181
+ out.sort(key=lambda x: x["id"])
182
+ return out
183
+
184
+
185
+ def _chat_models_together(items: list[dict]) -> list[dict]:
186
+ """Together model objects carry a ``type`` enum (chat|language|code|image|
187
+ embedding|...) — keep only chat."""
188
+ out = []
189
+ for m in items:
190
+ if (m.get("type") or "") != "chat":
191
+ continue
192
+ mid = m.get("id") or ""
193
+ if mid:
194
+ out.append(
195
+ {
196
+ "id": mid,
197
+ "display": m.get("display_name") or mid,
198
+ "created": m.get("created") or 0,
199
+ }
200
+ )
201
+ out.sort(key=lambda x: (-(x.get("created") or 0), x["id"]))
202
+ return out
203
+
204
+
205
+ def _chat_models_openrouter(items: list[dict]) -> list[dict]:
206
+ """OpenRouter lists a few non-text-output models — drop anything whose
207
+ architecture can't emit text."""
208
+ out = []
209
+ for m in items:
210
+ outs = (m.get("architecture") or {}).get("output_modalities") or []
211
+ if outs and "text" not in outs:
212
+ continue
213
+ mid = m.get("id") or ""
214
+ if mid:
215
+ out.append(
216
+ {"id": mid, "display": m.get("name") or mid, "created": m.get("created") or 0}
217
+ )
218
+ out.sort(key=lambda x: (-(x.get("created") or 0), x["id"]))
219
+ return out
220
+
221
+
222
+ _OSS_CHAT_FILTERS = {
223
+ "deepinfra": _chat_models_deepinfra,
224
+ "together": _chat_models_together,
225
+ "openrouter": _chat_models_openrouter,
226
+ }
227
+
228
+
229
+ def _list_openai_compat(name: str, api_key: str = "") -> list[dict]:
230
+ """Generic model lister for the OpenAI-compatible aggregators in
231
+ ``OPENAI_COMPAT``. Normalizes the three /models response shapes into one
232
+ ``[{id, display, ...}]`` list and applies the per-aggregator chat filter.
233
+ Auth is sent ONLY when the endpoint requires it (DeepInfra/OpenRouter list
234
+ publicly, and DeepInfra 401s on a non-empty invalid key — keyless listing is
235
+ the robust path)."""
236
+ d = OPENAI_COMPAT[name]
237
+ headers = dict(d.get("extra_headers") or {})
238
+ if d["models_need_key"] and api_key:
239
+ headers["Authorization"] = f"Bearer {api_key}"
240
+ r = httpx.get(d["models_url"], headers=headers, timeout=20.0)
241
+ r.raise_for_status()
242
+ body = r.json()
243
+ if d["list_shape"] == "bare_array":
244
+ items = body if isinstance(body, list) else body.get("data", [])
245
+ else: # "data": {"data": [...]} (with or without an "object":"list" wrapper)
246
+ items = body.get("data", []) if isinstance(body, dict) else (body or [])
247
+ return _OSS_CHAT_FILTERS[d["chat_filter"]](items)
248
+
249
+
250
+ def resolve_model_spec(spec, providers: dict) -> tuple[str, dict]:
251
+ """Turn a role's model spec into (litellm_model_id, extra litellm kwargs).
252
+
253
+ ``spec`` is ``{"provider": ..., "model": ...}`` from the UI, or a bare
254
+ string (legacy sessions / tests) which is treated as a Gemini model id.
255
+ ``providers`` is the session's credential map {provider: {api_key, base_url}}.
256
+ """
257
+ if isinstance(spec, str):
258
+ provider, model = "gemini", spec
259
+ else:
260
+ provider = (spec or {}).get("provider", "gemini")
261
+ model = (spec or {}).get("model", "")
262
+ creds = (providers or {}).get(provider, {})
263
+ key = creds.get("api_key", "")
264
+ if provider in OPENAI_COMPAT:
265
+ d = OPENAI_COMPAT[provider]
266
+ # litellm knows each prefix's base URL + cost map natively; the key is
267
+ # passed explicitly so no env-var mirroring is needed on this path.
268
+ extra = {"api_key": key}
269
+ if d.get("extra_headers"):
270
+ extra["extra_headers"] = dict(d["extra_headers"])
271
+ return f"{d['litellm_prefix']}{model}", extra
272
+ if provider == "gemini":
273
+ return f"gemini/{model}", {"api_key": key}
274
+ if provider == "openai":
275
+ return f"openai/{model}", {"api_key": key}
276
+ if provider == "anthropic":
277
+ return f"anthropic/{model}", {"api_key": key}
278
+ if provider == "local":
279
+ base = _normalize_local_base(creds.get("base_url", ""))
280
+ # litellm requires SOME api_key for the openai route; local servers ignore it.
281
+ return f"openai/{model}", {"api_base": base, "api_key": key or "local"}
282
+ raise ValueError(f"unknown provider: {provider}")
283
+
284
+
285
+ def _call_litellm(model: str, messages: list[dict], **extra):
286
+ """Single litellm entry point for every playground call (role callables AND
287
+ the A/B baseline). Module-level so tests can monkeypatch it."""
288
+ import litellm
289
+
290
+ litellm.suppress_debug_info = True
291
+ return litellm.completion(model=model, messages=messages, **extra)
292
+
293
+
294
+ def _stopped(session) -> bool:
295
+ """True when the user pressed Stop for this session's current turn. Safe if
296
+ the agent / stop event isn't wired yet (returns False)."""
297
+ ev = getattr(getattr(session, "agent", None), "_stop_event", None)
298
+ return bool(ev is not None and ev.is_set())
299
+
300
+
301
+ def _call_litellm_stream(model, messages, on_delta, should_stop, on_reasoning=None, **extra):
302
+ """Streaming variant of _call_litellm for the USER-VISIBLE main reply.
303
+
304
+ Calls ``on_delta(chunk_text)`` per answer token and ``on_reasoning(piece)``
305
+ per reasoning/"thinking" token (litellm normalizes provider reasoning into
306
+ ``delta.reasoning_content`` — DeepSeek-R1, GLM, o-series, Gemini/Anthropic
307
+ thinking). Breaks early if ``should_stop()`` flips. Returns a
308
+ ModelResponse-shaped object (full text + usage) rebuilt from the chunks, so
309
+ the caller's text/usage extraction is IDENTICAL to the non-streaming path.
310
+ The initial ``completion(stream=True)`` is intentionally outside the try
311
+ block: if it fails before any token, the exception propagates so the caller
312
+ can cleanly fall back to non-streaming; once tokens have streamed, a
313
+ mid-stream error keeps whatever arrived."""
314
+ import litellm
315
+ from types import SimpleNamespace
316
+
317
+ litellm.suppress_debug_info = True
318
+ stream = litellm.completion(model=model, messages=messages, stream=True, **extra)
319
+ chunks, parts = [], []
320
+ try:
321
+ for chunk in stream:
322
+ chunks.append(chunk)
323
+ try:
324
+ _delta = chunk.choices[0].delta
325
+ except Exception:
326
+ _delta = None
327
+ piece = (getattr(_delta, "content", None) or "") if _delta is not None else ""
328
+ if piece:
329
+ parts.append(piece)
330
+ on_delta(piece)
331
+ if on_reasoning is not None and _delta is not None:
332
+ rc = getattr(_delta, "reasoning_content", None) or ""
333
+ if rc:
334
+ on_reasoning(rc)
335
+ if should_stop():
336
+ break
337
+ except Exception:
338
+ pass # mid-stream error → keep the text we already streamed
339
+ try:
340
+ built = litellm.stream_chunk_builder(chunks, messages=messages)
341
+ if (
342
+ built
343
+ and getattr(built, "choices", None)
344
+ and built.choices[0].message.content is not None
345
+ ):
346
+ return built
347
+ except Exception:
348
+ pass
349
+ # Fallback shape so the caller reads .choices[0].message.content / .usage uniformly.
350
+ return SimpleNamespace(
351
+ choices=[SimpleNamespace(message=SimpleNamespace(content="".join(parts)))],
352
+ usage=SimpleNamespace(prompt_tokens=0, completion_tokens=0, total_tokens=0),
353
+ )
354
+
355
+
356
+ def _spec_provider(spec) -> str:
357
+ """Provider name for a role's model spec (legacy bare string = gemini)."""
358
+ if isinstance(spec, str):
359
+ return "gemini"
360
+ return ((spec or {}).get("provider") or "gemini").lower()
361
+
362
+
363
+ def _apply_cache_hints(messages: list[dict], cache_hints, provider: str) -> list[dict]:
364
+ """v1.3: turn sherlock's CallableProvider ``cache_hints`` into Anthropic
365
+ prompt-cache blocks. Hints are ``{"stable_prefix_chars": {msg_idx: chars}}``;
366
+ for the anthropic provider we REUSE sherlock's converter (ChatMessage +
367
+ LiteLLMProvider._to_litellm_messages) so each hinted message becomes
368
+ OpenAI-format content blocks with ``cache_control`` on the stable prefix.
369
+ Other providers (gemini/openai/local) cache implicitly server-side, so the
370
+ hints are ignored and the payload stays byte-identical."""
371
+ prefixes = (cache_hints or {}).get("stable_prefix_chars") or {}
372
+ if not prefixes or provider != "anthropic":
373
+ return messages
374
+ try: # deferred, mirroring _call_litellm — litellm import is heavy/optional
375
+ from sherlock.providers.litellm_provider import LiteLLMProvider
376
+ except Exception:
377
+ return messages
378
+ converted: list[ChatMessage] = []
379
+ for i, m in enumerate(messages):
380
+ content = m.get("content")
381
+ split = prefixes.get(i, prefixes.get(str(i))) if isinstance(content, str) else None
382
+ converted.append(
383
+ ChatMessage(
384
+ role=m.get("role", "user"),
385
+ content=content if isinstance(content, str) else (content or ""),
386
+ cache_stable_prefix_chars=int(split) if split else None,
387
+ )
388
+ )
389
+ return LiteLLMProvider._to_litellm_messages(converted)
390
+
391
+
392
+ def _baseline_search_block(session, message: str) -> str:
393
+ """One NAIVE search pass with the raw user message — the typical
394
+ 'LLM + web search' wiring people actually use as a baseline. Same engine
395
+ the Sherlock side uses, so the A/B isolates CURATION, not tool access."""
396
+ engine_name = (session.settings or {}).get("search_engine", "duckduckgo")
397
+ if engine_name in (None, "", "off", "none"):
398
+ return ""
399
+ try:
400
+ from sherlock.tools.web_search import create_search
401
+
402
+ if session._baseline_engine is None:
403
+ session._baseline_engine = create_search(
404
+ engine_name, api_key=(session.settings or {}).get("search_api_key") or None
405
+ )
406
+ results = session._baseline_engine.search(message[:300], max_results=5) or []
407
+ except Exception:
408
+ return ""
409
+ lines = []
410
+ for r in results[:5]:
411
+ if not isinstance(r, dict) or r.get("error"):
412
+ continue
413
+ snippet = (r.get("content") or r.get("snippet") or "")[:300]
414
+ lines.append(f"- {r.get('title', '')} — {r.get('url', '')}: {snippet}")
415
+ if not lines:
416
+ return ""
417
+ return "Web search results for the user's message:\n" + "\n".join(lines)
418
+
419
+
420
+ def baseline_chat(session, message: str, *, use_search: bool = True) -> dict:
421
+ """The fair single-LLM baseline for A/B mode: the MAIN role's model called
422
+ directly through litellm — full raw history, the user's plain system prompt
423
+ plus today's date, and (by default) ONE naive web-search pass with the raw
424
+ user message. No Sherlock curation/companions/memory. Returns
425
+ ``{"text", "latency_ms", "prompt_tokens", "completion_tokens", "error",
426
+ "searched"}``.
427
+ """
428
+ from datetime import datetime
429
+
430
+ t0 = time.time()
431
+ text, pt, ct, err = "", 0, 0, None
432
+ # Today's date is one trivial line any wrapper would add — withholding it
433
+ # would gift Sherlock an unearned win on date questions.
434
+ today = datetime.now().astimezone().strftime("%Y-%m-%d (%A)")
435
+ sys_prompt = f"{session.system_prompt}\n(Today is {today}.)"
436
+ search_block = _baseline_search_block(session, message) if use_search else ""
437
+ user_content = f"{message}\n\n{search_block}" if search_block else message
438
+ messages = (
439
+ [{"role": "system", "content": sys_prompt}]
440
+ + list(session.baseline_history)
441
+ + [{"role": "user", "content": user_content}]
442
+ )
443
+ try:
444
+ model_id, extra = resolve_model_spec(
445
+ session.models.get("main"), getattr(session, "providers", {})
446
+ )
447
+ resp = _call_litellm(model_id, messages, **extra)
448
+ text = (resp.choices[0].message.content or "") if resp.choices else ""
449
+ usage = getattr(resp, "usage", None)
450
+ if usage is not None:
451
+ pt = getattr(usage, "prompt_tokens", 0) or 0
452
+ ct = getattr(usage, "completion_tokens", 0) or 0
453
+ except Exception as exc:
454
+ err = f"{type(exc).__name__}: {exc}"
455
+ text = ""
456
+ latency_ms = int((time.time() - t0) * 1000)
457
+ if err is None:
458
+ # history keeps the PLAIN user message (search blocks are per-turn aids,
459
+ # not conversation content — mirroring how Sherlock's transcript works)
460
+ session.baseline_history.append({"role": "user", "content": message})
461
+ session.baseline_history.append({"role": "assistant", "content": text})
462
+ session.baseline_tokens["in"] += pt
463
+ session.baseline_tokens["out"] += ct
464
+ return {
465
+ "text": text,
466
+ "latency_ms": latency_ms,
467
+ "prompt_tokens": pt,
468
+ "completion_tokens": ct,
469
+ "error": err,
470
+ "searched": bool(search_block),
471
+ }
472
+
473
+
474
+ # Markers that only appear in Sherlock's INTERNAL deep-research prompts
475
+ # (round Q&A, planning, review, synthesis) — never in a real user turn.
476
+ _INTERNAL_RESEARCH_MARKERS = (
477
+ "Answer these meta-questions",
478
+ "RESEARCH DOCUMENTS:",
479
+ "META-COGNITION QUESTIONS",
480
+ "MULTILINGUAL web-search sweep",
481
+ "reviewing ONE round",
482
+ )
483
+
484
+
485
+ def _is_internal_research_prompt(messages: list[dict]) -> bool:
486
+ """True when the last user message is an internal deep-research call
487
+ (round Q&A / synthesis / planning) rather than a real user turn."""
488
+ last = next((m for m in reversed(messages) if m.get("role") == "user"), None)
489
+ content = (last or {}).get("content") or ""
490
+ return any(marker in content for marker in _INTERNAL_RESEARCH_MARKERS)
491
+
492
+
493
+ def make_role_callable(role: str, session, emit):
494
+ """Build a Sherlock chat callable for ``role`` ∈ {main, summary, inference}.
495
+
496
+ Reads the CURRENT model selection from ``session.models`` each call (so a
497
+ mid-session dropdown change takes effect next turn). Emits an ``llm.call``
498
+ event with the exact prompt/response/tokens/latency, and returns a
499
+ ``ChatResponse`` with real token usage so Sherlock's budget telemetry is
500
+ accurate.
501
+
502
+ The ``cache_hints`` kwarg makes sherlock's CallableProvider pass its
503
+ prompt-cache hints (it detects the parameter via signature inspection);
504
+ for anthropic-backed roles they become real ``cache_control`` blocks.
505
+ """
506
+ actor = _ROLE_ACTOR.get(role, role)
507
+
508
+ def _call(messages: list[dict], cache_hints=None):
509
+ spec = session.models.get(role) or session.models.get("main")
510
+ t0 = time.time()
511
+ text, pt, ct, tt, err = "", 0, 0, 0, None
512
+ cache_read = 0
513
+ model_id = "?"
514
+ try:
515
+ model_id, extra = resolve_model_spec(spec, getattr(session, "providers", {}))
516
+ send_messages = _apply_cache_hints(messages, cache_hints, _spec_provider(spec))
517
+ # Stream ONLY the user-visible main reply — companions (LLM-2/LLM-3)
518
+ # and internal deep-research prompts stay non-streaming (background,
519
+ # not shown live). Each main reply token is pushed to the browser as
520
+ # an `llm.delta` event; reasoning/"thinking" tokens go out as a
521
+ # separate `llm.reasoning_delta`. The full text is still returned to
522
+ # the core (its `f(messages)->str` contract is unchanged).
523
+ reasoning_streamed = []
524
+
525
+ def _emit_reasoning(piece: str) -> None:
526
+ reasoning_streamed.append(piece)
527
+ emit(
528
+ {
529
+ "type": "llm.reasoning_delta",
530
+ "actor": actor,
531
+ "turn": session.turn,
532
+ "data": {"chunk": piece},
533
+ }
534
+ )
535
+
536
+ if role == "main" and not _is_internal_research_prompt(messages):
537
+ answer_streamed = []
538
+
539
+ def _on_delta(piece: str) -> None:
540
+ answer_streamed.append(piece)
541
+ emit(
542
+ {
543
+ "type": "llm.delta",
544
+ "actor": actor,
545
+ "turn": session.turn,
546
+ "data": {"chunk": piece},
547
+ }
548
+ )
549
+
550
+ try:
551
+ resp = _call_litellm_stream(
552
+ model_id,
553
+ send_messages,
554
+ _on_delta,
555
+ lambda: _stopped(session),
556
+ on_reasoning=_emit_reasoning,
557
+ **extra,
558
+ )
559
+ except Exception:
560
+ # Provider/route can't stream → fall back (no deltas emitted yet).
561
+ resp = _call_litellm(model_id, send_messages, **extra)
562
+ # If the stream produced no visible tokens AND no text (an empty
563
+ # or mid-error stream on some route), fall back to non-streaming so
564
+ # the reply is never blank — unless the user explicitly stopped.
565
+ _txt = (
566
+ (resp.choices[0].message.content or "")
567
+ if getattr(resp, "choices", None)
568
+ else ""
569
+ )
570
+ if not answer_streamed and not _txt and not _stopped(session):
571
+ resp = _call_litellm(model_id, send_messages, **extra)
572
+ else:
573
+ resp = _call_litellm(model_id, send_messages, **extra)
574
+ text = (resp.choices[0].message.content or "") if resp.choices else ""
575
+ # Reasoning models that expose thinking only on the final message (or
576
+ # the non-streaming fallback) — emit it once if nothing streamed live.
577
+ if role == "main" and not reasoning_streamed and resp.choices:
578
+ _final_reasoning = getattr(resp.choices[0].message, "reasoning_content", None)
579
+ if _final_reasoning:
580
+ _emit_reasoning(_final_reasoning)
581
+ usage = getattr(resp, "usage", None)
582
+ if usage is not None:
583
+ pt = getattr(usage, "prompt_tokens", 0) or 0
584
+ ct = getattr(usage, "completion_tokens", 0) or 0
585
+ tt = getattr(usage, "total_tokens", 0) or (pt + ct)
586
+ cache_read = int(getattr(usage, "cache_read_input_tokens", 0) or 0)
587
+ if not cache_read:
588
+ details = (
589
+ usage.get("prompt_tokens_details")
590
+ if isinstance(usage, dict)
591
+ else getattr(usage, "prompt_tokens_details", None)
592
+ )
593
+ if isinstance(details, dict):
594
+ cache_read = int(details.get("cached_tokens") or 0)
595
+ else:
596
+ cache_read = int(getattr(details, "cached_tokens", 0) or 0)
597
+ except Exception as exc: # surface as a wrapper-error (Sherlock skips persisting it)
598
+ err = f"{type(exc).__name__}: {exc}"
599
+ text = f"[wrapper-error: {err}]"
600
+ latency_ms = int((time.time() - t0) * 1000)
601
+ # Force-reasoning: a vanilla model rarely emits the
602
+ # <<sherlock-companions: ...>> control tag on its own, so the inference
603
+ # (LLM-3) and compaction (LLM-2) panels stay empty and the user can't
604
+ # see the system "think". When the toggle is on, append the tag to the
605
+ # MAIN reply so both companions fire every turn. Sherlock strips the tag
606
+ # before the user sees it; we emit the ORIGINAL text for display.
607
+ # Internal deep-research calls (round Q&A / synthesis) also go through
608
+ # this callable — never tag those, or the tag leaks into the persisted
609
+ # final research answer.
610
+ returned_text = text
611
+ if (
612
+ role == "main"
613
+ and not err
614
+ and (session.settings or {}).get("force_companions")
615
+ and "<<sherlock-companions" not in text
616
+ and "<<sherlock-tool" not in text
617
+ and not _is_internal_research_prompt(messages)
618
+ ):
619
+ returned_text = text.rstrip() + "\n<<sherlock-companions: compact, infer>>"
620
+ sys_prompt = next((m.get("content", "") for m in messages if m.get("role") == "system"), "")
621
+ emit(
622
+ {
623
+ "type": "llm.call",
624
+ "actor": actor,
625
+ "turn": session.turn,
626
+ "data": {
627
+ "role": role,
628
+ "model": model_id,
629
+ "system_prompt": sys_prompt,
630
+ "messages": messages,
631
+ "response_text": text,
632
+ "prompt_tokens": pt,
633
+ "completion_tokens": ct,
634
+ "total_tokens": tt,
635
+ "cache_read_tokens": cache_read,
636
+ "latency_ms": latency_ms,
637
+ "error": err,
638
+ },
639
+ }
640
+ )
641
+ return ChatResponse(
642
+ text=returned_text,
643
+ model=model_id,
644
+ usage=TokenUsage(
645
+ prompt_tokens=pt,
646
+ completion_tokens=ct,
647
+ total_tokens=tt,
648
+ cache_read_tokens=cache_read,
649
+ ),
650
+ )
651
+
652
+ return _call