sliceagent 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. sliceagent/__init__.py +3 -0
  2. sliceagent/__main__.py +6 -0
  3. sliceagent/access.py +93 -0
  4. sliceagent/agents.py +173 -0
  5. sliceagent/background_review.py +146 -0
  6. sliceagent/binsniff.py +89 -0
  7. sliceagent/cli.py +890 -0
  8. sliceagent/clock.py +32 -0
  9. sliceagent/code_grep.py +329 -0
  10. sliceagent/code_index.py +417 -0
  11. sliceagent/config.py +240 -0
  12. sliceagent/context_overflow.py +227 -0
  13. sliceagent/envspec.py +129 -0
  14. sliceagent/errors.py +167 -0
  15. sliceagent/events.py +96 -0
  16. sliceagent/finding_types.py +70 -0
  17. sliceagent/flags.py +63 -0
  18. sliceagent/fuzzy.py +135 -0
  19. sliceagent/guardrails.py +438 -0
  20. sliceagent/guidance.py +69 -0
  21. sliceagent/hippocampus.py +581 -0
  22. sliceagent/hooks.py +334 -0
  23. sliceagent/interfaces.py +144 -0
  24. sliceagent/llm.py +695 -0
  25. sliceagent/loop.py +548 -0
  26. sliceagent/mcp_client.py +255 -0
  27. sliceagent/mcp_security.py +77 -0
  28. sliceagent/memory.py +428 -0
  29. sliceagent/metrics.py +103 -0
  30. sliceagent/model_catalog.py +124 -0
  31. sliceagent/monitor.py +615 -0
  32. sliceagent/neocortex.py +436 -0
  33. sliceagent/onboarding.py +323 -0
  34. sliceagent/oracle.py +36 -0
  35. sliceagent/pagetable.py +255 -0
  36. sliceagent/pfc.py +449 -0
  37. sliceagent/plugins.py +127 -0
  38. sliceagent/policy.py +234 -0
  39. sliceagent/procman.py +187 -0
  40. sliceagent/prompt.py +239 -0
  41. sliceagent/records.py +108 -0
  42. sliceagent/recovery.py +119 -0
  43. sliceagent/regions.py +678 -0
  44. sliceagent/registry.py +128 -0
  45. sliceagent/retriever.py +19 -0
  46. sliceagent/safety.py +332 -0
  47. sliceagent/sandbox.py +143 -0
  48. sliceagent/scheduler.py +92 -0
  49. sliceagent/search_index.py +289 -0
  50. sliceagent/seed.py +465 -0
  51. sliceagent/sensory_cortex.py +500 -0
  52. sliceagent/session.py +222 -0
  53. sliceagent/skill_provenance.py +71 -0
  54. sliceagent/skill_usage.py +123 -0
  55. sliceagent/skills.py +209 -0
  56. sliceagent/subagent.py +332 -0
  57. sliceagent/subdir_hints.py +222 -0
  58. sliceagent/swap.py +182 -0
  59. sliceagent/taskstate.py +57 -0
  60. sliceagent/telemetry.py +59 -0
  61. sliceagent/terminal.py +240 -0
  62. sliceagent/text_utils.py +56 -0
  63. sliceagent/tool_summary.py +93 -0
  64. sliceagent/tools.py +1194 -0
  65. sliceagent/tui.py +1377 -0
  66. sliceagent/web.py +354 -0
  67. sliceagent-0.1.0.dist-info/METADATA +262 -0
  68. sliceagent-0.1.0.dist-info/RECORD +71 -0
  69. sliceagent-0.1.0.dist-info/WHEEL +4 -0
  70. sliceagent-0.1.0.dist-info/entry_points.txt +2 -0
  71. sliceagent-0.1.0.dist-info/licenses/LICENSE +21 -0
sliceagent/llm.py ADDED
@@ -0,0 +1,695 @@
1
+ """OpenAILLM — the default LLMClient over any OpenAI-COMPATIBLE endpoint (OpenAI, Moonshot,
2
+ DeepSeek, …). Configured by provider-AGNOSTIC env: LLM_API_KEY + LLM_BASE_URL (+ AGENT_MODEL);
3
+ OPENAI_*/MOONSHOT_* are accepted only as a back-compat fallback.
4
+
5
+ Connects directly by default; set AGENT_PROXY (or HTTPS_PROXY/HTTP_PROXY) to route through an HTTP
6
+ proxy, or AGENT_PROXY=none to force a direct connection. The only module that imports the openai SDK
7
+ — the core stays openai-free.
8
+ """
9
+ from __future__ import annotations
10
+
11
+ import json
12
+ import os
13
+ import threading
14
+
15
+ from .context_overflow import ContextOverflow, is_context_overflow
16
+ from .interfaces import AssistantMessage, ToolCall
17
+
18
+
19
+ def _import_api_timeout_error():
20
+ """APITimeoutError moved between openai SDK versions; import defensively."""
21
+ try:
22
+ from openai import APITimeoutError
23
+ return APITimeoutError
24
+ except ImportError:
25
+ pass
26
+ try:
27
+ from openai.error import Timeout as APITimeoutError
28
+ return APITimeoutError
29
+ except ImportError:
30
+ pass
31
+ # Fallback: a plain retryable timeout that is_retryable will still classify.
32
+ class _FallbackTimeoutError(Exception):
33
+ pass
34
+ return _FallbackTimeoutError
35
+
36
+
37
+ def _choose_proxy(resolved_base: str | None, explicit: str | None) -> str:
38
+ """Pick the HTTP proxy for the active provider. An EXPLICIT setting (arg or AGENT_PROXY/HTTPS_PROXY/
39
+ HTTP_PROXY) wins; otherwise connect DIRECTLY — no proxy by default. AGENT_PROXY=none/off forces a
40
+ direct connection. Returns a proxy URL or 'none'. (Environment quirk, isolated here.)"""
41
+ if explicit and explicit.strip(): # a whitespace-only env var is NOT an explicit setting
42
+ s = explicit.strip()
43
+ if s.lower() in ("off", "none", "direct", "no", "false", "0"):
44
+ return "none" # AGENT_PROXY=off → force a DIRECT connection (was treated as a URL → bug)
45
+ return s
46
+ return "none" # default: direct connection, no proxy
47
+
48
+
49
+ def _int(x) -> int:
50
+ """Coerce a provider-supplied token counter to int; non-numeric (str/object/None) → 0. Some providers
51
+ report counts as strings or odd objects, and `x or 0` keeps a truthy non-number → arithmetic TypeError."""
52
+ try:
53
+ return int(x)
54
+ except (TypeError, ValueError):
55
+ return 0
56
+
57
+
58
+ def _usage_dict(raw) -> dict | None:
59
+ """Normalize a provider usage object into a typed token breakdown: `output` plus input split into
60
+ cache-read / cache-creation / other. Keeps
61
+ the legacy prompt_tokens/completion_tokens/cached_tokens keys so existing consumers keep working,
62
+ and adds the typed fields the telemetry layer needs to measure per-turn FRESH-input cost (the moat).
63
+ Provider-agnostic: every field defaults to 0, so a provider that omits a counter never crashes."""
64
+ if not raw:
65
+ return None
66
+ prompt = _int(getattr(raw, "prompt_tokens", 0))
67
+ output = _int(getattr(raw, "completion_tokens", 0))
68
+ details = getattr(raw, "prompt_tokens_details", None)
69
+ # cache READ: OpenAI nests it under prompt_tokens_details; Moonshot/some report it top-level. Use
70
+ # is-None (not truthiness) to choose the source — a legit cached_tokens=0 from details must NOT fall
71
+ # through to raw.cached_tokens (that miscounted a no-cache-hit turn as a top-level value).
72
+ _cr = getattr(details, "cached_tokens", None)
73
+ cache_read = _int(_cr if _cr is not None else getattr(raw, "cached_tokens", None))
74
+ # cache CREATION: Anthropic-compatible only (absent on OpenAI/Moonshot → 0).
75
+ cache_create = _int(getattr(raw, "cache_creation_input_tokens", 0))
76
+ input_other = max(0, prompt - cache_read - cache_create)
77
+ usage = {
78
+ "prompt_tokens": prompt, "completion_tokens": output, # legacy / back-compat
79
+ "input_other": input_other, "output": output, # typed token fields
80
+ "input_cache_read": cache_read, "input_cache_creation": cache_create,
81
+ }
82
+ if cache_read:
83
+ usage["cached_tokens"] = cache_read # legacy key (only when present)
84
+ return usage
85
+
86
+
87
+ def _as_text(content) -> str:
88
+ """Flatten a chat `content` (str OR a multimodal parts list) to plain text."""
89
+ if isinstance(content, str):
90
+ return content
91
+ if isinstance(content, list):
92
+ return "".join(p.get("text", "") for p in content
93
+ if isinstance(p, dict) and p.get("type") in ("text", "input_text"))
94
+ return "" if content is None else str(content)
95
+
96
+
97
+ def _to_responses_content(content):
98
+ """Map a chat message `content` to the Responses-API content shape: a plain string passes through;
99
+ a multimodal parts list is converted (text→input_text, image_url→input_image)."""
100
+ if not isinstance(content, list):
101
+ return content if content is not None else ""
102
+ parts = []
103
+ for p in content:
104
+ if not isinstance(p, dict):
105
+ parts.append({"type": "input_text", "text": str(p)}); continue
106
+ t = p.get("type")
107
+ if t in ("text", "input_text"):
108
+ parts.append({"type": "input_text", "text": p.get("text", "")})
109
+ elif t in ("image_url", "input_image"):
110
+ u = p.get("image_url")
111
+ url = u.get("url") if isinstance(u, dict) else u
112
+ parts.append({"type": "input_image", "image_url": url})
113
+ else:
114
+ parts.append(p)
115
+ return parts
116
+
117
+
118
+ def _to_responses_input(messages: list[dict]) -> list[dict]:
119
+ """Convert chat/completions `messages` → Responses-API `input` items. The Responses API has no
120
+ `tool` role and no nested `tool_calls`: an assistant tool call becomes a flat {type:function_call}
121
+ item and a tool result a {type:function_call_output} item; plain system/user/assistant stay as
122
+ {role, content}. Pure — testable offline."""
123
+ out: list[dict] = []
124
+ for m in messages:
125
+ role = m.get("role")
126
+ if role == "tool": # tool result → function_call_output
127
+ out.append({"type": "function_call_output",
128
+ "call_id": m.get("tool_call_id", ""), "output": _as_text(m.get("content"))})
129
+ elif role == "assistant" and m.get("tool_calls"): # assistant turn that called tools
130
+ txt = _as_text(m.get("content"))
131
+ if txt:
132
+ out.append({"role": "assistant", "content": txt})
133
+ for tc in m["tool_calls"]:
134
+ fn = tc.get("function", {}) if isinstance(tc, dict) else {}
135
+ out.append({"type": "function_call", "call_id": tc.get("id", ""),
136
+ "name": fn.get("name", ""), "arguments": fn.get("arguments") or "{}"})
137
+ else: # plain text (system/user/assistant)
138
+ out.append({"role": role or "user", "content": _to_responses_content(m.get("content"))})
139
+ return out
140
+
141
+
142
+ def _to_responses_tools(tools: list[dict]) -> list[dict]:
143
+ """chat tool schema {type:function, function:{name,description,parameters}} → Responses flat
144
+ {type:function, name, description, parameters}."""
145
+ out = []
146
+ for t in (tools or []):
147
+ fn = t.get("function") if isinstance(t, dict) else None
148
+ if fn:
149
+ out.append({"type": "function", "name": fn.get("name", ""),
150
+ "description": fn.get("description", ""),
151
+ "parameters": fn.get("parameters") or {"type": "object", "properties": {}}})
152
+ elif isinstance(t, dict) and t.get("type") == "function" and "name" in t:
153
+ out.append(t) # already Responses-shaped
154
+ return out
155
+
156
+
157
+ def _responses_usage(u):
158
+ """Adapt a Responses `usage` (input_tokens / output_tokens / input_tokens_details.cached_tokens) to
159
+ the chat-usage attribute names `_usage_dict` reads, so token telemetry/cost is unchanged. None→None."""
160
+ if not u:
161
+ return None
162
+ from types import SimpleNamespace as NS
163
+ det = getattr(u, "input_tokens_details", None)
164
+ cached = (getattr(det, "cached_tokens", 0) if det else 0) or 0
165
+ return NS(prompt_tokens=getattr(u, "input_tokens", 0) or 0,
166
+ completion_tokens=getattr(u, "output_tokens", 0) or 0,
167
+ prompt_tokens_details=NS(cached_tokens=cached),
168
+ cached_tokens=cached, cache_creation_input_tokens=0)
169
+
170
+
171
+ class OpenAILLM:
172
+ def __init__(self, model: str | None = None, api_key: str | None = None,
173
+ base_url: str | None = None, proxy: str | None = None, timeout: float | None = None):
174
+ import httpx
175
+ from openai import OpenAI
176
+
177
+ # Request timeout is env-configurable (LLM_TIMEOUT_SEC) — large ACCUMULATED contexts produce a
178
+ # single long non-streaming completion that legitimately exceeds the 60s default over a high-
179
+ # latency proxy, and the hard watchdog would then false-kill a valid slow call (every retry
180
+ # timing out → the turn parks 'error'). Default stays 60 for snappy interactive use.
181
+ if timeout is None:
182
+ try:
183
+ timeout = float(os.environ.get("LLM_TIMEOUT_SEC") or os.environ.get("LLM_TIMEOUT") or 60.0)
184
+ except (TypeError, ValueError):
185
+ timeout = 60.0
186
+
187
+ # Provider-AGNOSTIC env: LLM_API_KEY / LLM_BASE_URL are canonical. OPENAI_*/MOONSHOT_* are
188
+ # kept ONLY as a back-compat fallback (the SDK is OpenAI-compatible and many shells already
189
+ # export OPENAI_API_KEY) — the surface the user configures says "LLM", not a provider name.
190
+ # Resolve the ENDPOINT first: the proxy choice below depends on which provider it is.
191
+ kwargs: dict = {"api_key": api_key or os.environ.get("LLM_API_KEY")
192
+ or os.environ.get("OPENAI_API_KEY") or os.environ.get("MOONSHOT_API_KEY")}
193
+ resolved_base = base_url or os.environ.get("LLM_BASE_URL") or os.environ.get("OPENAI_BASE_URL")
194
+ if not resolved_base and os.environ.get("MOONSHOT_API_KEY") and not (
195
+ os.environ.get("LLM_API_KEY") or os.environ.get("OPENAI_API_KEY")):
196
+ resolved_base = "https://api.moonshot.cn/v1"
197
+ if resolved_base:
198
+ kwargs["base_url"] = resolved_base
199
+
200
+ # Proxy: an EXPLICIT setting (the arg, or AGENT_PROXY/HTTPS_PROXY/HTTP_PROXY) wins; otherwise connect
201
+ # directly (no proxy by default). Isolated to this adapter (llm-agnostic) and fully overridable.
202
+ proxy = _choose_proxy(resolved_base, proxy or os.environ.get("AGENT_PROXY")
203
+ or os.environ.get("HTTPS_PROXY") or os.environ.get("HTTP_PROXY"))
204
+ use_proxy = bool(proxy) and proxy != "none"
205
+ self.proxy_used = proxy if use_proxy else "direct" # exposed so the CLI can announce the route (A4)
206
+ http_client = httpx.Client(proxy=proxy, timeout=timeout) if use_proxy else httpx.Client(timeout=timeout)
207
+
208
+ # Enforce the request timeout at the SDK layer too. The openai SDK applies its OWN per-request
209
+ # timeout (default ~600s) which OVERRIDES the httpx client's, so without passing it here a
210
+ # stalled/half-open connection hangs ~10 min before max_retries ever fires (observed: a wedged
211
+ # direct connection, timeout never tripping). Passing `timeout` makes a wedged call abort
212
+ # promptly so the retry recovers on a fresh connection — task-agnostic reliability (→ wall time).
213
+ self.client = OpenAI(http_client=http_client, timeout=timeout, max_retries=2, **kwargs)
214
+ # HARD wall-clock backstop for _create() (SIGALRM): a few seconds above the SDK read-timeout so
215
+ # the SDK's own (cleaner) timeout fires first when it can, and SIGALRM only catches the stalls
216
+ # the read-timeout misses (silent mid-response connections).
217
+ self._hard_timeout = max(int(timeout) + 15, 30)
218
+ # No built-in default model — the user picks (parallels the CLI's model gate; a silent
219
+ # fallback here would contradict it for library/embedding callers).
220
+ self.model = model or os.environ.get("AGENT_MODEL") or ""
221
+ if not self.model:
222
+ raise ValueError("No model configured. Pass model=... or set AGENT_MODEL "
223
+ "(interactive setup: `sliceagent init`).")
224
+ self._base_url = kwargs.get("base_url") or ""
225
+ # Provider-AGNOSTIC reasoning intent: "full" (default) keeps the model's reasoning; "fast"
226
+ # minimizes it (wall-clock tracks reasoning tokens, and the slice reconstructs ground-truth
227
+ # STATE each turn, which can substitute for per-step re-derivation). The core/agent never
228
+ # sees this — _reasoning_kwargs() maps it to each provider's own param, here in the adapter
229
+ # (the one place permitted to know provider specifics). AGENT_THINKING=off kept as an alias.
230
+ self.reasoning = (os.environ.get("AGENT_REASONING")
231
+ or ("fast" if (os.environ.get("AGENT_THINKING") or "").lower() == "off"
232
+ else "full")).lower()
233
+ # Cap the completion generously. Providers default low (deepseek ~4096); a response that
234
+ # exceeds it truncates mid-edit → the agent retries the broken edit → step/time blowup. A
235
+ # generous explicit cap avoids that. Standard param (provider-agnostic). 0 → leave default.
236
+ # per-REQUEST completion cap — its OWN env var, decoupled from AGENT_MAX_TOKENS (which is the
237
+ # per-turn BudgetHook budget; sharing the key made one value drive two quantities orders of
238
+ # magnitude apart). Guarded so a malformed value degrades to the default instead of crashing init.
239
+ try:
240
+ self.max_tokens = int(os.environ.get("AGENT_COMPLETION_TOKENS") or 8192)
241
+ except (TypeError, ValueError):
242
+ self.max_tokens = 8192
243
+ # Provider-AGNOSTIC prompt-cache routing key (OpenAI `prompt_cache_key`, accepted/ignored
244
+ # harmlessly elsewhere). A session-stable key keeps every turn's requests on the same cached
245
+ # prefix → higher cache-hit rate at ZERO added prompt tokens. Set via set_cache_key(); the
246
+ # quirk stays isolated to this adapter (llm-agnostic). None → omit the kwarg entirely.
247
+ self._cache_key: str | None = None
248
+ # Optional LIVE token sink for interactive streaming (set by the cli/TUI). When set, complete()
249
+ # STREAMS the completion and emits deltas (kind in {"content","reasoning"}) so a slow turn renders
250
+ # LIVE instead of freezing on one blocking call.
251
+ # None → the blocking non-streaming path (eval/headless unchanged; byte-identical assembled result).
252
+ self._on_delta = None
253
+ # Sticky: set True once this provider 400s on reasoning_effort+tools (gpt-5.5 chat/completions);
254
+ # thereafter reasoning_effort is dropped when tools are present (graceful degrade, no re-400).
255
+ self._drop_reasoning_effort = False
256
+
257
+ def switch(self, *, model: str | None = None, reasoning: str | None = None) -> None:
258
+ """Live-switch the model id and/or reasoning intent for SUBSEQUENT turns (mutates in place — the
259
+ loop passes this same llm object every turn, so the change applies from the next turn on). Resets
260
+ the reasoning_effort+tools degrade memory since a different model may support the pairing. Same
261
+ endpoint/client; switching to a DIFFERENT PROVIDER (base_url/key) is `sliceagent config --use`."""
262
+ if model:
263
+ self.model = model
264
+ self._drop_reasoning_effort = False
265
+ if reasoning:
266
+ self.reasoning = reasoning.strip().lower()
267
+
268
+ def set_cache_key(self, key: str | None) -> None:
269
+ """Pin a session-scoped prompt-cache routing key (typically the session_id). Cheapest cache
270
+ lever there is: raises cache-hit rate, adds no tokens. Safe to call repeatedly."""
271
+ self._cache_key = key or None
272
+
273
+ def set_delta_sink(self, fn) -> None:
274
+ """Wire a live-delta sink for interactive STREAMING: fn(kind: str, text: str), kind in
275
+ {'content','reasoning'}. None restores the blocking non-streaming path. Safe to call repeatedly.
276
+ Pure transport/UX — the slice/loop/moat never see it (the assembled result is identical)."""
277
+ self._on_delta = fn
278
+
279
+ def _emit(self, kind: str, text: str) -> None:
280
+ sink = getattr(self, "_on_delta", None)
281
+ if sink and text:
282
+ try:
283
+ sink(kind, text)
284
+ except _import_api_timeout_error():
285
+ raise # the SIGALRM hard-deadline must not be swallowed by the sink wrapper
286
+ except Exception: # noqa: BLE001 — a render error must NEVER break the LLM call
287
+ pass
288
+
289
+ def is_retryable(self, error: Exception) -> bool:
290
+ from .errors import EmptyResponseError
291
+ try:
292
+ from openai import APIConnectionError, APITimeoutError, InternalServerError, RateLimitError
293
+ openai_errors = (RateLimitError, APITimeoutError, APIConnectionError, InternalServerError)
294
+ except ImportError:
295
+ openai_errors = ()
296
+ try:
297
+ import httpx
298
+ transport = (httpx.TransportError,) # a raw mid-STREAM drop (SDK doesn't wrap stream-iter errors) must retry like the blocking path
299
+ except ImportError:
300
+ transport = ()
301
+ return isinstance(error, openai_errors + transport + (EmptyResponseError,))
302
+
303
+ def _on_alarm(self, signum, frame):
304
+ """SIGALRM handler: a request blew the HARD wall-clock deadline → raise a retryable timeout."""
305
+ APITimeoutError = _import_api_timeout_error()
306
+ try:
307
+ import httpx
308
+ raise APITimeoutError(request=httpx.Request("POST", (self._base_url or "http://local") + "/chat/completions"))
309
+ except TypeError:
310
+ # Older SDKs don't accept `request=` in the constructor.
311
+ raise APITimeoutError("sliceagent hard timeout reached")
312
+
313
+ def _create(self, kwargs: dict, caller=None):
314
+ """Call the SDK with a HARD wall-clock deadline that ALWAYS fires, on ANY thread. The httpx/SDK
315
+ read-timeout only bounds the gap BETWEEN bytes, so a connection that goes silent mid-response can
316
+ hang far past `timeout` (observed: a stalled read wedging the loop 10+ min). On the main thread a
317
+ SIGALRM deadline guarantees control returns to the retry path. OFF the main thread — e.g. a
318
+ Terminal-Bench / any host ThreadPoolExecutor worker, where SIGALRM cannot arm — a watchdog thread
319
+ enforces the SAME deadline (the abandoned SDK call is left to die on its socket while control
320
+ returns). Without this, a wedged connection in a worker thread hangs the turn FOREVER, since the
321
+ SDK timeout alone misses silent mid-response stalls. Task/provider-agnostic reliability."""
322
+ caller = caller or (lambda kw: self.client.chat.completions.create(**kw))
323
+ import signal as _signal
324
+ try:
325
+ prev = _signal.signal(_signal.SIGALRM, self._on_alarm)
326
+ _signal.alarm(self._hard_timeout)
327
+ except (ValueError, AttributeError, OSError):
328
+ return self._create_watchdog(kwargs, caller) # not the main thread → deadline via a thread
329
+ try:
330
+ return caller(kwargs)
331
+ finally:
332
+ _signal.alarm(0)
333
+ _signal.signal(_signal.SIGALRM, prev)
334
+
335
+ def _create_watchdog(self, kwargs: dict, caller=None):
336
+ """Off-main-thread hard deadline: run the SDK call in a DAEMON worker and abandon it if it blows
337
+ the wall-clock budget (raise a RETRYABLE timeout so with_retry can retry, then the loop parks
338
+ gracefully instead of hanging). #47: a daemon thread (vs a ThreadPoolExecutor, whose worker the
339
+ interpreter joins at exit) means a wedged call can NEVER block process shutdown — it dies with the
340
+ socket whenever the SDK call finally errors on its own timeout. One thread per call; bounded."""
341
+ import threading
342
+ APITimeoutError = _import_api_timeout_error()
343
+ caller = caller or (lambda kw: self.client.chat.completions.create(**kw))
344
+ box: dict = {}
345
+
346
+ def _call():
347
+ try:
348
+ box["resp"] = caller(kwargs)
349
+ except BaseException as e: # noqa: BLE001 — propagate to the caller thread
350
+ box["err"] = e
351
+
352
+ t = threading.Thread(target=_call, name="llm-watchdog", daemon=True)
353
+ t.start()
354
+ t.join(self._hard_timeout)
355
+ if t.is_alive(): # blew the deadline — abandon the (daemon) thread, raise a retryable timeout
356
+ try:
357
+ import httpx
358
+ raise APITimeoutError(
359
+ request=httpx.Request("POST", (self._base_url or "http://local") + "/chat/completions"))
360
+ except TypeError:
361
+ raise APITimeoutError("sliceagent hard timeout reached")
362
+ if "err" in box:
363
+ raise box["err"]
364
+ return box["resp"]
365
+
366
+ def _create_streaming(self, kwargs: dict):
367
+ """Interactive STREAMING variant of _create: drain the SSE stream into an assembled response under
368
+ the SAME SIGALRM hard deadline (this path is always main-thread — set only by the cli — so SIGALRM
369
+ arms; if not, fall back to the httpx read-timeout). Returns the same response SHAPE as _create so
370
+ complete() is identical downstream. The deadline wraps the whole drain (the wait is in iteration,
371
+ not in create()), so a stalled stream still aborts instead of hanging."""
372
+ import signal as _signal
373
+ try:
374
+ prev = _signal.signal(_signal.SIGALRM, self._on_alarm)
375
+ _signal.alarm(self._hard_timeout)
376
+ except (ValueError, AttributeError, OSError):
377
+ return self._stream_assemble(kwargs) # not main thread → rely on the httpx read-timeout
378
+ try:
379
+ return self._stream_assemble(kwargs)
380
+ finally:
381
+ _signal.alarm(0)
382
+ _signal.signal(_signal.SIGALRM, prev)
383
+
384
+ def _stream_assemble(self, kwargs: dict):
385
+ """Stream the completion, emit content/reasoning deltas live (self._emit), and assemble the pieces
386
+ into a response object with the SAME shape complete() reads from the non-streamed path (choices[0]
387
+ .message.content / .tool_calls[*].function.{name,arguments} / .finish_reason / .usage). So the rest
388
+ of complete() — tool-arg JSON parse, usage dict, cache read-back — is byte-identical to the blocking
389
+ path. include_usage gives the final usage chunk; tool-call deltas are reassembled by index."""
390
+ from types import SimpleNamespace as NS
391
+
392
+ from .model_catalog import capability
393
+ skw = {**kwargs, "stream": True}
394
+ # #49: stream_options is OpenAI-specific — some OpenAI-compatible providers 400 on it. Gate by the
395
+ # catalog flag (default True; set False for a provider that rejects it) so we still get the usage
396
+ # chunk where supported without breaking the others.
397
+ if capability(self.model, self._base_url).supports_stream_options:
398
+ skw["stream_options"] = {"include_usage": True}
399
+ parts: list[str] = []
400
+ calls: dict[int, dict] = {} # index → {id, name, args:[fragments]}
401
+ finish = None
402
+ usage = None
403
+ _timeout_err = _import_api_timeout_error() # the SIGALRM hard-deadline exception (must not be swallowed)
404
+ # E3 streaming resilience: a single MALFORMED chunk is skipped (never aborts the whole stream); a
405
+ # mid-stream CONNECTION error re-raises ONLY when nothing was assembled (so with_retry re-rolls) —
406
+ # otherwise we salvage the partial as a truncated stop, which the loop handles cleanly.
407
+ try:
408
+ for chunk in self.client.chat.completions.create(**skw):
409
+ try:
410
+ if getattr(chunk, "usage", None):
411
+ usage = chunk.usage # final include_usage chunk (choices may be empty here)
412
+ for ch in (getattr(chunk, "choices", None) or []):
413
+ if getattr(ch, "finish_reason", None):
414
+ finish = ch.finish_reason
415
+ d = getattr(ch, "delta", None)
416
+ if d is None:
417
+ continue
418
+ txt = getattr(d, "content", None)
419
+ if txt:
420
+ parts.append(txt); self._emit("content", txt)
421
+ rc = getattr(d, "reasoning_content", None) or getattr(d, "reasoning", None)
422
+ if rc:
423
+ self._emit("reasoning", rc)
424
+ for tcd in (getattr(d, "tool_calls", None) or []):
425
+ _ix = getattr(tcd, "index", None)
426
+ if _ix is None: # provider omitted the streaming index
427
+ _tid = getattr(tcd, "id", None)
428
+ if _tid is not None:
429
+ _ix = _tid # a NEW call, announced by its id → its own slot
430
+ elif calls:
431
+ _ix = next(reversed(calls)) # continuation fragment → the OPEN (last) slot,
432
+ else: # NOT len(calls) (that split args into a dead slot)
433
+ _ix = 0 # first fragment before any id/index arrives
434
+ slot = calls.setdefault(_ix, {"id": None, "name": None, "args": []})
435
+ if getattr(tcd, "id", None):
436
+ slot["id"] = tcd.id
437
+ fn = getattr(tcd, "function", None)
438
+ if fn is not None:
439
+ if getattr(fn, "name", None):
440
+ slot["name"] = fn.name
441
+ if getattr(fn, "arguments", None):
442
+ slot["args"].append(fn.arguments)
443
+ except _timeout_err:
444
+ raise # SIGALRM hard-deadline fired mid-chunk → propagate (one-shot alarm won't re-arm); the outer handler salvages the partial
445
+ except Exception: # noqa: BLE001 — one bad chunk must not kill the stream
446
+ continue
447
+ except Exception: # noqa: BLE001 — stream broke mid-flight
448
+ if not parts and not calls:
449
+ raise # nothing salvageable → let with_retry re-roll
450
+ finish = finish or "length" # partial assembly → treat as a truncated (incomplete) stop
451
+ # Drop any INCOMPLETE tool call (missing id or name) — a mid-stream break before a tool_call's
452
+ # name/id delta arrived would otherwise yield a ToolCall(name=None) that breaks the dispatcher.
453
+ # If this empties content AND tool_calls, complete() raises EmptyResponseError → with_retry re-rolls.
454
+ tool_calls = [NS(id=c["id"], function=NS(name=c["name"], arguments="".join(c["args"])))
455
+ for _, c in sorted(calls.items(), key=lambda kv: kv[0] if isinstance(kv[0], int) else 0)
456
+ if c["id"] and c["name"]] # robust sort: a None/str stream index must not crash assembly
457
+ message = NS(content=("".join(parts) or None), tool_calls=tool_calls)
458
+ return NS(choices=[NS(message=message, finish_reason=finish)], usage=usage)
459
+
460
+ def _reasoning_kwargs(self) -> dict:
461
+ """Map the provider-agnostic reasoning intent to the ACTIVE provider's knob; no-op (never error)
462
+ for providers that have none. Keeps the quirk isolated to this adapter. Intents: fast→low,
463
+ high→high, max→xhigh; "full" (default) = the provider's OWN default (deliberately NOT forced-high —
464
+ forcing high would inflate tokens/cost on every turn against the moat; ask for "high"/"max" to
465
+ opt into more reasoning)."""
466
+ from .model_catalog import capability
467
+ r = self.reasoning
468
+ model, base = self.model.lower(), self._base_url.lower()
469
+ if "deepseek" in model or "deepseek" in base:
470
+ return {"extra_body": {"thinking": {"type": "disabled"}}} if r == "fast" else {}
471
+ if not capability(self.model, self._base_url).supports_reasoning_effort:
472
+ return {} # unknown / non-reasoning provider → leave at provider default (graceful)
473
+ effort = {"fast": "low", "high": "high", "max": "xhigh"}.get(r) # full/unknown → {} (default)
474
+ return {"reasoning_effort": effort} if effort else {}
475
+
476
+ def _cache_kwargs(self, messages: list[dict]) -> dict:
477
+ """Map prompt-caching intent to the ACTIVE provider's knob; no-op for providers without
478
+ one. Modeled on `_reasoning_kwargs` — the quirk stays isolated to this adapter.
479
+
480
+ Only Claude/Anthropic-compatible endpoints support an explicit prompt-cache breakpoint;
481
+ every other provider (the default gpt-5.5 / OpenAI-compatible path) returns {} so the
482
+ request is byte-stable and untouched. For an Anthropic-compatible endpoint we return a
483
+ TODO-stubbed {} for now: the exact `extra_body` cache_control shape is DEFERRED until a
484
+ real Anthropic base_url is wired (the safe half — a byte-stable prefix + cached_tokens
485
+ read-back — is already in place and provider-agnostic).
486
+ """
487
+ model, base = self.model.lower(), self._base_url.lower()
488
+ if "claude" not in model and "anthropic" not in base:
489
+ return {} # non-Claude provider → no explicit cache breakpoint
490
+ # Anthropic-compatible endpoint: DEFER the real cache_control extra_body shape (see
491
+ # adopt_plan.md sec 6 defer). Stubbed {} keeps the request byte-stable until wired.
492
+ # TODO(anthropic): set extra_body cache_control on the system/stable prefix against a
493
+ # live Anthropic base_url; MERGE with _reasoning_kwargs' extra_body, do not overwrite.
494
+ return {}
495
+
496
+ def _cache_routing_kwargs(self) -> dict:
497
+ """Map the session cache-routing hint to the ACTIVE provider; gated like the sibling
498
+ quirk-mappers (`_reasoning_kwargs`/`_cache_kwargs`) so a provider-specific param never
499
+ reaches an endpoint that rejects it.
500
+
501
+ `prompt_cache_key` is an OpenAI Chat-Completions field (routes identical-prefix requests to
502
+ the same cache shard for a higher hit rate; 0 added tokens). OpenAI-compatible providers
503
+ (the default Moonshot path, DeepSeek) accept-and-ignore it harmlessly. It is INVALID on an
504
+ Anthropic-compatible endpoint (which caches via explicit cache_control breakpoints — see
505
+ `_cache_kwargs`), so we return {} there to keep that request byte-stable and untouched.
506
+ """
507
+ key = getattr(self, "_cache_key", None)
508
+ if not key:
509
+ return {}
510
+ model, base = self.model.lower(), self._base_url.lower()
511
+ if "claude" in model or "anthropic" in base:
512
+ return {} # Anthropic uses cache_control, not prompt_cache_key
513
+ return {"prompt_cache_key": key}
514
+
515
+ def _merge_kwargs(self, kwargs: dict, extra: dict) -> None:
516
+ """Fold `extra` into `kwargs`, MERGING `extra_body` instead of overwriting it.
517
+
518
+ Both `_reasoning_kwargs` and `_cache_kwargs` may set `extra_body`; a plain
519
+ `kwargs.update(...)` would clobber whichever ran first. Merge the nested dict so both
520
+ provider quirks survive.
521
+ """
522
+ for key, value in extra.items():
523
+ if key == "extra_body" and isinstance(kwargs.get("extra_body"), dict) and isinstance(value, dict):
524
+ kwargs["extra_body"] = {**kwargs["extra_body"], **value}
525
+ else:
526
+ kwargs[key] = value
527
+
528
+ def _effort(self) -> str | None:
529
+ """The Responses-API reasoning effort for THIS call ('low'/'high'/'xhigh'), or None when the
530
+ intent is the provider default ('full') or the model has no effort knob. This is the routing key:
531
+ gpt-5.5 REJECTS reasoning_effort + function tools on /v1/chat/completions, so any explicit effort
532
+ goes through /v1/responses (which supports the pairing). Default 'full' → None → chat path."""
533
+ from .model_catalog import capability
534
+ if not capability(self.model, self._base_url).supports_reasoning_effort:
535
+ return None
536
+ return {"fast": "low", "high": "high", "max": "xhigh"}.get(self.reasoning)
537
+
538
+ def _complete_responses(self, messages: list[dict], tools: list[dict], effort: str) -> AssistantMessage:
539
+ """The /v1/responses path: lets the gpt-5 family reason at `effort` WITH function tools (the pairing
540
+ chat/completions 400s on). Same AssistantMessage contract, same hard-deadline + live-streaming
541
+ behaviour as the chat path. Isolated provider quirk — the loop/slice/moat never see it."""
542
+ kwargs: dict = {"model": self.model, "input": _to_responses_input(messages),
543
+ "reasoning": {"effort": effort}}
544
+ rtools = _to_responses_tools(tools)
545
+ if rtools:
546
+ kwargs["tools"] = rtools
547
+ kwargs["tool_choice"] = "auto"
548
+ if self.max_tokens:
549
+ kwargs["max_output_tokens"] = self.max_tokens
550
+ ck = self._cache_routing_kwargs() # prompt_cache_key is valid on Responses too
551
+ if ck.get("prompt_cache_key"):
552
+ kwargs["prompt_cache_key"] = ck["prompt_cache_key"]
553
+ _stream = (getattr(self, "_on_delta", None) is not None
554
+ and threading.current_thread() is threading.main_thread())
555
+ try:
556
+ resp = (self._responses_stream(kwargs) if _stream
557
+ else self._create(kwargs, caller=lambda kw: self.client.responses.create(**kw)))
558
+ except Exception as e: # noqa: BLE001
559
+ # route a provider context overflow into the SAME slice-tighten recovery the chat path uses
560
+ # (llm.py chat except) — otherwise an overflow on the responses path crashes the turn instead.
561
+ if is_context_overflow(e):
562
+ raise ContextOverflow(e, status_code=getattr(e, "status_code", None)) from e
563
+ raise
564
+ return self._parse_responses(resp)
565
+
566
+ def _responses_stream(self, kwargs: dict):
567
+ """Stream a Responses call, emit content/reasoning deltas live, return the final Response (parsed
568
+ downstream identically to the blocking path). Hard-deadline wrapped; on ANY stream hiccup it falls
569
+ back to a single blocking call (a render path must never kill the turn)."""
570
+ def _drain(kw):
571
+ with self.client.responses.stream(**kw) as stream:
572
+ for ev in stream:
573
+ try:
574
+ t = getattr(ev, "type", "")
575
+ if t == "response.output_text.delta":
576
+ self._emit("content", getattr(ev, "delta", "") or "")
577
+ elif t in ("response.reasoning_summary_text.delta", "response.reasoning_text.delta"):
578
+ self._emit("reasoning", getattr(ev, "delta", "") or "")
579
+ except _import_api_timeout_error():
580
+ raise # SIGALRM hard-deadline fired mid-event → propagate (one-shot alarm won't re-arm), mirroring the chat path
581
+ except Exception: # noqa: BLE001 — one bad event must not abort the stream
582
+ continue
583
+ return stream.get_final_response()
584
+ try:
585
+ return self._create(kwargs, caller=_drain)
586
+ except Exception as e: # noqa: BLE001 — streaming unavailable/broke → blocking call (identical result)
587
+ # but NOT on a deterministic request-level failure (a hard-deadline timeout or a context
588
+ # overflow): re-issuing the SAME request as a blocking call just doubles a guaranteed failure
589
+ # (and overflow must reach _complete_responses' converter to drive recovery). Re-raise those;
590
+ # only fall back for a genuine transport/streaming-unsupported hiccup.
591
+ if isinstance(e, _import_api_timeout_error()) or is_context_overflow(e):
592
+ raise
593
+ return self._create(kwargs, caller=lambda kw: self.client.responses.create(**kw))
594
+
595
+ def _parse_responses(self, resp) -> AssistantMessage:
596
+ """Map a Responses Response → AssistantMessage (content / tool_calls / usage / finish_reason)."""
597
+ content = (getattr(resp, "output_text", None) or "").strip() or None
598
+ calls: list[ToolCall] = []
599
+ for item in (getattr(resp, "output", None) or []):
600
+ if getattr(item, "type", None) == "function_call":
601
+ _name = getattr(item, "name", "") or ""
602
+ if not _name:
603
+ continue # malformed function_call (no name) — skip, don't dispatch nameless
604
+ try:
605
+ args = json.loads(getattr(item, "arguments", "") or "{}")
606
+ except Exception: # noqa: BLE001
607
+ args = {}
608
+ calls.append(ToolCall(id=getattr(item, "call_id", "") or getattr(item, "id", ""),
609
+ name=_name, args=args))
610
+ status = getattr(resp, "status", None) # finish_reason from Responses status
611
+ reason = ""
612
+ if status == "incomplete":
613
+ reason = getattr(getattr(resp, "incomplete_details", None), "reason", "")
614
+ finish = "length" if reason == "max_output_tokens" else ("content_filter" if reason == "content_filter" else "stop")
615
+ else:
616
+ finish = "tool_calls" if calls else "stop"
617
+ # content_filter is a TERMINAL provider stop, not an empty-response hiccup: exempt it from the raise
618
+ # (mirrors the chat path) so the loop PARKS it instead of re-rolling forever on a filtered completion.
619
+ if not content and not calls and finish != "content_filter":
620
+ from .errors import EmptyResponseError
621
+ raise EmptyResponseError(f"empty responses completion (status={status})")
622
+ return AssistantMessage(content=content, tool_calls=calls,
623
+ usage=_usage_dict(_responses_usage(getattr(resp, "usage", None))),
624
+ finish_reason=finish)
625
+
626
+ def complete(self, messages: list[dict], tools: list[dict]) -> AssistantMessage:
627
+ effort = self._effort()
628
+ if effort and hasattr(self.client, "responses"): # explicit effort → /v1/responses (chat 400s on
629
+ return self._complete_responses(messages, tools, effort) # effort+tools). No responses API on
630
+ # an old SDK / a provider that only has chat → fall through; the chat 400→drop below degrades it.
631
+ kwargs: dict = dict(model=self.model, messages=messages, tools=tools, tool_choice="auto")
632
+ if self.max_tokens:
633
+ # Provider quirk (now sourced from the model catalog — gpt-5/o-series renamed this param to
634
+ # max_completion_tokens and REJECT max_tokens with a 400). One source of truth, not inline.
635
+ from .model_catalog import capability
636
+ kwargs[capability(self.model, self._base_url).tokens_param] = self.max_tokens
637
+ self._merge_kwargs(kwargs, self._cache_routing_kwargs()) # session-stable cache routing (0 added tokens)
638
+ self._merge_kwargs(kwargs, self._reasoning_kwargs())
639
+ self._merge_kwargs(kwargs, self._cache_kwargs(messages))
640
+ # Provider quirk (isolated here, llm-agnostic): some reasoning models (gpt-5.5) reject
641
+ # reasoning_effort TOGETHER with function tools on /v1/chat/completions (400 — "use /v1/responses").
642
+ # Once seen, drop reasoning_effort whenever tools are present so we degrade to default reasoning
643
+ # instead of 400ing every tool-calling turn. (Sticky — set in the except below.)
644
+ if getattr(self, "_drop_reasoning_effort", False) and kwargs.get("tools"):
645
+ kwargs.pop("reasoning_effort", None)
646
+ # STREAM only on the MAIN thread with a live sink wired (the interactive turn). OFF-main runs —
647
+ # parallel subagents/explorers sharing this llm via run_scheduled threads — take the BLOCKING path
648
+ # so they keep the off-main hard-deadline watchdog AND never racily drive the single TUI spinner from
649
+ # N threads. getattr keeps the object-__new__ test stubs working. Same assembled result either way.
650
+ _stream = (getattr(self, "_on_delta", None) is not None
651
+ and threading.current_thread() is threading.main_thread())
652
+ _creator = self._create_streaming if _stream else self._create
653
+ try:
654
+ resp = _creator(kwargs)
655
+ except Exception as e:
656
+ # Context overflow is NOT a backoff case (is_retryable stays unchanged): signal the
657
+ # rebuild loop to TIGHTEN the slice rather than re-send the identical oversized request.
658
+ if is_context_overflow(e):
659
+ raise ContextOverflow(e, status_code=getattr(e, "status_code", None)) from e
660
+ # reasoning_effort + tools rejected by this model → drop it, remember, retry ONCE (graceful
661
+ # degrade to default reasoning instead of crashing the turn). General; no model name hardcoded.
662
+ if "reasoning_effort" in str(e) and kwargs.pop("reasoning_effort", None) is not None:
663
+ self._drop_reasoning_effort = True
664
+ resp = _creator(kwargs)
665
+ else:
666
+ raise
667
+ if not resp.choices: # some OpenAI-compatible proxies emit {"choices": []} on filter/transient errors
668
+ from .errors import EmptyResponseError
669
+ raise EmptyResponseError("empty completion (no choices)") # RETRYABLE → with_retry re-rolls (not a raw IndexError)
670
+ choice = resp.choices[0]
671
+ msg = choice.message
672
+ if msg is None: # some proxies emit a choice with no message — retry, don't crash
673
+ from .errors import EmptyResponseError
674
+ raise EmptyResponseError(f"no message in completion (finish_reason={choice.finish_reason})")
675
+ calls: list[ToolCall] = []
676
+ for tc in (msg.tool_calls or []):
677
+ fn = getattr(tc, "function", None)
678
+ if fn is None or not getattr(fn, "name", None):
679
+ continue # malformed tool_call (no function/name) — skip, don't crash
680
+ try:
681
+ args = json.loads(fn.arguments)
682
+ except Exception:
683
+ args = {}
684
+ calls.append(ToolCall(id=getattr(tc, "id", "") or "", name=fn.name, args=args))
685
+ # Degenerate completion — no content AND no tool calls (and not a content-filter stop). Some
686
+ # providers/proxies occasionally emit an empty body; returning it stalls the loop, so raise a
687
+ # RETRYABLE error (empty-response) and let with_retry re-roll. content_filter is
688
+ # excluded — re-rolling would just filter again.
689
+ if not (msg.content or "").strip() and not calls and choice.finish_reason != "content_filter":
690
+ from .errors import EmptyResponseError
691
+ raise EmptyResponseError(f"empty completion (finish_reason={choice.finish_reason})")
692
+ usage = _usage_dict(resp.usage)
693
+ return AssistantMessage(
694
+ content=msg.content, tool_calls=calls, usage=usage, finish_reason=choice.finish_reason
695
+ )