power-loop 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. llm_client/__init__.py +0 -0
  2. llm_client/capabilities.py +162 -0
  3. llm_client/interface.py +470 -0
  4. llm_client/llm_factory.py +981 -0
  5. llm_client/llm_tooling.py +645 -0
  6. llm_client/llm_utils.py +205 -0
  7. llm_client/multimodal.py +237 -0
  8. llm_client/qwen_image.py +576 -0
  9. llm_client/web_search.py +149 -0
  10. power_loop/__init__.py +326 -0
  11. power_loop/agent/__init__.py +6 -0
  12. power_loop/agent/sink.py +247 -0
  13. power_loop/agent/stateful_loop.py +363 -0
  14. power_loop/agent/system_prompt.py +396 -0
  15. power_loop/agent/types.py +41 -0
  16. power_loop/contracts/__init__.py +132 -0
  17. power_loop/contracts/errors.py +140 -0
  18. power_loop/contracts/event_payloads.py +278 -0
  19. power_loop/contracts/events.py +86 -0
  20. power_loop/contracts/handlers.py +45 -0
  21. power_loop/contracts/hook_contexts.py +265 -0
  22. power_loop/contracts/hooks.py +64 -0
  23. power_loop/contracts/messages.py +90 -0
  24. power_loop/contracts/protocols.py +48 -0
  25. power_loop/contracts/tools.py +56 -0
  26. power_loop/core/agent_context.py +94 -0
  27. power_loop/core/events.py +124 -0
  28. power_loop/core/hooks.py +122 -0
  29. power_loop/core/phase.py +217 -0
  30. power_loop/core/pipeline.py +880 -0
  31. power_loop/core/runner.py +60 -0
  32. power_loop/core/state.py +208 -0
  33. power_loop/runtime/budget.py +179 -0
  34. power_loop/runtime/cancellation.py +127 -0
  35. power_loop/runtime/compact.py +300 -0
  36. power_loop/runtime/env.py +103 -0
  37. power_loop/runtime/memory.py +107 -0
  38. power_loop/runtime/provider.py +176 -0
  39. power_loop/runtime/retry.py +182 -0
  40. power_loop/runtime/session_store.py +636 -0
  41. power_loop/runtime/skills.py +201 -0
  42. power_loop/runtime/spec.py +233 -0
  43. power_loop/runtime/structured.py +225 -0
  44. power_loop/tools/__init__.py +51 -0
  45. power_loop/tools/default_manifest.py +244 -0
  46. power_loop/tools/default_tools.py +766 -0
  47. power_loop/tools/registry.py +162 -0
  48. power_loop/tools/spawn_agent.py +173 -0
  49. power_loop-0.2.0.dist-info/METADATA +632 -0
  50. power_loop-0.2.0.dist-info/RECORD +53 -0
  51. power_loop-0.2.0.dist-info/WHEEL +5 -0
  52. power_loop-0.2.0.dist-info/licenses/LICENSE +21 -0
  53. power_loop-0.2.0.dist-info/top_level.txt +2 -0
@@ -0,0 +1,981 @@
1
+ """
2
+ LLM factory utilities for zrag.
3
+
4
+ This module provides a small, explicit way to build an OpenAI-compatible `LLMService`
5
+ from environment / `src.config.settings`, similar in spirit to agent-psychology's
6
+ model utilities, but adapted to zrag's `LLMService` Protocol.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import asyncio
12
+ import inspect
13
+ import json
14
+ import logging
15
+ import os
16
+ import random
17
+ from collections.abc import AsyncIterator, Callable, Sequence
18
+ from typing import Any, cast
19
+
20
+ from openai import AsyncOpenAI
21
+
22
+ from .capabilities import ModelCapabilities, resolve_model_capabilities
23
+ from .interface import LLMRequest, LLMResponse, LLMService, LLMStreamChunk, LLMTokenUsage, OpenAICompatibleChatConfig
24
+ from .llm_utils import parse_json_from_model_output_detailed
25
+
26
+ try:
27
+ from openai.types.chat import (
28
+ ChatCompletion,
29
+ ChatCompletionChunk, # type: ignore[import-not-found]
30
+ ChatCompletionMessage,
31
+ )
32
+ except Exception: # pragma: no cover
33
+ # Runtime will still work because we treat these as typing-only. Keep minimal fallback.
34
+ ChatCompletion = Any # type: ignore[assignment]
35
+ ChatCompletionMessage = Any # type: ignore[assignment]
36
+
37
+ logger = logging.getLogger(__name__)
38
+
39
+
40
+ _PROXY_ENV_KEYS = (
41
+ "ALL_PROXY",
42
+ "all_proxy",
43
+ "HTTPS_PROXY",
44
+ "https_proxy",
45
+ "HTTP_PROXY",
46
+ "http_proxy",
47
+ )
48
+
49
+
50
+ def _normalize_proxy_url_scheme(url: str) -> str:
51
+ text = (url or "").strip()
52
+ if text.lower().startswith("socks://"):
53
+ # httpx/openai expects socks5://, not socks://.
54
+ return f"socks5://{text[len('socks://'):]}"
55
+ return text
56
+
57
+
58
+ def _normalize_proxy_env_inplace() -> None:
59
+ for key in _PROXY_ENV_KEYS:
60
+ value = os.environ.get(key)
61
+ if not value:
62
+ continue
63
+ normalized = _normalize_proxy_url_scheme(value)
64
+ if normalized != value:
65
+ os.environ[key] = normalized
66
+ logger.info("Normalized proxy env %s from socks:// to socks5://", key)
67
+
68
+
69
+ def _sanitize_debug_payload(value: Any) -> Any:
70
+ if isinstance(value, dict):
71
+ sanitized: dict[str, Any] = {}
72
+ for key, item in value.items():
73
+ if key == "url" and isinstance(item, str) and item.startswith("data:"):
74
+ prefix, _, payload = item.partition(",")
75
+ sanitized[key] = f"{prefix},<base64:{len(payload)} chars>"
76
+ else:
77
+ sanitized[key] = _sanitize_debug_payload(item)
78
+ return sanitized
79
+ if isinstance(value, list):
80
+ return [_sanitize_debug_payload(item) for item in value]
81
+ return value
82
+
83
+
84
+ def _format_messages_for_debug(messages: Sequence[dict[str, Any]]) -> str:
85
+ sanitized = _sanitize_debug_payload(list(messages))
86
+ return json.dumps(sanitized, ensure_ascii=False, indent=2)
87
+
88
+
89
+
90
+
91
+ class OpenAICompatibleChatLLMService(LLMService):
92
+ """
93
+ Minimal OpenAI-compatible chat completion client.
94
+
95
+ - Uses `openai.AsyncOpenAI`
96
+ - Records token usage from response.usage (if present)
97
+ """
98
+
99
+ def __init__(self, cfg: OpenAICompatibleChatConfig):
100
+ self._cfg = cfg
101
+ self._client: Any = None
102
+ self._last_usage: dict[str, Any] = {}
103
+ self._capabilities: ModelCapabilities = resolve_model_capabilities(
104
+ cfg.model,
105
+ cfg.base_url,
106
+ cfg.capability_overrides,
107
+ )
108
+
109
+ logger.info(
110
+ "GraphExtractor LLM: base_url=%s model=%s timeout_s=%s max_tokens=%s temperature=%s api_key=%s",
111
+ cfg.base_url,
112
+ cfg.model,
113
+ cfg.timeout_s,
114
+ cfg.max_tokens,
115
+ cfg.temperature,
116
+ "set" if bool(cfg.api_key) else "missing",
117
+ )
118
+
119
+ def get_last_token_usage(self) -> dict[str, Any]:
120
+ return dict(self._last_usage or {})
121
+
122
+ @property
123
+ def capabilities(self) -> ModelCapabilities:
124
+ return self._capabilities
125
+
126
+ def _ensure_client(self) -> Any:
127
+ if self._client is not None:
128
+ return self._client
129
+ from openai import AsyncOpenAI # type: ignore
130
+
131
+ _normalize_proxy_env_inplace()
132
+
133
+ self._client = AsyncOpenAI(
134
+ base_url=self._cfg.base_url,
135
+ api_key=self._cfg.api_key,
136
+ timeout=self._cfg.timeout_s,
137
+ )
138
+ return self._client
139
+
140
+ async def close(self) -> None:
141
+ """
142
+ Close underlying HTTP resources (httpx) to avoid ResourceWarning in tests.
143
+ """
144
+ if self._client is None:
145
+ return
146
+ try:
147
+ # openai.AsyncOpenAI exposes `close()` (async).
148
+ await self._client.close()
149
+ finally:
150
+ self._client = None
151
+
152
+ def _record_usage(self, usage: Any, *, method: str) -> None:
153
+ self._last_usage = self._usage_dict_from_any(usage)
154
+
155
+ total_tokens = self._last_usage.get("total_tokens")
156
+ if isinstance(total_tokens, int) and total_tokens > 0:
157
+ logger.info(
158
+ "[llm:%s] token_usage prompt=%s completion=%s total=%s",
159
+ method,
160
+ self._last_usage.get("prompt_tokens"),
161
+ self._last_usage.get("completion_tokens"),
162
+ self._last_usage.get("total_tokens"),
163
+ )
164
+
165
+ def _emit_debug_payload(self, *, method: str, messages: Any, kwargs: Any) -> None:
166
+ """
167
+ Emit request debug payload.
168
+
169
+ This is a best-effort helper used by stream/complete paths. It must exist
170
+ because some code paths call it unconditionally.
171
+ """
172
+ # Avoid spamming logs unless explicitly requested.
173
+ env_flag = (os.getenv("POWER_LOOP_LLM_DEBUG") or "").strip().lower()
174
+ debug_enabled = env_flag in {"1", "true", "yes", "y"} or logger.isEnabledFor(logging.DEBUG)
175
+ if not debug_enabled:
176
+ return
177
+
178
+ try:
179
+ rendered_messages = (
180
+ _format_messages_for_debug(messages)
181
+ if isinstance(messages, (list, tuple))
182
+ else _sanitize_debug_payload(messages)
183
+ )
184
+ except Exception:
185
+ rendered_messages = "<unrenderable messages>"
186
+
187
+ try:
188
+ sanitized_kwargs = _sanitize_debug_payload(kwargs)
189
+ except Exception:
190
+ sanitized_kwargs = "<unrenderable kwargs>"
191
+
192
+ logger.debug("[llm:%s] request messages=%s kwargs=%s", method, rendered_messages, sanitized_kwargs)
193
+
194
+ def _usage_dict_from_any(self, usage: Any) -> dict[str, Any]:
195
+ """
196
+ Normalize token usage from various provider shapes.
197
+ Supports:
198
+ - OpenAI usage: prompt_tokens/completion_tokens/total_tokens
199
+ - Some providers: input_tokens/output_tokens/total_tokens
200
+ - Dict payloads (from model_dump / raw json)
201
+ """
202
+ if usage is None:
203
+ return {
204
+ "prompt_tokens": None,
205
+ "completion_tokens": None,
206
+ "total_tokens": None,
207
+ }
208
+
209
+ raw: dict[str, Any] = {}
210
+
211
+ # dict-like
212
+ if isinstance(usage, dict):
213
+ raw = dict(usage)
214
+ else:
215
+ # object-like: prefer model_dump, then __dict__
216
+ if hasattr(usage, "model_dump"):
217
+ try:
218
+ dumped = usage.model_dump()
219
+ if isinstance(dumped, dict):
220
+ raw = dict(dumped)
221
+ except Exception:
222
+ raw = {}
223
+ if not raw:
224
+ try:
225
+ raw = dict(usage.__dict__)
226
+ except Exception:
227
+ raw = {}
228
+
229
+ def _safe_int_or_none(v: Any) -> int | None:
230
+ if v is None:
231
+ return None
232
+ if isinstance(v, bool):
233
+ return int(v)
234
+ if isinstance(v, (int, float)):
235
+ return int(v)
236
+ if isinstance(v, str):
237
+ s = v.strip()
238
+ if not s:
239
+ return None
240
+ try:
241
+ return int(float(s))
242
+ except Exception:
243
+ return None
244
+ return None
245
+
246
+ def _pick_int(payload: dict[str, Any], keys: list[str]) -> int | None:
247
+ for key in keys:
248
+ if key in payload:
249
+ iv = _safe_int_or_none(payload.get(key))
250
+ if iv is not None:
251
+ return iv
252
+ return None
253
+
254
+ def _as_dict(v: Any) -> dict[str, Any]:
255
+ if isinstance(v, dict):
256
+ return v
257
+ if hasattr(v, "model_dump"):
258
+ try:
259
+ dumped = v.model_dump()
260
+ if isinstance(dumped, dict):
261
+ return dumped
262
+ except Exception:
263
+ return {}
264
+ return {}
265
+
266
+ prompt = _pick_int(raw, [
267
+ "prompt_tokens",
268
+ "input_tokens",
269
+ "prompt_token_count",
270
+ "promptTokens",
271
+ "inputTokenCount",
272
+ "prompt_count",
273
+ ])
274
+ completion = _pick_int(raw, [
275
+ "completion_tokens",
276
+ "output_tokens",
277
+ "completion_token_count",
278
+ "completionTokens",
279
+ "outputTokenCount",
280
+ "generated_tokens",
281
+ "candidates_token_count",
282
+ ])
283
+ total = _pick_int(raw, [
284
+ "total_tokens",
285
+ "total_token_count",
286
+ "totalTokens",
287
+ "token_count",
288
+ "usage_tokens",
289
+ ])
290
+
291
+ if total is None and prompt is not None and completion is not None:
292
+ total = prompt + completion
293
+
294
+ prompt_details = _as_dict(raw.get("prompt_tokens_details"))
295
+ completion_details = _as_dict(raw.get("completion_tokens_details"))
296
+ output_details = _as_dict(raw.get("output_tokens_details"))
297
+ input_details = _as_dict(raw.get("input_tokens_details"))
298
+
299
+ prompt_audio_tokens = _pick_int(prompt_details, ["audio_tokens", "audioTokenCount"])
300
+ if prompt_audio_tokens is None:
301
+ prompt_audio_tokens = _pick_int(input_details, ["audio_tokens", "audioTokenCount"])
302
+ prompt_cached_tokens = _pick_int(
303
+ raw,
304
+ ["prompt_cache_hit_tokens", "cached_tokens", "cache_hit_tokens", "prompt_cached_tokens"],
305
+ )
306
+ if prompt_cached_tokens is None:
307
+ prompt_cached_tokens = _pick_int(prompt_details, ["cached_tokens", "cache_hit_tokens", "cacheHitTokens"])
308
+ prompt_cache_miss_tokens = _pick_int(raw, ["prompt_cache_miss_tokens", "cache_miss_tokens", "cacheMissTokens"])
309
+ if prompt_cache_miss_tokens is None:
310
+ prompt_cache_miss_tokens = _pick_int(prompt_details, ["cache_miss_tokens", "cacheMissTokens"])
311
+ prompt_text_tokens = _pick_int(prompt_details, ["text_tokens", "textTokenCount"])
312
+ if prompt_text_tokens is None:
313
+ prompt_text_tokens = _pick_int(input_details, ["text_tokens", "textTokenCount"])
314
+ prompt_image_tokens = _pick_int(prompt_details, ["image_tokens", "imageTokenCount"])
315
+ if prompt_image_tokens is None:
316
+ prompt_image_tokens = _pick_int(input_details, ["image_tokens", "imageTokenCount"])
317
+
318
+ completion_reasoning_tokens = _pick_int(
319
+ completion_details,
320
+ ["reasoning_tokens", "reasoningTokenCount", "reasoning_token", "reasoningTokens", "thinking_tokens"],
321
+ )
322
+ if completion_reasoning_tokens is None:
323
+ completion_reasoning_tokens = _pick_int(
324
+ output_details,
325
+ ["reasoning_tokens", "reasoningTokenCount", "reasoning_token", "reasoningTokens", "thinking_tokens"],
326
+ )
327
+ completion_audio_tokens = _pick_int(completion_details, ["audio_tokens", "audioTokenCount"])
328
+ if completion_audio_tokens is None:
329
+ completion_audio_tokens = _pick_int(output_details, ["audio_tokens", "audioTokenCount"])
330
+ completion_text_tokens = _pick_int(completion_details, ["text_tokens", "textTokenCount"])
331
+ if completion_text_tokens is None:
332
+ completion_text_tokens = _pick_int(output_details, ["text_tokens", "textTokenCount"])
333
+ completion_image_tokens = _pick_int(completion_details, ["image_tokens", "imageTokenCount"])
334
+ if completion_image_tokens is None:
335
+ completion_image_tokens = _pick_int(output_details, ["image_tokens", "imageTokenCount"])
336
+ accepted_prediction_tokens = _pick_int(
337
+ completion_details,
338
+ ["accepted_prediction_tokens", "acceptedPredictionTokens"],
339
+ )
340
+ if accepted_prediction_tokens is None:
341
+ accepted_prediction_tokens = _pick_int(
342
+ output_details,
343
+ ["accepted_prediction_tokens", "acceptedPredictionTokens"],
344
+ )
345
+ rejected_prediction_tokens = _pick_int(
346
+ completion_details,
347
+ ["rejected_prediction_tokens", "rejectedPredictionTokens"],
348
+ )
349
+ if rejected_prediction_tokens is None:
350
+ rejected_prediction_tokens = _pick_int(
351
+ output_details,
352
+ ["rejected_prediction_tokens", "rejectedPredictionTokens"],
353
+ )
354
+
355
+ # Provider-agnostic aliases (keep None when unknown)
356
+ cached_tokens = _pick_int(raw, ["cached_tokens", "cache_hit_tokens", "prompt_cache_hit_tokens"])
357
+ if cached_tokens is None:
358
+ cached_tokens = prompt_cached_tokens
359
+ cache_hit_tokens = _pick_int(raw, ["cache_hit_tokens", "prompt_cache_hit_tokens", "cached_tokens"])
360
+ if cache_hit_tokens is None:
361
+ cache_hit_tokens = prompt_cached_tokens
362
+ cache_miss_tokens = _pick_int(raw, ["cache_miss_tokens", "prompt_cache_miss_tokens"])
363
+ if cache_miss_tokens is None:
364
+ cache_miss_tokens = prompt_cache_miss_tokens
365
+
366
+ reasoning_tokens = _pick_int(
367
+ raw,
368
+ ["reasoning_tokens", "reasoning_token", "reasoningTokens", "thinking_tokens"],
369
+ )
370
+ if reasoning_tokens is None:
371
+ reasoning_tokens = completion_reasoning_tokens
372
+
373
+ accepted_tokens = _pick_int(raw, ["accepted_prediction_tokens"])
374
+ if accepted_tokens is None:
375
+ accepted_tokens = accepted_prediction_tokens
376
+ rejected_tokens = _pick_int(raw, ["rejected_prediction_tokens"])
377
+ if rejected_tokens is None:
378
+ rejected_tokens = rejected_prediction_tokens
379
+
380
+ # Keep provider-native fields, and ensure normalized aliases exist.
381
+ raw["prompt_tokens"] = prompt
382
+ raw["completion_tokens"] = completion
383
+ raw["total_tokens"] = total
384
+ raw["prompt_audio_tokens"] = prompt_audio_tokens
385
+ raw["prompt_cached_tokens"] = prompt_cached_tokens
386
+ raw["prompt_cache_miss_tokens"] = prompt_cache_miss_tokens
387
+ raw["prompt_text_tokens"] = prompt_text_tokens
388
+ raw["prompt_image_tokens"] = prompt_image_tokens
389
+ raw["completion_reasoning_tokens"] = completion_reasoning_tokens
390
+ raw["completion_audio_tokens"] = completion_audio_tokens
391
+ raw["completion_text_tokens"] = completion_text_tokens
392
+ raw["completion_image_tokens"] = completion_image_tokens
393
+ raw["accepted_prediction_tokens"] = accepted_prediction_tokens
394
+ raw["rejected_prediction_tokens"] = rejected_prediction_tokens
395
+ raw["cached_tokens"] = cached_tokens
396
+ raw["cache_hit_tokens"] = cache_hit_tokens
397
+ raw["cache_miss_tokens"] = cache_miss_tokens
398
+ raw["reasoning_tokens"] = reasoning_tokens
399
+ raw["accepted_tokens"] = accepted_tokens
400
+ raw["rejected_tokens"] = rejected_tokens
401
+ return raw
402
+
403
+ def _usage_obj(self, usage: Any = None) -> LLMTokenUsage:
404
+ d = self._last_usage if usage is None else self._usage_dict_from_any(usage)
405
+
406
+ def _int_or_none(v: Any) -> int | None:
407
+ try:
408
+ return None if v is None else int(v)
409
+ except Exception:
410
+ return None
411
+
412
+ return LLMTokenUsage(
413
+ prompt_tokens=_int_or_none(d.get("prompt_tokens")),
414
+ completion_tokens=_int_or_none(d.get("completion_tokens")),
415
+ total_tokens=_int_or_none(d.get("total_tokens")),
416
+ prompt_audio_tokens=_int_or_none(d.get("prompt_audio_tokens")),
417
+ prompt_cached_tokens=_int_or_none(d.get("prompt_cached_tokens")),
418
+ prompt_cache_miss_tokens=_int_or_none(d.get("prompt_cache_miss_tokens")),
419
+ prompt_text_tokens=_int_or_none(d.get("prompt_text_tokens")),
420
+ prompt_image_tokens=_int_or_none(d.get("prompt_image_tokens")),
421
+ completion_reasoning_tokens=_int_or_none(d.get("completion_reasoning_tokens")),
422
+ completion_audio_tokens=_int_or_none(d.get("completion_audio_tokens")),
423
+ completion_text_tokens=_int_or_none(d.get("completion_text_tokens")),
424
+ completion_image_tokens=_int_or_none(d.get("completion_image_tokens")),
425
+ accepted_prediction_tokens=_int_or_none(d.get("accepted_prediction_tokens")),
426
+ rejected_prediction_tokens=_int_or_none(d.get("rejected_prediction_tokens")),
427
+ cached_tokens=_int_or_none(d.get("cached_tokens")),
428
+ cache_hit_tokens=_int_or_none(d.get("cache_hit_tokens")),
429
+ cache_miss_tokens=_int_or_none(d.get("cache_miss_tokens")),
430
+ reasoning_tokens=_int_or_none(d.get("reasoning_tokens")),
431
+ accepted_tokens=_int_or_none(d.get("accepted_tokens")),
432
+ rejected_tokens=_int_or_none(d.get("rejected_tokens")),
433
+ )
434
+
435
+ async def _with_retries(self, fn, *, method: str) -> Any:
436
+ """
437
+ Lightweight retry wrapper inspired by LangChain's ergonomics.
438
+ """
439
+ last_err: Exception | None = None
440
+ attempts = max(1, int(self._cfg.max_retries) + 1)
441
+ for i in range(attempts):
442
+ try:
443
+ return await fn()
444
+ except Exception as e:
445
+ last_err = e
446
+ if i >= attempts - 1:
447
+ break
448
+ # exponential backoff + jitter
449
+ base = float(self._cfg.retry_base_delay_s)
450
+ delay = base * (2 ** i) * (0.75 + 0.5 * random.random())
451
+ logger.warning("[llm:%s] call failed (attempt %s/%s), retrying in %.2fs: %s", method, i + 1, attempts,
452
+ delay, e)
453
+ await asyncio.sleep(delay)
454
+ raise cast(Exception, last_err)
455
+
456
+ def _request_kwargs(self, request: LLMRequest) -> dict[str, Any]:
457
+ """
458
+ Map `LLMRequest` into kwargs for OpenAI-compatible chat.completions.create.
459
+ Falls back to self._cfg for default settings.
460
+ """
461
+ out: dict[str, Any] = dict(request.extra or {})
462
+ out["model"] = request.model if request.model else self._cfg.model
463
+
464
+ req_temp = request.temperature if request.temperature is not None else self._cfg.temperature
465
+ if req_temp is not None:
466
+ out["temperature"] = float(req_temp)
467
+
468
+ req_max_tokens = request.max_tokens if request.max_tokens is not None else self._cfg.max_tokens
469
+ if req_max_tokens is not None:
470
+ try:
471
+ max_tokens = int(req_max_tokens)
472
+ except Exception:
473
+ max_tokens = None
474
+ if max_tokens is not None and max_tokens > 0:
475
+ out["max_tokens"] = max_tokens
476
+
477
+ reason_value: bool | None = None
478
+ if "reason" in out:
479
+ reason_value = bool(out.pop("reason"))
480
+ elif request.reason is not None:
481
+ reason_value = bool(request.reason)
482
+
483
+ if reason_value is not None:
484
+ extra_body = out.get("extra_body")
485
+ if not isinstance(extra_body, dict):
486
+ extra_body = {}
487
+ if "reason" not in extra_body:
488
+ extra_body["reason"] = reason_value
489
+ out["extra_body"] = extra_body
490
+
491
+ if request.tools is not None:
492
+ out["tools"] = request.tools
493
+ if request.tool_choice is not None:
494
+ out["tool_choice"] = request.tool_choice
495
+ if request.response_format is not None:
496
+ out["response_format"] = request.response_format
497
+ return out
498
+
499
+ def _build_resume_request(self, request: LLMRequest, partial_text: str) -> LLMRequest:
500
+ resumed_messages = list(request.messages or [])
501
+ trimmed = (partial_text or "").strip()
502
+ if trimmed:
503
+ resumed_messages.append({"role": "assistant", "content": trimmed})
504
+ resumed_messages.append({"role": "user", "content": self._cfg.stream_resume_instruction})
505
+ return LLMRequest(
506
+ messages=resumed_messages,
507
+ system_prompt=request.system_prompt,
508
+ model=request.model,
509
+ temperature=request.temperature,
510
+ max_tokens=request.max_tokens,
511
+ parse_json=request.parse_json,
512
+ reason=request.reason,
513
+ tools=request.tools,
514
+ tool_choice=request.tool_choice,
515
+ response_format=request.response_format,
516
+ extra=dict(request.extra or {}),
517
+ )
518
+
519
+ async def complete(
520
+ self,
521
+ request: LLMRequest,
522
+ *,
523
+ on_chunk_delta_text: Callable[[str], Any] | None = None,
524
+ on_chunk_think: Callable[[str], Any] | None = None,
525
+ on_stream_end: Callable[[LLMResponse], Any] | None = None,
526
+ ) -> LLMResponse:
527
+ """
528
+ Canonical non-streaming API (preferred).
529
+ """
530
+ text_parts: list[str] = []
531
+ think_parts: list[str] = []
532
+ chunks: list[LLMStreamChunk] = []
533
+ last_tool_calls: list[dict[str, Any]] = []
534
+ final_usage: LLMTokenUsage | None = None
535
+ last_raw_event: Any = None
536
+
537
+ async for chunk in self.stream(request):
538
+ chunks.append(chunk)
539
+ if chunk.delta_text:
540
+ text_parts.append(chunk.delta_text)
541
+ if on_chunk_delta_text:
542
+ res_val = on_chunk_delta_text(chunk.delta_text)
543
+ if inspect.isawaitable(res_val):
544
+ await res_val
545
+ if chunk.think:
546
+ think_parts.append(chunk.think)
547
+ if on_chunk_think:
548
+ res_val = on_chunk_think(chunk.think)
549
+ if inspect.isawaitable(res_val):
550
+ await res_val
551
+ if chunk.tool_calls:
552
+ last_tool_calls = list(chunk.tool_calls)
553
+ if chunk.token_usage is not None:
554
+ final_usage = chunk.token_usage
555
+ if chunk.raw_event is not None:
556
+ last_raw_event = chunk.raw_event
557
+
558
+ text = "".join(text_parts)
559
+
560
+ if request.parse_json:
561
+ res = parse_json_from_model_output_detailed(text)
562
+ else:
563
+ res = LLMResponse(raw_text=text, content_text=text)
564
+
565
+ res.token_usage = final_usage if final_usage is not None else self._usage_obj()
566
+ res.tool_calls = last_tool_calls
567
+ res.stream_chunks = chunks
568
+ res.raw_completion = last_raw_event
569
+ res.think = "".join(think_parts)
570
+
571
+ if on_stream_end:
572
+ res_val = on_stream_end(res)
573
+ if inspect.isawaitable(res_val):
574
+ await res_val
575
+
576
+ return res
577
+
578
+ async def stream(self, request: LLMRequest) -> AsyncIterator[LLMStreamChunk]:
579
+ """
580
+ Streaming API.
581
+
582
+ Best-effort real streaming for OpenAI-compatible servers.
583
+ If provider does not support streaming, callers can switch to `complete()`.
584
+ """
585
+ client: AsyncOpenAI = self._ensure_client()
586
+ kwargs = self._request_kwargs(request)
587
+
588
+ stream_kwargs = dict(kwargs)
589
+ stream_kwargs["stream"] = True
590
+ stream_kwargs.setdefault("stream_options", {"include_usage": True})
591
+
592
+ last_event: ChatCompletionChunk | None = None
593
+ final_usage: LLMTokenUsage | None = None
594
+ tool_call_store: dict[str, dict[str, Any]] = {}
595
+ tool_call_order: list[str] = []
596
+ tool_call_index_to_key: dict[int, str] = {}
597
+ aggregated_text: str = ""
598
+ current_request = request
599
+ restart_count = 0
600
+ max_restarts = max(0, int(self._cfg.stream_max_restarts or 0))
601
+ resume_enabled = bool(self._cfg.stream_resume_on_error)
602
+
603
+ def _as_dict(obj: Any) -> Any:
604
+ if obj is None:
605
+ return None
606
+ if isinstance(obj, dict):
607
+ return obj
608
+ if hasattr(obj, "model_dump"):
609
+ try:
610
+ return obj.model_dump()
611
+ except Exception:
612
+ return obj
613
+ # best-effort: fallback to __dict__
614
+ try:
615
+ return dict(obj.__dict__)
616
+ except Exception:
617
+ return obj
618
+
619
+ def _merge_tool_calls_delta(delta_tool_calls_obj: Any) -> list[dict[str, Any]]:
620
+ """
621
+ Merge OpenAI-compatible streaming `delta.tool_calls` into an aggregated list.
622
+
623
+ Delta items commonly look like:
624
+ {"index":0,"id":"...","type":"function","function":{"name":"x","arguments":"{...partial..."}}
625
+ """
626
+ if not delta_tool_calls_obj:
627
+ # return current snapshot
628
+ return [tool_call_store[k] for k in tool_call_order]
629
+
630
+ items = delta_tool_calls_obj
631
+ # sometimes it's a single object
632
+ if not isinstance(items, list):
633
+ items = [items]
634
+
635
+ for pos, it in enumerate(items):
636
+ d = _as_dict(it)
637
+ if not isinstance(d, dict):
638
+ continue
639
+ explicit_id = d.get("id")
640
+ delta_index = d.get("index")
641
+
642
+ call_key: str
643
+ if explicit_id and f"id_{explicit_id}" in tool_call_store:
644
+ call_key = f"id_{explicit_id}"
645
+ elif isinstance(delta_index, int) and delta_index in tool_call_index_to_key:
646
+ call_key = tool_call_index_to_key[delta_index]
647
+ elif explicit_id:
648
+ call_key = f"id_{explicit_id}"
649
+ elif delta_index is not None:
650
+ call_key = f"index_{delta_index}"
651
+ else:
652
+ # fall back to position inside this delta event
653
+ call_key = f"event_pos_{pos}"
654
+
655
+ if call_key not in tool_call_store:
656
+ tool_call_store[call_key] = {
657
+ "id": explicit_id or call_key,
658
+ "type": d.get("type") or "function",
659
+ "function": {"name": "", "arguments": ""},
660
+ }
661
+ tool_call_order.append(call_key)
662
+
663
+ entry = tool_call_store[call_key]
664
+ if explicit_id and entry.get("id") != explicit_id:
665
+ entry["id"] = explicit_id
666
+ if isinstance(delta_index, int):
667
+ tool_call_index_to_key[delta_index] = call_key
668
+ fn = d.get("function") or {}
669
+ if not isinstance(fn, dict):
670
+ fn = _as_dict(fn) or {}
671
+ if isinstance(fn, dict):
672
+ name = fn.get("name")
673
+ if name:
674
+ entry["function"]["name"] = name
675
+ args = fn.get("arguments")
676
+ if args:
677
+ entry["function"]["arguments"] = (entry["function"].get("arguments") or "") + str(args)
678
+
679
+ # allow other keys to be updated if present
680
+ if d.get("type"):
681
+ entry["type"] = d.get("type")
682
+
683
+ return [tool_call_store[k] for k in tool_call_order]
684
+
685
+ def _extract_text_from_value(v: Any) -> str:
686
+ if isinstance(v, str):
687
+ return v
688
+ if isinstance(v, list):
689
+ parts: list[str] = []
690
+ for item in v:
691
+ if isinstance(item, str):
692
+ parts.append(item)
693
+ elif isinstance(item, dict):
694
+ t = item.get("text") or item.get("content")
695
+ if isinstance(t, str):
696
+ parts.append(t)
697
+ else:
698
+ d = _as_dict(item)
699
+ if isinstance(d, dict):
700
+ t2 = d.get("text") or d.get("content")
701
+ if isinstance(t2, str):
702
+ parts.append(t2)
703
+ return "".join(parts)
704
+ return ""
705
+
706
+ def _extract_text_and_think_from_value(v: Any) -> tuple[str, str]:
707
+ """Split provider content blocks into user-visible text vs reasoning text.
708
+
709
+ Some OpenAI-compatible providers stream `delta.content` as a list of
710
+ typed blocks (for example type=reasoning). Those reasoning blocks
711
+ should feed the think panel, not the main chat text.
712
+ """
713
+ if isinstance(v, str):
714
+ return v, ""
715
+
716
+ if isinstance(v, list):
717
+ text_parts: list[str] = []
718
+ think_parts: list[str] = []
719
+ for item in v:
720
+ if isinstance(item, str):
721
+ text_parts.append(item)
722
+ continue
723
+
724
+ data = item if isinstance(item, dict) else _as_dict(item)
725
+ if not isinstance(data, dict):
726
+ continue
727
+
728
+ part = _extract_text_from_value(data.get("text") or data.get("content"))
729
+ if not part:
730
+ continue
731
+
732
+ item_type = str(data.get("type") or data.get("role") or "").lower()
733
+ if any(tag in item_type for tag in ("reason", "think", "thought")):
734
+ think_parts.append(part)
735
+ else:
736
+ text_parts.append(part)
737
+
738
+ return "".join(text_parts), "".join(think_parts)
739
+
740
+ data = _as_dict(v)
741
+ if isinstance(data, dict):
742
+ part = _extract_text_from_value(data.get("text") or data.get("content"))
743
+ if not part:
744
+ return "", ""
745
+ item_type = str(data.get("type") or data.get("role") or "").lower()
746
+ if any(tag in item_type for tag in ("reason", "think", "thought")):
747
+ return "", part
748
+ return part, ""
749
+
750
+ return "", ""
751
+
752
+ def _extract_delta_text_and_think(delta: Any) -> tuple[str, str]:
753
+ if delta is None:
754
+ return "", ""
755
+
756
+ delta_dict = _as_dict(delta)
757
+ if isinstance(delta_dict, dict):
758
+ text, think = _extract_text_and_think_from_value(delta_dict.get("content"))
759
+ if not text:
760
+ text = _extract_text_from_value(delta_dict.get("text"))
761
+ for key in [
762
+ "reasoning_content",
763
+ "reasoning",
764
+ "thinking",
765
+ "thinking_content",
766
+ "reasoning_text",
767
+ "thought",
768
+ ]:
769
+ extra_think = _extract_text_from_value(delta_dict.get(key))
770
+ if extra_think:
771
+ think += extra_think
772
+ break
773
+ return text, think
774
+
775
+ text, think = _extract_text_and_think_from_value(getattr(delta, "content", None))
776
+ if not text:
777
+ text = _extract_text_from_value(getattr(delta, "text", None))
778
+ for key in [
779
+ "reasoning_content",
780
+ "reasoning",
781
+ "thinking",
782
+ "thinking_content",
783
+ "reasoning_text",
784
+ "thought",
785
+ ]:
786
+ extra_think = _extract_text_from_value(getattr(delta, key, None))
787
+ if extra_think:
788
+ think += extra_think
789
+ break
790
+ return text, think
791
+
792
+ def _extract_delta_tool_calls(delta: Any) -> Any:
793
+ if delta is None:
794
+ return None
795
+
796
+ delta_dict = _as_dict(delta)
797
+ if isinstance(delta_dict, dict):
798
+ tc = delta_dict.get("tool_calls")
799
+ if tc:
800
+ return tc
801
+ fc = delta_dict.get("function_call")
802
+ if fc:
803
+ return [{"index": 0, "type": "function", "function": fc}]
804
+ return None
805
+
806
+ tc = getattr(delta, "tool_calls", None)
807
+ if tc:
808
+ return tc
809
+ fc = getattr(delta, "function_call", None)
810
+ if fc:
811
+ return [{"index": 0, "type": "function", "function": fc}]
812
+ return None
813
+
814
+ rendered_messages = current_request.to_messages(self._capabilities)
815
+ self._emit_debug_payload(method="stream", messages=rendered_messages, kwargs=stream_kwargs)
816
+
817
+ while True:
818
+ # Bind loop variables explicitly to avoid late-binding in the closure (B023).
819
+ async def _open_stream(_msgs=rendered_messages, _kw=stream_kwargs) -> Any:
820
+ return await client.chat.completions.create(messages=_msgs, **_kw)
821
+
822
+ stream: Any = await self._with_retries(
823
+ _open_stream,
824
+ method="stream_open",
825
+ )
826
+
827
+ try:
828
+ async for event in stream:
829
+ delta_text = ""
830
+ delta_think = ""
831
+ event: Any = event
832
+ last_event = event
833
+ delta_tool_calls: Any = None
834
+ aggregated_tool_calls: list[dict[str, Any]] = []
835
+ delta = None
836
+ choices = getattr(event, "choices", None)
837
+ if isinstance(choices, list) and choices:
838
+ choice0 = choices[0]
839
+ delta = getattr(choice0, "delta", None)
840
+ elif isinstance(choices, tuple) and choices:
841
+ choice0 = choices[0]
842
+ delta = getattr(choice0, "delta", None)
843
+ else:
844
+ # Some providers emit non-content events with empty choices.
845
+ delta = getattr(event, "delta", None)
846
+ if delta is None and hasattr(event, "model_dump"):
847
+ try:
848
+ dumped = event.model_dump()
849
+ if isinstance(dumped, dict):
850
+ dumped_choices = dumped.get("choices")
851
+ if isinstance(dumped_choices, list) and dumped_choices:
852
+ choice0 = dumped_choices[0]
853
+ if isinstance(choice0, dict):
854
+ delta = choice0.get("delta")
855
+ except Exception as exc:
856
+ logger.error("[llm:stream] failed to extract delta from model_dump: %s", exc)
857
+
858
+ try:
859
+ delta_text, delta_think = _extract_delta_text_and_think(delta)
860
+ delta_tool_calls = _extract_delta_tool_calls(delta)
861
+ aggregated_tool_calls = _merge_tool_calls_delta(delta_tool_calls)
862
+ except Exception as e:
863
+ delta_text = ""
864
+ delta_think = ""
865
+ delta_tool_calls = None
866
+ aggregated_tool_calls = [tool_call_store[k] for k in tool_call_order]
867
+ logger.error("[llm:stream] failed to extract delta: %s", e)
868
+ # Some providers send usage only on the final event when stream_options.include_usage is enabled.
869
+ try:
870
+ usage_obj = getattr(event, "usage", None)
871
+ # Fallback: some SDKs/providers only expose usage via model_dump / extra fields.
872
+ if usage_obj is None and hasattr(event, "model_dump"):
873
+ try:
874
+ dumped = event.model_dump()
875
+ if isinstance(dumped, dict):
876
+ usage_obj = dumped.get("usage") or dumped.get("x_openai_usage") or dumped.get("x_usage")
877
+ except Exception:
878
+ pass
879
+ if usage_obj is not None:
880
+ # also record into last_usage so callers relying on _usage_obj() can still work
881
+ self._record_usage(usage_obj, method="stream")
882
+ final_usage = self._usage_obj(usage_obj)
883
+ except Exception:
884
+ # Don't fail streaming if usage parsing fails.
885
+ pass
886
+
887
+ if delta_text or delta_think or delta_tool_calls:
888
+ if delta_text:
889
+ aggregated_text += str(delta_text)
890
+ yield LLMStreamChunk(
891
+ delta_text=delta_text,
892
+ think=delta_think,
893
+ delta_tool_calls=delta_tool_calls,
894
+ tool_calls=aggregated_tool_calls,
895
+ token_usage=None,
896
+ raw_event=event,
897
+ is_final=False,
898
+ )
899
+ break
900
+ except Exception as e:
901
+ if not resume_enabled or restart_count >= max_restarts:
902
+ raise
903
+ restart_count += 1
904
+ logger.warning(
905
+ "[llm:stream] stream interrupted, attempting resume (%s/%s): %s",
906
+ restart_count,
907
+ max_restarts,
908
+ e,
909
+ )
910
+ current_request = self._build_resume_request(request, aggregated_text)
911
+ rendered_messages = current_request.to_messages(self._capabilities)
912
+ self._emit_debug_payload(method="stream-resume", messages=rendered_messages, kwargs=stream_kwargs)
913
+ continue
914
+
915
+ # Final chunk carries best-effort usage + last raw event so callers can inspect finish_reason, tool_calls, etc.
916
+ yield LLMStreamChunk(
917
+ delta_text="",
918
+ think="",
919
+ delta_tool_calls=None,
920
+ tool_calls=[tool_call_store[k] for k in tool_call_order],
921
+ token_usage=final_usage,
922
+ raw_event=last_event,
923
+ is_final=True,
924
+ )
925
+
926
+ async def chat_completion(
927
+ self,
928
+ *,
929
+ messages: Sequence[dict[str, str]],
930
+ **kwargs: Any,
931
+ ) -> Any:
932
+ """
933
+ Return the raw ChatCompletion object.
934
+ """
935
+ client = self._ensure_client()
936
+ model = str(kwargs.get("model") or self._cfg.model)
937
+ temperature = float(kwargs.get("temperature", self._cfg.temperature))
938
+ max_tokens = int(kwargs.get("max_tokens", self._cfg.max_tokens))
939
+ message_list = [dict(message) for message in messages]
940
+ debug_kwargs = dict(kwargs)
941
+ debug_kwargs.update({"model": model, "temperature": temperature, "max_tokens": max_tokens})
942
+ self._emit_debug_payload(method="chat_completion", messages=message_list, kwargs=debug_kwargs)
943
+
944
+ async def _call() -> Any:
945
+ return await client.chat.completions.create(
946
+ model=model,
947
+ messages=message_list,
948
+ temperature=temperature,
949
+ max_tokens=max_tokens,
950
+ **{k: v for k, v in kwargs.items() if k not in {"model", "temperature", "max_tokens"}},
951
+ )
952
+
953
+ resp: Any = await self._with_retries(_call, method="chat_completion")
954
+ usage_obj = getattr(resp, "usage", None)
955
+ if usage_obj is None and hasattr(resp, "model_dump"):
956
+ try:
957
+ dumped = resp.model_dump()
958
+ if isinstance(dumped, dict):
959
+ usage_obj = dumped.get("usage") or dumped.get("x_openai_usage") or dumped.get("x_usage")
960
+ except Exception:
961
+ pass
962
+ self._record_usage(usage_obj, method="chat_completion")
963
+ return resp
964
+
965
+ def _message_text(self, msg: Any) -> str:
966
+ """
967
+ Convert a ChatCompletionMessage content to plain text.
968
+ Some providers may return list content; we best-effort stringify.
969
+ """
970
+ content = getattr(msg, "content", None)
971
+ if content is None:
972
+ return ""
973
+ if isinstance(content, str):
974
+ return content
975
+ # List/parts fallback
976
+ try:
977
+ return "".join(str(part) for part in content).strip()
978
+ except Exception:
979
+ return str(content).strip()
980
+
981
+ # NOTE: `predict/chat/predict_stream` wrappers are provided as default methods on the Protocol.