power-loop 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llm_client/__init__.py +0 -0
- llm_client/capabilities.py +162 -0
- llm_client/interface.py +470 -0
- llm_client/llm_factory.py +981 -0
- llm_client/llm_tooling.py +645 -0
- llm_client/llm_utils.py +205 -0
- llm_client/multimodal.py +237 -0
- llm_client/qwen_image.py +576 -0
- llm_client/web_search.py +149 -0
- power_loop/__init__.py +326 -0
- power_loop/agent/__init__.py +6 -0
- power_loop/agent/sink.py +247 -0
- power_loop/agent/stateful_loop.py +363 -0
- power_loop/agent/system_prompt.py +396 -0
- power_loop/agent/types.py +41 -0
- power_loop/contracts/__init__.py +132 -0
- power_loop/contracts/errors.py +140 -0
- power_loop/contracts/event_payloads.py +278 -0
- power_loop/contracts/events.py +86 -0
- power_loop/contracts/handlers.py +45 -0
- power_loop/contracts/hook_contexts.py +265 -0
- power_loop/contracts/hooks.py +64 -0
- power_loop/contracts/messages.py +90 -0
- power_loop/contracts/protocols.py +48 -0
- power_loop/contracts/tools.py +56 -0
- power_loop/core/agent_context.py +94 -0
- power_loop/core/events.py +124 -0
- power_loop/core/hooks.py +122 -0
- power_loop/core/phase.py +217 -0
- power_loop/core/pipeline.py +880 -0
- power_loop/core/runner.py +60 -0
- power_loop/core/state.py +208 -0
- power_loop/runtime/budget.py +179 -0
- power_loop/runtime/cancellation.py +127 -0
- power_loop/runtime/compact.py +300 -0
- power_loop/runtime/env.py +103 -0
- power_loop/runtime/memory.py +107 -0
- power_loop/runtime/provider.py +176 -0
- power_loop/runtime/retry.py +182 -0
- power_loop/runtime/session_store.py +636 -0
- power_loop/runtime/skills.py +201 -0
- power_loop/runtime/spec.py +233 -0
- power_loop/runtime/structured.py +225 -0
- power_loop/tools/__init__.py +51 -0
- power_loop/tools/default_manifest.py +244 -0
- power_loop/tools/default_tools.py +766 -0
- power_loop/tools/registry.py +162 -0
- power_loop/tools/spawn_agent.py +173 -0
- power_loop-0.2.0.dist-info/METADATA +632 -0
- power_loop-0.2.0.dist-info/RECORD +53 -0
- power_loop-0.2.0.dist-info/WHEEL +5 -0
- power_loop-0.2.0.dist-info/licenses/LICENSE +21 -0
- power_loop-0.2.0.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,981 @@
|
|
|
1
|
+
"""
|
|
2
|
+
LLM factory utilities for zrag.
|
|
3
|
+
|
|
4
|
+
This module provides a small, explicit way to build an OpenAI-compatible `LLMService`
|
|
5
|
+
from environment / `src.config.settings`, similar in spirit to agent-psychology's
|
|
6
|
+
model utilities, but adapted to zrag's `LLMService` Protocol.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import asyncio
|
|
12
|
+
import inspect
|
|
13
|
+
import json
|
|
14
|
+
import logging
|
|
15
|
+
import os
|
|
16
|
+
import random
|
|
17
|
+
from collections.abc import AsyncIterator, Callable, Sequence
|
|
18
|
+
from typing import Any, cast
|
|
19
|
+
|
|
20
|
+
from openai import AsyncOpenAI
|
|
21
|
+
|
|
22
|
+
from .capabilities import ModelCapabilities, resolve_model_capabilities
|
|
23
|
+
from .interface import LLMRequest, LLMResponse, LLMService, LLMStreamChunk, LLMTokenUsage, OpenAICompatibleChatConfig
|
|
24
|
+
from .llm_utils import parse_json_from_model_output_detailed
|
|
25
|
+
|
|
26
|
+
try:
|
|
27
|
+
from openai.types.chat import (
|
|
28
|
+
ChatCompletion,
|
|
29
|
+
ChatCompletionChunk, # type: ignore[import-not-found]
|
|
30
|
+
ChatCompletionMessage,
|
|
31
|
+
)
|
|
32
|
+
except Exception: # pragma: no cover
|
|
33
|
+
# Runtime will still work because we treat these as typing-only. Keep minimal fallback.
|
|
34
|
+
ChatCompletion = Any # type: ignore[assignment]
|
|
35
|
+
ChatCompletionMessage = Any # type: ignore[assignment]
|
|
36
|
+
|
|
37
|
+
logger = logging.getLogger(__name__)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
_PROXY_ENV_KEYS = (
|
|
41
|
+
"ALL_PROXY",
|
|
42
|
+
"all_proxy",
|
|
43
|
+
"HTTPS_PROXY",
|
|
44
|
+
"https_proxy",
|
|
45
|
+
"HTTP_PROXY",
|
|
46
|
+
"http_proxy",
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _normalize_proxy_url_scheme(url: str) -> str:
|
|
51
|
+
text = (url or "").strip()
|
|
52
|
+
if text.lower().startswith("socks://"):
|
|
53
|
+
# httpx/openai expects socks5://, not socks://.
|
|
54
|
+
return f"socks5://{text[len('socks://'):]}"
|
|
55
|
+
return text
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _normalize_proxy_env_inplace() -> None:
|
|
59
|
+
for key in _PROXY_ENV_KEYS:
|
|
60
|
+
value = os.environ.get(key)
|
|
61
|
+
if not value:
|
|
62
|
+
continue
|
|
63
|
+
normalized = _normalize_proxy_url_scheme(value)
|
|
64
|
+
if normalized != value:
|
|
65
|
+
os.environ[key] = normalized
|
|
66
|
+
logger.info("Normalized proxy env %s from socks:// to socks5://", key)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _sanitize_debug_payload(value: Any) -> Any:
|
|
70
|
+
if isinstance(value, dict):
|
|
71
|
+
sanitized: dict[str, Any] = {}
|
|
72
|
+
for key, item in value.items():
|
|
73
|
+
if key == "url" and isinstance(item, str) and item.startswith("data:"):
|
|
74
|
+
prefix, _, payload = item.partition(",")
|
|
75
|
+
sanitized[key] = f"{prefix},<base64:{len(payload)} chars>"
|
|
76
|
+
else:
|
|
77
|
+
sanitized[key] = _sanitize_debug_payload(item)
|
|
78
|
+
return sanitized
|
|
79
|
+
if isinstance(value, list):
|
|
80
|
+
return [_sanitize_debug_payload(item) for item in value]
|
|
81
|
+
return value
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _format_messages_for_debug(messages: Sequence[dict[str, Any]]) -> str:
|
|
85
|
+
sanitized = _sanitize_debug_payload(list(messages))
|
|
86
|
+
return json.dumps(sanitized, ensure_ascii=False, indent=2)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
class OpenAICompatibleChatLLMService(LLMService):
|
|
92
|
+
"""
|
|
93
|
+
Minimal OpenAI-compatible chat completion client.
|
|
94
|
+
|
|
95
|
+
- Uses `openai.AsyncOpenAI`
|
|
96
|
+
- Records token usage from response.usage (if present)
|
|
97
|
+
"""
|
|
98
|
+
|
|
99
|
+
def __init__(self, cfg: OpenAICompatibleChatConfig):
|
|
100
|
+
self._cfg = cfg
|
|
101
|
+
self._client: Any = None
|
|
102
|
+
self._last_usage: dict[str, Any] = {}
|
|
103
|
+
self._capabilities: ModelCapabilities = resolve_model_capabilities(
|
|
104
|
+
cfg.model,
|
|
105
|
+
cfg.base_url,
|
|
106
|
+
cfg.capability_overrides,
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
logger.info(
|
|
110
|
+
"GraphExtractor LLM: base_url=%s model=%s timeout_s=%s max_tokens=%s temperature=%s api_key=%s",
|
|
111
|
+
cfg.base_url,
|
|
112
|
+
cfg.model,
|
|
113
|
+
cfg.timeout_s,
|
|
114
|
+
cfg.max_tokens,
|
|
115
|
+
cfg.temperature,
|
|
116
|
+
"set" if bool(cfg.api_key) else "missing",
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
def get_last_token_usage(self) -> dict[str, Any]:
|
|
120
|
+
return dict(self._last_usage or {})
|
|
121
|
+
|
|
122
|
+
@property
|
|
123
|
+
def capabilities(self) -> ModelCapabilities:
|
|
124
|
+
return self._capabilities
|
|
125
|
+
|
|
126
|
+
def _ensure_client(self) -> Any:
|
|
127
|
+
if self._client is not None:
|
|
128
|
+
return self._client
|
|
129
|
+
from openai import AsyncOpenAI # type: ignore
|
|
130
|
+
|
|
131
|
+
_normalize_proxy_env_inplace()
|
|
132
|
+
|
|
133
|
+
self._client = AsyncOpenAI(
|
|
134
|
+
base_url=self._cfg.base_url,
|
|
135
|
+
api_key=self._cfg.api_key,
|
|
136
|
+
timeout=self._cfg.timeout_s,
|
|
137
|
+
)
|
|
138
|
+
return self._client
|
|
139
|
+
|
|
140
|
+
async def close(self) -> None:
|
|
141
|
+
"""
|
|
142
|
+
Close underlying HTTP resources (httpx) to avoid ResourceWarning in tests.
|
|
143
|
+
"""
|
|
144
|
+
if self._client is None:
|
|
145
|
+
return
|
|
146
|
+
try:
|
|
147
|
+
# openai.AsyncOpenAI exposes `close()` (async).
|
|
148
|
+
await self._client.close()
|
|
149
|
+
finally:
|
|
150
|
+
self._client = None
|
|
151
|
+
|
|
152
|
+
def _record_usage(self, usage: Any, *, method: str) -> None:
|
|
153
|
+
self._last_usage = self._usage_dict_from_any(usage)
|
|
154
|
+
|
|
155
|
+
total_tokens = self._last_usage.get("total_tokens")
|
|
156
|
+
if isinstance(total_tokens, int) and total_tokens > 0:
|
|
157
|
+
logger.info(
|
|
158
|
+
"[llm:%s] token_usage prompt=%s completion=%s total=%s",
|
|
159
|
+
method,
|
|
160
|
+
self._last_usage.get("prompt_tokens"),
|
|
161
|
+
self._last_usage.get("completion_tokens"),
|
|
162
|
+
self._last_usage.get("total_tokens"),
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
def _emit_debug_payload(self, *, method: str, messages: Any, kwargs: Any) -> None:
|
|
166
|
+
"""
|
|
167
|
+
Emit request debug payload.
|
|
168
|
+
|
|
169
|
+
This is a best-effort helper used by stream/complete paths. It must exist
|
|
170
|
+
because some code paths call it unconditionally.
|
|
171
|
+
"""
|
|
172
|
+
# Avoid spamming logs unless explicitly requested.
|
|
173
|
+
env_flag = (os.getenv("POWER_LOOP_LLM_DEBUG") or "").strip().lower()
|
|
174
|
+
debug_enabled = env_flag in {"1", "true", "yes", "y"} or logger.isEnabledFor(logging.DEBUG)
|
|
175
|
+
if not debug_enabled:
|
|
176
|
+
return
|
|
177
|
+
|
|
178
|
+
try:
|
|
179
|
+
rendered_messages = (
|
|
180
|
+
_format_messages_for_debug(messages)
|
|
181
|
+
if isinstance(messages, (list, tuple))
|
|
182
|
+
else _sanitize_debug_payload(messages)
|
|
183
|
+
)
|
|
184
|
+
except Exception:
|
|
185
|
+
rendered_messages = "<unrenderable messages>"
|
|
186
|
+
|
|
187
|
+
try:
|
|
188
|
+
sanitized_kwargs = _sanitize_debug_payload(kwargs)
|
|
189
|
+
except Exception:
|
|
190
|
+
sanitized_kwargs = "<unrenderable kwargs>"
|
|
191
|
+
|
|
192
|
+
logger.debug("[llm:%s] request messages=%s kwargs=%s", method, rendered_messages, sanitized_kwargs)
|
|
193
|
+
|
|
194
|
+
def _usage_dict_from_any(self, usage: Any) -> dict[str, Any]:
|
|
195
|
+
"""
|
|
196
|
+
Normalize token usage from various provider shapes.
|
|
197
|
+
Supports:
|
|
198
|
+
- OpenAI usage: prompt_tokens/completion_tokens/total_tokens
|
|
199
|
+
- Some providers: input_tokens/output_tokens/total_tokens
|
|
200
|
+
- Dict payloads (from model_dump / raw json)
|
|
201
|
+
"""
|
|
202
|
+
if usage is None:
|
|
203
|
+
return {
|
|
204
|
+
"prompt_tokens": None,
|
|
205
|
+
"completion_tokens": None,
|
|
206
|
+
"total_tokens": None,
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
raw: dict[str, Any] = {}
|
|
210
|
+
|
|
211
|
+
# dict-like
|
|
212
|
+
if isinstance(usage, dict):
|
|
213
|
+
raw = dict(usage)
|
|
214
|
+
else:
|
|
215
|
+
# object-like: prefer model_dump, then __dict__
|
|
216
|
+
if hasattr(usage, "model_dump"):
|
|
217
|
+
try:
|
|
218
|
+
dumped = usage.model_dump()
|
|
219
|
+
if isinstance(dumped, dict):
|
|
220
|
+
raw = dict(dumped)
|
|
221
|
+
except Exception:
|
|
222
|
+
raw = {}
|
|
223
|
+
if not raw:
|
|
224
|
+
try:
|
|
225
|
+
raw = dict(usage.__dict__)
|
|
226
|
+
except Exception:
|
|
227
|
+
raw = {}
|
|
228
|
+
|
|
229
|
+
def _safe_int_or_none(v: Any) -> int | None:
|
|
230
|
+
if v is None:
|
|
231
|
+
return None
|
|
232
|
+
if isinstance(v, bool):
|
|
233
|
+
return int(v)
|
|
234
|
+
if isinstance(v, (int, float)):
|
|
235
|
+
return int(v)
|
|
236
|
+
if isinstance(v, str):
|
|
237
|
+
s = v.strip()
|
|
238
|
+
if not s:
|
|
239
|
+
return None
|
|
240
|
+
try:
|
|
241
|
+
return int(float(s))
|
|
242
|
+
except Exception:
|
|
243
|
+
return None
|
|
244
|
+
return None
|
|
245
|
+
|
|
246
|
+
def _pick_int(payload: dict[str, Any], keys: list[str]) -> int | None:
|
|
247
|
+
for key in keys:
|
|
248
|
+
if key in payload:
|
|
249
|
+
iv = _safe_int_or_none(payload.get(key))
|
|
250
|
+
if iv is not None:
|
|
251
|
+
return iv
|
|
252
|
+
return None
|
|
253
|
+
|
|
254
|
+
def _as_dict(v: Any) -> dict[str, Any]:
|
|
255
|
+
if isinstance(v, dict):
|
|
256
|
+
return v
|
|
257
|
+
if hasattr(v, "model_dump"):
|
|
258
|
+
try:
|
|
259
|
+
dumped = v.model_dump()
|
|
260
|
+
if isinstance(dumped, dict):
|
|
261
|
+
return dumped
|
|
262
|
+
except Exception:
|
|
263
|
+
return {}
|
|
264
|
+
return {}
|
|
265
|
+
|
|
266
|
+
prompt = _pick_int(raw, [
|
|
267
|
+
"prompt_tokens",
|
|
268
|
+
"input_tokens",
|
|
269
|
+
"prompt_token_count",
|
|
270
|
+
"promptTokens",
|
|
271
|
+
"inputTokenCount",
|
|
272
|
+
"prompt_count",
|
|
273
|
+
])
|
|
274
|
+
completion = _pick_int(raw, [
|
|
275
|
+
"completion_tokens",
|
|
276
|
+
"output_tokens",
|
|
277
|
+
"completion_token_count",
|
|
278
|
+
"completionTokens",
|
|
279
|
+
"outputTokenCount",
|
|
280
|
+
"generated_tokens",
|
|
281
|
+
"candidates_token_count",
|
|
282
|
+
])
|
|
283
|
+
total = _pick_int(raw, [
|
|
284
|
+
"total_tokens",
|
|
285
|
+
"total_token_count",
|
|
286
|
+
"totalTokens",
|
|
287
|
+
"token_count",
|
|
288
|
+
"usage_tokens",
|
|
289
|
+
])
|
|
290
|
+
|
|
291
|
+
if total is None and prompt is not None and completion is not None:
|
|
292
|
+
total = prompt + completion
|
|
293
|
+
|
|
294
|
+
prompt_details = _as_dict(raw.get("prompt_tokens_details"))
|
|
295
|
+
completion_details = _as_dict(raw.get("completion_tokens_details"))
|
|
296
|
+
output_details = _as_dict(raw.get("output_tokens_details"))
|
|
297
|
+
input_details = _as_dict(raw.get("input_tokens_details"))
|
|
298
|
+
|
|
299
|
+
prompt_audio_tokens = _pick_int(prompt_details, ["audio_tokens", "audioTokenCount"])
|
|
300
|
+
if prompt_audio_tokens is None:
|
|
301
|
+
prompt_audio_tokens = _pick_int(input_details, ["audio_tokens", "audioTokenCount"])
|
|
302
|
+
prompt_cached_tokens = _pick_int(
|
|
303
|
+
raw,
|
|
304
|
+
["prompt_cache_hit_tokens", "cached_tokens", "cache_hit_tokens", "prompt_cached_tokens"],
|
|
305
|
+
)
|
|
306
|
+
if prompt_cached_tokens is None:
|
|
307
|
+
prompt_cached_tokens = _pick_int(prompt_details, ["cached_tokens", "cache_hit_tokens", "cacheHitTokens"])
|
|
308
|
+
prompt_cache_miss_tokens = _pick_int(raw, ["prompt_cache_miss_tokens", "cache_miss_tokens", "cacheMissTokens"])
|
|
309
|
+
if prompt_cache_miss_tokens is None:
|
|
310
|
+
prompt_cache_miss_tokens = _pick_int(prompt_details, ["cache_miss_tokens", "cacheMissTokens"])
|
|
311
|
+
prompt_text_tokens = _pick_int(prompt_details, ["text_tokens", "textTokenCount"])
|
|
312
|
+
if prompt_text_tokens is None:
|
|
313
|
+
prompt_text_tokens = _pick_int(input_details, ["text_tokens", "textTokenCount"])
|
|
314
|
+
prompt_image_tokens = _pick_int(prompt_details, ["image_tokens", "imageTokenCount"])
|
|
315
|
+
if prompt_image_tokens is None:
|
|
316
|
+
prompt_image_tokens = _pick_int(input_details, ["image_tokens", "imageTokenCount"])
|
|
317
|
+
|
|
318
|
+
completion_reasoning_tokens = _pick_int(
|
|
319
|
+
completion_details,
|
|
320
|
+
["reasoning_tokens", "reasoningTokenCount", "reasoning_token", "reasoningTokens", "thinking_tokens"],
|
|
321
|
+
)
|
|
322
|
+
if completion_reasoning_tokens is None:
|
|
323
|
+
completion_reasoning_tokens = _pick_int(
|
|
324
|
+
output_details,
|
|
325
|
+
["reasoning_tokens", "reasoningTokenCount", "reasoning_token", "reasoningTokens", "thinking_tokens"],
|
|
326
|
+
)
|
|
327
|
+
completion_audio_tokens = _pick_int(completion_details, ["audio_tokens", "audioTokenCount"])
|
|
328
|
+
if completion_audio_tokens is None:
|
|
329
|
+
completion_audio_tokens = _pick_int(output_details, ["audio_tokens", "audioTokenCount"])
|
|
330
|
+
completion_text_tokens = _pick_int(completion_details, ["text_tokens", "textTokenCount"])
|
|
331
|
+
if completion_text_tokens is None:
|
|
332
|
+
completion_text_tokens = _pick_int(output_details, ["text_tokens", "textTokenCount"])
|
|
333
|
+
completion_image_tokens = _pick_int(completion_details, ["image_tokens", "imageTokenCount"])
|
|
334
|
+
if completion_image_tokens is None:
|
|
335
|
+
completion_image_tokens = _pick_int(output_details, ["image_tokens", "imageTokenCount"])
|
|
336
|
+
accepted_prediction_tokens = _pick_int(
|
|
337
|
+
completion_details,
|
|
338
|
+
["accepted_prediction_tokens", "acceptedPredictionTokens"],
|
|
339
|
+
)
|
|
340
|
+
if accepted_prediction_tokens is None:
|
|
341
|
+
accepted_prediction_tokens = _pick_int(
|
|
342
|
+
output_details,
|
|
343
|
+
["accepted_prediction_tokens", "acceptedPredictionTokens"],
|
|
344
|
+
)
|
|
345
|
+
rejected_prediction_tokens = _pick_int(
|
|
346
|
+
completion_details,
|
|
347
|
+
["rejected_prediction_tokens", "rejectedPredictionTokens"],
|
|
348
|
+
)
|
|
349
|
+
if rejected_prediction_tokens is None:
|
|
350
|
+
rejected_prediction_tokens = _pick_int(
|
|
351
|
+
output_details,
|
|
352
|
+
["rejected_prediction_tokens", "rejectedPredictionTokens"],
|
|
353
|
+
)
|
|
354
|
+
|
|
355
|
+
# Provider-agnostic aliases (keep None when unknown)
|
|
356
|
+
cached_tokens = _pick_int(raw, ["cached_tokens", "cache_hit_tokens", "prompt_cache_hit_tokens"])
|
|
357
|
+
if cached_tokens is None:
|
|
358
|
+
cached_tokens = prompt_cached_tokens
|
|
359
|
+
cache_hit_tokens = _pick_int(raw, ["cache_hit_tokens", "prompt_cache_hit_tokens", "cached_tokens"])
|
|
360
|
+
if cache_hit_tokens is None:
|
|
361
|
+
cache_hit_tokens = prompt_cached_tokens
|
|
362
|
+
cache_miss_tokens = _pick_int(raw, ["cache_miss_tokens", "prompt_cache_miss_tokens"])
|
|
363
|
+
if cache_miss_tokens is None:
|
|
364
|
+
cache_miss_tokens = prompt_cache_miss_tokens
|
|
365
|
+
|
|
366
|
+
reasoning_tokens = _pick_int(
|
|
367
|
+
raw,
|
|
368
|
+
["reasoning_tokens", "reasoning_token", "reasoningTokens", "thinking_tokens"],
|
|
369
|
+
)
|
|
370
|
+
if reasoning_tokens is None:
|
|
371
|
+
reasoning_tokens = completion_reasoning_tokens
|
|
372
|
+
|
|
373
|
+
accepted_tokens = _pick_int(raw, ["accepted_prediction_tokens"])
|
|
374
|
+
if accepted_tokens is None:
|
|
375
|
+
accepted_tokens = accepted_prediction_tokens
|
|
376
|
+
rejected_tokens = _pick_int(raw, ["rejected_prediction_tokens"])
|
|
377
|
+
if rejected_tokens is None:
|
|
378
|
+
rejected_tokens = rejected_prediction_tokens
|
|
379
|
+
|
|
380
|
+
# Keep provider-native fields, and ensure normalized aliases exist.
|
|
381
|
+
raw["prompt_tokens"] = prompt
|
|
382
|
+
raw["completion_tokens"] = completion
|
|
383
|
+
raw["total_tokens"] = total
|
|
384
|
+
raw["prompt_audio_tokens"] = prompt_audio_tokens
|
|
385
|
+
raw["prompt_cached_tokens"] = prompt_cached_tokens
|
|
386
|
+
raw["prompt_cache_miss_tokens"] = prompt_cache_miss_tokens
|
|
387
|
+
raw["prompt_text_tokens"] = prompt_text_tokens
|
|
388
|
+
raw["prompt_image_tokens"] = prompt_image_tokens
|
|
389
|
+
raw["completion_reasoning_tokens"] = completion_reasoning_tokens
|
|
390
|
+
raw["completion_audio_tokens"] = completion_audio_tokens
|
|
391
|
+
raw["completion_text_tokens"] = completion_text_tokens
|
|
392
|
+
raw["completion_image_tokens"] = completion_image_tokens
|
|
393
|
+
raw["accepted_prediction_tokens"] = accepted_prediction_tokens
|
|
394
|
+
raw["rejected_prediction_tokens"] = rejected_prediction_tokens
|
|
395
|
+
raw["cached_tokens"] = cached_tokens
|
|
396
|
+
raw["cache_hit_tokens"] = cache_hit_tokens
|
|
397
|
+
raw["cache_miss_tokens"] = cache_miss_tokens
|
|
398
|
+
raw["reasoning_tokens"] = reasoning_tokens
|
|
399
|
+
raw["accepted_tokens"] = accepted_tokens
|
|
400
|
+
raw["rejected_tokens"] = rejected_tokens
|
|
401
|
+
return raw
|
|
402
|
+
|
|
403
|
+
def _usage_obj(self, usage: Any = None) -> LLMTokenUsage:
|
|
404
|
+
d = self._last_usage if usage is None else self._usage_dict_from_any(usage)
|
|
405
|
+
|
|
406
|
+
def _int_or_none(v: Any) -> int | None:
|
|
407
|
+
try:
|
|
408
|
+
return None if v is None else int(v)
|
|
409
|
+
except Exception:
|
|
410
|
+
return None
|
|
411
|
+
|
|
412
|
+
return LLMTokenUsage(
|
|
413
|
+
prompt_tokens=_int_or_none(d.get("prompt_tokens")),
|
|
414
|
+
completion_tokens=_int_or_none(d.get("completion_tokens")),
|
|
415
|
+
total_tokens=_int_or_none(d.get("total_tokens")),
|
|
416
|
+
prompt_audio_tokens=_int_or_none(d.get("prompt_audio_tokens")),
|
|
417
|
+
prompt_cached_tokens=_int_or_none(d.get("prompt_cached_tokens")),
|
|
418
|
+
prompt_cache_miss_tokens=_int_or_none(d.get("prompt_cache_miss_tokens")),
|
|
419
|
+
prompt_text_tokens=_int_or_none(d.get("prompt_text_tokens")),
|
|
420
|
+
prompt_image_tokens=_int_or_none(d.get("prompt_image_tokens")),
|
|
421
|
+
completion_reasoning_tokens=_int_or_none(d.get("completion_reasoning_tokens")),
|
|
422
|
+
completion_audio_tokens=_int_or_none(d.get("completion_audio_tokens")),
|
|
423
|
+
completion_text_tokens=_int_or_none(d.get("completion_text_tokens")),
|
|
424
|
+
completion_image_tokens=_int_or_none(d.get("completion_image_tokens")),
|
|
425
|
+
accepted_prediction_tokens=_int_or_none(d.get("accepted_prediction_tokens")),
|
|
426
|
+
rejected_prediction_tokens=_int_or_none(d.get("rejected_prediction_tokens")),
|
|
427
|
+
cached_tokens=_int_or_none(d.get("cached_tokens")),
|
|
428
|
+
cache_hit_tokens=_int_or_none(d.get("cache_hit_tokens")),
|
|
429
|
+
cache_miss_tokens=_int_or_none(d.get("cache_miss_tokens")),
|
|
430
|
+
reasoning_tokens=_int_or_none(d.get("reasoning_tokens")),
|
|
431
|
+
accepted_tokens=_int_or_none(d.get("accepted_tokens")),
|
|
432
|
+
rejected_tokens=_int_or_none(d.get("rejected_tokens")),
|
|
433
|
+
)
|
|
434
|
+
|
|
435
|
+
async def _with_retries(self, fn, *, method: str) -> Any:
|
|
436
|
+
"""
|
|
437
|
+
Lightweight retry wrapper inspired by LangChain's ergonomics.
|
|
438
|
+
"""
|
|
439
|
+
last_err: Exception | None = None
|
|
440
|
+
attempts = max(1, int(self._cfg.max_retries) + 1)
|
|
441
|
+
for i in range(attempts):
|
|
442
|
+
try:
|
|
443
|
+
return await fn()
|
|
444
|
+
except Exception as e:
|
|
445
|
+
last_err = e
|
|
446
|
+
if i >= attempts - 1:
|
|
447
|
+
break
|
|
448
|
+
# exponential backoff + jitter
|
|
449
|
+
base = float(self._cfg.retry_base_delay_s)
|
|
450
|
+
delay = base * (2 ** i) * (0.75 + 0.5 * random.random())
|
|
451
|
+
logger.warning("[llm:%s] call failed (attempt %s/%s), retrying in %.2fs: %s", method, i + 1, attempts,
|
|
452
|
+
delay, e)
|
|
453
|
+
await asyncio.sleep(delay)
|
|
454
|
+
raise cast(Exception, last_err)
|
|
455
|
+
|
|
456
|
+
def _request_kwargs(self, request: LLMRequest) -> dict[str, Any]:
|
|
457
|
+
"""
|
|
458
|
+
Map `LLMRequest` into kwargs for OpenAI-compatible chat.completions.create.
|
|
459
|
+
Falls back to self._cfg for default settings.
|
|
460
|
+
"""
|
|
461
|
+
out: dict[str, Any] = dict(request.extra or {})
|
|
462
|
+
out["model"] = request.model if request.model else self._cfg.model
|
|
463
|
+
|
|
464
|
+
req_temp = request.temperature if request.temperature is not None else self._cfg.temperature
|
|
465
|
+
if req_temp is not None:
|
|
466
|
+
out["temperature"] = float(req_temp)
|
|
467
|
+
|
|
468
|
+
req_max_tokens = request.max_tokens if request.max_tokens is not None else self._cfg.max_tokens
|
|
469
|
+
if req_max_tokens is not None:
|
|
470
|
+
try:
|
|
471
|
+
max_tokens = int(req_max_tokens)
|
|
472
|
+
except Exception:
|
|
473
|
+
max_tokens = None
|
|
474
|
+
if max_tokens is not None and max_tokens > 0:
|
|
475
|
+
out["max_tokens"] = max_tokens
|
|
476
|
+
|
|
477
|
+
reason_value: bool | None = None
|
|
478
|
+
if "reason" in out:
|
|
479
|
+
reason_value = bool(out.pop("reason"))
|
|
480
|
+
elif request.reason is not None:
|
|
481
|
+
reason_value = bool(request.reason)
|
|
482
|
+
|
|
483
|
+
if reason_value is not None:
|
|
484
|
+
extra_body = out.get("extra_body")
|
|
485
|
+
if not isinstance(extra_body, dict):
|
|
486
|
+
extra_body = {}
|
|
487
|
+
if "reason" not in extra_body:
|
|
488
|
+
extra_body["reason"] = reason_value
|
|
489
|
+
out["extra_body"] = extra_body
|
|
490
|
+
|
|
491
|
+
if request.tools is not None:
|
|
492
|
+
out["tools"] = request.tools
|
|
493
|
+
if request.tool_choice is not None:
|
|
494
|
+
out["tool_choice"] = request.tool_choice
|
|
495
|
+
if request.response_format is not None:
|
|
496
|
+
out["response_format"] = request.response_format
|
|
497
|
+
return out
|
|
498
|
+
|
|
499
|
+
def _build_resume_request(self, request: LLMRequest, partial_text: str) -> LLMRequest:
|
|
500
|
+
resumed_messages = list(request.messages or [])
|
|
501
|
+
trimmed = (partial_text or "").strip()
|
|
502
|
+
if trimmed:
|
|
503
|
+
resumed_messages.append({"role": "assistant", "content": trimmed})
|
|
504
|
+
resumed_messages.append({"role": "user", "content": self._cfg.stream_resume_instruction})
|
|
505
|
+
return LLMRequest(
|
|
506
|
+
messages=resumed_messages,
|
|
507
|
+
system_prompt=request.system_prompt,
|
|
508
|
+
model=request.model,
|
|
509
|
+
temperature=request.temperature,
|
|
510
|
+
max_tokens=request.max_tokens,
|
|
511
|
+
parse_json=request.parse_json,
|
|
512
|
+
reason=request.reason,
|
|
513
|
+
tools=request.tools,
|
|
514
|
+
tool_choice=request.tool_choice,
|
|
515
|
+
response_format=request.response_format,
|
|
516
|
+
extra=dict(request.extra or {}),
|
|
517
|
+
)
|
|
518
|
+
|
|
519
|
+
async def complete(
|
|
520
|
+
self,
|
|
521
|
+
request: LLMRequest,
|
|
522
|
+
*,
|
|
523
|
+
on_chunk_delta_text: Callable[[str], Any] | None = None,
|
|
524
|
+
on_chunk_think: Callable[[str], Any] | None = None,
|
|
525
|
+
on_stream_end: Callable[[LLMResponse], Any] | None = None,
|
|
526
|
+
) -> LLMResponse:
|
|
527
|
+
"""
|
|
528
|
+
Canonical non-streaming API (preferred).
|
|
529
|
+
"""
|
|
530
|
+
text_parts: list[str] = []
|
|
531
|
+
think_parts: list[str] = []
|
|
532
|
+
chunks: list[LLMStreamChunk] = []
|
|
533
|
+
last_tool_calls: list[dict[str, Any]] = []
|
|
534
|
+
final_usage: LLMTokenUsage | None = None
|
|
535
|
+
last_raw_event: Any = None
|
|
536
|
+
|
|
537
|
+
async for chunk in self.stream(request):
|
|
538
|
+
chunks.append(chunk)
|
|
539
|
+
if chunk.delta_text:
|
|
540
|
+
text_parts.append(chunk.delta_text)
|
|
541
|
+
if on_chunk_delta_text:
|
|
542
|
+
res_val = on_chunk_delta_text(chunk.delta_text)
|
|
543
|
+
if inspect.isawaitable(res_val):
|
|
544
|
+
await res_val
|
|
545
|
+
if chunk.think:
|
|
546
|
+
think_parts.append(chunk.think)
|
|
547
|
+
if on_chunk_think:
|
|
548
|
+
res_val = on_chunk_think(chunk.think)
|
|
549
|
+
if inspect.isawaitable(res_val):
|
|
550
|
+
await res_val
|
|
551
|
+
if chunk.tool_calls:
|
|
552
|
+
last_tool_calls = list(chunk.tool_calls)
|
|
553
|
+
if chunk.token_usage is not None:
|
|
554
|
+
final_usage = chunk.token_usage
|
|
555
|
+
if chunk.raw_event is not None:
|
|
556
|
+
last_raw_event = chunk.raw_event
|
|
557
|
+
|
|
558
|
+
text = "".join(text_parts)
|
|
559
|
+
|
|
560
|
+
if request.parse_json:
|
|
561
|
+
res = parse_json_from_model_output_detailed(text)
|
|
562
|
+
else:
|
|
563
|
+
res = LLMResponse(raw_text=text, content_text=text)
|
|
564
|
+
|
|
565
|
+
res.token_usage = final_usage if final_usage is not None else self._usage_obj()
|
|
566
|
+
res.tool_calls = last_tool_calls
|
|
567
|
+
res.stream_chunks = chunks
|
|
568
|
+
res.raw_completion = last_raw_event
|
|
569
|
+
res.think = "".join(think_parts)
|
|
570
|
+
|
|
571
|
+
if on_stream_end:
|
|
572
|
+
res_val = on_stream_end(res)
|
|
573
|
+
if inspect.isawaitable(res_val):
|
|
574
|
+
await res_val
|
|
575
|
+
|
|
576
|
+
return res
|
|
577
|
+
|
|
578
|
+
async def stream(self, request: LLMRequest) -> AsyncIterator[LLMStreamChunk]:
|
|
579
|
+
"""
|
|
580
|
+
Streaming API.
|
|
581
|
+
|
|
582
|
+
Best-effort real streaming for OpenAI-compatible servers.
|
|
583
|
+
If provider does not support streaming, callers can switch to `complete()`.
|
|
584
|
+
"""
|
|
585
|
+
client: AsyncOpenAI = self._ensure_client()
|
|
586
|
+
kwargs = self._request_kwargs(request)
|
|
587
|
+
|
|
588
|
+
stream_kwargs = dict(kwargs)
|
|
589
|
+
stream_kwargs["stream"] = True
|
|
590
|
+
stream_kwargs.setdefault("stream_options", {"include_usage": True})
|
|
591
|
+
|
|
592
|
+
last_event: ChatCompletionChunk | None = None
|
|
593
|
+
final_usage: LLMTokenUsage | None = None
|
|
594
|
+
tool_call_store: dict[str, dict[str, Any]] = {}
|
|
595
|
+
tool_call_order: list[str] = []
|
|
596
|
+
tool_call_index_to_key: dict[int, str] = {}
|
|
597
|
+
aggregated_text: str = ""
|
|
598
|
+
current_request = request
|
|
599
|
+
restart_count = 0
|
|
600
|
+
max_restarts = max(0, int(self._cfg.stream_max_restarts or 0))
|
|
601
|
+
resume_enabled = bool(self._cfg.stream_resume_on_error)
|
|
602
|
+
|
|
603
|
+
def _as_dict(obj: Any) -> Any:
|
|
604
|
+
if obj is None:
|
|
605
|
+
return None
|
|
606
|
+
if isinstance(obj, dict):
|
|
607
|
+
return obj
|
|
608
|
+
if hasattr(obj, "model_dump"):
|
|
609
|
+
try:
|
|
610
|
+
return obj.model_dump()
|
|
611
|
+
except Exception:
|
|
612
|
+
return obj
|
|
613
|
+
# best-effort: fallback to __dict__
|
|
614
|
+
try:
|
|
615
|
+
return dict(obj.__dict__)
|
|
616
|
+
except Exception:
|
|
617
|
+
return obj
|
|
618
|
+
|
|
619
|
+
def _merge_tool_calls_delta(delta_tool_calls_obj: Any) -> list[dict[str, Any]]:
|
|
620
|
+
"""
|
|
621
|
+
Merge OpenAI-compatible streaming `delta.tool_calls` into an aggregated list.
|
|
622
|
+
|
|
623
|
+
Delta items commonly look like:
|
|
624
|
+
{"index":0,"id":"...","type":"function","function":{"name":"x","arguments":"{...partial..."}}
|
|
625
|
+
"""
|
|
626
|
+
if not delta_tool_calls_obj:
|
|
627
|
+
# return current snapshot
|
|
628
|
+
return [tool_call_store[k] for k in tool_call_order]
|
|
629
|
+
|
|
630
|
+
items = delta_tool_calls_obj
|
|
631
|
+
# sometimes it's a single object
|
|
632
|
+
if not isinstance(items, list):
|
|
633
|
+
items = [items]
|
|
634
|
+
|
|
635
|
+
for pos, it in enumerate(items):
|
|
636
|
+
d = _as_dict(it)
|
|
637
|
+
if not isinstance(d, dict):
|
|
638
|
+
continue
|
|
639
|
+
explicit_id = d.get("id")
|
|
640
|
+
delta_index = d.get("index")
|
|
641
|
+
|
|
642
|
+
call_key: str
|
|
643
|
+
if explicit_id and f"id_{explicit_id}" in tool_call_store:
|
|
644
|
+
call_key = f"id_{explicit_id}"
|
|
645
|
+
elif isinstance(delta_index, int) and delta_index in tool_call_index_to_key:
|
|
646
|
+
call_key = tool_call_index_to_key[delta_index]
|
|
647
|
+
elif explicit_id:
|
|
648
|
+
call_key = f"id_{explicit_id}"
|
|
649
|
+
elif delta_index is not None:
|
|
650
|
+
call_key = f"index_{delta_index}"
|
|
651
|
+
else:
|
|
652
|
+
# fall back to position inside this delta event
|
|
653
|
+
call_key = f"event_pos_{pos}"
|
|
654
|
+
|
|
655
|
+
if call_key not in tool_call_store:
|
|
656
|
+
tool_call_store[call_key] = {
|
|
657
|
+
"id": explicit_id or call_key,
|
|
658
|
+
"type": d.get("type") or "function",
|
|
659
|
+
"function": {"name": "", "arguments": ""},
|
|
660
|
+
}
|
|
661
|
+
tool_call_order.append(call_key)
|
|
662
|
+
|
|
663
|
+
entry = tool_call_store[call_key]
|
|
664
|
+
if explicit_id and entry.get("id") != explicit_id:
|
|
665
|
+
entry["id"] = explicit_id
|
|
666
|
+
if isinstance(delta_index, int):
|
|
667
|
+
tool_call_index_to_key[delta_index] = call_key
|
|
668
|
+
fn = d.get("function") or {}
|
|
669
|
+
if not isinstance(fn, dict):
|
|
670
|
+
fn = _as_dict(fn) or {}
|
|
671
|
+
if isinstance(fn, dict):
|
|
672
|
+
name = fn.get("name")
|
|
673
|
+
if name:
|
|
674
|
+
entry["function"]["name"] = name
|
|
675
|
+
args = fn.get("arguments")
|
|
676
|
+
if args:
|
|
677
|
+
entry["function"]["arguments"] = (entry["function"].get("arguments") or "") + str(args)
|
|
678
|
+
|
|
679
|
+
# allow other keys to be updated if present
|
|
680
|
+
if d.get("type"):
|
|
681
|
+
entry["type"] = d.get("type")
|
|
682
|
+
|
|
683
|
+
return [tool_call_store[k] for k in tool_call_order]
|
|
684
|
+
|
|
685
|
+
def _extract_text_from_value(v: Any) -> str:
|
|
686
|
+
if isinstance(v, str):
|
|
687
|
+
return v
|
|
688
|
+
if isinstance(v, list):
|
|
689
|
+
parts: list[str] = []
|
|
690
|
+
for item in v:
|
|
691
|
+
if isinstance(item, str):
|
|
692
|
+
parts.append(item)
|
|
693
|
+
elif isinstance(item, dict):
|
|
694
|
+
t = item.get("text") or item.get("content")
|
|
695
|
+
if isinstance(t, str):
|
|
696
|
+
parts.append(t)
|
|
697
|
+
else:
|
|
698
|
+
d = _as_dict(item)
|
|
699
|
+
if isinstance(d, dict):
|
|
700
|
+
t2 = d.get("text") or d.get("content")
|
|
701
|
+
if isinstance(t2, str):
|
|
702
|
+
parts.append(t2)
|
|
703
|
+
return "".join(parts)
|
|
704
|
+
return ""
|
|
705
|
+
|
|
706
|
+
def _extract_text_and_think_from_value(v: Any) -> tuple[str, str]:
|
|
707
|
+
"""Split provider content blocks into user-visible text vs reasoning text.
|
|
708
|
+
|
|
709
|
+
Some OpenAI-compatible providers stream `delta.content` as a list of
|
|
710
|
+
typed blocks (for example type=reasoning). Those reasoning blocks
|
|
711
|
+
should feed the think panel, not the main chat text.
|
|
712
|
+
"""
|
|
713
|
+
if isinstance(v, str):
|
|
714
|
+
return v, ""
|
|
715
|
+
|
|
716
|
+
if isinstance(v, list):
|
|
717
|
+
text_parts: list[str] = []
|
|
718
|
+
think_parts: list[str] = []
|
|
719
|
+
for item in v:
|
|
720
|
+
if isinstance(item, str):
|
|
721
|
+
text_parts.append(item)
|
|
722
|
+
continue
|
|
723
|
+
|
|
724
|
+
data = item if isinstance(item, dict) else _as_dict(item)
|
|
725
|
+
if not isinstance(data, dict):
|
|
726
|
+
continue
|
|
727
|
+
|
|
728
|
+
part = _extract_text_from_value(data.get("text") or data.get("content"))
|
|
729
|
+
if not part:
|
|
730
|
+
continue
|
|
731
|
+
|
|
732
|
+
item_type = str(data.get("type") or data.get("role") or "").lower()
|
|
733
|
+
if any(tag in item_type for tag in ("reason", "think", "thought")):
|
|
734
|
+
think_parts.append(part)
|
|
735
|
+
else:
|
|
736
|
+
text_parts.append(part)
|
|
737
|
+
|
|
738
|
+
return "".join(text_parts), "".join(think_parts)
|
|
739
|
+
|
|
740
|
+
data = _as_dict(v)
|
|
741
|
+
if isinstance(data, dict):
|
|
742
|
+
part = _extract_text_from_value(data.get("text") or data.get("content"))
|
|
743
|
+
if not part:
|
|
744
|
+
return "", ""
|
|
745
|
+
item_type = str(data.get("type") or data.get("role") or "").lower()
|
|
746
|
+
if any(tag in item_type for tag in ("reason", "think", "thought")):
|
|
747
|
+
return "", part
|
|
748
|
+
return part, ""
|
|
749
|
+
|
|
750
|
+
return "", ""
|
|
751
|
+
|
|
752
|
+
def _extract_delta_text_and_think(delta: Any) -> tuple[str, str]:
|
|
753
|
+
if delta is None:
|
|
754
|
+
return "", ""
|
|
755
|
+
|
|
756
|
+
delta_dict = _as_dict(delta)
|
|
757
|
+
if isinstance(delta_dict, dict):
|
|
758
|
+
text, think = _extract_text_and_think_from_value(delta_dict.get("content"))
|
|
759
|
+
if not text:
|
|
760
|
+
text = _extract_text_from_value(delta_dict.get("text"))
|
|
761
|
+
for key in [
|
|
762
|
+
"reasoning_content",
|
|
763
|
+
"reasoning",
|
|
764
|
+
"thinking",
|
|
765
|
+
"thinking_content",
|
|
766
|
+
"reasoning_text",
|
|
767
|
+
"thought",
|
|
768
|
+
]:
|
|
769
|
+
extra_think = _extract_text_from_value(delta_dict.get(key))
|
|
770
|
+
if extra_think:
|
|
771
|
+
think += extra_think
|
|
772
|
+
break
|
|
773
|
+
return text, think
|
|
774
|
+
|
|
775
|
+
text, think = _extract_text_and_think_from_value(getattr(delta, "content", None))
|
|
776
|
+
if not text:
|
|
777
|
+
text = _extract_text_from_value(getattr(delta, "text", None))
|
|
778
|
+
for key in [
|
|
779
|
+
"reasoning_content",
|
|
780
|
+
"reasoning",
|
|
781
|
+
"thinking",
|
|
782
|
+
"thinking_content",
|
|
783
|
+
"reasoning_text",
|
|
784
|
+
"thought",
|
|
785
|
+
]:
|
|
786
|
+
extra_think = _extract_text_from_value(getattr(delta, key, None))
|
|
787
|
+
if extra_think:
|
|
788
|
+
think += extra_think
|
|
789
|
+
break
|
|
790
|
+
return text, think
|
|
791
|
+
|
|
792
|
+
def _extract_delta_tool_calls(delta: Any) -> Any:
|
|
793
|
+
if delta is None:
|
|
794
|
+
return None
|
|
795
|
+
|
|
796
|
+
delta_dict = _as_dict(delta)
|
|
797
|
+
if isinstance(delta_dict, dict):
|
|
798
|
+
tc = delta_dict.get("tool_calls")
|
|
799
|
+
if tc:
|
|
800
|
+
return tc
|
|
801
|
+
fc = delta_dict.get("function_call")
|
|
802
|
+
if fc:
|
|
803
|
+
return [{"index": 0, "type": "function", "function": fc}]
|
|
804
|
+
return None
|
|
805
|
+
|
|
806
|
+
tc = getattr(delta, "tool_calls", None)
|
|
807
|
+
if tc:
|
|
808
|
+
return tc
|
|
809
|
+
fc = getattr(delta, "function_call", None)
|
|
810
|
+
if fc:
|
|
811
|
+
return [{"index": 0, "type": "function", "function": fc}]
|
|
812
|
+
return None
|
|
813
|
+
|
|
814
|
+
rendered_messages = current_request.to_messages(self._capabilities)
|
|
815
|
+
self._emit_debug_payload(method="stream", messages=rendered_messages, kwargs=stream_kwargs)
|
|
816
|
+
|
|
817
|
+
while True:
|
|
818
|
+
# Bind loop variables explicitly to avoid late-binding in the closure (B023).
|
|
819
|
+
async def _open_stream(_msgs=rendered_messages, _kw=stream_kwargs) -> Any:
|
|
820
|
+
return await client.chat.completions.create(messages=_msgs, **_kw)
|
|
821
|
+
|
|
822
|
+
stream: Any = await self._with_retries(
|
|
823
|
+
_open_stream,
|
|
824
|
+
method="stream_open",
|
|
825
|
+
)
|
|
826
|
+
|
|
827
|
+
try:
|
|
828
|
+
async for event in stream:
|
|
829
|
+
delta_text = ""
|
|
830
|
+
delta_think = ""
|
|
831
|
+
event: Any = event
|
|
832
|
+
last_event = event
|
|
833
|
+
delta_tool_calls: Any = None
|
|
834
|
+
aggregated_tool_calls: list[dict[str, Any]] = []
|
|
835
|
+
delta = None
|
|
836
|
+
choices = getattr(event, "choices", None)
|
|
837
|
+
if isinstance(choices, list) and choices:
|
|
838
|
+
choice0 = choices[0]
|
|
839
|
+
delta = getattr(choice0, "delta", None)
|
|
840
|
+
elif isinstance(choices, tuple) and choices:
|
|
841
|
+
choice0 = choices[0]
|
|
842
|
+
delta = getattr(choice0, "delta", None)
|
|
843
|
+
else:
|
|
844
|
+
# Some providers emit non-content events with empty choices.
|
|
845
|
+
delta = getattr(event, "delta", None)
|
|
846
|
+
if delta is None and hasattr(event, "model_dump"):
|
|
847
|
+
try:
|
|
848
|
+
dumped = event.model_dump()
|
|
849
|
+
if isinstance(dumped, dict):
|
|
850
|
+
dumped_choices = dumped.get("choices")
|
|
851
|
+
if isinstance(dumped_choices, list) and dumped_choices:
|
|
852
|
+
choice0 = dumped_choices[0]
|
|
853
|
+
if isinstance(choice0, dict):
|
|
854
|
+
delta = choice0.get("delta")
|
|
855
|
+
except Exception as exc:
|
|
856
|
+
logger.error("[llm:stream] failed to extract delta from model_dump: %s", exc)
|
|
857
|
+
|
|
858
|
+
try:
|
|
859
|
+
delta_text, delta_think = _extract_delta_text_and_think(delta)
|
|
860
|
+
delta_tool_calls = _extract_delta_tool_calls(delta)
|
|
861
|
+
aggregated_tool_calls = _merge_tool_calls_delta(delta_tool_calls)
|
|
862
|
+
except Exception as e:
|
|
863
|
+
delta_text = ""
|
|
864
|
+
delta_think = ""
|
|
865
|
+
delta_tool_calls = None
|
|
866
|
+
aggregated_tool_calls = [tool_call_store[k] for k in tool_call_order]
|
|
867
|
+
logger.error("[llm:stream] failed to extract delta: %s", e)
|
|
868
|
+
# Some providers send usage only on the final event when stream_options.include_usage is enabled.
|
|
869
|
+
try:
|
|
870
|
+
usage_obj = getattr(event, "usage", None)
|
|
871
|
+
# Fallback: some SDKs/providers only expose usage via model_dump / extra fields.
|
|
872
|
+
if usage_obj is None and hasattr(event, "model_dump"):
|
|
873
|
+
try:
|
|
874
|
+
dumped = event.model_dump()
|
|
875
|
+
if isinstance(dumped, dict):
|
|
876
|
+
usage_obj = dumped.get("usage") or dumped.get("x_openai_usage") or dumped.get("x_usage")
|
|
877
|
+
except Exception:
|
|
878
|
+
pass
|
|
879
|
+
if usage_obj is not None:
|
|
880
|
+
# also record into last_usage so callers relying on _usage_obj() can still work
|
|
881
|
+
self._record_usage(usage_obj, method="stream")
|
|
882
|
+
final_usage = self._usage_obj(usage_obj)
|
|
883
|
+
except Exception:
|
|
884
|
+
# Don't fail streaming if usage parsing fails.
|
|
885
|
+
pass
|
|
886
|
+
|
|
887
|
+
if delta_text or delta_think or delta_tool_calls:
|
|
888
|
+
if delta_text:
|
|
889
|
+
aggregated_text += str(delta_text)
|
|
890
|
+
yield LLMStreamChunk(
|
|
891
|
+
delta_text=delta_text,
|
|
892
|
+
think=delta_think,
|
|
893
|
+
delta_tool_calls=delta_tool_calls,
|
|
894
|
+
tool_calls=aggregated_tool_calls,
|
|
895
|
+
token_usage=None,
|
|
896
|
+
raw_event=event,
|
|
897
|
+
is_final=False,
|
|
898
|
+
)
|
|
899
|
+
break
|
|
900
|
+
except Exception as e:
|
|
901
|
+
if not resume_enabled or restart_count >= max_restarts:
|
|
902
|
+
raise
|
|
903
|
+
restart_count += 1
|
|
904
|
+
logger.warning(
|
|
905
|
+
"[llm:stream] stream interrupted, attempting resume (%s/%s): %s",
|
|
906
|
+
restart_count,
|
|
907
|
+
max_restarts,
|
|
908
|
+
e,
|
|
909
|
+
)
|
|
910
|
+
current_request = self._build_resume_request(request, aggregated_text)
|
|
911
|
+
rendered_messages = current_request.to_messages(self._capabilities)
|
|
912
|
+
self._emit_debug_payload(method="stream-resume", messages=rendered_messages, kwargs=stream_kwargs)
|
|
913
|
+
continue
|
|
914
|
+
|
|
915
|
+
# Final chunk carries best-effort usage + last raw event so callers can inspect finish_reason, tool_calls, etc.
|
|
916
|
+
yield LLMStreamChunk(
|
|
917
|
+
delta_text="",
|
|
918
|
+
think="",
|
|
919
|
+
delta_tool_calls=None,
|
|
920
|
+
tool_calls=[tool_call_store[k] for k in tool_call_order],
|
|
921
|
+
token_usage=final_usage,
|
|
922
|
+
raw_event=last_event,
|
|
923
|
+
is_final=True,
|
|
924
|
+
)
|
|
925
|
+
|
|
926
|
+
async def chat_completion(
|
|
927
|
+
self,
|
|
928
|
+
*,
|
|
929
|
+
messages: Sequence[dict[str, str]],
|
|
930
|
+
**kwargs: Any,
|
|
931
|
+
) -> Any:
|
|
932
|
+
"""
|
|
933
|
+
Return the raw ChatCompletion object.
|
|
934
|
+
"""
|
|
935
|
+
client = self._ensure_client()
|
|
936
|
+
model = str(kwargs.get("model") or self._cfg.model)
|
|
937
|
+
temperature = float(kwargs.get("temperature", self._cfg.temperature))
|
|
938
|
+
max_tokens = int(kwargs.get("max_tokens", self._cfg.max_tokens))
|
|
939
|
+
message_list = [dict(message) for message in messages]
|
|
940
|
+
debug_kwargs = dict(kwargs)
|
|
941
|
+
debug_kwargs.update({"model": model, "temperature": temperature, "max_tokens": max_tokens})
|
|
942
|
+
self._emit_debug_payload(method="chat_completion", messages=message_list, kwargs=debug_kwargs)
|
|
943
|
+
|
|
944
|
+
async def _call() -> Any:
|
|
945
|
+
return await client.chat.completions.create(
|
|
946
|
+
model=model,
|
|
947
|
+
messages=message_list,
|
|
948
|
+
temperature=temperature,
|
|
949
|
+
max_tokens=max_tokens,
|
|
950
|
+
**{k: v for k, v in kwargs.items() if k not in {"model", "temperature", "max_tokens"}},
|
|
951
|
+
)
|
|
952
|
+
|
|
953
|
+
resp: Any = await self._with_retries(_call, method="chat_completion")
|
|
954
|
+
usage_obj = getattr(resp, "usage", None)
|
|
955
|
+
if usage_obj is None and hasattr(resp, "model_dump"):
|
|
956
|
+
try:
|
|
957
|
+
dumped = resp.model_dump()
|
|
958
|
+
if isinstance(dumped, dict):
|
|
959
|
+
usage_obj = dumped.get("usage") or dumped.get("x_openai_usage") or dumped.get("x_usage")
|
|
960
|
+
except Exception:
|
|
961
|
+
pass
|
|
962
|
+
self._record_usage(usage_obj, method="chat_completion")
|
|
963
|
+
return resp
|
|
964
|
+
|
|
965
|
+
def _message_text(self, msg: Any) -> str:
|
|
966
|
+
"""
|
|
967
|
+
Convert a ChatCompletionMessage content to plain text.
|
|
968
|
+
Some providers may return list content; we best-effort stringify.
|
|
969
|
+
"""
|
|
970
|
+
content = getattr(msg, "content", None)
|
|
971
|
+
if content is None:
|
|
972
|
+
return ""
|
|
973
|
+
if isinstance(content, str):
|
|
974
|
+
return content
|
|
975
|
+
# List/parts fallback
|
|
976
|
+
try:
|
|
977
|
+
return "".join(str(part) for part in content).strip()
|
|
978
|
+
except Exception:
|
|
979
|
+
return str(content).strip()
|
|
980
|
+
|
|
981
|
+
# NOTE: `predict/chat/predict_stream` wrappers are provided as default methods on the Protocol.
|