ai-pipeline-core 0.2.6__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ai_pipeline_core/__init__.py +78 -125
- ai_pipeline_core/deployment/__init__.py +34 -0
- ai_pipeline_core/deployment/base.py +861 -0
- ai_pipeline_core/deployment/contract.py +80 -0
- ai_pipeline_core/deployment/deploy.py +561 -0
- ai_pipeline_core/deployment/helpers.py +97 -0
- ai_pipeline_core/deployment/progress.py +126 -0
- ai_pipeline_core/deployment/remote.py +116 -0
- ai_pipeline_core/docs_generator/__init__.py +54 -0
- ai_pipeline_core/docs_generator/__main__.py +5 -0
- ai_pipeline_core/docs_generator/cli.py +196 -0
- ai_pipeline_core/docs_generator/extractor.py +324 -0
- ai_pipeline_core/docs_generator/guide_builder.py +644 -0
- ai_pipeline_core/docs_generator/trimmer.py +35 -0
- ai_pipeline_core/docs_generator/validator.py +114 -0
- ai_pipeline_core/document_store/__init__.py +13 -0
- ai_pipeline_core/document_store/_summary.py +9 -0
- ai_pipeline_core/document_store/_summary_worker.py +170 -0
- ai_pipeline_core/document_store/clickhouse.py +492 -0
- ai_pipeline_core/document_store/factory.py +38 -0
- ai_pipeline_core/document_store/local.py +312 -0
- ai_pipeline_core/document_store/memory.py +85 -0
- ai_pipeline_core/document_store/protocol.py +68 -0
- ai_pipeline_core/documents/__init__.py +12 -14
- ai_pipeline_core/documents/_context_vars.py +85 -0
- ai_pipeline_core/documents/_hashing.py +52 -0
- ai_pipeline_core/documents/attachment.py +85 -0
- ai_pipeline_core/documents/context.py +128 -0
- ai_pipeline_core/documents/document.py +318 -1434
- ai_pipeline_core/documents/mime_type.py +37 -82
- ai_pipeline_core/documents/utils.py +4 -12
- ai_pipeline_core/exceptions.py +10 -62
- ai_pipeline_core/images/__init__.py +309 -0
- ai_pipeline_core/images/_processing.py +151 -0
- ai_pipeline_core/llm/__init__.py +6 -4
- ai_pipeline_core/llm/ai_messages.py +130 -81
- ai_pipeline_core/llm/client.py +327 -193
- ai_pipeline_core/llm/model_options.py +14 -86
- ai_pipeline_core/llm/model_response.py +60 -103
- ai_pipeline_core/llm/model_types.py +16 -34
- ai_pipeline_core/logging/__init__.py +2 -7
- ai_pipeline_core/logging/logging.yml +1 -1
- ai_pipeline_core/logging/logging_config.py +27 -37
- ai_pipeline_core/logging/logging_mixin.py +15 -41
- ai_pipeline_core/observability/__init__.py +32 -0
- ai_pipeline_core/observability/_debug/__init__.py +30 -0
- ai_pipeline_core/observability/_debug/_auto_summary.py +94 -0
- ai_pipeline_core/observability/_debug/_config.py +95 -0
- ai_pipeline_core/observability/_debug/_content.py +764 -0
- ai_pipeline_core/observability/_debug/_processor.py +98 -0
- ai_pipeline_core/observability/_debug/_summary.py +312 -0
- ai_pipeline_core/observability/_debug/_types.py +75 -0
- ai_pipeline_core/observability/_debug/_writer.py +843 -0
- ai_pipeline_core/observability/_document_tracking.py +146 -0
- ai_pipeline_core/observability/_initialization.py +194 -0
- ai_pipeline_core/observability/_logging_bridge.py +57 -0
- ai_pipeline_core/observability/_summary.py +81 -0
- ai_pipeline_core/observability/_tracking/__init__.py +6 -0
- ai_pipeline_core/observability/_tracking/_client.py +178 -0
- ai_pipeline_core/observability/_tracking/_internal.py +28 -0
- ai_pipeline_core/observability/_tracking/_models.py +138 -0
- ai_pipeline_core/observability/_tracking/_processor.py +158 -0
- ai_pipeline_core/observability/_tracking/_service.py +311 -0
- ai_pipeline_core/observability/_tracking/_writer.py +229 -0
- ai_pipeline_core/{tracing.py → observability/tracing.py} +139 -283
- ai_pipeline_core/pipeline/__init__.py +10 -0
- ai_pipeline_core/pipeline/decorators.py +915 -0
- ai_pipeline_core/pipeline/options.py +16 -0
- ai_pipeline_core/prompt_manager.py +16 -102
- ai_pipeline_core/settings.py +26 -31
- ai_pipeline_core/testing.py +9 -0
- ai_pipeline_core-0.4.1.dist-info/METADATA +807 -0
- ai_pipeline_core-0.4.1.dist-info/RECORD +76 -0
- {ai_pipeline_core-0.2.6.dist-info → ai_pipeline_core-0.4.1.dist-info}/WHEEL +1 -1
- ai_pipeline_core/documents/document_list.py +0 -420
- ai_pipeline_core/documents/flow_document.py +0 -112
- ai_pipeline_core/documents/task_document.py +0 -117
- ai_pipeline_core/documents/temporary_document.py +0 -74
- ai_pipeline_core/flow/__init__.py +0 -9
- ai_pipeline_core/flow/config.py +0 -483
- ai_pipeline_core/flow/options.py +0 -75
- ai_pipeline_core/pipeline.py +0 -718
- ai_pipeline_core/prefect.py +0 -63
- ai_pipeline_core/simple_runner/__init__.py +0 -14
- ai_pipeline_core/simple_runner/cli.py +0 -254
- ai_pipeline_core/simple_runner/simple_runner.py +0 -247
- ai_pipeline_core/storage/__init__.py +0 -8
- ai_pipeline_core/storage/storage.py +0 -628
- ai_pipeline_core/utils/__init__.py +0 -8
- ai_pipeline_core/utils/deploy.py +0 -373
- ai_pipeline_core/utils/remote_deployment.py +0 -269
- ai_pipeline_core-0.2.6.dist-info/METADATA +0 -500
- ai_pipeline_core-0.2.6.dist-info/RECORD +0 -41
- {ai_pipeline_core-0.2.6.dist-info → ai_pipeline_core-0.4.1.dist-info}/licenses/LICENSE +0 -0
ai_pipeline_core/llm/client.py
CHANGED
|
@@ -1,99 +1,216 @@
|
|
|
1
1
|
"""LLM client implementation for AI model interactions.
|
|
2
2
|
|
|
3
|
-
@public
|
|
4
|
-
|
|
5
3
|
This module provides the core functionality for interacting with language models
|
|
6
4
|
through a unified interface. It handles retries, caching, structured outputs,
|
|
7
5
|
and integration with various LLM providers via LiteLLM.
|
|
8
6
|
|
|
9
|
-
|
|
10
|
-
-
|
|
11
|
-
|
|
7
|
+
Automatic image auto-tiling splits oversized images in attachments to meet
|
|
8
|
+
model-specific constraints (e.g., 3000x3000 for Gemini, 1000x1000 default).
|
|
9
|
+
Context caching separates static content from dynamic messages for 50-90% token savings.
|
|
10
|
+
Optional purpose and expected_cost parameters enable tracing and cost-tracking.
|
|
12
11
|
"""
|
|
13
12
|
|
|
14
13
|
import asyncio
|
|
14
|
+
import contextlib
|
|
15
15
|
import time
|
|
16
|
+
from io import BytesIO
|
|
16
17
|
from typing import Any, TypeVar
|
|
17
18
|
|
|
18
19
|
from lmnr import Laminar
|
|
19
20
|
from openai import AsyncOpenAI
|
|
20
21
|
from openai.lib.streaming.chat import ChunkEvent, ContentDeltaEvent, ContentDoneEvent
|
|
21
22
|
from openai.types.chat import (
|
|
23
|
+
ChatCompletion,
|
|
22
24
|
ChatCompletionMessageParam,
|
|
23
25
|
)
|
|
24
|
-
from
|
|
26
|
+
from PIL import Image
|
|
25
27
|
from pydantic import BaseModel, ValidationError
|
|
26
28
|
|
|
29
|
+
from ai_pipeline_core.documents import Document
|
|
30
|
+
from ai_pipeline_core.documents.attachment import Attachment
|
|
27
31
|
from ai_pipeline_core.exceptions import LLMError
|
|
32
|
+
from ai_pipeline_core.images import ImageProcessingConfig, process_image, process_image_to_documents
|
|
33
|
+
from ai_pipeline_core.logging import get_pipeline_logger
|
|
34
|
+
from ai_pipeline_core.observability._document_tracking import track_llm_documents
|
|
28
35
|
from ai_pipeline_core.settings import settings
|
|
29
36
|
|
|
30
|
-
from .ai_messages import AIMessages
|
|
37
|
+
from .ai_messages import AIMessages, AIMessageType
|
|
31
38
|
from .model_options import ModelOptions
|
|
32
39
|
from .model_response import ModelResponse, StructuredModelResponse
|
|
33
40
|
from .model_types import ModelName
|
|
34
41
|
|
|
35
|
-
logger =
|
|
42
|
+
logger = get_pipeline_logger(__name__)
|
|
43
|
+
|
|
44
|
+
# Image splitting configs for automatic large-image handling at the LLM boundary.
|
|
45
|
+
# Gemini supports up to 3000x3000; all other models use a conservative 1000x1000 default.
|
|
46
|
+
_GEMINI_IMAGE_CONFIG = ImageProcessingConfig(max_dimension=3000, max_pixels=9_000_000, jpeg_quality=75)
|
|
47
|
+
_DEFAULT_IMAGE_CONFIG = ImageProcessingConfig(max_dimension=1000, max_pixels=1_000_000, jpeg_quality=75)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _get_image_config(model: str) -> ImageProcessingConfig:
|
|
51
|
+
"""Return the image splitting config for a model."""
|
|
52
|
+
if "gemini" in model.lower():
|
|
53
|
+
return _GEMINI_IMAGE_CONFIG
|
|
54
|
+
return _DEFAULT_IMAGE_CONFIG
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _prepare_images_for_model(messages: AIMessages, model: str) -> AIMessages: # noqa: C901, PLR0912, PLR0915, PLR0914
|
|
58
|
+
"""Split image documents and image attachments that exceed model constraints.
|
|
59
|
+
|
|
60
|
+
Returns a new AIMessages with oversized images replaced by tiles.
|
|
61
|
+
Returns the original instance unchanged if no splitting is needed.
|
|
62
|
+
"""
|
|
63
|
+
if not any(isinstance(m, Document) and (m.is_image or any(att.is_image for att in m.attachments)) for m in messages):
|
|
64
|
+
return messages
|
|
65
|
+
|
|
66
|
+
config = _get_image_config(model)
|
|
67
|
+
result: list[AIMessageType] = []
|
|
68
|
+
changed = False
|
|
69
|
+
|
|
70
|
+
for msg in messages:
|
|
71
|
+
if not isinstance(msg, Document):
|
|
72
|
+
result.append(msg)
|
|
73
|
+
continue
|
|
74
|
+
|
|
75
|
+
# 1. Handle top-level image Documents (existing logic)
|
|
76
|
+
if msg.is_image:
|
|
77
|
+
try:
|
|
78
|
+
with Image.open(BytesIO(msg.content)) as img:
|
|
79
|
+
w, h = img.size
|
|
80
|
+
except Exception:
|
|
81
|
+
result.append(msg)
|
|
82
|
+
continue
|
|
83
|
+
|
|
84
|
+
within_limits = w <= config.max_dimension and h <= config.max_dimension and w * h <= config.max_pixels
|
|
85
|
+
if within_limits:
|
|
86
|
+
pass # Falls through to attachment handling
|
|
87
|
+
else:
|
|
88
|
+
name_prefix = msg.name.rsplit(".", 1)[0] if "." in msg.name else msg.name
|
|
89
|
+
tiles = process_image_to_documents(msg, config=config, name_prefix=name_prefix)
|
|
90
|
+
if msg.attachments and tiles:
|
|
91
|
+
tiles[0] = tiles[0].model_copy(update={"attachments": msg.attachments})
|
|
92
|
+
result.extend(tiles)
|
|
93
|
+
changed = True
|
|
94
|
+
continue
|
|
95
|
+
|
|
96
|
+
# 2. Handle image attachments
|
|
97
|
+
if msg.attachments:
|
|
98
|
+
new_attachments: list[Attachment] = []
|
|
99
|
+
attachments_changed = False
|
|
100
|
+
|
|
101
|
+
for att in msg.attachments:
|
|
102
|
+
if not att.is_image:
|
|
103
|
+
new_attachments.append(att)
|
|
104
|
+
continue
|
|
105
|
+
|
|
106
|
+
try:
|
|
107
|
+
with Image.open(BytesIO(att.content)) as img:
|
|
108
|
+
w, h = img.size
|
|
109
|
+
except Exception:
|
|
110
|
+
new_attachments.append(att)
|
|
111
|
+
continue
|
|
112
|
+
|
|
113
|
+
att_within_limits = w <= config.max_dimension and h <= config.max_dimension and w * h <= config.max_pixels
|
|
114
|
+
if att_within_limits:
|
|
115
|
+
new_attachments.append(att)
|
|
116
|
+
continue
|
|
117
|
+
|
|
118
|
+
# Tile the oversized attachment image
|
|
119
|
+
processed = process_image(att.content, config=config)
|
|
120
|
+
att_prefix = att.name.rsplit(".", 1)[0] if "." in att.name else att.name
|
|
121
|
+
|
|
122
|
+
for part in processed.parts:
|
|
123
|
+
if part.total == 1:
|
|
124
|
+
tile_name = f"{att_prefix}.jpg"
|
|
125
|
+
tile_desc = att.description
|
|
126
|
+
else:
|
|
127
|
+
tile_name = f"{att_prefix}_{part.index + 1:02d}_of_{part.total:02d}.jpg"
|
|
128
|
+
tile_desc = f"{att.description} ({part.label})" if att.description else part.label
|
|
129
|
+
|
|
130
|
+
new_attachments.append(
|
|
131
|
+
Attachment(
|
|
132
|
+
name=tile_name,
|
|
133
|
+
content=part.data,
|
|
134
|
+
description=tile_desc,
|
|
135
|
+
)
|
|
136
|
+
)
|
|
137
|
+
attachments_changed = True
|
|
138
|
+
|
|
139
|
+
if attachments_changed:
|
|
140
|
+
msg = msg.model_copy(update={"attachments": tuple(new_attachments)}) # noqa: PLW2901
|
|
141
|
+
changed = True
|
|
142
|
+
|
|
143
|
+
result.append(msg)
|
|
144
|
+
|
|
145
|
+
if not changed:
|
|
146
|
+
return messages
|
|
147
|
+
return AIMessages(result)
|
|
36
148
|
|
|
37
149
|
|
|
38
150
|
def _process_messages(
|
|
39
151
|
context: AIMessages,
|
|
40
152
|
messages: AIMessages,
|
|
41
153
|
system_prompt: str | None = None,
|
|
42
|
-
cache_ttl: str | None = "
|
|
154
|
+
cache_ttl: str | None = "300s",
|
|
43
155
|
) -> list[ChatCompletionMessageParam]:
|
|
44
156
|
"""Process and format messages for LLM API consumption.
|
|
45
157
|
|
|
46
158
|
Internal function that combines context and messages into a single
|
|
47
159
|
list of API-compatible messages. Applies caching directives to
|
|
48
|
-
context messages for efficiency.
|
|
160
|
+
system prompt and context messages for efficiency.
|
|
49
161
|
|
|
50
162
|
Args:
|
|
51
163
|
context: Messages to be cached (typically expensive/static content).
|
|
52
164
|
messages: Regular messages without caching (dynamic queries).
|
|
53
165
|
system_prompt: Optional system instructions for the model.
|
|
54
|
-
cache_ttl: Cache TTL for context messages (e.g. "120s", "
|
|
166
|
+
cache_ttl: Cache TTL for system and context messages (e.g. "120s", "300s", "1h").
|
|
55
167
|
Set to None or empty string to disable caching.
|
|
56
168
|
|
|
57
169
|
Returns:
|
|
58
170
|
List of formatted messages ready for API calls, with:
|
|
59
|
-
- System prompt at the beginning (if provided)
|
|
60
|
-
- Context messages with cache_control on
|
|
171
|
+
- System prompt at the beginning with cache_control (if provided and cache_ttl set)
|
|
172
|
+
- Context messages with cache_control on all messages (if cache_ttl set)
|
|
61
173
|
- Regular messages without caching
|
|
62
174
|
|
|
63
175
|
System Prompt Location:
|
|
64
176
|
The system prompt parameter is always injected as the FIRST message
|
|
65
|
-
with role="system". It is
|
|
66
|
-
system prompts without breaking cache efficiency.
|
|
177
|
+
with role="system". It is cached along with context when cache_ttl is set.
|
|
67
178
|
|
|
68
179
|
Cache behavior:
|
|
69
|
-
|
|
180
|
+
All system and context messages get ephemeral caching with specified TTL
|
|
70
181
|
to reduce token usage on repeated calls with same context.
|
|
71
182
|
If cache_ttl is None or empty string (falsy), no caching is applied.
|
|
72
|
-
|
|
183
|
+
All system and context messages receive cache_control to maximize cache efficiency.
|
|
73
184
|
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
The context/messages split enables efficient token usage.
|
|
185
|
+
This is an internal function used by _generate_with_retry().
|
|
186
|
+
The context/messages split enables efficient token usage.
|
|
77
187
|
"""
|
|
78
188
|
processed_messages: list[ChatCompletionMessageParam] = []
|
|
79
189
|
|
|
80
190
|
# Add system prompt if provided
|
|
81
191
|
if system_prompt:
|
|
82
|
-
processed_messages.append({
|
|
192
|
+
processed_messages.append({
|
|
193
|
+
"role": "system",
|
|
194
|
+
"content": [{"type": "text", "text": system_prompt}],
|
|
195
|
+
})
|
|
83
196
|
|
|
84
197
|
# Process context messages with caching if provided
|
|
85
198
|
if context:
|
|
86
199
|
# Use AIMessages.to_prompt() for context
|
|
87
200
|
context_messages = context.to_prompt()
|
|
201
|
+
processed_messages.extend(context_messages)
|
|
88
202
|
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
203
|
+
if cache_ttl:
|
|
204
|
+
for message in processed_messages:
|
|
205
|
+
message["cache_control"] = { # type: ignore
|
|
92
206
|
"type": "ephemeral",
|
|
93
207
|
"ttl": cache_ttl,
|
|
94
208
|
}
|
|
95
|
-
|
|
96
|
-
|
|
209
|
+
if isinstance(message["content"], list): # type: ignore
|
|
210
|
+
message["content"][-1]["cache_control"] = { # type: ignore
|
|
211
|
+
"type": "ephemeral",
|
|
212
|
+
"ttl": cache_ttl,
|
|
213
|
+
}
|
|
97
214
|
|
|
98
215
|
# Process regular messages without caching
|
|
99
216
|
if messages:
|
|
@@ -103,6 +220,35 @@ def _process_messages(
|
|
|
103
220
|
return processed_messages
|
|
104
221
|
|
|
105
222
|
|
|
223
|
+
def _remove_cache_control(
|
|
224
|
+
messages: list[ChatCompletionMessageParam],
|
|
225
|
+
) -> list[ChatCompletionMessageParam]:
|
|
226
|
+
"""Remove cache control directives from messages.
|
|
227
|
+
|
|
228
|
+
Internal utility that strips cache_control fields from both message-level
|
|
229
|
+
and content-level entries. Used in retry logic when cache-related errors
|
|
230
|
+
occur during LLM API calls.
|
|
231
|
+
|
|
232
|
+
Args:
|
|
233
|
+
messages: List of messages that may contain cache_control directives.
|
|
234
|
+
|
|
235
|
+
Returns:
|
|
236
|
+
The same message list (modified in-place) with all cache_control
|
|
237
|
+
fields removed from both messages and their content items.
|
|
238
|
+
|
|
239
|
+
Modifies the input list in-place but also returns it for convenience.
|
|
240
|
+
Handles both list-based content (multipart) and string content (simple messages).
|
|
241
|
+
"""
|
|
242
|
+
for message in messages:
|
|
243
|
+
if (content := message.get("content")) and isinstance(content, list):
|
|
244
|
+
for item in content:
|
|
245
|
+
if "cache_control" in item:
|
|
246
|
+
del item["cache_control"]
|
|
247
|
+
if "cache_control" in message:
|
|
248
|
+
del message["cache_control"]
|
|
249
|
+
return messages
|
|
250
|
+
|
|
251
|
+
|
|
106
252
|
def _model_name_to_openrouter_model(model: ModelName) -> str:
|
|
107
253
|
"""Convert a model name to an OpenRouter model name.
|
|
108
254
|
|
|
@@ -112,14 +258,10 @@ def _model_name_to_openrouter_model(model: ModelName) -> str:
|
|
|
112
258
|
Returns:
|
|
113
259
|
OpenRouter model name.
|
|
114
260
|
"""
|
|
115
|
-
if model == "
|
|
116
|
-
return "
|
|
117
|
-
if model == "gemini-2.5-flash-search":
|
|
118
|
-
return "google/gemini-2.5-flash:online"
|
|
119
|
-
if model == "grok-4-fast-search":
|
|
120
|
-
return "x-ai/grok-4-fast:online"
|
|
261
|
+
if model == "gemini-3-flash-search":
|
|
262
|
+
return "google/gemini-3-flash:online"
|
|
121
263
|
if model == "sonar-pro-search":
|
|
122
|
-
return "perplexity/sonar-
|
|
264
|
+
return "perplexity/sonar-pro-search"
|
|
123
265
|
if model.startswith("gemini"):
|
|
124
266
|
return f"google/{model}"
|
|
125
267
|
elif model.startswith("gpt"):
|
|
@@ -139,30 +281,76 @@ def _model_name_to_openrouter_model(model: ModelName) -> str:
|
|
|
139
281
|
return model
|
|
140
282
|
|
|
141
283
|
|
|
142
|
-
async def
|
|
143
|
-
|
|
284
|
+
async def _generate_streaming(client: AsyncOpenAI, model: str, messages: list[ChatCompletionMessageParam], completion_kwargs: dict[str, Any]) -> ModelResponse:
|
|
285
|
+
"""Execute a streaming LLM API call."""
|
|
286
|
+
start_time = time.time()
|
|
287
|
+
first_token_time = None
|
|
288
|
+
usage = None
|
|
289
|
+
async with client.chat.completions.stream(
|
|
290
|
+
model=model,
|
|
291
|
+
messages=messages,
|
|
292
|
+
**completion_kwargs,
|
|
293
|
+
) as s:
|
|
294
|
+
async for event in s:
|
|
295
|
+
if isinstance(event, ContentDeltaEvent):
|
|
296
|
+
if not first_token_time:
|
|
297
|
+
first_token_time = time.time()
|
|
298
|
+
elif isinstance(event, ContentDoneEvent):
|
|
299
|
+
pass
|
|
300
|
+
elif isinstance(event, ChunkEvent) and event.chunk.usage:
|
|
301
|
+
usage = event.chunk.usage
|
|
302
|
+
if not first_token_time:
|
|
303
|
+
first_token_time = time.time()
|
|
304
|
+
raw_response = await s.get_final_completion()
|
|
305
|
+
|
|
306
|
+
metadata = {
|
|
307
|
+
"time_taken": round(time.time() - start_time, 2),
|
|
308
|
+
"first_token_time": round(first_token_time - start_time, 2),
|
|
309
|
+
}
|
|
310
|
+
return ModelResponse(raw_response, model_options=completion_kwargs, metadata=metadata, usage=usage)
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
async def _generate_non_streaming(
|
|
314
|
+
client: AsyncOpenAI, model: str, messages: list[ChatCompletionMessageParam], completion_kwargs: dict[str, Any]
|
|
144
315
|
) -> ModelResponse:
|
|
145
|
-
"""Execute a
|
|
316
|
+
"""Execute a non-streaming LLM API call.
|
|
317
|
+
|
|
318
|
+
Avoids OpenAI SDK delta accumulation — some providers (e.g. Grok) send
|
|
319
|
+
streaming annotation deltas that crash the SDK's accumulate_delta().
|
|
320
|
+
"""
|
|
321
|
+
start_time = time.time()
|
|
322
|
+
kwargs = {k: v for k, v in completion_kwargs.items() if k != "stream_options"}
|
|
323
|
+
response_format = kwargs.get("response_format")
|
|
324
|
+
if isinstance(response_format, type) and issubclass(response_format, BaseModel):
|
|
325
|
+
raw_response: ChatCompletion = await client.chat.completions.parse(
|
|
326
|
+
model=model,
|
|
327
|
+
messages=messages,
|
|
328
|
+
**kwargs,
|
|
329
|
+
)
|
|
330
|
+
else:
|
|
331
|
+
raw_response = await client.chat.completions.create(
|
|
332
|
+
model=model,
|
|
333
|
+
messages=messages,
|
|
334
|
+
stream=False,
|
|
335
|
+
**kwargs,
|
|
336
|
+
)
|
|
337
|
+
elapsed = round(time.time() - start_time, 2)
|
|
338
|
+
metadata = {"time_taken": elapsed, "first_token_time": elapsed}
|
|
339
|
+
return ModelResponse(raw_response, model_options=completion_kwargs, metadata=metadata)
|
|
340
|
+
|
|
146
341
|
|
|
147
|
-
|
|
148
|
-
|
|
342
|
+
async def _generate(model: str, messages: list[ChatCompletionMessageParam], completion_kwargs: dict[str, Any], *, stream: bool = True) -> ModelResponse:
|
|
343
|
+
"""Execute a single LLM API call.
|
|
149
344
|
|
|
150
345
|
Args:
|
|
151
|
-
model: Model identifier (e.g., "gpt-5", "gemini-
|
|
346
|
+
model: Model identifier (e.g., "gpt-5.1", "gemini-3-pro").
|
|
152
347
|
messages: Formatted messages for the API.
|
|
153
348
|
completion_kwargs: Additional parameters for the completion API.
|
|
349
|
+
stream: Whether to use streaming mode (default True). Non-streaming
|
|
350
|
+
avoids OpenAI SDK delta accumulation issues with some providers.
|
|
154
351
|
|
|
155
352
|
Returns:
|
|
156
353
|
ModelResponse with generated content and metadata.
|
|
157
|
-
|
|
158
|
-
API selection:
|
|
159
|
-
- Uses client.chat.completions.parse() for structured output
|
|
160
|
-
- Uses client.chat.completions.create() for regular text
|
|
161
|
-
|
|
162
|
-
Note:
|
|
163
|
-
- Uses AsyncOpenAI client configured via settings
|
|
164
|
-
- Captures response headers for cost tracking
|
|
165
|
-
- Response includes model options for debugging
|
|
166
354
|
"""
|
|
167
355
|
if "openrouter" in settings.openai_base_url.lower():
|
|
168
356
|
model = _model_name_to_openrouter_model(model)
|
|
@@ -171,45 +359,18 @@ async def _generate(
|
|
|
171
359
|
api_key=settings.openai_api_key,
|
|
172
360
|
base_url=settings.openai_base_url,
|
|
173
361
|
) as client:
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
async with client.chat.completions.stream(
|
|
178
|
-
model=model,
|
|
179
|
-
messages=messages,
|
|
180
|
-
**completion_kwargs,
|
|
181
|
-
) as stream:
|
|
182
|
-
async for event in stream:
|
|
183
|
-
if isinstance(event, ContentDeltaEvent):
|
|
184
|
-
if not first_token_time:
|
|
185
|
-
first_token_time = time.time()
|
|
186
|
-
elif isinstance(event, ContentDoneEvent):
|
|
187
|
-
pass
|
|
188
|
-
elif isinstance(event, ChunkEvent):
|
|
189
|
-
if event.chunk.usage: # used to fix a bug with missing usage data
|
|
190
|
-
usage = event.chunk.usage
|
|
191
|
-
if not first_token_time:
|
|
192
|
-
first_token_time = time.time()
|
|
193
|
-
raw_response = await stream.get_final_completion()
|
|
194
|
-
|
|
195
|
-
metadata = {
|
|
196
|
-
"time_taken": round(time.time() - start_time, 2),
|
|
197
|
-
"first_token_time": round(first_token_time - start_time, 2),
|
|
198
|
-
}
|
|
199
|
-
response = ModelResponse(
|
|
200
|
-
raw_response,
|
|
201
|
-
model_options=completion_kwargs,
|
|
202
|
-
metadata=metadata,
|
|
203
|
-
usage=usage,
|
|
204
|
-
)
|
|
205
|
-
return response
|
|
362
|
+
if stream:
|
|
363
|
+
return await _generate_streaming(client, model, messages, completion_kwargs)
|
|
364
|
+
return await _generate_non_streaming(client, model, messages, completion_kwargs)
|
|
206
365
|
|
|
207
366
|
|
|
208
|
-
async def _generate_with_retry(
|
|
367
|
+
async def _generate_with_retry( # noqa: PLR0917
|
|
209
368
|
model: str,
|
|
210
369
|
context: AIMessages,
|
|
211
370
|
messages: AIMessages,
|
|
212
371
|
options: ModelOptions,
|
|
372
|
+
purpose: str | None = None,
|
|
373
|
+
expected_cost: float | None = None,
|
|
213
374
|
) -> ModelResponse:
|
|
214
375
|
"""Core LLM generation with automatic retry logic.
|
|
215
376
|
|
|
@@ -221,6 +382,8 @@ async def _generate_with_retry(
|
|
|
221
382
|
context: Cached context messages (can be empty).
|
|
222
383
|
messages: Dynamic query messages.
|
|
223
384
|
options: Configuration including retries, timeout, temperature.
|
|
385
|
+
purpose: Optional semantic label for the LLM span name.
|
|
386
|
+
expected_cost: Optional expected cost for cost-tracking attributes.
|
|
224
387
|
|
|
225
388
|
Returns:
|
|
226
389
|
ModelResponse with generated content.
|
|
@@ -229,17 +392,22 @@ async def _generate_with_retry(
|
|
|
229
392
|
ValueError: If model is not provided or both context and messages are empty.
|
|
230
393
|
LLMError: If all retry attempts are exhausted.
|
|
231
394
|
|
|
232
|
-
|
|
233
|
-
Empty responses trigger a retry as they indicate API issues.
|
|
395
|
+
Empty responses trigger a retry as they indicate API issues.
|
|
234
396
|
"""
|
|
235
397
|
if not model:
|
|
236
398
|
raise ValueError("Model must be provided")
|
|
237
399
|
if not context and not messages:
|
|
238
400
|
raise ValueError("Either context or messages must be provided")
|
|
239
401
|
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
)
|
|
402
|
+
# Auto-split large images based on model-specific constraints
|
|
403
|
+
context = _prepare_images_for_model(context, model)
|
|
404
|
+
messages = _prepare_images_for_model(messages, model)
|
|
405
|
+
|
|
406
|
+
if "gemini" in model.lower() and context.approximate_tokens_count < 10000:
|
|
407
|
+
# Bug fix for minimum explicit context size for Gemini models
|
|
408
|
+
options.cache_ttl = None
|
|
409
|
+
|
|
410
|
+
processed_messages = _process_messages(context, messages, options.system_prompt, options.cache_ttl)
|
|
243
411
|
completion_kwargs: dict[str, Any] = {
|
|
244
412
|
**options.to_openai_completion_kwargs(),
|
|
245
413
|
}
|
|
@@ -249,20 +417,23 @@ async def _generate_with_retry(
|
|
|
249
417
|
|
|
250
418
|
for attempt in range(options.retries):
|
|
251
419
|
try:
|
|
252
|
-
with Laminar.start_as_current_span(
|
|
253
|
-
model,
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
]
|
|
420
|
+
with Laminar.start_as_current_span(purpose or model, span_type="LLM", input=processed_messages) as span:
|
|
421
|
+
response = await _generate(model, processed_messages, completion_kwargs, stream=options.stream)
|
|
422
|
+
laminar_metadata = response.get_laminar_metadata()
|
|
423
|
+
if purpose:
|
|
424
|
+
laminar_metadata["purpose"] = purpose
|
|
425
|
+
if expected_cost is not None:
|
|
426
|
+
laminar_metadata["expected_cost"] = expected_cost
|
|
427
|
+
span.set_attributes(laminar_metadata) # pyright: ignore[reportArgumentType]
|
|
428
|
+
Laminar.set_span_output([r for r in (response.reasoning_content, response.content) if r])
|
|
260
429
|
response.validate_output()
|
|
261
430
|
return response
|
|
262
|
-
except (
|
|
431
|
+
except (TimeoutError, ValueError, ValidationError, Exception) as e:
|
|
263
432
|
if not isinstance(e, asyncio.TimeoutError):
|
|
264
433
|
# disable cache if it's not a timeout because it may cause an error
|
|
265
434
|
completion_kwargs["extra_body"]["cache"] = {"no-cache": True}
|
|
435
|
+
# sometimes there are issues with cache so cache is removed in case of failure
|
|
436
|
+
processed_messages = _remove_cache_control(processed_messages)
|
|
266
437
|
|
|
267
438
|
logger.warning(
|
|
268
439
|
f"LLM generation failed (attempt {attempt + 1}/{options.retries}): {e}",
|
|
@@ -281,11 +452,11 @@ async def generate(
|
|
|
281
452
|
context: AIMessages | None = None,
|
|
282
453
|
messages: AIMessages | str,
|
|
283
454
|
options: ModelOptions | None = None,
|
|
455
|
+
purpose: str | None = None,
|
|
456
|
+
expected_cost: float | None = None,
|
|
284
457
|
) -> ModelResponse:
|
|
285
458
|
"""Generate text response from a language model.
|
|
286
459
|
|
|
287
|
-
@public
|
|
288
|
-
|
|
289
460
|
Main entry point for LLM text generation with smart context caching.
|
|
290
461
|
The context/messages split enables efficient token usage by caching
|
|
291
462
|
expensive static content separately from dynamic queries.
|
|
@@ -297,18 +468,21 @@ async def generate(
|
|
|
297
468
|
4. CONFIGURATION: Configure model behavior via LiteLLM proxy or environment variables
|
|
298
469
|
|
|
299
470
|
Args:
|
|
300
|
-
model: Model to use (e.g., "gpt-5", "gemini-
|
|
471
|
+
model: Model to use (e.g., "gpt-5.1", "gemini-3-pro", "grok-4.1-fast").
|
|
301
472
|
Accepts predefined models or any string for custom models.
|
|
302
473
|
context: Static context to cache (documents, examples, instructions).
|
|
303
474
|
Defaults to None (empty context). Cached for 5 minutes by default.
|
|
304
475
|
messages: Dynamic messages/queries. AIMessages or str ONLY.
|
|
305
|
-
Do not pass Document or
|
|
476
|
+
Do not pass Document or list[Document] directly.
|
|
306
477
|
If string, converted to AIMessages internally.
|
|
307
|
-
options:
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
478
|
+
options: Internal framework parameter. Framework defaults are production-optimized
|
|
479
|
+
(3 retries, 20s delay, 600s timeout). Configure model behavior centrally via
|
|
480
|
+
LiteLLM proxy settings or environment variables, not per API call.
|
|
481
|
+
Provider-specific settings should be configured at the proxy level.
|
|
482
|
+
purpose: Optional semantic label used as the tracing span name
|
|
483
|
+
instead of model name. Stored as a span attribute.
|
|
484
|
+
expected_cost: Optional expected cost stored as a span attribute
|
|
485
|
+
for cost-tracking and comparison with actual cost.
|
|
312
486
|
|
|
313
487
|
Returns:
|
|
314
488
|
ModelResponse containing:
|
|
@@ -325,17 +499,17 @@ async def generate(
|
|
|
325
499
|
Wrap Documents in AIMessages - DO NOT pass directly or convert to .text:
|
|
326
500
|
|
|
327
501
|
# CORRECT - wrap Document in AIMessages
|
|
328
|
-
response = await llm.generate("gpt-5", messages=AIMessages([my_document]))
|
|
502
|
+
response = await llm.generate("gpt-5.1", messages=AIMessages([my_document]))
|
|
329
503
|
|
|
330
504
|
# WRONG - don't pass Document directly
|
|
331
|
-
response = await llm.generate("gpt-5", messages=my_document) # NO!
|
|
505
|
+
response = await llm.generate("gpt-5.1", messages=my_document) # NO!
|
|
332
506
|
|
|
333
507
|
# WRONG - don't convert to string yourself
|
|
334
|
-
response = await llm.generate("gpt-5", messages=my_document.text) # NO!
|
|
508
|
+
response = await llm.generate("gpt-5.1", messages=my_document.text) # NO!
|
|
335
509
|
|
|
336
510
|
VISION/PDF MODEL COMPATIBILITY:
|
|
337
511
|
When using Documents containing images or PDFs, ensure your model supports these formats:
|
|
338
|
-
- Images require vision-capable models (gpt-
|
|
512
|
+
- Images require vision-capable models (gpt-5.1, gemini-3-flash, gemini-3-pro)
|
|
339
513
|
- PDFs require document processing support (varies by provider)
|
|
340
514
|
- Non-compatible models will raise ValueError or fall back to text extraction
|
|
341
515
|
- Check model capabilities before including visual/PDF content
|
|
@@ -351,50 +525,12 @@ async def generate(
|
|
|
351
525
|
- Changes with each API call
|
|
352
526
|
- Never cached, always processed fresh
|
|
353
527
|
|
|
354
|
-
Example:
|
|
355
|
-
>>> # CORRECT - No options parameter (this is the recommended pattern)
|
|
356
|
-
>>> response = await llm.generate("gpt-5", messages="Explain quantum computing")
|
|
357
|
-
>>> print(response.content) # In production, use get_pipeline_logger instead of print
|
|
358
|
-
|
|
359
|
-
>>> # With context caching for efficiency
|
|
360
|
-
>>> # Context and messages are both AIMessages or str; wrap any Documents
|
|
361
|
-
>>> static_doc = AIMessages([large_document, "few-shot example: ..."])
|
|
362
|
-
>>>
|
|
363
|
-
>>> # First call: caches context
|
|
364
|
-
>>> r1 = await llm.generate("gpt-5", context=static_doc, messages="Summarize")
|
|
365
|
-
>>>
|
|
366
|
-
>>> # Second call: reuses cache, saves tokens!
|
|
367
|
-
>>> r2 = await llm.generate("gpt-5", context=static_doc, messages="Key points?")
|
|
368
|
-
|
|
369
|
-
>>> # Multi-turn conversation
|
|
370
|
-
>>> messages = AIMessages([
|
|
371
|
-
... "What is Python?",
|
|
372
|
-
... previous_response,
|
|
373
|
-
... "Can you give an example?"
|
|
374
|
-
... ])
|
|
375
|
-
>>> response = await llm.generate("gpt-5", messages=messages)
|
|
376
|
-
|
|
377
|
-
Configuration via LiteLLM Proxy:
|
|
378
|
-
>>> # Configure temperature in litellm_config.yaml:
|
|
379
|
-
>>> # model_list:
|
|
380
|
-
>>> # - model_name: gpt-5
|
|
381
|
-
>>> # litellm_params:
|
|
382
|
-
>>> # model: openai/gpt-4o
|
|
383
|
-
>>> # temperature: 0.3
|
|
384
|
-
>>> # max_tokens: 1000
|
|
385
|
-
>>>
|
|
386
|
-
>>> # Configure retry logic in proxy:
|
|
387
|
-
>>> # general_settings:
|
|
388
|
-
>>> # master_key: sk-1234
|
|
389
|
-
>>> # max_retries: 5
|
|
390
|
-
>>> # retry_delay: 15
|
|
391
|
-
|
|
392
528
|
Performance:
|
|
393
529
|
- Context caching saves ~50-90% tokens on repeated calls
|
|
394
530
|
- First call: full token cost
|
|
395
531
|
- Subsequent calls (within cache TTL): only messages tokens
|
|
396
|
-
- Default cache TTL is
|
|
397
|
-
- Default retry logic: 3 attempts with
|
|
532
|
+
- Default cache TTL is 300s/5 minutes (production-optimized)
|
|
533
|
+
- Default retry logic: 3 attempts with 20s delay (production-optimized)
|
|
398
534
|
|
|
399
535
|
Caching:
|
|
400
536
|
When enabled in your LiteLLM proxy and supported by the upstream provider,
|
|
@@ -412,10 +548,8 @@ async def generate(
|
|
|
412
548
|
|
|
413
549
|
This centralizes configuration and ensures consistency across all API calls.
|
|
414
550
|
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
- Automatic retry with configurable delay between attempts
|
|
418
|
-
- Cost tracking via response headers
|
|
551
|
+
All models are accessed via LiteLLM proxy with automatic retry and
|
|
552
|
+
cost tracking via response headers.
|
|
419
553
|
"""
|
|
420
554
|
if isinstance(messages, str):
|
|
421
555
|
messages = AIMessages([messages])
|
|
@@ -424,9 +558,22 @@ async def generate(
|
|
|
424
558
|
context = AIMessages()
|
|
425
559
|
if options is None:
|
|
426
560
|
options = ModelOptions()
|
|
561
|
+
else:
|
|
562
|
+
# Create a copy to avoid mutating the caller's options object
|
|
563
|
+
options = options.model_copy()
|
|
564
|
+
|
|
565
|
+
with contextlib.suppress(Exception):
|
|
566
|
+
track_llm_documents(context, messages)
|
|
427
567
|
|
|
428
568
|
try:
|
|
429
|
-
return await _generate_with_retry(
|
|
569
|
+
return await _generate_with_retry(
|
|
570
|
+
model,
|
|
571
|
+
context,
|
|
572
|
+
messages,
|
|
573
|
+
options,
|
|
574
|
+
purpose=purpose,
|
|
575
|
+
expected_cost=expected_cost,
|
|
576
|
+
)
|
|
430
577
|
except (ValueError, LLMError):
|
|
431
578
|
raise # Explicitly re-raise to satisfy DOC502
|
|
432
579
|
|
|
@@ -435,18 +582,18 @@ T = TypeVar("T", bound=BaseModel)
|
|
|
435
582
|
"""Type variable for Pydantic model types in structured generation."""
|
|
436
583
|
|
|
437
584
|
|
|
438
|
-
async def generate_structured(
|
|
585
|
+
async def generate_structured( # noqa: UP047
|
|
439
586
|
model: ModelName,
|
|
440
587
|
response_format: type[T],
|
|
441
588
|
*,
|
|
442
589
|
context: AIMessages | None = None,
|
|
443
590
|
messages: AIMessages | str,
|
|
444
591
|
options: ModelOptions | None = None,
|
|
592
|
+
purpose: str | None = None,
|
|
593
|
+
expected_cost: float | None = None,
|
|
445
594
|
) -> StructuredModelResponse[T]:
|
|
446
595
|
"""Generate structured output conforming to a Pydantic model.
|
|
447
596
|
|
|
448
|
-
@public
|
|
449
|
-
|
|
450
597
|
Type-safe generation that returns validated Pydantic model instances.
|
|
451
598
|
Uses OpenAI's structured output feature for guaranteed schema compliance.
|
|
452
599
|
|
|
@@ -482,7 +629,7 @@ async def generate_structured(
|
|
|
482
629
|
|
|
483
630
|
>>> # Step 1: Research/analysis with generate() - no options parameter
|
|
484
631
|
>>> research = await llm.generate(
|
|
485
|
-
... "gpt-5",
|
|
632
|
+
... "gpt-5.1",
|
|
486
633
|
... messages="Research and analyze this complex topic..."
|
|
487
634
|
... )
|
|
488
635
|
>>>
|
|
@@ -501,21 +648,21 @@ async def generate_structured(
|
|
|
501
648
|
context: Static context to cache (documents, schemas, examples).
|
|
502
649
|
Defaults to None (empty AIMessages).
|
|
503
650
|
messages: Dynamic prompts/queries. AIMessages or str ONLY.
|
|
504
|
-
Do not pass Document or
|
|
651
|
+
Do not pass Document or list[Document] directly.
|
|
505
652
|
options: Optional ModelOptions for configuring temperature, retries, etc.
|
|
506
653
|
If provided, it will NOT be mutated (a copy is created internally).
|
|
507
654
|
The response_format field is set automatically from the response_format parameter.
|
|
508
655
|
In most cases, leave as None to use framework defaults.
|
|
509
656
|
Configure model behavior centrally via LiteLLM proxy settings when possible.
|
|
657
|
+
purpose: Optional semantic label used as the tracing span name
|
|
658
|
+
instead of model name. Stored as a span attribute.
|
|
659
|
+
expected_cost: Optional expected cost stored as a span attribute
|
|
660
|
+
for cost-tracking and comparison with actual cost.
|
|
510
661
|
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
- Many models support either vision OR structured output, but not both
|
|
516
|
-
- Test your specific model+document combination before production use
|
|
517
|
-
- Consider two-step approach: generate() for analysis, then generate_structured()
|
|
518
|
-
for formatting
|
|
662
|
+
Vision/PDF model compatibility: Images require vision-capable models that also support
|
|
663
|
+
structured output. PDFs require models with both document processing AND structured output
|
|
664
|
+
support. Consider two-step approach: generate() for analysis, then generate_structured()
|
|
665
|
+
for formatting.
|
|
519
666
|
|
|
520
667
|
Returns:
|
|
521
668
|
StructuredModelResponse[T] containing:
|
|
@@ -529,26 +676,6 @@ async def generate_structured(
|
|
|
529
676
|
LLMError: If generation fails after retries.
|
|
530
677
|
ValidationError: If response cannot be parsed into response_format.
|
|
531
678
|
|
|
532
|
-
Example:
|
|
533
|
-
>>> from pydantic import BaseModel, Field
|
|
534
|
-
>>>
|
|
535
|
-
>>> class Analysis(BaseModel):
|
|
536
|
-
... summary: str = Field(description="Brief summary")
|
|
537
|
-
... sentiment: float = Field(ge=-1, le=1)
|
|
538
|
-
... key_points: list[str] = Field(max_length=5)
|
|
539
|
-
>>>
|
|
540
|
-
>>> # CORRECT - No options parameter
|
|
541
|
-
>>> response = await llm.generate_structured(
|
|
542
|
-
... "gpt-5",
|
|
543
|
-
... response_format=Analysis,
|
|
544
|
-
... messages="Analyze this product review: ..."
|
|
545
|
-
... )
|
|
546
|
-
>>>
|
|
547
|
-
>>> analysis = response.parsed # Type: Analysis
|
|
548
|
-
>>> print(f"Sentiment: {analysis.sentiment}")
|
|
549
|
-
>>> for point in analysis.key_points:
|
|
550
|
-
... print(f"- {point}")
|
|
551
|
-
|
|
552
679
|
Supported models:
|
|
553
680
|
Structured output support varies by provider and model. Generally includes:
|
|
554
681
|
- OpenAI: GPT-4 and newer models
|
|
@@ -563,12 +690,9 @@ async def generate_structured(
|
|
|
563
690
|
- Complex schemas increase generation time
|
|
564
691
|
- Validation overhead is minimal (Pydantic is fast)
|
|
565
692
|
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
- Validation happens automatically via Pydantic
|
|
570
|
-
- Use Field() descriptions to guide generation
|
|
571
|
-
- Search models (models with '-search' suffix) do not support structured output
|
|
693
|
+
Pydantic model is converted to JSON Schema for the API. Validation happens
|
|
694
|
+
automatically via Pydantic. Search models (models with '-search' suffix) do
|
|
695
|
+
not support structured output.
|
|
572
696
|
"""
|
|
573
697
|
if context is None:
|
|
574
698
|
context = AIMessages()
|
|
@@ -585,9 +709,19 @@ async def generate_structured(
|
|
|
585
709
|
|
|
586
710
|
assert isinstance(messages, AIMessages)
|
|
587
711
|
|
|
712
|
+
with contextlib.suppress(Exception):
|
|
713
|
+
track_llm_documents(context, messages)
|
|
714
|
+
|
|
588
715
|
# Call the internal generate function with structured output enabled
|
|
589
716
|
try:
|
|
590
|
-
response = await _generate_with_retry(
|
|
717
|
+
response = await _generate_with_retry(
|
|
718
|
+
model,
|
|
719
|
+
context,
|
|
720
|
+
messages,
|
|
721
|
+
options,
|
|
722
|
+
purpose=purpose,
|
|
723
|
+
expected_cost=expected_cost,
|
|
724
|
+
)
|
|
591
725
|
except (ValueError, LLMError):
|
|
592
726
|
raise # Explicitly re-raise to satisfy DOC502
|
|
593
727
|
|