ai-pipeline-core 0.1.12__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ai_pipeline_core/__init__.py +83 -119
- ai_pipeline_core/deployment/__init__.py +34 -0
- ai_pipeline_core/deployment/base.py +861 -0
- ai_pipeline_core/deployment/contract.py +80 -0
- ai_pipeline_core/deployment/deploy.py +561 -0
- ai_pipeline_core/deployment/helpers.py +97 -0
- ai_pipeline_core/deployment/progress.py +126 -0
- ai_pipeline_core/deployment/remote.py +116 -0
- ai_pipeline_core/docs_generator/__init__.py +54 -0
- ai_pipeline_core/docs_generator/__main__.py +5 -0
- ai_pipeline_core/docs_generator/cli.py +196 -0
- ai_pipeline_core/docs_generator/extractor.py +324 -0
- ai_pipeline_core/docs_generator/guide_builder.py +644 -0
- ai_pipeline_core/docs_generator/trimmer.py +35 -0
- ai_pipeline_core/docs_generator/validator.py +114 -0
- ai_pipeline_core/document_store/__init__.py +13 -0
- ai_pipeline_core/document_store/_summary.py +9 -0
- ai_pipeline_core/document_store/_summary_worker.py +170 -0
- ai_pipeline_core/document_store/clickhouse.py +492 -0
- ai_pipeline_core/document_store/factory.py +38 -0
- ai_pipeline_core/document_store/local.py +312 -0
- ai_pipeline_core/document_store/memory.py +85 -0
- ai_pipeline_core/document_store/protocol.py +68 -0
- ai_pipeline_core/documents/__init__.py +14 -15
- ai_pipeline_core/documents/_context_vars.py +85 -0
- ai_pipeline_core/documents/_hashing.py +52 -0
- ai_pipeline_core/documents/attachment.py +85 -0
- ai_pipeline_core/documents/context.py +128 -0
- ai_pipeline_core/documents/document.py +349 -1062
- ai_pipeline_core/documents/mime_type.py +40 -85
- ai_pipeline_core/documents/utils.py +62 -7
- ai_pipeline_core/exceptions.py +10 -62
- ai_pipeline_core/images/__init__.py +309 -0
- ai_pipeline_core/images/_processing.py +151 -0
- ai_pipeline_core/llm/__init__.py +5 -3
- ai_pipeline_core/llm/ai_messages.py +284 -73
- ai_pipeline_core/llm/client.py +462 -209
- ai_pipeline_core/llm/model_options.py +86 -53
- ai_pipeline_core/llm/model_response.py +187 -241
- ai_pipeline_core/llm/model_types.py +34 -54
- ai_pipeline_core/logging/__init__.py +2 -9
- ai_pipeline_core/logging/logging.yml +1 -1
- ai_pipeline_core/logging/logging_config.py +27 -43
- ai_pipeline_core/logging/logging_mixin.py +17 -51
- ai_pipeline_core/observability/__init__.py +32 -0
- ai_pipeline_core/observability/_debug/__init__.py +30 -0
- ai_pipeline_core/observability/_debug/_auto_summary.py +94 -0
- ai_pipeline_core/observability/_debug/_config.py +95 -0
- ai_pipeline_core/observability/_debug/_content.py +764 -0
- ai_pipeline_core/observability/_debug/_processor.py +98 -0
- ai_pipeline_core/observability/_debug/_summary.py +312 -0
- ai_pipeline_core/observability/_debug/_types.py +75 -0
- ai_pipeline_core/observability/_debug/_writer.py +843 -0
- ai_pipeline_core/observability/_document_tracking.py +146 -0
- ai_pipeline_core/observability/_initialization.py +194 -0
- ai_pipeline_core/observability/_logging_bridge.py +57 -0
- ai_pipeline_core/observability/_summary.py +81 -0
- ai_pipeline_core/observability/_tracking/__init__.py +6 -0
- ai_pipeline_core/observability/_tracking/_client.py +178 -0
- ai_pipeline_core/observability/_tracking/_internal.py +28 -0
- ai_pipeline_core/observability/_tracking/_models.py +138 -0
- ai_pipeline_core/observability/_tracking/_processor.py +158 -0
- ai_pipeline_core/observability/_tracking/_service.py +311 -0
- ai_pipeline_core/observability/_tracking/_writer.py +229 -0
- ai_pipeline_core/observability/tracing.py +640 -0
- ai_pipeline_core/pipeline/__init__.py +10 -0
- ai_pipeline_core/pipeline/decorators.py +915 -0
- ai_pipeline_core/pipeline/options.py +16 -0
- ai_pipeline_core/prompt_manager.py +26 -105
- ai_pipeline_core/settings.py +41 -32
- ai_pipeline_core/testing.py +9 -0
- ai_pipeline_core-0.4.1.dist-info/METADATA +807 -0
- ai_pipeline_core-0.4.1.dist-info/RECORD +76 -0
- {ai_pipeline_core-0.1.12.dist-info → ai_pipeline_core-0.4.1.dist-info}/WHEEL +1 -1
- ai_pipeline_core/documents/document_list.py +0 -240
- ai_pipeline_core/documents/flow_document.py +0 -128
- ai_pipeline_core/documents/task_document.py +0 -133
- ai_pipeline_core/documents/temporary_document.py +0 -95
- ai_pipeline_core/flow/__init__.py +0 -9
- ai_pipeline_core/flow/config.py +0 -314
- ai_pipeline_core/flow/options.py +0 -75
- ai_pipeline_core/pipeline.py +0 -717
- ai_pipeline_core/prefect.py +0 -54
- ai_pipeline_core/simple_runner/__init__.py +0 -24
- ai_pipeline_core/simple_runner/cli.py +0 -255
- ai_pipeline_core/simple_runner/simple_runner.py +0 -385
- ai_pipeline_core/tracing.py +0 -475
- ai_pipeline_core-0.1.12.dist-info/METADATA +0 -450
- ai_pipeline_core-0.1.12.dist-info/RECORD +0 -36
- {ai_pipeline_core-0.1.12.dist-info → ai_pipeline_core-0.4.1.dist-info}/licenses/LICENSE +0 -0
ai_pipeline_core/llm/client.py
CHANGED
|
@@ -1,93 +1,217 @@
|
|
|
1
1
|
"""LLM client implementation for AI model interactions.
|
|
2
2
|
|
|
3
|
-
@public
|
|
4
|
-
|
|
5
3
|
This module provides the core functionality for interacting with language models
|
|
6
4
|
through a unified interface. It handles retries, caching, structured outputs,
|
|
7
5
|
and integration with various LLM providers via LiteLLM.
|
|
8
6
|
|
|
9
|
-
|
|
10
|
-
-
|
|
11
|
-
|
|
7
|
+
Automatic image auto-tiling splits oversized images in attachments to meet
|
|
8
|
+
model-specific constraints (e.g., 3000x3000 for Gemini, 1000x1000 default).
|
|
9
|
+
Context caching separates static content from dynamic messages for 50-90% token savings.
|
|
10
|
+
Optional purpose and expected_cost parameters enable tracing and cost-tracking.
|
|
12
11
|
"""
|
|
13
12
|
|
|
14
13
|
import asyncio
|
|
14
|
+
import contextlib
|
|
15
|
+
import time
|
|
16
|
+
from io import BytesIO
|
|
15
17
|
from typing import Any, TypeVar
|
|
16
18
|
|
|
17
19
|
from lmnr import Laminar
|
|
18
20
|
from openai import AsyncOpenAI
|
|
21
|
+
from openai.lib.streaming.chat import ChunkEvent, ContentDeltaEvent, ContentDoneEvent
|
|
19
22
|
from openai.types.chat import (
|
|
23
|
+
ChatCompletion,
|
|
20
24
|
ChatCompletionMessageParam,
|
|
21
25
|
)
|
|
22
|
-
from
|
|
23
|
-
from pydantic import BaseModel
|
|
26
|
+
from PIL import Image
|
|
27
|
+
from pydantic import BaseModel, ValidationError
|
|
24
28
|
|
|
29
|
+
from ai_pipeline_core.documents import Document
|
|
30
|
+
from ai_pipeline_core.documents.attachment import Attachment
|
|
25
31
|
from ai_pipeline_core.exceptions import LLMError
|
|
32
|
+
from ai_pipeline_core.images import ImageProcessingConfig, process_image, process_image_to_documents
|
|
33
|
+
from ai_pipeline_core.logging import get_pipeline_logger
|
|
34
|
+
from ai_pipeline_core.observability._document_tracking import track_llm_documents
|
|
26
35
|
from ai_pipeline_core.settings import settings
|
|
27
|
-
from ai_pipeline_core.tracing import trace
|
|
28
36
|
|
|
29
|
-
from .ai_messages import AIMessages
|
|
37
|
+
from .ai_messages import AIMessages, AIMessageType
|
|
30
38
|
from .model_options import ModelOptions
|
|
31
39
|
from .model_response import ModelResponse, StructuredModelResponse
|
|
32
40
|
from .model_types import ModelName
|
|
33
41
|
|
|
34
|
-
logger =
|
|
42
|
+
logger = get_pipeline_logger(__name__)
|
|
43
|
+
|
|
44
|
+
# Image splitting configs for automatic large-image handling at the LLM boundary.
|
|
45
|
+
# Gemini supports up to 3000x3000; all other models use a conservative 1000x1000 default.
|
|
46
|
+
_GEMINI_IMAGE_CONFIG = ImageProcessingConfig(max_dimension=3000, max_pixels=9_000_000, jpeg_quality=75)
|
|
47
|
+
_DEFAULT_IMAGE_CONFIG = ImageProcessingConfig(max_dimension=1000, max_pixels=1_000_000, jpeg_quality=75)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _get_image_config(model: str) -> ImageProcessingConfig:
|
|
51
|
+
"""Return the image splitting config for a model."""
|
|
52
|
+
if "gemini" in model.lower():
|
|
53
|
+
return _GEMINI_IMAGE_CONFIG
|
|
54
|
+
return _DEFAULT_IMAGE_CONFIG
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _prepare_images_for_model(messages: AIMessages, model: str) -> AIMessages: # noqa: C901, PLR0912, PLR0915, PLR0914
|
|
58
|
+
"""Split image documents and image attachments that exceed model constraints.
|
|
59
|
+
|
|
60
|
+
Returns a new AIMessages with oversized images replaced by tiles.
|
|
61
|
+
Returns the original instance unchanged if no splitting is needed.
|
|
62
|
+
"""
|
|
63
|
+
if not any(isinstance(m, Document) and (m.is_image or any(att.is_image for att in m.attachments)) for m in messages):
|
|
64
|
+
return messages
|
|
65
|
+
|
|
66
|
+
config = _get_image_config(model)
|
|
67
|
+
result: list[AIMessageType] = []
|
|
68
|
+
changed = False
|
|
69
|
+
|
|
70
|
+
for msg in messages:
|
|
71
|
+
if not isinstance(msg, Document):
|
|
72
|
+
result.append(msg)
|
|
73
|
+
continue
|
|
74
|
+
|
|
75
|
+
# 1. Handle top-level image Documents (existing logic)
|
|
76
|
+
if msg.is_image:
|
|
77
|
+
try:
|
|
78
|
+
with Image.open(BytesIO(msg.content)) as img:
|
|
79
|
+
w, h = img.size
|
|
80
|
+
except Exception:
|
|
81
|
+
result.append(msg)
|
|
82
|
+
continue
|
|
83
|
+
|
|
84
|
+
within_limits = w <= config.max_dimension and h <= config.max_dimension and w * h <= config.max_pixels
|
|
85
|
+
if within_limits:
|
|
86
|
+
pass # Falls through to attachment handling
|
|
87
|
+
else:
|
|
88
|
+
name_prefix = msg.name.rsplit(".", 1)[0] if "." in msg.name else msg.name
|
|
89
|
+
tiles = process_image_to_documents(msg, config=config, name_prefix=name_prefix)
|
|
90
|
+
if msg.attachments and tiles:
|
|
91
|
+
tiles[0] = tiles[0].model_copy(update={"attachments": msg.attachments})
|
|
92
|
+
result.extend(tiles)
|
|
93
|
+
changed = True
|
|
94
|
+
continue
|
|
95
|
+
|
|
96
|
+
# 2. Handle image attachments
|
|
97
|
+
if msg.attachments:
|
|
98
|
+
new_attachments: list[Attachment] = []
|
|
99
|
+
attachments_changed = False
|
|
100
|
+
|
|
101
|
+
for att in msg.attachments:
|
|
102
|
+
if not att.is_image:
|
|
103
|
+
new_attachments.append(att)
|
|
104
|
+
continue
|
|
105
|
+
|
|
106
|
+
try:
|
|
107
|
+
with Image.open(BytesIO(att.content)) as img:
|
|
108
|
+
w, h = img.size
|
|
109
|
+
except Exception:
|
|
110
|
+
new_attachments.append(att)
|
|
111
|
+
continue
|
|
112
|
+
|
|
113
|
+
att_within_limits = w <= config.max_dimension and h <= config.max_dimension and w * h <= config.max_pixels
|
|
114
|
+
if att_within_limits:
|
|
115
|
+
new_attachments.append(att)
|
|
116
|
+
continue
|
|
117
|
+
|
|
118
|
+
# Tile the oversized attachment image
|
|
119
|
+
processed = process_image(att.content, config=config)
|
|
120
|
+
att_prefix = att.name.rsplit(".", 1)[0] if "." in att.name else att.name
|
|
121
|
+
|
|
122
|
+
for part in processed.parts:
|
|
123
|
+
if part.total == 1:
|
|
124
|
+
tile_name = f"{att_prefix}.jpg"
|
|
125
|
+
tile_desc = att.description
|
|
126
|
+
else:
|
|
127
|
+
tile_name = f"{att_prefix}_{part.index + 1:02d}_of_{part.total:02d}.jpg"
|
|
128
|
+
tile_desc = f"{att.description} ({part.label})" if att.description else part.label
|
|
129
|
+
|
|
130
|
+
new_attachments.append(
|
|
131
|
+
Attachment(
|
|
132
|
+
name=tile_name,
|
|
133
|
+
content=part.data,
|
|
134
|
+
description=tile_desc,
|
|
135
|
+
)
|
|
136
|
+
)
|
|
137
|
+
attachments_changed = True
|
|
138
|
+
|
|
139
|
+
if attachments_changed:
|
|
140
|
+
msg = msg.model_copy(update={"attachments": tuple(new_attachments)}) # noqa: PLW2901
|
|
141
|
+
changed = True
|
|
142
|
+
|
|
143
|
+
result.append(msg)
|
|
144
|
+
|
|
145
|
+
if not changed:
|
|
146
|
+
return messages
|
|
147
|
+
return AIMessages(result)
|
|
35
148
|
|
|
36
149
|
|
|
37
150
|
def _process_messages(
|
|
38
151
|
context: AIMessages,
|
|
39
152
|
messages: AIMessages,
|
|
40
153
|
system_prompt: str | None = None,
|
|
154
|
+
cache_ttl: str | None = "300s",
|
|
41
155
|
) -> list[ChatCompletionMessageParam]:
|
|
42
156
|
"""Process and format messages for LLM API consumption.
|
|
43
157
|
|
|
44
158
|
Internal function that combines context and messages into a single
|
|
45
159
|
list of API-compatible messages. Applies caching directives to
|
|
46
|
-
context messages for efficiency.
|
|
160
|
+
system prompt and context messages for efficiency.
|
|
47
161
|
|
|
48
162
|
Args:
|
|
49
163
|
context: Messages to be cached (typically expensive/static content).
|
|
50
164
|
messages: Regular messages without caching (dynamic queries).
|
|
51
165
|
system_prompt: Optional system instructions for the model.
|
|
166
|
+
cache_ttl: Cache TTL for system and context messages (e.g. "120s", "300s", "1h").
|
|
167
|
+
Set to None or empty string to disable caching.
|
|
52
168
|
|
|
53
169
|
Returns:
|
|
54
170
|
List of formatted messages ready for API calls, with:
|
|
55
|
-
- System prompt at the beginning (if provided)
|
|
56
|
-
- Context messages with cache_control on
|
|
171
|
+
- System prompt at the beginning with cache_control (if provided and cache_ttl set)
|
|
172
|
+
- Context messages with cache_control on all messages (if cache_ttl set)
|
|
57
173
|
- Regular messages without caching
|
|
58
174
|
|
|
59
175
|
System Prompt Location:
|
|
60
|
-
The system prompt
|
|
61
|
-
|
|
62
|
-
allowing dynamic system prompts without breaking cache efficiency.
|
|
176
|
+
The system prompt parameter is always injected as the FIRST message
|
|
177
|
+
with role="system". It is cached along with context when cache_ttl is set.
|
|
63
178
|
|
|
64
179
|
Cache behavior:
|
|
65
|
-
|
|
180
|
+
All system and context messages get ephemeral caching with specified TTL
|
|
66
181
|
to reduce token usage on repeated calls with same context.
|
|
182
|
+
If cache_ttl is None or empty string (falsy), no caching is applied.
|
|
183
|
+
All system and context messages receive cache_control to maximize cache efficiency.
|
|
67
184
|
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
The context/messages split enables efficient token usage.
|
|
185
|
+
This is an internal function used by _generate_with_retry().
|
|
186
|
+
The context/messages split enables efficient token usage.
|
|
71
187
|
"""
|
|
72
188
|
processed_messages: list[ChatCompletionMessageParam] = []
|
|
73
189
|
|
|
74
190
|
# Add system prompt if provided
|
|
75
191
|
if system_prompt:
|
|
76
|
-
processed_messages.append({
|
|
192
|
+
processed_messages.append({
|
|
193
|
+
"role": "system",
|
|
194
|
+
"content": [{"type": "text", "text": system_prompt}],
|
|
195
|
+
})
|
|
77
196
|
|
|
78
197
|
# Process context messages with caching if provided
|
|
79
198
|
if context:
|
|
80
199
|
# Use AIMessages.to_prompt() for context
|
|
81
200
|
context_messages = context.to_prompt()
|
|
82
|
-
|
|
83
|
-
# Apply caching to last context message
|
|
84
|
-
context_messages[-1]["cache_control"] = { # type: ignore
|
|
85
|
-
"type": "ephemeral",
|
|
86
|
-
"ttl": "120s", # Cache for 2m
|
|
87
|
-
}
|
|
88
|
-
|
|
89
201
|
processed_messages.extend(context_messages)
|
|
90
202
|
|
|
203
|
+
if cache_ttl:
|
|
204
|
+
for message in processed_messages:
|
|
205
|
+
message["cache_control"] = { # type: ignore
|
|
206
|
+
"type": "ephemeral",
|
|
207
|
+
"ttl": cache_ttl,
|
|
208
|
+
}
|
|
209
|
+
if isinstance(message["content"], list): # type: ignore
|
|
210
|
+
message["content"][-1]["cache_control"] = { # type: ignore
|
|
211
|
+
"type": "ephemeral",
|
|
212
|
+
"ttl": cache_ttl,
|
|
213
|
+
}
|
|
214
|
+
|
|
91
215
|
# Process regular messages without caching
|
|
92
216
|
if messages:
|
|
93
217
|
regular_messages = messages.to_prompt()
|
|
@@ -96,56 +220,157 @@ def _process_messages(
|
|
|
96
220
|
return processed_messages
|
|
97
221
|
|
|
98
222
|
|
|
99
|
-
|
|
100
|
-
|
|
223
|
+
def _remove_cache_control(
|
|
224
|
+
messages: list[ChatCompletionMessageParam],
|
|
225
|
+
) -> list[ChatCompletionMessageParam]:
|
|
226
|
+
"""Remove cache control directives from messages.
|
|
227
|
+
|
|
228
|
+
Internal utility that strips cache_control fields from both message-level
|
|
229
|
+
and content-level entries. Used in retry logic when cache-related errors
|
|
230
|
+
occur during LLM API calls.
|
|
231
|
+
|
|
232
|
+
Args:
|
|
233
|
+
messages: List of messages that may contain cache_control directives.
|
|
234
|
+
|
|
235
|
+
Returns:
|
|
236
|
+
The same message list (modified in-place) with all cache_control
|
|
237
|
+
fields removed from both messages and their content items.
|
|
238
|
+
|
|
239
|
+
Modifies the input list in-place but also returns it for convenience.
|
|
240
|
+
Handles both list-based content (multipart) and string content (simple messages).
|
|
241
|
+
"""
|
|
242
|
+
for message in messages:
|
|
243
|
+
if (content := message.get("content")) and isinstance(content, list):
|
|
244
|
+
for item in content:
|
|
245
|
+
if "cache_control" in item:
|
|
246
|
+
del item["cache_control"]
|
|
247
|
+
if "cache_control" in message:
|
|
248
|
+
del message["cache_control"]
|
|
249
|
+
return messages
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
def _model_name_to_openrouter_model(model: ModelName) -> str:
|
|
253
|
+
"""Convert a model name to an OpenRouter model name.
|
|
254
|
+
|
|
255
|
+
Args:
|
|
256
|
+
model: Model name to convert.
|
|
257
|
+
|
|
258
|
+
Returns:
|
|
259
|
+
OpenRouter model name.
|
|
260
|
+
"""
|
|
261
|
+
if model == "gemini-3-flash-search":
|
|
262
|
+
return "google/gemini-3-flash:online"
|
|
263
|
+
if model == "sonar-pro-search":
|
|
264
|
+
return "perplexity/sonar-pro-search"
|
|
265
|
+
if model.startswith("gemini"):
|
|
266
|
+
return f"google/{model}"
|
|
267
|
+
elif model.startswith("gpt"):
|
|
268
|
+
return f"openai/{model}"
|
|
269
|
+
elif model.startswith("grok"):
|
|
270
|
+
return f"x-ai/{model}"
|
|
271
|
+
elif model.startswith("claude"):
|
|
272
|
+
return f"anthropic/{model}"
|
|
273
|
+
elif model.startswith("qwen3"):
|
|
274
|
+
return f"qwen/{model}"
|
|
275
|
+
elif model.startswith("deepseek-"):
|
|
276
|
+
return f"deepseek/{model}"
|
|
277
|
+
elif model.startswith("glm-"):
|
|
278
|
+
return f"z-ai/{model}"
|
|
279
|
+
elif model.startswith("kimi-"):
|
|
280
|
+
return f"moonshotai/{model}"
|
|
281
|
+
return model
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
async def _generate_streaming(client: AsyncOpenAI, model: str, messages: list[ChatCompletionMessageParam], completion_kwargs: dict[str, Any]) -> ModelResponse:
|
|
285
|
+
"""Execute a streaming LLM API call."""
|
|
286
|
+
start_time = time.time()
|
|
287
|
+
first_token_time = None
|
|
288
|
+
usage = None
|
|
289
|
+
async with client.chat.completions.stream(
|
|
290
|
+
model=model,
|
|
291
|
+
messages=messages,
|
|
292
|
+
**completion_kwargs,
|
|
293
|
+
) as s:
|
|
294
|
+
async for event in s:
|
|
295
|
+
if isinstance(event, ContentDeltaEvent):
|
|
296
|
+
if not first_token_time:
|
|
297
|
+
first_token_time = time.time()
|
|
298
|
+
elif isinstance(event, ContentDoneEvent):
|
|
299
|
+
pass
|
|
300
|
+
elif isinstance(event, ChunkEvent) and event.chunk.usage:
|
|
301
|
+
usage = event.chunk.usage
|
|
302
|
+
if not first_token_time:
|
|
303
|
+
first_token_time = time.time()
|
|
304
|
+
raw_response = await s.get_final_completion()
|
|
305
|
+
|
|
306
|
+
metadata = {
|
|
307
|
+
"time_taken": round(time.time() - start_time, 2),
|
|
308
|
+
"first_token_time": round(first_token_time - start_time, 2),
|
|
309
|
+
}
|
|
310
|
+
return ModelResponse(raw_response, model_options=completion_kwargs, metadata=metadata, usage=usage)
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
async def _generate_non_streaming(
|
|
314
|
+
client: AsyncOpenAI, model: str, messages: list[ChatCompletionMessageParam], completion_kwargs: dict[str, Any]
|
|
101
315
|
) -> ModelResponse:
|
|
102
|
-
"""Execute a
|
|
316
|
+
"""Execute a non-streaming LLM API call.
|
|
103
317
|
|
|
104
|
-
|
|
105
|
-
|
|
318
|
+
Avoids OpenAI SDK delta accumulation — some providers (e.g. Grok) send
|
|
319
|
+
streaming annotation deltas that crash the SDK's accumulate_delta().
|
|
320
|
+
"""
|
|
321
|
+
start_time = time.time()
|
|
322
|
+
kwargs = {k: v for k, v in completion_kwargs.items() if k != "stream_options"}
|
|
323
|
+
response_format = kwargs.get("response_format")
|
|
324
|
+
if isinstance(response_format, type) and issubclass(response_format, BaseModel):
|
|
325
|
+
raw_response: ChatCompletion = await client.chat.completions.parse(
|
|
326
|
+
model=model,
|
|
327
|
+
messages=messages,
|
|
328
|
+
**kwargs,
|
|
329
|
+
)
|
|
330
|
+
else:
|
|
331
|
+
raw_response = await client.chat.completions.create(
|
|
332
|
+
model=model,
|
|
333
|
+
messages=messages,
|
|
334
|
+
stream=False,
|
|
335
|
+
**kwargs,
|
|
336
|
+
)
|
|
337
|
+
elapsed = round(time.time() - start_time, 2)
|
|
338
|
+
metadata = {"time_taken": elapsed, "first_token_time": elapsed}
|
|
339
|
+
return ModelResponse(raw_response, model_options=completion_kwargs, metadata=metadata)
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
async def _generate(model: str, messages: list[ChatCompletionMessageParam], completion_kwargs: dict[str, Any], *, stream: bool = True) -> ModelResponse:
|
|
343
|
+
"""Execute a single LLM API call.
|
|
106
344
|
|
|
107
345
|
Args:
|
|
108
|
-
model: Model identifier (e.g., "gpt-5", "gemini-
|
|
346
|
+
model: Model identifier (e.g., "gpt-5.1", "gemini-3-pro").
|
|
109
347
|
messages: Formatted messages for the API.
|
|
110
348
|
completion_kwargs: Additional parameters for the completion API.
|
|
349
|
+
stream: Whether to use streaming mode (default True). Non-streaming
|
|
350
|
+
avoids OpenAI SDK delta accumulation issues with some providers.
|
|
111
351
|
|
|
112
352
|
Returns:
|
|
113
353
|
ModelResponse with generated content and metadata.
|
|
114
|
-
|
|
115
|
-
API selection:
|
|
116
|
-
- Uses client.chat.completions.parse() for structured output
|
|
117
|
-
- Uses client.chat.completions.create() for regular text
|
|
118
|
-
|
|
119
|
-
Note:
|
|
120
|
-
- Uses AsyncOpenAI client configured via settings
|
|
121
|
-
- Captures response headers for cost tracking
|
|
122
|
-
- Response includes model options for debugging
|
|
123
354
|
"""
|
|
355
|
+
if "openrouter" in settings.openai_base_url.lower():
|
|
356
|
+
model = _model_name_to_openrouter_model(model)
|
|
357
|
+
|
|
124
358
|
async with AsyncOpenAI(
|
|
125
359
|
api_key=settings.openai_api_key,
|
|
126
360
|
base_url=settings.openai_base_url,
|
|
127
361
|
) as client:
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
**completion_kwargs,
|
|
132
|
-
)
|
|
133
|
-
else:
|
|
134
|
-
raw_response = await client.chat.completions.with_raw_response.create( # type: ignore[var-annotated]
|
|
135
|
-
**completion_kwargs
|
|
136
|
-
)
|
|
137
|
-
|
|
138
|
-
response = ModelResponse(raw_response.parse()) # type: ignore[arg-type]
|
|
139
|
-
response.set_model_options(completion_kwargs)
|
|
140
|
-
response.set_headers(dict(raw_response.headers.items())) # type: ignore[arg-type]
|
|
141
|
-
return response
|
|
362
|
+
if stream:
|
|
363
|
+
return await _generate_streaming(client, model, messages, completion_kwargs)
|
|
364
|
+
return await _generate_non_streaming(client, model, messages, completion_kwargs)
|
|
142
365
|
|
|
143
366
|
|
|
144
|
-
async def _generate_with_retry(
|
|
367
|
+
async def _generate_with_retry( # noqa: PLR0917
|
|
145
368
|
model: str,
|
|
146
369
|
context: AIMessages,
|
|
147
370
|
messages: AIMessages,
|
|
148
371
|
options: ModelOptions,
|
|
372
|
+
purpose: str | None = None,
|
|
373
|
+
expected_cost: float | None = None,
|
|
149
374
|
) -> ModelResponse:
|
|
150
375
|
"""Core LLM generation with automatic retry logic.
|
|
151
376
|
|
|
@@ -157,6 +382,8 @@ async def _generate_with_retry(
|
|
|
157
382
|
context: Cached context messages (can be empty).
|
|
158
383
|
messages: Dynamic query messages.
|
|
159
384
|
options: Configuration including retries, timeout, temperature.
|
|
385
|
+
purpose: Optional semantic label for the LLM span name.
|
|
386
|
+
expected_cost: Optional expected cost for cost-tracking attributes.
|
|
160
387
|
|
|
161
388
|
Returns:
|
|
162
389
|
ModelResponse with generated content.
|
|
@@ -165,45 +392,51 @@ async def _generate_with_retry(
|
|
|
165
392
|
ValueError: If model is not provided or both context and messages are empty.
|
|
166
393
|
LLMError: If all retry attempts are exhausted.
|
|
167
394
|
|
|
168
|
-
|
|
169
|
-
Empty responses trigger a retry as they indicate API issues.
|
|
395
|
+
Empty responses trigger a retry as they indicate API issues.
|
|
170
396
|
"""
|
|
171
397
|
if not model:
|
|
172
398
|
raise ValueError("Model must be provided")
|
|
173
399
|
if not context and not messages:
|
|
174
400
|
raise ValueError("Either context or messages must be provided")
|
|
175
401
|
|
|
176
|
-
|
|
402
|
+
# Auto-split large images based on model-specific constraints
|
|
403
|
+
context = _prepare_images_for_model(context, model)
|
|
404
|
+
messages = _prepare_images_for_model(messages, model)
|
|
405
|
+
|
|
406
|
+
if "gemini" in model.lower() and context.approximate_tokens_count < 10000:
|
|
407
|
+
# Bug fix for minimum explicit context size for Gemini models
|
|
408
|
+
options.cache_ttl = None
|
|
409
|
+
|
|
410
|
+
processed_messages = _process_messages(context, messages, options.system_prompt, options.cache_ttl)
|
|
177
411
|
completion_kwargs: dict[str, Any] = {
|
|
178
|
-
"model": model,
|
|
179
|
-
"messages": processed_messages,
|
|
180
412
|
**options.to_openai_completion_kwargs(),
|
|
181
413
|
}
|
|
182
414
|
|
|
183
|
-
if context:
|
|
415
|
+
if context and options.cache_ttl:
|
|
184
416
|
completion_kwargs["prompt_cache_key"] = context.get_prompt_cache_key(options.system_prompt)
|
|
185
417
|
|
|
186
418
|
for attempt in range(options.retries):
|
|
187
419
|
try:
|
|
188
|
-
with Laminar.start_as_current_span(
|
|
189
|
-
model,
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
420
|
+
with Laminar.start_as_current_span(purpose or model, span_type="LLM", input=processed_messages) as span:
|
|
421
|
+
response = await _generate(model, processed_messages, completion_kwargs, stream=options.stream)
|
|
422
|
+
laminar_metadata = response.get_laminar_metadata()
|
|
423
|
+
if purpose:
|
|
424
|
+
laminar_metadata["purpose"] = purpose
|
|
425
|
+
if expected_cost is not None:
|
|
426
|
+
laminar_metadata["expected_cost"] = expected_cost
|
|
427
|
+
span.set_attributes(laminar_metadata) # pyright: ignore[reportArgumentType]
|
|
428
|
+
Laminar.set_span_output([r for r in (response.reasoning_content, response.content) if r])
|
|
429
|
+
response.validate_output()
|
|
196
430
|
return response
|
|
197
|
-
except (
|
|
431
|
+
except (TimeoutError, ValueError, ValidationError, Exception) as e:
|
|
198
432
|
if not isinstance(e, asyncio.TimeoutError):
|
|
199
433
|
# disable cache if it's not a timeout because it may cause an error
|
|
200
434
|
completion_kwargs["extra_body"]["cache"] = {"no-cache": True}
|
|
435
|
+
# sometimes there are issues with cache so cache is removed in case of failure
|
|
436
|
+
processed_messages = _remove_cache_control(processed_messages)
|
|
201
437
|
|
|
202
438
|
logger.warning(
|
|
203
|
-
"LLM generation failed (attempt
|
|
204
|
-
attempt + 1,
|
|
205
|
-
options.retries,
|
|
206
|
-
e,
|
|
439
|
+
f"LLM generation failed (attempt {attempt + 1}/{options.retries}): {e}",
|
|
207
440
|
)
|
|
208
441
|
if attempt == options.retries - 1:
|
|
209
442
|
raise LLMError("Exhausted all retry attempts for LLM generation.") from e
|
|
@@ -213,37 +446,43 @@ async def _generate_with_retry(
|
|
|
213
446
|
raise LLMError("Unknown error occurred during LLM generation.")
|
|
214
447
|
|
|
215
448
|
|
|
216
|
-
@trace(ignore_inputs=["context"])
|
|
217
449
|
async def generate(
|
|
218
|
-
model: ModelName
|
|
450
|
+
model: ModelName,
|
|
219
451
|
*,
|
|
220
452
|
context: AIMessages | None = None,
|
|
221
453
|
messages: AIMessages | str,
|
|
222
454
|
options: ModelOptions | None = None,
|
|
455
|
+
purpose: str | None = None,
|
|
456
|
+
expected_cost: float | None = None,
|
|
223
457
|
) -> ModelResponse:
|
|
224
458
|
"""Generate text response from a language model.
|
|
225
459
|
|
|
226
|
-
@public
|
|
227
|
-
|
|
228
460
|
Main entry point for LLM text generation with smart context caching.
|
|
229
461
|
The context/messages split enables efficient token usage by caching
|
|
230
462
|
expensive static content separately from dynamic queries.
|
|
231
463
|
|
|
232
464
|
Best Practices:
|
|
233
|
-
1. OPTIONS:
|
|
465
|
+
1. OPTIONS: DO NOT use the options parameter - omit it entirely for production use
|
|
234
466
|
2. MESSAGES: Use AIMessages or str - wrap Documents in AIMessages
|
|
235
467
|
3. CONTEXT vs MESSAGES: Use context for static/cacheable, messages for dynamic
|
|
468
|
+
4. CONFIGURATION: Configure model behavior via LiteLLM proxy or environment variables
|
|
236
469
|
|
|
237
470
|
Args:
|
|
238
|
-
model: Model to use (e.g., "gpt-5", "gemini-
|
|
239
|
-
|
|
471
|
+
model: Model to use (e.g., "gpt-5.1", "gemini-3-pro", "grok-4.1-fast").
|
|
472
|
+
Accepts predefined models or any string for custom models.
|
|
240
473
|
context: Static context to cache (documents, examples, instructions).
|
|
241
|
-
Defaults to None (empty context). Cached for
|
|
474
|
+
Defaults to None (empty context). Cached for 5 minutes by default.
|
|
242
475
|
messages: Dynamic messages/queries. AIMessages or str ONLY.
|
|
243
|
-
Do not pass Document or
|
|
476
|
+
Do not pass Document or list[Document] directly.
|
|
244
477
|
If string, converted to AIMessages internally.
|
|
245
|
-
options:
|
|
246
|
-
|
|
478
|
+
options: Internal framework parameter. Framework defaults are production-optimized
|
|
479
|
+
(3 retries, 20s delay, 600s timeout). Configure model behavior centrally via
|
|
480
|
+
LiteLLM proxy settings or environment variables, not per API call.
|
|
481
|
+
Provider-specific settings should be configured at the proxy level.
|
|
482
|
+
purpose: Optional semantic label used as the tracing span name
|
|
483
|
+
instead of model name. Stored as a span attribute.
|
|
484
|
+
expected_cost: Optional expected cost stored as a span attribute
|
|
485
|
+
for cost-tracking and comparison with actual cost.
|
|
247
486
|
|
|
248
487
|
Returns:
|
|
249
488
|
ModelResponse containing:
|
|
@@ -260,75 +499,57 @@ async def generate(
|
|
|
260
499
|
Wrap Documents in AIMessages - DO NOT pass directly or convert to .text:
|
|
261
500
|
|
|
262
501
|
# CORRECT - wrap Document in AIMessages
|
|
263
|
-
response = await llm.generate("gpt-5", messages=AIMessages([my_document]))
|
|
502
|
+
response = await llm.generate("gpt-5.1", messages=AIMessages([my_document]))
|
|
264
503
|
|
|
265
504
|
# WRONG - don't pass Document directly
|
|
266
|
-
response = await llm.generate("gpt-5", messages=my_document) # NO!
|
|
505
|
+
response = await llm.generate("gpt-5.1", messages=my_document) # NO!
|
|
267
506
|
|
|
268
507
|
# WRONG - don't convert to string yourself
|
|
269
|
-
response = await llm.generate("gpt-5", messages=my_document.text) # NO!
|
|
508
|
+
response = await llm.generate("gpt-5.1", messages=my_document.text) # NO!
|
|
509
|
+
|
|
510
|
+
VISION/PDF MODEL COMPATIBILITY:
|
|
511
|
+
When using Documents containing images or PDFs, ensure your model supports these formats:
|
|
512
|
+
- Images require vision-capable models (gpt-5.1, gemini-3-flash, gemini-3-pro)
|
|
513
|
+
- PDFs require document processing support (varies by provider)
|
|
514
|
+
- Non-compatible models will raise ValueError or fall back to text extraction
|
|
515
|
+
- Check model capabilities before including visual/PDF content
|
|
270
516
|
|
|
271
517
|
Context vs Messages Strategy:
|
|
272
|
-
context: Static, reusable content
|
|
518
|
+
context: Static, reusable content for caching efficiency
|
|
273
519
|
- Large documents, instructions, examples
|
|
274
|
-
-
|
|
520
|
+
- Remains constant across multiple calls
|
|
521
|
+
- Cached when supported by provider/proxy configuration
|
|
275
522
|
|
|
276
|
-
messages: Dynamic,
|
|
523
|
+
messages: Dynamic, per-call specific content
|
|
277
524
|
- User questions, current conversation turn
|
|
278
|
-
- Changes
|
|
279
|
-
|
|
280
|
-
Example:
|
|
281
|
-
>>> # Simple case - no options needed (90% of cases)
|
|
282
|
-
>>> response = await llm.generate("gpt-5", messages="Explain quantum computing")
|
|
283
|
-
>>> print(response.content) # In production, use get_pipeline_logger instead of print
|
|
284
|
-
|
|
285
|
-
>>> # With context caching for efficiency
|
|
286
|
-
>>> # Context and messages are both AIMessages or str; wrap any Documents
|
|
287
|
-
>>> static_doc = AIMessages([large_document, "few-shot example: ..."])
|
|
288
|
-
>>>
|
|
289
|
-
>>> # First call: caches context
|
|
290
|
-
>>> r1 = await llm.generate("gpt-5", context=static_doc, messages="Summarize")
|
|
291
|
-
>>>
|
|
292
|
-
>>> # Second call: reuses cache, saves tokens!
|
|
293
|
-
>>> r2 = await llm.generate("gpt-5", context=static_doc, messages="Key points?")
|
|
294
|
-
|
|
295
|
-
>>> # AVOID unnecessary options (defaults are optimal)
|
|
296
|
-
>>> response = await llm.generate(
|
|
297
|
-
... "gpt-5",
|
|
298
|
-
... messages="Hello",
|
|
299
|
-
... options=ModelOptions(temperature=0.7) # Default is probably fine!
|
|
300
|
-
... )
|
|
301
|
-
|
|
302
|
-
>>> # Multi-turn conversation
|
|
303
|
-
>>> messages = AIMessages([
|
|
304
|
-
... "What is Python?",
|
|
305
|
-
... previous_response,
|
|
306
|
-
... "Can you give an example?"
|
|
307
|
-
... ])
|
|
308
|
-
>>> response = await llm.generate("gpt-5", messages=messages)
|
|
525
|
+
- Changes with each API call
|
|
526
|
+
- Never cached, always processed fresh
|
|
309
527
|
|
|
310
528
|
Performance:
|
|
311
529
|
- Context caching saves ~50-90% tokens on repeated calls
|
|
312
530
|
- First call: full token cost
|
|
313
|
-
- Subsequent calls (within
|
|
314
|
-
- Default
|
|
531
|
+
- Subsequent calls (within cache TTL): only messages tokens
|
|
532
|
+
- Default cache TTL is 300s/5 minutes (production-optimized)
|
|
533
|
+
- Default retry logic: 3 attempts with 20s delay (production-optimized)
|
|
315
534
|
|
|
316
535
|
Caching:
|
|
317
536
|
When enabled in your LiteLLM proxy and supported by the upstream provider,
|
|
318
|
-
context messages may be cached
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
-
|
|
326
|
-
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
537
|
+
context messages may be cached to reduce token usage on repeated calls.
|
|
538
|
+
Default TTL is 5m (optimized for production workloads). Configure caching
|
|
539
|
+
behavior centrally via your LiteLLM proxy settings, not per API call.
|
|
540
|
+
Savings depend on provider and payload; treat this as an optimization, not a guarantee.
|
|
541
|
+
|
|
542
|
+
Configuration:
|
|
543
|
+
All model behavior should be configured at the LiteLLM proxy level:
|
|
544
|
+
- Temperature, max_tokens: Set in litellm_config.yaml model_list
|
|
545
|
+
- Retry logic: Configure in proxy general_settings
|
|
546
|
+
- Timeouts: Set via proxy configuration
|
|
547
|
+
- Caching: Enable/configure in proxy cache settings
|
|
548
|
+
|
|
549
|
+
This centralizes configuration and ensures consistency across all API calls.
|
|
550
|
+
|
|
551
|
+
All models are accessed via LiteLLM proxy with automatic retry and
|
|
552
|
+
cost tracking via response headers.
|
|
332
553
|
"""
|
|
333
554
|
if isinstance(messages, str):
|
|
334
555
|
messages = AIMessages([messages])
|
|
@@ -337,9 +558,22 @@ async def generate(
|
|
|
337
558
|
context = AIMessages()
|
|
338
559
|
if options is None:
|
|
339
560
|
options = ModelOptions()
|
|
561
|
+
else:
|
|
562
|
+
# Create a copy to avoid mutating the caller's options object
|
|
563
|
+
options = options.model_copy()
|
|
564
|
+
|
|
565
|
+
with contextlib.suppress(Exception):
|
|
566
|
+
track_llm_documents(context, messages)
|
|
340
567
|
|
|
341
568
|
try:
|
|
342
|
-
return await _generate_with_retry(
|
|
569
|
+
return await _generate_with_retry(
|
|
570
|
+
model,
|
|
571
|
+
context,
|
|
572
|
+
messages,
|
|
573
|
+
options,
|
|
574
|
+
purpose=purpose,
|
|
575
|
+
expected_cost=expected_cost,
|
|
576
|
+
)
|
|
343
577
|
except (ValueError, LLMError):
|
|
344
578
|
raise # Explicitly re-raise to satisfy DOC502
|
|
345
579
|
|
|
@@ -348,36 +582,87 @@ T = TypeVar("T", bound=BaseModel)
|
|
|
348
582
|
"""Type variable for Pydantic model types in structured generation."""
|
|
349
583
|
|
|
350
584
|
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
model: ModelName | str,
|
|
585
|
+
async def generate_structured( # noqa: UP047
|
|
586
|
+
model: ModelName,
|
|
354
587
|
response_format: type[T],
|
|
355
588
|
*,
|
|
356
589
|
context: AIMessages | None = None,
|
|
357
590
|
messages: AIMessages | str,
|
|
358
591
|
options: ModelOptions | None = None,
|
|
592
|
+
purpose: str | None = None,
|
|
593
|
+
expected_cost: float | None = None,
|
|
359
594
|
) -> StructuredModelResponse[T]:
|
|
360
595
|
"""Generate structured output conforming to a Pydantic model.
|
|
361
596
|
|
|
362
|
-
@public
|
|
363
|
-
|
|
364
597
|
Type-safe generation that returns validated Pydantic model instances.
|
|
365
598
|
Uses OpenAI's structured output feature for guaranteed schema compliance.
|
|
366
599
|
|
|
367
|
-
|
|
368
|
-
|
|
600
|
+
IMPORTANT: Search models (models with '-search' suffix) do not support
|
|
601
|
+
structured output. Use generate() instead for search models.
|
|
602
|
+
|
|
603
|
+
Best Practices:
|
|
604
|
+
1. OPTIONS: DO NOT use the options parameter - omit it entirely for production use
|
|
369
605
|
2. MESSAGES: Use AIMessages or str - wrap Documents in AIMessages
|
|
370
|
-
3.
|
|
606
|
+
3. CONFIGURATION: Configure model behavior via LiteLLM proxy or environment variables
|
|
607
|
+
4. See generate() documentation for more details
|
|
608
|
+
|
|
609
|
+
Context vs Messages Strategy:
|
|
610
|
+
context: Static, reusable content for caching efficiency
|
|
611
|
+
- Schemas, examples, instructions
|
|
612
|
+
- Remains constant across multiple calls
|
|
613
|
+
- Cached when supported by provider/proxy configuration
|
|
614
|
+
|
|
615
|
+
messages: Dynamic, per-call specific content
|
|
616
|
+
- Data to be structured, user queries
|
|
617
|
+
- Changes with each API call
|
|
618
|
+
- Never cached, always processed fresh
|
|
619
|
+
|
|
620
|
+
Complex Task Pattern:
|
|
621
|
+
For complex tasks like research or deep analysis, it's recommended to use
|
|
622
|
+
a two-step approach:
|
|
623
|
+
1. First use generate() with a capable model to perform the analysis
|
|
624
|
+
2. Then use generate_structured() with a smaller model to convert the
|
|
625
|
+
response into structured output
|
|
626
|
+
|
|
627
|
+
This pattern is more reliable than trying to force complex reasoning
|
|
628
|
+
directly into structured format:
|
|
629
|
+
|
|
630
|
+
>>> # Step 1: Research/analysis with generate() - no options parameter
|
|
631
|
+
>>> research = await llm.generate(
|
|
632
|
+
... "gpt-5.1",
|
|
633
|
+
... messages="Research and analyze this complex topic..."
|
|
634
|
+
... )
|
|
635
|
+
>>>
|
|
636
|
+
>>> # Step 2: Structure the results with generate_structured()
|
|
637
|
+
>>> structured = await llm.generate_structured(
|
|
638
|
+
... "gpt-5-mini", # Smaller model is fine for structuring
|
|
639
|
+
... response_format=ResearchSummary,
|
|
640
|
+
... messages=f"Extract key information: {research.content}"
|
|
641
|
+
... )
|
|
371
642
|
|
|
372
643
|
Args:
|
|
373
644
|
model: Model to use (must support structured output).
|
|
645
|
+
Search models (models with '-search' suffix) do not support structured output.
|
|
374
646
|
response_format: Pydantic model class defining the output schema.
|
|
375
647
|
The model will generate JSON matching this schema.
|
|
376
648
|
context: Static context to cache (documents, schemas, examples).
|
|
377
649
|
Defaults to None (empty AIMessages).
|
|
378
650
|
messages: Dynamic prompts/queries. AIMessages or str ONLY.
|
|
379
|
-
Do not pass Document or
|
|
380
|
-
options:
|
|
651
|
+
Do not pass Document or list[Document] directly.
|
|
652
|
+
options: Optional ModelOptions for configuring temperature, retries, etc.
|
|
653
|
+
If provided, it will NOT be mutated (a copy is created internally).
|
|
654
|
+
The response_format field is set automatically from the response_format parameter.
|
|
655
|
+
In most cases, leave as None to use framework defaults.
|
|
656
|
+
Configure model behavior centrally via LiteLLM proxy settings when possible.
|
|
657
|
+
purpose: Optional semantic label used as the tracing span name
|
|
658
|
+
instead of model name. Stored as a span attribute.
|
|
659
|
+
expected_cost: Optional expected cost stored as a span attribute
|
|
660
|
+
for cost-tracking and comparison with actual cost.
|
|
661
|
+
|
|
662
|
+
Vision/PDF model compatibility: Images require vision-capable models that also support
|
|
663
|
+
structured output. PDFs require models with both document processing AND structured output
|
|
664
|
+
support. Consider two-step approach: generate() for analysis, then generate_structured()
|
|
665
|
+
for formatting.
|
|
381
666
|
|
|
382
667
|
Returns:
|
|
383
668
|
StructuredModelResponse[T] containing:
|
|
@@ -387,89 +672,57 @@ async def generate_structured(
|
|
|
387
672
|
Raises:
|
|
388
673
|
TypeError: If response_format is not a Pydantic model class.
|
|
389
674
|
ValueError: If model doesn't support structured output or no parsed content returned.
|
|
675
|
+
Structured output support varies by provider and model.
|
|
390
676
|
LLMError: If generation fails after retries.
|
|
391
677
|
ValidationError: If response cannot be parsed into response_format.
|
|
392
678
|
|
|
393
|
-
Example:
|
|
394
|
-
>>> from pydantic import BaseModel, Field
|
|
395
|
-
>>>
|
|
396
|
-
>>> class Analysis(BaseModel):
|
|
397
|
-
... summary: str = Field(description="Brief summary")
|
|
398
|
-
... sentiment: float = Field(ge=-1, le=1)
|
|
399
|
-
... key_points: list[str] = Field(max_length=5)
|
|
400
|
-
>>>
|
|
401
|
-
>>> response = await llm.generate_structured(
|
|
402
|
-
... model="gpt-5",
|
|
403
|
-
... response_format=Analysis,
|
|
404
|
-
... messages="Analyze this product review: ..."
|
|
405
|
-
... )
|
|
406
|
-
>>>
|
|
407
|
-
>>> analysis = response.parsed # Type: Analysis
|
|
408
|
-
>>> print(f"Sentiment: {analysis.sentiment}")
|
|
409
|
-
>>> for point in analysis.key_points:
|
|
410
|
-
... print(f"- {point}")
|
|
411
|
-
|
|
412
679
|
Supported models:
|
|
413
|
-
|
|
680
|
+
Structured output support varies by provider and model. Generally includes:
|
|
414
681
|
- OpenAI: GPT-4 and newer models
|
|
415
682
|
- Anthropic: Claude 3+ models
|
|
416
683
|
- Google: Gemini Pro models
|
|
417
|
-
|
|
684
|
+
|
|
685
|
+
Search models (models with '-search' suffix) do not support structured output.
|
|
686
|
+
Check provider documentation for specific support.
|
|
418
687
|
|
|
419
688
|
Performance:
|
|
420
689
|
- Structured output may use more tokens than free text
|
|
421
690
|
- Complex schemas increase generation time
|
|
422
691
|
- Validation overhead is minimal (Pydantic is fast)
|
|
423
692
|
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
- Validation happens automatically via Pydantic
|
|
428
|
-
- Use Field() descriptions to guide generation
|
|
429
|
-
|
|
430
|
-
See Also:
|
|
431
|
-
- generate: For unstructured text generation
|
|
432
|
-
- ModelOptions: Configuration including response_format
|
|
433
|
-
- StructuredModelResponse: Response wrapper with .parsed property
|
|
693
|
+
Pydantic model is converted to JSON Schema for the API. Validation happens
|
|
694
|
+
automatically via Pydantic. Search models (models with '-search' suffix) do
|
|
695
|
+
not support structured output.
|
|
434
696
|
"""
|
|
435
697
|
if context is None:
|
|
436
698
|
context = AIMessages()
|
|
437
699
|
if options is None:
|
|
438
700
|
options = ModelOptions()
|
|
701
|
+
else:
|
|
702
|
+
# Create a copy to avoid mutating the caller's options object
|
|
703
|
+
options = options.model_copy()
|
|
439
704
|
|
|
440
705
|
options.response_format = response_format
|
|
441
706
|
|
|
442
707
|
if isinstance(messages, str):
|
|
443
708
|
messages = AIMessages([messages])
|
|
444
709
|
|
|
710
|
+
assert isinstance(messages, AIMessages)
|
|
711
|
+
|
|
712
|
+
with contextlib.suppress(Exception):
|
|
713
|
+
track_llm_documents(context, messages)
|
|
714
|
+
|
|
445
715
|
# Call the internal generate function with structured output enabled
|
|
446
716
|
try:
|
|
447
|
-
response = await _generate_with_retry(
|
|
717
|
+
response = await _generate_with_retry(
|
|
718
|
+
model,
|
|
719
|
+
context,
|
|
720
|
+
messages,
|
|
721
|
+
options,
|
|
722
|
+
purpose=purpose,
|
|
723
|
+
expected_cost=expected_cost,
|
|
724
|
+
)
|
|
448
725
|
except (ValueError, LLMError):
|
|
449
726
|
raise # Explicitly re-raise to satisfy DOC502
|
|
450
727
|
|
|
451
|
-
|
|
452
|
-
parsed_value: T | None = None
|
|
453
|
-
|
|
454
|
-
# Check if response has choices and parsed content
|
|
455
|
-
if response.choices and hasattr(response.choices[0].message, "parsed"):
|
|
456
|
-
parsed: Any = response.choices[0].message.parsed # type: ignore[attr-defined]
|
|
457
|
-
|
|
458
|
-
# If parsed is a dict, instantiate it as the response format class
|
|
459
|
-
if isinstance(parsed, dict):
|
|
460
|
-
parsed_value = response_format(**parsed)
|
|
461
|
-
# If it's already the right type, use it
|
|
462
|
-
elif isinstance(parsed, response_format):
|
|
463
|
-
parsed_value = parsed
|
|
464
|
-
else:
|
|
465
|
-
# Otherwise try to convert it
|
|
466
|
-
raise TypeError(
|
|
467
|
-
f"Unable to convert parsed response to {response_format.__name__}: "
|
|
468
|
-
f"got type {type(parsed).__name__}" # type: ignore[reportUnknownArgumentType]
|
|
469
|
-
)
|
|
470
|
-
|
|
471
|
-
if parsed_value is None:
|
|
472
|
-
raise ValueError("No parsed content available from the model response")
|
|
473
|
-
|
|
474
|
-
# Create a StructuredModelResponse with the parsed value
|
|
475
|
-
return StructuredModelResponse[T](chat_completion=response, parsed_value=parsed_value)
|
|
728
|
+
return StructuredModelResponse[T].from_model_response(response)
|