ai-pipeline-core 0.3.4__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ai_pipeline_core/__init__.py +64 -158
- ai_pipeline_core/deployment/__init__.py +6 -18
- ai_pipeline_core/deployment/base.py +392 -212
- ai_pipeline_core/deployment/contract.py +6 -10
- ai_pipeline_core/{utils → deployment}/deploy.py +50 -69
- ai_pipeline_core/deployment/helpers.py +16 -17
- ai_pipeline_core/{progress.py → deployment/progress.py} +23 -24
- ai_pipeline_core/{utils/remote_deployment.py → deployment/remote.py} +11 -14
- ai_pipeline_core/docs_generator/__init__.py +54 -0
- ai_pipeline_core/docs_generator/__main__.py +5 -0
- ai_pipeline_core/docs_generator/cli.py +196 -0
- ai_pipeline_core/docs_generator/extractor.py +324 -0
- ai_pipeline_core/docs_generator/guide_builder.py +644 -0
- ai_pipeline_core/docs_generator/trimmer.py +35 -0
- ai_pipeline_core/docs_generator/validator.py +114 -0
- ai_pipeline_core/document_store/__init__.py +13 -0
- ai_pipeline_core/document_store/_summary.py +9 -0
- ai_pipeline_core/document_store/_summary_worker.py +170 -0
- ai_pipeline_core/document_store/clickhouse.py +492 -0
- ai_pipeline_core/document_store/factory.py +38 -0
- ai_pipeline_core/document_store/local.py +312 -0
- ai_pipeline_core/document_store/memory.py +85 -0
- ai_pipeline_core/document_store/protocol.py +68 -0
- ai_pipeline_core/documents/__init__.py +12 -14
- ai_pipeline_core/documents/_context_vars.py +85 -0
- ai_pipeline_core/documents/_hashing.py +52 -0
- ai_pipeline_core/documents/attachment.py +85 -0
- ai_pipeline_core/documents/context.py +128 -0
- ai_pipeline_core/documents/document.py +318 -1434
- ai_pipeline_core/documents/mime_type.py +11 -84
- ai_pipeline_core/documents/utils.py +4 -12
- ai_pipeline_core/exceptions.py +10 -62
- ai_pipeline_core/images/__init__.py +32 -85
- ai_pipeline_core/images/_processing.py +5 -11
- ai_pipeline_core/llm/__init__.py +6 -4
- ai_pipeline_core/llm/ai_messages.py +102 -90
- ai_pipeline_core/llm/client.py +229 -183
- ai_pipeline_core/llm/model_options.py +12 -84
- ai_pipeline_core/llm/model_response.py +53 -99
- ai_pipeline_core/llm/model_types.py +8 -23
- ai_pipeline_core/logging/__init__.py +2 -7
- ai_pipeline_core/logging/logging.yml +1 -1
- ai_pipeline_core/logging/logging_config.py +27 -37
- ai_pipeline_core/logging/logging_mixin.py +15 -41
- ai_pipeline_core/observability/__init__.py +32 -0
- ai_pipeline_core/observability/_debug/__init__.py +30 -0
- ai_pipeline_core/observability/_debug/_auto_summary.py +94 -0
- ai_pipeline_core/{debug/config.py → observability/_debug/_config.py} +11 -7
- ai_pipeline_core/{debug/content.py → observability/_debug/_content.py} +133 -75
- ai_pipeline_core/{debug/processor.py → observability/_debug/_processor.py} +16 -17
- ai_pipeline_core/{debug/summary.py → observability/_debug/_summary.py} +113 -37
- ai_pipeline_core/observability/_debug/_types.py +75 -0
- ai_pipeline_core/{debug/writer.py → observability/_debug/_writer.py} +126 -196
- ai_pipeline_core/observability/_document_tracking.py +146 -0
- ai_pipeline_core/observability/_initialization.py +194 -0
- ai_pipeline_core/observability/_logging_bridge.py +57 -0
- ai_pipeline_core/observability/_summary.py +81 -0
- ai_pipeline_core/observability/_tracking/__init__.py +6 -0
- ai_pipeline_core/observability/_tracking/_client.py +178 -0
- ai_pipeline_core/observability/_tracking/_internal.py +28 -0
- ai_pipeline_core/observability/_tracking/_models.py +138 -0
- ai_pipeline_core/observability/_tracking/_processor.py +158 -0
- ai_pipeline_core/observability/_tracking/_service.py +311 -0
- ai_pipeline_core/observability/_tracking/_writer.py +229 -0
- ai_pipeline_core/{tracing.py → observability/tracing.py} +139 -335
- ai_pipeline_core/pipeline/__init__.py +10 -0
- ai_pipeline_core/pipeline/decorators.py +915 -0
- ai_pipeline_core/pipeline/options.py +16 -0
- ai_pipeline_core/prompt_manager.py +16 -102
- ai_pipeline_core/settings.py +26 -31
- ai_pipeline_core/testing.py +9 -0
- ai_pipeline_core-0.4.0.dist-info/METADATA +807 -0
- ai_pipeline_core-0.4.0.dist-info/RECORD +76 -0
- ai_pipeline_core/debug/__init__.py +0 -26
- ai_pipeline_core/documents/document_list.py +0 -420
- ai_pipeline_core/documents/flow_document.py +0 -112
- ai_pipeline_core/documents/task_document.py +0 -117
- ai_pipeline_core/documents/temporary_document.py +0 -74
- ai_pipeline_core/flow/__init__.py +0 -9
- ai_pipeline_core/flow/config.py +0 -494
- ai_pipeline_core/flow/options.py +0 -75
- ai_pipeline_core/pipeline.py +0 -718
- ai_pipeline_core/prefect.py +0 -63
- ai_pipeline_core/prompt_builder/__init__.py +0 -5
- ai_pipeline_core/prompt_builder/documents_prompt.jinja2 +0 -23
- ai_pipeline_core/prompt_builder/global_cache.py +0 -78
- ai_pipeline_core/prompt_builder/new_core_documents_prompt.jinja2 +0 -6
- ai_pipeline_core/prompt_builder/prompt_builder.py +0 -253
- ai_pipeline_core/prompt_builder/system_prompt.jinja2 +0 -41
- ai_pipeline_core/storage/__init__.py +0 -8
- ai_pipeline_core/storage/storage.py +0 -628
- ai_pipeline_core/utils/__init__.py +0 -8
- ai_pipeline_core-0.3.4.dist-info/METADATA +0 -569
- ai_pipeline_core-0.3.4.dist-info/RECORD +0 -57
- {ai_pipeline_core-0.3.4.dist-info → ai_pipeline_core-0.4.0.dist-info}/WHEEL +0 -0
- {ai_pipeline_core-0.3.4.dist-info → ai_pipeline_core-0.4.0.dist-info}/licenses/LICENSE +0 -0
ai_pipeline_core/llm/client.py
CHANGED
|
@@ -1,17 +1,17 @@
|
|
|
1
1
|
"""LLM client implementation for AI model interactions.
|
|
2
2
|
|
|
3
|
-
@public
|
|
4
|
-
|
|
5
3
|
This module provides the core functionality for interacting with language models
|
|
6
4
|
through a unified interface. It handles retries, caching, structured outputs,
|
|
7
5
|
and integration with various LLM providers via LiteLLM.
|
|
8
6
|
|
|
9
|
-
|
|
10
|
-
-
|
|
11
|
-
|
|
7
|
+
Automatic image auto-tiling splits oversized images in attachments to meet
|
|
8
|
+
model-specific constraints (e.g., 3000x3000 for Gemini, 1000x1000 default).
|
|
9
|
+
Context caching separates static content from dynamic messages for 50-90% token savings.
|
|
10
|
+
Optional purpose and expected_cost parameters enable tracing and cost-tracking.
|
|
12
11
|
"""
|
|
13
12
|
|
|
14
13
|
import asyncio
|
|
14
|
+
import contextlib
|
|
15
15
|
import time
|
|
16
16
|
from io import BytesIO
|
|
17
17
|
from typing import Any, TypeVar
|
|
@@ -20,15 +20,18 @@ from lmnr import Laminar
|
|
|
20
20
|
from openai import AsyncOpenAI
|
|
21
21
|
from openai.lib.streaming.chat import ChunkEvent, ContentDeltaEvent, ContentDoneEvent
|
|
22
22
|
from openai.types.chat import (
|
|
23
|
+
ChatCompletion,
|
|
23
24
|
ChatCompletionMessageParam,
|
|
24
25
|
)
|
|
25
26
|
from PIL import Image
|
|
26
|
-
from prefect.logging import get_logger
|
|
27
27
|
from pydantic import BaseModel, ValidationError
|
|
28
28
|
|
|
29
29
|
from ai_pipeline_core.documents import Document
|
|
30
|
+
from ai_pipeline_core.documents.attachment import Attachment
|
|
30
31
|
from ai_pipeline_core.exceptions import LLMError
|
|
31
|
-
from ai_pipeline_core.images import ImageProcessingConfig, process_image_to_documents
|
|
32
|
+
from ai_pipeline_core.images import ImageProcessingConfig, process_image, process_image_to_documents
|
|
33
|
+
from ai_pipeline_core.logging import get_pipeline_logger
|
|
34
|
+
from ai_pipeline_core.observability._document_tracking import track_llm_documents
|
|
32
35
|
from ai_pipeline_core.settings import settings
|
|
33
36
|
|
|
34
37
|
from .ai_messages import AIMessages, AIMessageType
|
|
@@ -36,16 +39,12 @@ from .model_options import ModelOptions
|
|
|
36
39
|
from .model_response import ModelResponse, StructuredModelResponse
|
|
37
40
|
from .model_types import ModelName
|
|
38
41
|
|
|
39
|
-
logger =
|
|
42
|
+
logger = get_pipeline_logger(__name__)
|
|
40
43
|
|
|
41
44
|
# Image splitting configs for automatic large-image handling at the LLM boundary.
|
|
42
45
|
# Gemini supports up to 3000x3000; all other models use a conservative 1000x1000 default.
|
|
43
|
-
_GEMINI_IMAGE_CONFIG = ImageProcessingConfig(
|
|
44
|
-
|
|
45
|
-
)
|
|
46
|
-
_DEFAULT_IMAGE_CONFIG = ImageProcessingConfig(
|
|
47
|
-
max_dimension=1000, max_pixels=1_000_000, jpeg_quality=75
|
|
48
|
-
)
|
|
46
|
+
_GEMINI_IMAGE_CONFIG = ImageProcessingConfig(max_dimension=3000, max_pixels=9_000_000, jpeg_quality=75)
|
|
47
|
+
_DEFAULT_IMAGE_CONFIG = ImageProcessingConfig(max_dimension=1000, max_pixels=1_000_000, jpeg_quality=75)
|
|
49
48
|
|
|
50
49
|
|
|
51
50
|
def _get_image_config(model: str) -> ImageProcessingConfig:
|
|
@@ -55,13 +54,13 @@ def _get_image_config(model: str) -> ImageProcessingConfig:
|
|
|
55
54
|
return _DEFAULT_IMAGE_CONFIG
|
|
56
55
|
|
|
57
56
|
|
|
58
|
-
def _prepare_images_for_model(messages: AIMessages, model: str) -> AIMessages:
|
|
59
|
-
"""Split image documents that exceed model constraints.
|
|
57
|
+
def _prepare_images_for_model(messages: AIMessages, model: str) -> AIMessages: # noqa: C901, PLR0912, PLR0915, PLR0914
|
|
58
|
+
"""Split image documents and image attachments that exceed model constraints.
|
|
60
59
|
|
|
61
60
|
Returns a new AIMessages with oversized images replaced by tiles.
|
|
62
61
|
Returns the original instance unchanged if no splitting is needed.
|
|
63
62
|
"""
|
|
64
|
-
if not any(isinstance(m, Document) and m.is_image for m in messages):
|
|
63
|
+
if not any(isinstance(m, Document) and (m.is_image or any(att.is_image for att in m.attachments)) for m in messages):
|
|
65
64
|
return messages
|
|
66
65
|
|
|
67
66
|
config = _get_image_config(model)
|
|
@@ -69,25 +68,79 @@ def _prepare_images_for_model(messages: AIMessages, model: str) -> AIMessages:
|
|
|
69
68
|
changed = False
|
|
70
69
|
|
|
71
70
|
for msg in messages:
|
|
72
|
-
if not
|
|
71
|
+
if not isinstance(msg, Document):
|
|
73
72
|
result.append(msg)
|
|
74
73
|
continue
|
|
75
74
|
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
75
|
+
# 1. Handle top-level image Documents (existing logic)
|
|
76
|
+
if msg.is_image:
|
|
77
|
+
try:
|
|
78
|
+
with Image.open(BytesIO(msg.content)) as img:
|
|
79
|
+
w, h = img.size
|
|
80
|
+
except Exception:
|
|
81
|
+
result.append(msg)
|
|
82
|
+
continue
|
|
83
|
+
|
|
84
|
+
within_limits = w <= config.max_dimension and h <= config.max_dimension and w * h <= config.max_pixels
|
|
85
|
+
if within_limits:
|
|
86
|
+
pass # Falls through to attachment handling
|
|
87
|
+
else:
|
|
88
|
+
name_prefix = msg.name.rsplit(".", 1)[0] if "." in msg.name else msg.name
|
|
89
|
+
tiles = process_image_to_documents(msg, config=config, name_prefix=name_prefix)
|
|
90
|
+
if msg.attachments and tiles:
|
|
91
|
+
tiles[0] = tiles[0].model_copy(update={"attachments": msg.attachments})
|
|
92
|
+
result.extend(tiles)
|
|
93
|
+
changed = True
|
|
94
|
+
continue
|
|
95
|
+
|
|
96
|
+
# 2. Handle image attachments
|
|
97
|
+
if msg.attachments:
|
|
98
|
+
new_attachments: list[Attachment] = []
|
|
99
|
+
attachments_changed = False
|
|
100
|
+
|
|
101
|
+
for att in msg.attachments:
|
|
102
|
+
if not att.is_image:
|
|
103
|
+
new_attachments.append(att)
|
|
104
|
+
continue
|
|
105
|
+
|
|
106
|
+
try:
|
|
107
|
+
with Image.open(BytesIO(att.content)) as img:
|
|
108
|
+
w, h = img.size
|
|
109
|
+
except Exception:
|
|
110
|
+
new_attachments.append(att)
|
|
111
|
+
continue
|
|
112
|
+
|
|
113
|
+
att_within_limits = w <= config.max_dimension and h <= config.max_dimension and w * h <= config.max_pixels
|
|
114
|
+
if att_within_limits:
|
|
115
|
+
new_attachments.append(att)
|
|
116
|
+
continue
|
|
117
|
+
|
|
118
|
+
# Tile the oversized attachment image
|
|
119
|
+
processed = process_image(att.content, config=config)
|
|
120
|
+
att_prefix = att.name.rsplit(".", 1)[0] if "." in att.name else att.name
|
|
121
|
+
|
|
122
|
+
for part in processed.parts:
|
|
123
|
+
if part.total == 1:
|
|
124
|
+
tile_name = f"{att_prefix}.jpg"
|
|
125
|
+
tile_desc = att.description
|
|
126
|
+
else:
|
|
127
|
+
tile_name = f"{att_prefix}_{part.index + 1:02d}_of_{part.total:02d}.jpg"
|
|
128
|
+
tile_desc = f"{att.description} ({part.label})" if att.description else part.label
|
|
129
|
+
|
|
130
|
+
new_attachments.append(
|
|
131
|
+
Attachment(
|
|
132
|
+
name=tile_name,
|
|
133
|
+
content=part.data,
|
|
134
|
+
description=tile_desc,
|
|
135
|
+
)
|
|
136
|
+
)
|
|
137
|
+
attachments_changed = True
|
|
138
|
+
|
|
139
|
+
if attachments_changed:
|
|
140
|
+
msg = msg.model_copy(update={"attachments": tuple(new_attachments)}) # noqa: PLW2901
|
|
141
|
+
changed = True
|
|
142
|
+
|
|
143
|
+
result.append(msg)
|
|
91
144
|
|
|
92
145
|
if not changed:
|
|
93
146
|
return messages
|
|
@@ -129,9 +182,8 @@ def _process_messages(
|
|
|
129
182
|
If cache_ttl is None or empty string (falsy), no caching is applied.
|
|
130
183
|
All system and context messages receive cache_control to maximize cache efficiency.
|
|
131
184
|
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
The context/messages split enables efficient token usage.
|
|
185
|
+
This is an internal function used by _generate_with_retry().
|
|
186
|
+
The context/messages split enables efficient token usage.
|
|
135
187
|
"""
|
|
136
188
|
processed_messages: list[ChatCompletionMessageParam] = []
|
|
137
189
|
|
|
@@ -184,20 +236,17 @@ def _remove_cache_control(
|
|
|
184
236
|
The same message list (modified in-place) with all cache_control
|
|
185
237
|
fields removed from both messages and their content items.
|
|
186
238
|
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
for convenience. Handles both list-based content (multipart) and
|
|
190
|
-
string content (simple messages).
|
|
239
|
+
Modifies the input list in-place but also returns it for convenience.
|
|
240
|
+
Handles both list-based content (multipart) and string content (simple messages).
|
|
191
241
|
"""
|
|
192
242
|
for message in messages:
|
|
193
|
-
if content := message.get("content"):
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
del item["cache_control"]
|
|
243
|
+
if (content := message.get("content")) and isinstance(content, list):
|
|
244
|
+
for item in content:
|
|
245
|
+
if "cache_control" in item:
|
|
246
|
+
del item["cache_control"]
|
|
198
247
|
if "cache_control" in message:
|
|
199
248
|
del message["cache_control"]
|
|
200
|
-
return messages
|
|
249
|
+
return messages
|
|
201
250
|
|
|
202
251
|
|
|
203
252
|
def _model_name_to_openrouter_model(model: ModelName) -> str:
|
|
@@ -232,30 +281,76 @@ def _model_name_to_openrouter_model(model: ModelName) -> str:
|
|
|
232
281
|
return model
|
|
233
282
|
|
|
234
283
|
|
|
235
|
-
async def
|
|
236
|
-
|
|
284
|
+
async def _generate_streaming(client: AsyncOpenAI, model: str, messages: list[ChatCompletionMessageParam], completion_kwargs: dict[str, Any]) -> ModelResponse:
|
|
285
|
+
"""Execute a streaming LLM API call."""
|
|
286
|
+
start_time = time.time()
|
|
287
|
+
first_token_time = None
|
|
288
|
+
usage = None
|
|
289
|
+
async with client.chat.completions.stream(
|
|
290
|
+
model=model,
|
|
291
|
+
messages=messages,
|
|
292
|
+
**completion_kwargs,
|
|
293
|
+
) as s:
|
|
294
|
+
async for event in s:
|
|
295
|
+
if isinstance(event, ContentDeltaEvent):
|
|
296
|
+
if not first_token_time:
|
|
297
|
+
first_token_time = time.time()
|
|
298
|
+
elif isinstance(event, ContentDoneEvent):
|
|
299
|
+
pass
|
|
300
|
+
elif isinstance(event, ChunkEvent) and event.chunk.usage:
|
|
301
|
+
usage = event.chunk.usage
|
|
302
|
+
if not first_token_time:
|
|
303
|
+
first_token_time = time.time()
|
|
304
|
+
raw_response = await s.get_final_completion()
|
|
305
|
+
|
|
306
|
+
metadata = {
|
|
307
|
+
"time_taken": round(time.time() - start_time, 2),
|
|
308
|
+
"first_token_time": round(first_token_time - start_time, 2),
|
|
309
|
+
}
|
|
310
|
+
return ModelResponse(raw_response, model_options=completion_kwargs, metadata=metadata, usage=usage)
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
async def _generate_non_streaming(
|
|
314
|
+
client: AsyncOpenAI, model: str, messages: list[ChatCompletionMessageParam], completion_kwargs: dict[str, Any]
|
|
237
315
|
) -> ModelResponse:
|
|
238
|
-
"""Execute a
|
|
316
|
+
"""Execute a non-streaming LLM API call.
|
|
317
|
+
|
|
318
|
+
Avoids OpenAI SDK delta accumulation — some providers (e.g. Grok) send
|
|
319
|
+
streaming annotation deltas that crash the SDK's accumulate_delta().
|
|
320
|
+
"""
|
|
321
|
+
start_time = time.time()
|
|
322
|
+
kwargs = {k: v for k, v in completion_kwargs.items() if k != "stream_options"}
|
|
323
|
+
response_format = kwargs.get("response_format")
|
|
324
|
+
if isinstance(response_format, type) and issubclass(response_format, BaseModel):
|
|
325
|
+
raw_response: ChatCompletion = await client.chat.completions.parse(
|
|
326
|
+
model=model,
|
|
327
|
+
messages=messages,
|
|
328
|
+
**kwargs,
|
|
329
|
+
)
|
|
330
|
+
else:
|
|
331
|
+
raw_response = await client.chat.completions.create(
|
|
332
|
+
model=model,
|
|
333
|
+
messages=messages,
|
|
334
|
+
stream=False,
|
|
335
|
+
**kwargs,
|
|
336
|
+
)
|
|
337
|
+
elapsed = round(time.time() - start_time, 2)
|
|
338
|
+
metadata = {"time_taken": elapsed, "first_token_time": elapsed}
|
|
339
|
+
return ModelResponse(raw_response, model_options=completion_kwargs, metadata=metadata)
|
|
340
|
+
|
|
239
341
|
|
|
240
|
-
|
|
241
|
-
|
|
342
|
+
async def _generate(model: str, messages: list[ChatCompletionMessageParam], completion_kwargs: dict[str, Any], *, stream: bool = True) -> ModelResponse:
|
|
343
|
+
"""Execute a single LLM API call.
|
|
242
344
|
|
|
243
345
|
Args:
|
|
244
346
|
model: Model identifier (e.g., "gpt-5.1", "gemini-3-pro").
|
|
245
347
|
messages: Formatted messages for the API.
|
|
246
348
|
completion_kwargs: Additional parameters for the completion API.
|
|
349
|
+
stream: Whether to use streaming mode (default True). Non-streaming
|
|
350
|
+
avoids OpenAI SDK delta accumulation issues with some providers.
|
|
247
351
|
|
|
248
352
|
Returns:
|
|
249
353
|
ModelResponse with generated content and metadata.
|
|
250
|
-
|
|
251
|
-
API selection:
|
|
252
|
-
- Uses client.chat.completions.parse() for structured output
|
|
253
|
-
- Uses client.chat.completions.create() for regular text
|
|
254
|
-
|
|
255
|
-
Note:
|
|
256
|
-
- Uses AsyncOpenAI client configured via settings
|
|
257
|
-
- Captures response headers for cost tracking
|
|
258
|
-
- Response includes model options for debugging
|
|
259
354
|
"""
|
|
260
355
|
if "openrouter" in settings.openai_base_url.lower():
|
|
261
356
|
model = _model_name_to_openrouter_model(model)
|
|
@@ -264,45 +359,18 @@ async def _generate(
|
|
|
264
359
|
api_key=settings.openai_api_key,
|
|
265
360
|
base_url=settings.openai_base_url,
|
|
266
361
|
) as client:
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
async with client.chat.completions.stream(
|
|
271
|
-
model=model,
|
|
272
|
-
messages=messages,
|
|
273
|
-
**completion_kwargs,
|
|
274
|
-
) as stream:
|
|
275
|
-
async for event in stream:
|
|
276
|
-
if isinstance(event, ContentDeltaEvent):
|
|
277
|
-
if not first_token_time:
|
|
278
|
-
first_token_time = time.time()
|
|
279
|
-
elif isinstance(event, ContentDoneEvent):
|
|
280
|
-
pass
|
|
281
|
-
elif isinstance(event, ChunkEvent):
|
|
282
|
-
if event.chunk.usage: # used to fix a bug with missing usage data
|
|
283
|
-
usage = event.chunk.usage
|
|
284
|
-
if not first_token_time:
|
|
285
|
-
first_token_time = time.time()
|
|
286
|
-
raw_response = await stream.get_final_completion()
|
|
287
|
-
|
|
288
|
-
metadata = {
|
|
289
|
-
"time_taken": round(time.time() - start_time, 2),
|
|
290
|
-
"first_token_time": round(first_token_time - start_time, 2),
|
|
291
|
-
}
|
|
292
|
-
response = ModelResponse(
|
|
293
|
-
raw_response,
|
|
294
|
-
model_options=completion_kwargs,
|
|
295
|
-
metadata=metadata,
|
|
296
|
-
usage=usage,
|
|
297
|
-
)
|
|
298
|
-
return response
|
|
362
|
+
if stream:
|
|
363
|
+
return await _generate_streaming(client, model, messages, completion_kwargs)
|
|
364
|
+
return await _generate_non_streaming(client, model, messages, completion_kwargs)
|
|
299
365
|
|
|
300
366
|
|
|
301
|
-
async def _generate_with_retry(
|
|
367
|
+
async def _generate_with_retry( # noqa: PLR0917
|
|
302
368
|
model: str,
|
|
303
369
|
context: AIMessages,
|
|
304
370
|
messages: AIMessages,
|
|
305
371
|
options: ModelOptions,
|
|
372
|
+
purpose: str | None = None,
|
|
373
|
+
expected_cost: float | None = None,
|
|
306
374
|
) -> ModelResponse:
|
|
307
375
|
"""Core LLM generation with automatic retry logic.
|
|
308
376
|
|
|
@@ -314,6 +382,8 @@ async def _generate_with_retry(
|
|
|
314
382
|
context: Cached context messages (can be empty).
|
|
315
383
|
messages: Dynamic query messages.
|
|
316
384
|
options: Configuration including retries, timeout, temperature.
|
|
385
|
+
purpose: Optional semantic label for the LLM span name.
|
|
386
|
+
expected_cost: Optional expected cost for cost-tracking attributes.
|
|
317
387
|
|
|
318
388
|
Returns:
|
|
319
389
|
ModelResponse with generated content.
|
|
@@ -322,8 +392,7 @@ async def _generate_with_retry(
|
|
|
322
392
|
ValueError: If model is not provided or both context and messages are empty.
|
|
323
393
|
LLMError: If all retry attempts are exhausted.
|
|
324
394
|
|
|
325
|
-
|
|
326
|
-
Empty responses trigger a retry as they indicate API issues.
|
|
395
|
+
Empty responses trigger a retry as they indicate API issues.
|
|
327
396
|
"""
|
|
328
397
|
if not model:
|
|
329
398
|
raise ValueError("Model must be provided")
|
|
@@ -338,9 +407,7 @@ async def _generate_with_retry(
|
|
|
338
407
|
# Bug fix for minimum explicit context size for Gemini models
|
|
339
408
|
options.cache_ttl = None
|
|
340
409
|
|
|
341
|
-
processed_messages = _process_messages(
|
|
342
|
-
context, messages, options.system_prompt, options.cache_ttl
|
|
343
|
-
)
|
|
410
|
+
processed_messages = _process_messages(context, messages, options.system_prompt, options.cache_ttl)
|
|
344
411
|
completion_kwargs: dict[str, Any] = {
|
|
345
412
|
**options.to_openai_completion_kwargs(),
|
|
346
413
|
}
|
|
@@ -350,17 +417,18 @@ async def _generate_with_retry(
|
|
|
350
417
|
|
|
351
418
|
for attempt in range(options.retries):
|
|
352
419
|
try:
|
|
353
|
-
with Laminar.start_as_current_span(
|
|
354
|
-
model,
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
]
|
|
420
|
+
with Laminar.start_as_current_span(purpose or model, span_type="LLM", input=processed_messages) as span:
|
|
421
|
+
response = await _generate(model, processed_messages, completion_kwargs, stream=options.stream)
|
|
422
|
+
laminar_metadata = response.get_laminar_metadata()
|
|
423
|
+
if purpose:
|
|
424
|
+
laminar_metadata["purpose"] = purpose
|
|
425
|
+
if expected_cost is not None:
|
|
426
|
+
laminar_metadata["expected_cost"] = expected_cost
|
|
427
|
+
span.set_attributes(laminar_metadata) # pyright: ignore[reportArgumentType]
|
|
428
|
+
Laminar.set_span_output([r for r in (response.reasoning_content, response.content) if r])
|
|
361
429
|
response.validate_output()
|
|
362
430
|
return response
|
|
363
|
-
except (
|
|
431
|
+
except (TimeoutError, ValueError, ValidationError, Exception) as e:
|
|
364
432
|
if not isinstance(e, asyncio.TimeoutError):
|
|
365
433
|
# disable cache if it's not a timeout because it may cause an error
|
|
366
434
|
completion_kwargs["extra_body"]["cache"] = {"no-cache": True}
|
|
@@ -384,11 +452,11 @@ async def generate(
|
|
|
384
452
|
context: AIMessages | None = None,
|
|
385
453
|
messages: AIMessages | str,
|
|
386
454
|
options: ModelOptions | None = None,
|
|
455
|
+
purpose: str | None = None,
|
|
456
|
+
expected_cost: float | None = None,
|
|
387
457
|
) -> ModelResponse:
|
|
388
458
|
"""Generate text response from a language model.
|
|
389
459
|
|
|
390
|
-
@public
|
|
391
|
-
|
|
392
460
|
Main entry point for LLM text generation with smart context caching.
|
|
393
461
|
The context/messages split enables efficient token usage by caching
|
|
394
462
|
expensive static content separately from dynamic queries.
|
|
@@ -405,13 +473,16 @@ async def generate(
|
|
|
405
473
|
context: Static context to cache (documents, examples, instructions).
|
|
406
474
|
Defaults to None (empty context). Cached for 5 minutes by default.
|
|
407
475
|
messages: Dynamic messages/queries. AIMessages or str ONLY.
|
|
408
|
-
Do not pass Document or
|
|
476
|
+
Do not pass Document or list[Document] directly.
|
|
409
477
|
If string, converted to AIMessages internally.
|
|
410
|
-
options:
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
478
|
+
options: Internal framework parameter. Framework defaults are production-optimized
|
|
479
|
+
(3 retries, 20s delay, 600s timeout). Configure model behavior centrally via
|
|
480
|
+
LiteLLM proxy settings or environment variables, not per API call.
|
|
481
|
+
Provider-specific settings should be configured at the proxy level.
|
|
482
|
+
purpose: Optional semantic label used as the tracing span name
|
|
483
|
+
instead of model name. Stored as a span attribute.
|
|
484
|
+
expected_cost: Optional expected cost stored as a span attribute
|
|
485
|
+
for cost-tracking and comparison with actual cost.
|
|
415
486
|
|
|
416
487
|
Returns:
|
|
417
488
|
ModelResponse containing:
|
|
@@ -454,35 +525,12 @@ async def generate(
|
|
|
454
525
|
- Changes with each API call
|
|
455
526
|
- Never cached, always processed fresh
|
|
456
527
|
|
|
457
|
-
Example:
|
|
458
|
-
>>> # CORRECT - No options parameter (this is the recommended pattern)
|
|
459
|
-
>>> response = await llm.generate("gpt-5.1", messages="Explain quantum computing")
|
|
460
|
-
>>> print(response.content) # In production, use get_pipeline_logger instead of print
|
|
461
|
-
|
|
462
|
-
>>> # With context caching for efficiency
|
|
463
|
-
>>> # Context and messages are both AIMessages or str; wrap any Documents
|
|
464
|
-
>>> static_doc = AIMessages([large_document, "few-shot example: ..."])
|
|
465
|
-
>>>
|
|
466
|
-
>>> # First call: caches context
|
|
467
|
-
>>> r1 = await llm.generate("gpt-5.1", context=static_doc, messages="Summarize")
|
|
468
|
-
>>>
|
|
469
|
-
>>> # Second call: reuses cache, saves tokens!
|
|
470
|
-
>>> r2 = await llm.generate("gpt-5.1", context=static_doc, messages="Key points?")
|
|
471
|
-
|
|
472
|
-
>>> # Multi-turn conversation
|
|
473
|
-
>>> messages = AIMessages([
|
|
474
|
-
... "What is Python?",
|
|
475
|
-
... previous_response,
|
|
476
|
-
... "Can you give an example?"
|
|
477
|
-
... ])
|
|
478
|
-
>>> response = await llm.generate("gpt-5.1", messages=messages)
|
|
479
|
-
|
|
480
528
|
Performance:
|
|
481
529
|
- Context caching saves ~50-90% tokens on repeated calls
|
|
482
530
|
- First call: full token cost
|
|
483
531
|
- Subsequent calls (within cache TTL): only messages tokens
|
|
484
532
|
- Default cache TTL is 300s/5 minutes (production-optimized)
|
|
485
|
-
- Default retry logic: 3 attempts with
|
|
533
|
+
- Default retry logic: 3 attempts with 20s delay (production-optimized)
|
|
486
534
|
|
|
487
535
|
Caching:
|
|
488
536
|
When enabled in your LiteLLM proxy and supported by the upstream provider,
|
|
@@ -500,10 +548,8 @@ async def generate(
|
|
|
500
548
|
|
|
501
549
|
This centralizes configuration and ensures consistency across all API calls.
|
|
502
550
|
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
- Automatic retry with configurable delay between attempts
|
|
506
|
-
- Cost tracking via response headers
|
|
551
|
+
All models are accessed via LiteLLM proxy with automatic retry and
|
|
552
|
+
cost tracking via response headers.
|
|
507
553
|
"""
|
|
508
554
|
if isinstance(messages, str):
|
|
509
555
|
messages = AIMessages([messages])
|
|
@@ -512,9 +558,22 @@ async def generate(
|
|
|
512
558
|
context = AIMessages()
|
|
513
559
|
if options is None:
|
|
514
560
|
options = ModelOptions()
|
|
561
|
+
else:
|
|
562
|
+
# Create a copy to avoid mutating the caller's options object
|
|
563
|
+
options = options.model_copy()
|
|
564
|
+
|
|
565
|
+
with contextlib.suppress(Exception):
|
|
566
|
+
track_llm_documents(context, messages)
|
|
515
567
|
|
|
516
568
|
try:
|
|
517
|
-
return await _generate_with_retry(
|
|
569
|
+
return await _generate_with_retry(
|
|
570
|
+
model,
|
|
571
|
+
context,
|
|
572
|
+
messages,
|
|
573
|
+
options,
|
|
574
|
+
purpose=purpose,
|
|
575
|
+
expected_cost=expected_cost,
|
|
576
|
+
)
|
|
518
577
|
except (ValueError, LLMError):
|
|
519
578
|
raise # Explicitly re-raise to satisfy DOC502
|
|
520
579
|
|
|
@@ -523,18 +582,18 @@ T = TypeVar("T", bound=BaseModel)
|
|
|
523
582
|
"""Type variable for Pydantic model types in structured generation."""
|
|
524
583
|
|
|
525
584
|
|
|
526
|
-
async def generate_structured(
|
|
585
|
+
async def generate_structured( # noqa: UP047
|
|
527
586
|
model: ModelName,
|
|
528
587
|
response_format: type[T],
|
|
529
588
|
*,
|
|
530
589
|
context: AIMessages | None = None,
|
|
531
590
|
messages: AIMessages | str,
|
|
532
591
|
options: ModelOptions | None = None,
|
|
592
|
+
purpose: str | None = None,
|
|
593
|
+
expected_cost: float | None = None,
|
|
533
594
|
) -> StructuredModelResponse[T]:
|
|
534
595
|
"""Generate structured output conforming to a Pydantic model.
|
|
535
596
|
|
|
536
|
-
@public
|
|
537
|
-
|
|
538
597
|
Type-safe generation that returns validated Pydantic model instances.
|
|
539
598
|
Uses OpenAI's structured output feature for guaranteed schema compliance.
|
|
540
599
|
|
|
@@ -589,21 +648,21 @@ async def generate_structured(
|
|
|
589
648
|
context: Static context to cache (documents, schemas, examples).
|
|
590
649
|
Defaults to None (empty AIMessages).
|
|
591
650
|
messages: Dynamic prompts/queries. AIMessages or str ONLY.
|
|
592
|
-
Do not pass Document or
|
|
651
|
+
Do not pass Document or list[Document] directly.
|
|
593
652
|
options: Optional ModelOptions for configuring temperature, retries, etc.
|
|
594
653
|
If provided, it will NOT be mutated (a copy is created internally).
|
|
595
654
|
The response_format field is set automatically from the response_format parameter.
|
|
596
655
|
In most cases, leave as None to use framework defaults.
|
|
597
656
|
Configure model behavior centrally via LiteLLM proxy settings when possible.
|
|
657
|
+
purpose: Optional semantic label used as the tracing span name
|
|
658
|
+
instead of model name. Stored as a span attribute.
|
|
659
|
+
expected_cost: Optional expected cost stored as a span attribute
|
|
660
|
+
for cost-tracking and comparison with actual cost.
|
|
598
661
|
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
- Many models support either vision OR structured output, but not both
|
|
604
|
-
- Test your specific model+document combination before production use
|
|
605
|
-
- Consider two-step approach: generate() for analysis, then generate_structured()
|
|
606
|
-
for formatting
|
|
662
|
+
Vision/PDF model compatibility: Images require vision-capable models that also support
|
|
663
|
+
structured output. PDFs require models with both document processing AND structured output
|
|
664
|
+
support. Consider two-step approach: generate() for analysis, then generate_structured()
|
|
665
|
+
for formatting.
|
|
607
666
|
|
|
608
667
|
Returns:
|
|
609
668
|
StructuredModelResponse[T] containing:
|
|
@@ -617,26 +676,6 @@ async def generate_structured(
|
|
|
617
676
|
LLMError: If generation fails after retries.
|
|
618
677
|
ValidationError: If response cannot be parsed into response_format.
|
|
619
678
|
|
|
620
|
-
Example:
|
|
621
|
-
>>> from pydantic import BaseModel, Field
|
|
622
|
-
>>>
|
|
623
|
-
>>> class Analysis(BaseModel):
|
|
624
|
-
... summary: str = Field(description="Brief summary")
|
|
625
|
-
... sentiment: float = Field(ge=-1, le=1)
|
|
626
|
-
... key_points: list[str] = Field(max_length=5)
|
|
627
|
-
>>>
|
|
628
|
-
>>> # CORRECT - No options parameter
|
|
629
|
-
>>> response = await llm.generate_structured(
|
|
630
|
-
... "gpt-5.1",
|
|
631
|
-
... response_format=Analysis,
|
|
632
|
-
... messages="Analyze this product review: ..."
|
|
633
|
-
... )
|
|
634
|
-
>>>
|
|
635
|
-
>>> analysis = response.parsed # Type: Analysis
|
|
636
|
-
>>> print(f"Sentiment: {analysis.sentiment}")
|
|
637
|
-
>>> for point in analysis.key_points:
|
|
638
|
-
... print(f"- {point}")
|
|
639
|
-
|
|
640
679
|
Supported models:
|
|
641
680
|
Structured output support varies by provider and model. Generally includes:
|
|
642
681
|
- OpenAI: GPT-4 and newer models
|
|
@@ -651,12 +690,9 @@ async def generate_structured(
|
|
|
651
690
|
- Complex schemas increase generation time
|
|
652
691
|
- Validation overhead is minimal (Pydantic is fast)
|
|
653
692
|
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
- Validation happens automatically via Pydantic
|
|
658
|
-
- Use Field() descriptions to guide generation
|
|
659
|
-
- Search models (models with '-search' suffix) do not support structured output
|
|
693
|
+
Pydantic model is converted to JSON Schema for the API. Validation happens
|
|
694
|
+
automatically via Pydantic. Search models (models with '-search' suffix) do
|
|
695
|
+
not support structured output.
|
|
660
696
|
"""
|
|
661
697
|
if context is None:
|
|
662
698
|
context = AIMessages()
|
|
@@ -673,9 +709,19 @@ async def generate_structured(
|
|
|
673
709
|
|
|
674
710
|
assert isinstance(messages, AIMessages)
|
|
675
711
|
|
|
712
|
+
with contextlib.suppress(Exception):
|
|
713
|
+
track_llm_documents(context, messages)
|
|
714
|
+
|
|
676
715
|
# Call the internal generate function with structured output enabled
|
|
677
716
|
try:
|
|
678
|
-
response = await _generate_with_retry(
|
|
717
|
+
response = await _generate_with_retry(
|
|
718
|
+
model,
|
|
719
|
+
context,
|
|
720
|
+
messages,
|
|
721
|
+
options,
|
|
722
|
+
purpose=purpose,
|
|
723
|
+
expected_cost=expected_cost,
|
|
724
|
+
)
|
|
679
725
|
except (ValueError, LLMError):
|
|
680
726
|
raise # Explicitly re-raise to satisfy DOC502
|
|
681
727
|
|