ai-pipeline-core 0.2.6__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. ai_pipeline_core/__init__.py +78 -125
  2. ai_pipeline_core/deployment/__init__.py +34 -0
  3. ai_pipeline_core/deployment/base.py +861 -0
  4. ai_pipeline_core/deployment/contract.py +80 -0
  5. ai_pipeline_core/deployment/deploy.py +561 -0
  6. ai_pipeline_core/deployment/helpers.py +97 -0
  7. ai_pipeline_core/deployment/progress.py +126 -0
  8. ai_pipeline_core/deployment/remote.py +116 -0
  9. ai_pipeline_core/docs_generator/__init__.py +54 -0
  10. ai_pipeline_core/docs_generator/__main__.py +5 -0
  11. ai_pipeline_core/docs_generator/cli.py +196 -0
  12. ai_pipeline_core/docs_generator/extractor.py +324 -0
  13. ai_pipeline_core/docs_generator/guide_builder.py +644 -0
  14. ai_pipeline_core/docs_generator/trimmer.py +35 -0
  15. ai_pipeline_core/docs_generator/validator.py +114 -0
  16. ai_pipeline_core/document_store/__init__.py +13 -0
  17. ai_pipeline_core/document_store/_summary.py +9 -0
  18. ai_pipeline_core/document_store/_summary_worker.py +170 -0
  19. ai_pipeline_core/document_store/clickhouse.py +492 -0
  20. ai_pipeline_core/document_store/factory.py +38 -0
  21. ai_pipeline_core/document_store/local.py +312 -0
  22. ai_pipeline_core/document_store/memory.py +85 -0
  23. ai_pipeline_core/document_store/protocol.py +68 -0
  24. ai_pipeline_core/documents/__init__.py +12 -14
  25. ai_pipeline_core/documents/_context_vars.py +85 -0
  26. ai_pipeline_core/documents/_hashing.py +52 -0
  27. ai_pipeline_core/documents/attachment.py +85 -0
  28. ai_pipeline_core/documents/context.py +128 -0
  29. ai_pipeline_core/documents/document.py +318 -1434
  30. ai_pipeline_core/documents/mime_type.py +37 -82
  31. ai_pipeline_core/documents/utils.py +4 -12
  32. ai_pipeline_core/exceptions.py +10 -62
  33. ai_pipeline_core/images/__init__.py +309 -0
  34. ai_pipeline_core/images/_processing.py +151 -0
  35. ai_pipeline_core/llm/__init__.py +6 -4
  36. ai_pipeline_core/llm/ai_messages.py +130 -81
  37. ai_pipeline_core/llm/client.py +327 -193
  38. ai_pipeline_core/llm/model_options.py +14 -86
  39. ai_pipeline_core/llm/model_response.py +60 -103
  40. ai_pipeline_core/llm/model_types.py +16 -34
  41. ai_pipeline_core/logging/__init__.py +2 -7
  42. ai_pipeline_core/logging/logging.yml +1 -1
  43. ai_pipeline_core/logging/logging_config.py +27 -37
  44. ai_pipeline_core/logging/logging_mixin.py +15 -41
  45. ai_pipeline_core/observability/__init__.py +32 -0
  46. ai_pipeline_core/observability/_debug/__init__.py +30 -0
  47. ai_pipeline_core/observability/_debug/_auto_summary.py +94 -0
  48. ai_pipeline_core/observability/_debug/_config.py +95 -0
  49. ai_pipeline_core/observability/_debug/_content.py +764 -0
  50. ai_pipeline_core/observability/_debug/_processor.py +98 -0
  51. ai_pipeline_core/observability/_debug/_summary.py +312 -0
  52. ai_pipeline_core/observability/_debug/_types.py +75 -0
  53. ai_pipeline_core/observability/_debug/_writer.py +843 -0
  54. ai_pipeline_core/observability/_document_tracking.py +146 -0
  55. ai_pipeline_core/observability/_initialization.py +194 -0
  56. ai_pipeline_core/observability/_logging_bridge.py +57 -0
  57. ai_pipeline_core/observability/_summary.py +81 -0
  58. ai_pipeline_core/observability/_tracking/__init__.py +6 -0
  59. ai_pipeline_core/observability/_tracking/_client.py +178 -0
  60. ai_pipeline_core/observability/_tracking/_internal.py +28 -0
  61. ai_pipeline_core/observability/_tracking/_models.py +138 -0
  62. ai_pipeline_core/observability/_tracking/_processor.py +158 -0
  63. ai_pipeline_core/observability/_tracking/_service.py +311 -0
  64. ai_pipeline_core/observability/_tracking/_writer.py +229 -0
  65. ai_pipeline_core/{tracing.py → observability/tracing.py} +139 -283
  66. ai_pipeline_core/pipeline/__init__.py +10 -0
  67. ai_pipeline_core/pipeline/decorators.py +915 -0
  68. ai_pipeline_core/pipeline/options.py +16 -0
  69. ai_pipeline_core/prompt_manager.py +16 -102
  70. ai_pipeline_core/settings.py +26 -31
  71. ai_pipeline_core/testing.py +9 -0
  72. ai_pipeline_core-0.4.1.dist-info/METADATA +807 -0
  73. ai_pipeline_core-0.4.1.dist-info/RECORD +76 -0
  74. {ai_pipeline_core-0.2.6.dist-info → ai_pipeline_core-0.4.1.dist-info}/WHEEL +1 -1
  75. ai_pipeline_core/documents/document_list.py +0 -420
  76. ai_pipeline_core/documents/flow_document.py +0 -112
  77. ai_pipeline_core/documents/task_document.py +0 -117
  78. ai_pipeline_core/documents/temporary_document.py +0 -74
  79. ai_pipeline_core/flow/__init__.py +0 -9
  80. ai_pipeline_core/flow/config.py +0 -483
  81. ai_pipeline_core/flow/options.py +0 -75
  82. ai_pipeline_core/pipeline.py +0 -718
  83. ai_pipeline_core/prefect.py +0 -63
  84. ai_pipeline_core/simple_runner/__init__.py +0 -14
  85. ai_pipeline_core/simple_runner/cli.py +0 -254
  86. ai_pipeline_core/simple_runner/simple_runner.py +0 -247
  87. ai_pipeline_core/storage/__init__.py +0 -8
  88. ai_pipeline_core/storage/storage.py +0 -628
  89. ai_pipeline_core/utils/__init__.py +0 -8
  90. ai_pipeline_core/utils/deploy.py +0 -373
  91. ai_pipeline_core/utils/remote_deployment.py +0 -269
  92. ai_pipeline_core-0.2.6.dist-info/METADATA +0 -500
  93. ai_pipeline_core-0.2.6.dist-info/RECORD +0 -41
  94. {ai_pipeline_core-0.2.6.dist-info → ai_pipeline_core-0.4.1.dist-info}/licenses/LICENSE +0 -0
@@ -1,99 +1,216 @@
1
1
  """LLM client implementation for AI model interactions.
2
2
 
3
- @public
4
-
5
3
  This module provides the core functionality for interacting with language models
6
4
  through a unified interface. It handles retries, caching, structured outputs,
7
5
  and integration with various LLM providers via LiteLLM.
8
6
 
9
- Key functions:
10
- - generate(): Text generation with optional context caching
11
- - generate_structured(): Type-safe structured output generation
7
+ Automatic image auto-tiling splits oversized images in attachments to meet
8
+ model-specific constraints (e.g., 3000x3000 for Gemini, 1000x1000 default).
9
+ Context caching separates static content from dynamic messages for 50-90% token savings.
10
+ Optional purpose and expected_cost parameters enable tracing and cost-tracking.
12
11
  """
13
12
 
14
13
  import asyncio
14
+ import contextlib
15
15
  import time
16
+ from io import BytesIO
16
17
  from typing import Any, TypeVar
17
18
 
18
19
  from lmnr import Laminar
19
20
  from openai import AsyncOpenAI
20
21
  from openai.lib.streaming.chat import ChunkEvent, ContentDeltaEvent, ContentDoneEvent
21
22
  from openai.types.chat import (
23
+ ChatCompletion,
22
24
  ChatCompletionMessageParam,
23
25
  )
24
- from prefect.logging import get_logger
26
+ from PIL import Image
25
27
  from pydantic import BaseModel, ValidationError
26
28
 
29
+ from ai_pipeline_core.documents import Document
30
+ from ai_pipeline_core.documents.attachment import Attachment
27
31
  from ai_pipeline_core.exceptions import LLMError
32
+ from ai_pipeline_core.images import ImageProcessingConfig, process_image, process_image_to_documents
33
+ from ai_pipeline_core.logging import get_pipeline_logger
34
+ from ai_pipeline_core.observability._document_tracking import track_llm_documents
28
35
  from ai_pipeline_core.settings import settings
29
36
 
30
- from .ai_messages import AIMessages
37
+ from .ai_messages import AIMessages, AIMessageType
31
38
  from .model_options import ModelOptions
32
39
  from .model_response import ModelResponse, StructuredModelResponse
33
40
  from .model_types import ModelName
34
41
 
35
- logger = get_logger()
42
+ logger = get_pipeline_logger(__name__)
43
+
44
+ # Image splitting configs for automatic large-image handling at the LLM boundary.
45
+ # Gemini supports up to 3000x3000; all other models use a conservative 1000x1000 default.
46
+ _GEMINI_IMAGE_CONFIG = ImageProcessingConfig(max_dimension=3000, max_pixels=9_000_000, jpeg_quality=75)
47
+ _DEFAULT_IMAGE_CONFIG = ImageProcessingConfig(max_dimension=1000, max_pixels=1_000_000, jpeg_quality=75)
48
+
49
+
50
+ def _get_image_config(model: str) -> ImageProcessingConfig:
51
+ """Return the image splitting config for a model."""
52
+ if "gemini" in model.lower():
53
+ return _GEMINI_IMAGE_CONFIG
54
+ return _DEFAULT_IMAGE_CONFIG
55
+
56
+
57
+ def _prepare_images_for_model(messages: AIMessages, model: str) -> AIMessages: # noqa: C901, PLR0912, PLR0915, PLR0914
58
+ """Split image documents and image attachments that exceed model constraints.
59
+
60
+ Returns a new AIMessages with oversized images replaced by tiles.
61
+ Returns the original instance unchanged if no splitting is needed.
62
+ """
63
+ if not any(isinstance(m, Document) and (m.is_image or any(att.is_image for att in m.attachments)) for m in messages):
64
+ return messages
65
+
66
+ config = _get_image_config(model)
67
+ result: list[AIMessageType] = []
68
+ changed = False
69
+
70
+ for msg in messages:
71
+ if not isinstance(msg, Document):
72
+ result.append(msg)
73
+ continue
74
+
75
+ # 1. Handle top-level image Documents (existing logic)
76
+ if msg.is_image:
77
+ try:
78
+ with Image.open(BytesIO(msg.content)) as img:
79
+ w, h = img.size
80
+ except Exception:
81
+ result.append(msg)
82
+ continue
83
+
84
+ within_limits = w <= config.max_dimension and h <= config.max_dimension and w * h <= config.max_pixels
85
+ if within_limits:
86
+ pass # Falls through to attachment handling
87
+ else:
88
+ name_prefix = msg.name.rsplit(".", 1)[0] if "." in msg.name else msg.name
89
+ tiles = process_image_to_documents(msg, config=config, name_prefix=name_prefix)
90
+ if msg.attachments and tiles:
91
+ tiles[0] = tiles[0].model_copy(update={"attachments": msg.attachments})
92
+ result.extend(tiles)
93
+ changed = True
94
+ continue
95
+
96
+ # 2. Handle image attachments
97
+ if msg.attachments:
98
+ new_attachments: list[Attachment] = []
99
+ attachments_changed = False
100
+
101
+ for att in msg.attachments:
102
+ if not att.is_image:
103
+ new_attachments.append(att)
104
+ continue
105
+
106
+ try:
107
+ with Image.open(BytesIO(att.content)) as img:
108
+ w, h = img.size
109
+ except Exception:
110
+ new_attachments.append(att)
111
+ continue
112
+
113
+ att_within_limits = w <= config.max_dimension and h <= config.max_dimension and w * h <= config.max_pixels
114
+ if att_within_limits:
115
+ new_attachments.append(att)
116
+ continue
117
+
118
+ # Tile the oversized attachment image
119
+ processed = process_image(att.content, config=config)
120
+ att_prefix = att.name.rsplit(".", 1)[0] if "." in att.name else att.name
121
+
122
+ for part in processed.parts:
123
+ if part.total == 1:
124
+ tile_name = f"{att_prefix}.jpg"
125
+ tile_desc = att.description
126
+ else:
127
+ tile_name = f"{att_prefix}_{part.index + 1:02d}_of_{part.total:02d}.jpg"
128
+ tile_desc = f"{att.description} ({part.label})" if att.description else part.label
129
+
130
+ new_attachments.append(
131
+ Attachment(
132
+ name=tile_name,
133
+ content=part.data,
134
+ description=tile_desc,
135
+ )
136
+ )
137
+ attachments_changed = True
138
+
139
+ if attachments_changed:
140
+ msg = msg.model_copy(update={"attachments": tuple(new_attachments)}) # noqa: PLW2901
141
+ changed = True
142
+
143
+ result.append(msg)
144
+
145
+ if not changed:
146
+ return messages
147
+ return AIMessages(result)
36
148
 
37
149
 
38
150
  def _process_messages(
39
151
  context: AIMessages,
40
152
  messages: AIMessages,
41
153
  system_prompt: str | None = None,
42
- cache_ttl: str | None = "5m",
154
+ cache_ttl: str | None = "300s",
43
155
  ) -> list[ChatCompletionMessageParam]:
44
156
  """Process and format messages for LLM API consumption.
45
157
 
46
158
  Internal function that combines context and messages into a single
47
159
  list of API-compatible messages. Applies caching directives to
48
- context messages for efficiency.
160
+ system prompt and context messages for efficiency.
49
161
 
50
162
  Args:
51
163
  context: Messages to be cached (typically expensive/static content).
52
164
  messages: Regular messages without caching (dynamic queries).
53
165
  system_prompt: Optional system instructions for the model.
54
- cache_ttl: Cache TTL for context messages (e.g. "120s", "5m", "1h").
166
+ cache_ttl: Cache TTL for system and context messages (e.g. "120s", "300s", "1h").
55
167
  Set to None or empty string to disable caching.
56
168
 
57
169
  Returns:
58
170
  List of formatted messages ready for API calls, with:
59
- - System prompt at the beginning (if provided)
60
- - Context messages with cache_control on the last one (if cache_ttl)
171
+ - System prompt at the beginning with cache_control (if provided and cache_ttl set)
172
+ - Context messages with cache_control on all messages (if cache_ttl set)
61
173
  - Regular messages without caching
62
174
 
63
175
  System Prompt Location:
64
176
  The system prompt parameter is always injected as the FIRST message
65
- with role="system". It is NOT cached with context, allowing dynamic
66
- system prompts without breaking cache efficiency.
177
+ with role="system". It is cached along with context when cache_ttl is set.
67
178
 
68
179
  Cache behavior:
69
- The last context message gets ephemeral caching with specified TTL
180
+ All system and context messages get ephemeral caching with specified TTL
70
181
  to reduce token usage on repeated calls with same context.
71
182
  If cache_ttl is None or empty string (falsy), no caching is applied.
72
- Only the last context message receives cache_control to maximize efficiency.
183
+ All system and context messages receive cache_control to maximize cache efficiency.
73
184
 
74
- Note:
75
- This is an internal function used by _generate_with_retry().
76
- The context/messages split enables efficient token usage.
185
+ This is an internal function used by _generate_with_retry().
186
+ The context/messages split enables efficient token usage.
77
187
  """
78
188
  processed_messages: list[ChatCompletionMessageParam] = []
79
189
 
80
190
  # Add system prompt if provided
81
191
  if system_prompt:
82
- processed_messages.append({"role": "system", "content": system_prompt})
192
+ processed_messages.append({
193
+ "role": "system",
194
+ "content": [{"type": "text", "text": system_prompt}],
195
+ })
83
196
 
84
197
  # Process context messages with caching if provided
85
198
  if context:
86
199
  # Use AIMessages.to_prompt() for context
87
200
  context_messages = context.to_prompt()
201
+ processed_messages.extend(context_messages)
88
202
 
89
- # Apply caching to last context message if cache_ttl is set
90
- if cache_ttl:
91
- context_messages[-1]["cache_control"] = { # type: ignore
203
+ if cache_ttl:
204
+ for message in processed_messages:
205
+ message["cache_control"] = { # type: ignore
92
206
  "type": "ephemeral",
93
207
  "ttl": cache_ttl,
94
208
  }
95
-
96
- processed_messages.extend(context_messages)
209
+ if isinstance(message["content"], list): # type: ignore
210
+ message["content"][-1]["cache_control"] = { # type: ignore
211
+ "type": "ephemeral",
212
+ "ttl": cache_ttl,
213
+ }
97
214
 
98
215
  # Process regular messages without caching
99
216
  if messages:
@@ -103,6 +220,35 @@ def _process_messages(
103
220
  return processed_messages
104
221
 
105
222
 
223
+ def _remove_cache_control(
224
+ messages: list[ChatCompletionMessageParam],
225
+ ) -> list[ChatCompletionMessageParam]:
226
+ """Remove cache control directives from messages.
227
+
228
+ Internal utility that strips cache_control fields from both message-level
229
+ and content-level entries. Used in retry logic when cache-related errors
230
+ occur during LLM API calls.
231
+
232
+ Args:
233
+ messages: List of messages that may contain cache_control directives.
234
+
235
+ Returns:
236
+ The same message list (modified in-place) with all cache_control
237
+ fields removed from both messages and their content items.
238
+
239
+ Modifies the input list in-place but also returns it for convenience.
240
+ Handles both list-based content (multipart) and string content (simple messages).
241
+ """
242
+ for message in messages:
243
+ if (content := message.get("content")) and isinstance(content, list):
244
+ for item in content:
245
+ if "cache_control" in item:
246
+ del item["cache_control"]
247
+ if "cache_control" in message:
248
+ del message["cache_control"]
249
+ return messages
250
+
251
+
106
252
  def _model_name_to_openrouter_model(model: ModelName) -> str:
107
253
  """Convert a model name to an OpenRouter model name.
108
254
 
@@ -112,14 +258,10 @@ def _model_name_to_openrouter_model(model: ModelName) -> str:
112
258
  Returns:
113
259
  OpenRouter model name.
114
260
  """
115
- if model == "gpt-4o-search":
116
- return "openai/gpt-4o-search-preview"
117
- if model == "gemini-2.5-flash-search":
118
- return "google/gemini-2.5-flash:online"
119
- if model == "grok-4-fast-search":
120
- return "x-ai/grok-4-fast:online"
261
+ if model == "gemini-3-flash-search":
262
+ return "google/gemini-3-flash:online"
121
263
  if model == "sonar-pro-search":
122
- return "perplexity/sonar-reasoning-pro"
264
+ return "perplexity/sonar-pro-search"
123
265
  if model.startswith("gemini"):
124
266
  return f"google/{model}"
125
267
  elif model.startswith("gpt"):
@@ -139,30 +281,76 @@ def _model_name_to_openrouter_model(model: ModelName) -> str:
139
281
  return model
140
282
 
141
283
 
142
- async def _generate(
143
- model: str, messages: list[ChatCompletionMessageParam], completion_kwargs: dict[str, Any]
284
+ async def _generate_streaming(client: AsyncOpenAI, model: str, messages: list[ChatCompletionMessageParam], completion_kwargs: dict[str, Any]) -> ModelResponse:
285
+ """Execute a streaming LLM API call."""
286
+ start_time = time.time()
287
+ first_token_time = None
288
+ usage = None
289
+ async with client.chat.completions.stream(
290
+ model=model,
291
+ messages=messages,
292
+ **completion_kwargs,
293
+ ) as s:
294
+ async for event in s:
295
+ if isinstance(event, ContentDeltaEvent):
296
+ if not first_token_time:
297
+ first_token_time = time.time()
298
+ elif isinstance(event, ContentDoneEvent):
299
+ pass
300
+ elif isinstance(event, ChunkEvent) and event.chunk.usage:
301
+ usage = event.chunk.usage
302
+ if not first_token_time:
303
+ first_token_time = time.time()
304
+ raw_response = await s.get_final_completion()
305
+
306
+ metadata = {
307
+ "time_taken": round(time.time() - start_time, 2),
308
+ "first_token_time": round(first_token_time - start_time, 2),
309
+ }
310
+ return ModelResponse(raw_response, model_options=completion_kwargs, metadata=metadata, usage=usage)
311
+
312
+
313
+ async def _generate_non_streaming(
314
+ client: AsyncOpenAI, model: str, messages: list[ChatCompletionMessageParam], completion_kwargs: dict[str, Any]
144
315
  ) -> ModelResponse:
145
- """Execute a single LLM API call.
316
+ """Execute a non-streaming LLM API call.
317
+
318
+ Avoids OpenAI SDK delta accumulation — some providers (e.g. Grok) send
319
+ streaming annotation deltas that crash the SDK's accumulate_delta().
320
+ """
321
+ start_time = time.time()
322
+ kwargs = {k: v for k, v in completion_kwargs.items() if k != "stream_options"}
323
+ response_format = kwargs.get("response_format")
324
+ if isinstance(response_format, type) and issubclass(response_format, BaseModel):
325
+ raw_response: ChatCompletion = await client.chat.completions.parse(
326
+ model=model,
327
+ messages=messages,
328
+ **kwargs,
329
+ )
330
+ else:
331
+ raw_response = await client.chat.completions.create(
332
+ model=model,
333
+ messages=messages,
334
+ stream=False,
335
+ **kwargs,
336
+ )
337
+ elapsed = round(time.time() - start_time, 2)
338
+ metadata = {"time_taken": elapsed, "first_token_time": elapsed}
339
+ return ModelResponse(raw_response, model_options=completion_kwargs, metadata=metadata)
340
+
146
341
 
147
- Internal function that makes the actual API request to the LLM provider.
148
- Handles both regular and structured output generation.
342
+ async def _generate(model: str, messages: list[ChatCompletionMessageParam], completion_kwargs: dict[str, Any], *, stream: bool = True) -> ModelResponse:
343
+ """Execute a single LLM API call.
149
344
 
150
345
  Args:
151
- model: Model identifier (e.g., "gpt-5", "gemini-2.5-pro").
346
+ model: Model identifier (e.g., "gpt-5.1", "gemini-3-pro").
152
347
  messages: Formatted messages for the API.
153
348
  completion_kwargs: Additional parameters for the completion API.
349
+ stream: Whether to use streaming mode (default True). Non-streaming
350
+ avoids OpenAI SDK delta accumulation issues with some providers.
154
351
 
155
352
  Returns:
156
353
  ModelResponse with generated content and metadata.
157
-
158
- API selection:
159
- - Uses client.chat.completions.parse() for structured output
160
- - Uses client.chat.completions.create() for regular text
161
-
162
- Note:
163
- - Uses AsyncOpenAI client configured via settings
164
- - Captures response headers for cost tracking
165
- - Response includes model options for debugging
166
354
  """
167
355
  if "openrouter" in settings.openai_base_url.lower():
168
356
  model = _model_name_to_openrouter_model(model)
@@ -171,45 +359,18 @@ async def _generate(
171
359
  api_key=settings.openai_api_key,
172
360
  base_url=settings.openai_base_url,
173
361
  ) as client:
174
- start_time = time.time()
175
- first_token_time = None
176
- usage = None
177
- async with client.chat.completions.stream(
178
- model=model,
179
- messages=messages,
180
- **completion_kwargs,
181
- ) as stream:
182
- async for event in stream:
183
- if isinstance(event, ContentDeltaEvent):
184
- if not first_token_time:
185
- first_token_time = time.time()
186
- elif isinstance(event, ContentDoneEvent):
187
- pass
188
- elif isinstance(event, ChunkEvent):
189
- if event.chunk.usage: # used to fix a bug with missing usage data
190
- usage = event.chunk.usage
191
- if not first_token_time:
192
- first_token_time = time.time()
193
- raw_response = await stream.get_final_completion()
194
-
195
- metadata = {
196
- "time_taken": round(time.time() - start_time, 2),
197
- "first_token_time": round(first_token_time - start_time, 2),
198
- }
199
- response = ModelResponse(
200
- raw_response,
201
- model_options=completion_kwargs,
202
- metadata=metadata,
203
- usage=usage,
204
- )
205
- return response
362
+ if stream:
363
+ return await _generate_streaming(client, model, messages, completion_kwargs)
364
+ return await _generate_non_streaming(client, model, messages, completion_kwargs)
206
365
 
207
366
 
208
- async def _generate_with_retry(
367
+ async def _generate_with_retry( # noqa: PLR0917
209
368
  model: str,
210
369
  context: AIMessages,
211
370
  messages: AIMessages,
212
371
  options: ModelOptions,
372
+ purpose: str | None = None,
373
+ expected_cost: float | None = None,
213
374
  ) -> ModelResponse:
214
375
  """Core LLM generation with automatic retry logic.
215
376
 
@@ -221,6 +382,8 @@ async def _generate_with_retry(
221
382
  context: Cached context messages (can be empty).
222
383
  messages: Dynamic query messages.
223
384
  options: Configuration including retries, timeout, temperature.
385
+ purpose: Optional semantic label for the LLM span name.
386
+ expected_cost: Optional expected cost for cost-tracking attributes.
224
387
 
225
388
  Returns:
226
389
  ModelResponse with generated content.
@@ -229,17 +392,22 @@ async def _generate_with_retry(
229
392
  ValueError: If model is not provided or both context and messages are empty.
230
393
  LLMError: If all retry attempts are exhausted.
231
394
 
232
- Note:
233
- Empty responses trigger a retry as they indicate API issues.
395
+ Empty responses trigger a retry as they indicate API issues.
234
396
  """
235
397
  if not model:
236
398
  raise ValueError("Model must be provided")
237
399
  if not context and not messages:
238
400
  raise ValueError("Either context or messages must be provided")
239
401
 
240
- processed_messages = _process_messages(
241
- context, messages, options.system_prompt, options.cache_ttl
242
- )
402
+ # Auto-split large images based on model-specific constraints
403
+ context = _prepare_images_for_model(context, model)
404
+ messages = _prepare_images_for_model(messages, model)
405
+
406
+ if "gemini" in model.lower() and context.approximate_tokens_count < 10000:
407
+ # Bug fix for minimum explicit context size for Gemini models
408
+ options.cache_ttl = None
409
+
410
+ processed_messages = _process_messages(context, messages, options.system_prompt, options.cache_ttl)
243
411
  completion_kwargs: dict[str, Any] = {
244
412
  **options.to_openai_completion_kwargs(),
245
413
  }
@@ -249,20 +417,23 @@ async def _generate_with_retry(
249
417
 
250
418
  for attempt in range(options.retries):
251
419
  try:
252
- with Laminar.start_as_current_span(
253
- model, span_type="LLM", input=processed_messages
254
- ) as span:
255
- response = await _generate(model, processed_messages, completion_kwargs)
256
- span.set_attributes(response.get_laminar_metadata())
257
- Laminar.set_span_output([
258
- r for r in (response.reasoning_content, response.content) if r
259
- ])
420
+ with Laminar.start_as_current_span(purpose or model, span_type="LLM", input=processed_messages) as span:
421
+ response = await _generate(model, processed_messages, completion_kwargs, stream=options.stream)
422
+ laminar_metadata = response.get_laminar_metadata()
423
+ if purpose:
424
+ laminar_metadata["purpose"] = purpose
425
+ if expected_cost is not None:
426
+ laminar_metadata["expected_cost"] = expected_cost
427
+ span.set_attributes(laminar_metadata) # pyright: ignore[reportArgumentType]
428
+ Laminar.set_span_output([r for r in (response.reasoning_content, response.content) if r])
260
429
  response.validate_output()
261
430
  return response
262
- except (asyncio.TimeoutError, ValueError, ValidationError, Exception) as e:
431
+ except (TimeoutError, ValueError, ValidationError, Exception) as e:
263
432
  if not isinstance(e, asyncio.TimeoutError):
264
433
  # disable cache if it's not a timeout because it may cause an error
265
434
  completion_kwargs["extra_body"]["cache"] = {"no-cache": True}
435
+ # sometimes there are issues with cache so cache is removed in case of failure
436
+ processed_messages = _remove_cache_control(processed_messages)
266
437
 
267
438
  logger.warning(
268
439
  f"LLM generation failed (attempt {attempt + 1}/{options.retries}): {e}",
@@ -281,11 +452,11 @@ async def generate(
281
452
  context: AIMessages | None = None,
282
453
  messages: AIMessages | str,
283
454
  options: ModelOptions | None = None,
455
+ purpose: str | None = None,
456
+ expected_cost: float | None = None,
284
457
  ) -> ModelResponse:
285
458
  """Generate text response from a language model.
286
459
 
287
- @public
288
-
289
460
  Main entry point for LLM text generation with smart context caching.
290
461
  The context/messages split enables efficient token usage by caching
291
462
  expensive static content separately from dynamic queries.
@@ -297,18 +468,21 @@ async def generate(
297
468
  4. CONFIGURATION: Configure model behavior via LiteLLM proxy or environment variables
298
469
 
299
470
  Args:
300
- model: Model to use (e.g., "gpt-5", "gemini-2.5-pro", "grok-4").
471
+ model: Model to use (e.g., "gpt-5.1", "gemini-3-pro", "grok-4.1-fast").
301
472
  Accepts predefined models or any string for custom models.
302
473
  context: Static context to cache (documents, examples, instructions).
303
474
  Defaults to None (empty context). Cached for 5 minutes by default.
304
475
  messages: Dynamic messages/queries. AIMessages or str ONLY.
305
- Do not pass Document or DocumentList directly.
476
+ Do not pass Document or list[Document] directly.
306
477
  If string, converted to AIMessages internally.
307
- options: DEPRECATED - DO NOT USE. Reserved for internal framework usage only.
308
- Framework defaults are production-optimized (3 retries, 10s delay, 300s timeout).
309
- Configure model behavior centrally via LiteLLM proxy settings or environment
310
- variables, not per API call. Provider-specific settings should be configured
311
- at the proxy level.
478
+ options: Internal framework parameter. Framework defaults are production-optimized
479
+ (3 retries, 20s delay, 600s timeout). Configure model behavior centrally via
480
+ LiteLLM proxy settings or environment variables, not per API call.
481
+ Provider-specific settings should be configured at the proxy level.
482
+ purpose: Optional semantic label used as the tracing span name
483
+ instead of model name. Stored as a span attribute.
484
+ expected_cost: Optional expected cost stored as a span attribute
485
+ for cost-tracking and comparison with actual cost.
312
486
 
313
487
  Returns:
314
488
  ModelResponse containing:
@@ -325,17 +499,17 @@ async def generate(
325
499
  Wrap Documents in AIMessages - DO NOT pass directly or convert to .text:
326
500
 
327
501
  # CORRECT - wrap Document in AIMessages
328
- response = await llm.generate("gpt-5", messages=AIMessages([my_document]))
502
+ response = await llm.generate("gpt-5.1", messages=AIMessages([my_document]))
329
503
 
330
504
  # WRONG - don't pass Document directly
331
- response = await llm.generate("gpt-5", messages=my_document) # NO!
505
+ response = await llm.generate("gpt-5.1", messages=my_document) # NO!
332
506
 
333
507
  # WRONG - don't convert to string yourself
334
- response = await llm.generate("gpt-5", messages=my_document.text) # NO!
508
+ response = await llm.generate("gpt-5.1", messages=my_document.text) # NO!
335
509
 
336
510
  VISION/PDF MODEL COMPATIBILITY:
337
511
  When using Documents containing images or PDFs, ensure your model supports these formats:
338
- - Images require vision-capable models (gpt-4o, gemini-pro-vision, claude-3-sonnet)
512
+ - Images require vision-capable models (gpt-5.1, gemini-3-flash, gemini-3-pro)
339
513
  - PDFs require document processing support (varies by provider)
340
514
  - Non-compatible models will raise ValueError or fall back to text extraction
341
515
  - Check model capabilities before including visual/PDF content
@@ -351,50 +525,12 @@ async def generate(
351
525
  - Changes with each API call
352
526
  - Never cached, always processed fresh
353
527
 
354
- Example:
355
- >>> # CORRECT - No options parameter (this is the recommended pattern)
356
- >>> response = await llm.generate("gpt-5", messages="Explain quantum computing")
357
- >>> print(response.content) # In production, use get_pipeline_logger instead of print
358
-
359
- >>> # With context caching for efficiency
360
- >>> # Context and messages are both AIMessages or str; wrap any Documents
361
- >>> static_doc = AIMessages([large_document, "few-shot example: ..."])
362
- >>>
363
- >>> # First call: caches context
364
- >>> r1 = await llm.generate("gpt-5", context=static_doc, messages="Summarize")
365
- >>>
366
- >>> # Second call: reuses cache, saves tokens!
367
- >>> r2 = await llm.generate("gpt-5", context=static_doc, messages="Key points?")
368
-
369
- >>> # Multi-turn conversation
370
- >>> messages = AIMessages([
371
- ... "What is Python?",
372
- ... previous_response,
373
- ... "Can you give an example?"
374
- ... ])
375
- >>> response = await llm.generate("gpt-5", messages=messages)
376
-
377
- Configuration via LiteLLM Proxy:
378
- >>> # Configure temperature in litellm_config.yaml:
379
- >>> # model_list:
380
- >>> # - model_name: gpt-5
381
- >>> # litellm_params:
382
- >>> # model: openai/gpt-4o
383
- >>> # temperature: 0.3
384
- >>> # max_tokens: 1000
385
- >>>
386
- >>> # Configure retry logic in proxy:
387
- >>> # general_settings:
388
- >>> # master_key: sk-1234
389
- >>> # max_retries: 5
390
- >>> # retry_delay: 15
391
-
392
528
  Performance:
393
529
  - Context caching saves ~50-90% tokens on repeated calls
394
530
  - First call: full token cost
395
531
  - Subsequent calls (within cache TTL): only messages tokens
396
- - Default cache TTL is 5m (production-optimized)
397
- - Default retry logic: 3 attempts with 10s delay (production-optimized)
532
+ - Default cache TTL is 300s/5 minutes (production-optimized)
533
+ - Default retry logic: 3 attempts with 20s delay (production-optimized)
398
534
 
399
535
  Caching:
400
536
  When enabled in your LiteLLM proxy and supported by the upstream provider,
@@ -412,10 +548,8 @@ async def generate(
412
548
 
413
549
  This centralizes configuration and ensures consistency across all API calls.
414
550
 
415
- Note:
416
- - All models are accessed via LiteLLM proxy
417
- - Automatic retry with configurable delay between attempts
418
- - Cost tracking via response headers
551
+ All models are accessed via LiteLLM proxy with automatic retry and
552
+ cost tracking via response headers.
419
553
  """
420
554
  if isinstance(messages, str):
421
555
  messages = AIMessages([messages])
@@ -424,9 +558,22 @@ async def generate(
424
558
  context = AIMessages()
425
559
  if options is None:
426
560
  options = ModelOptions()
561
+ else:
562
+ # Create a copy to avoid mutating the caller's options object
563
+ options = options.model_copy()
564
+
565
+ with contextlib.suppress(Exception):
566
+ track_llm_documents(context, messages)
427
567
 
428
568
  try:
429
- return await _generate_with_retry(model, context, messages, options)
569
+ return await _generate_with_retry(
570
+ model,
571
+ context,
572
+ messages,
573
+ options,
574
+ purpose=purpose,
575
+ expected_cost=expected_cost,
576
+ )
430
577
  except (ValueError, LLMError):
431
578
  raise # Explicitly re-raise to satisfy DOC502
432
579
 
@@ -435,18 +582,18 @@ T = TypeVar("T", bound=BaseModel)
435
582
  """Type variable for Pydantic model types in structured generation."""
436
583
 
437
584
 
438
- async def generate_structured(
585
+ async def generate_structured( # noqa: UP047
439
586
  model: ModelName,
440
587
  response_format: type[T],
441
588
  *,
442
589
  context: AIMessages | None = None,
443
590
  messages: AIMessages | str,
444
591
  options: ModelOptions | None = None,
592
+ purpose: str | None = None,
593
+ expected_cost: float | None = None,
445
594
  ) -> StructuredModelResponse[T]:
446
595
  """Generate structured output conforming to a Pydantic model.
447
596
 
448
- @public
449
-
450
597
  Type-safe generation that returns validated Pydantic model instances.
451
598
  Uses OpenAI's structured output feature for guaranteed schema compliance.
452
599
 
@@ -482,7 +629,7 @@ async def generate_structured(
482
629
 
483
630
  >>> # Step 1: Research/analysis with generate() - no options parameter
484
631
  >>> research = await llm.generate(
485
- ... "gpt-5",
632
+ ... "gpt-5.1",
486
633
  ... messages="Research and analyze this complex topic..."
487
634
  ... )
488
635
  >>>
@@ -501,21 +648,21 @@ async def generate_structured(
501
648
  context: Static context to cache (documents, schemas, examples).
502
649
  Defaults to None (empty AIMessages).
503
650
  messages: Dynamic prompts/queries. AIMessages or str ONLY.
504
- Do not pass Document or DocumentList directly.
651
+ Do not pass Document or list[Document] directly.
505
652
  options: Optional ModelOptions for configuring temperature, retries, etc.
506
653
  If provided, it will NOT be mutated (a copy is created internally).
507
654
  The response_format field is set automatically from the response_format parameter.
508
655
  In most cases, leave as None to use framework defaults.
509
656
  Configure model behavior centrally via LiteLLM proxy settings when possible.
657
+ purpose: Optional semantic label used as the tracing span name
658
+ instead of model name. Stored as a span attribute.
659
+ expected_cost: Optional expected cost stored as a span attribute
660
+ for cost-tracking and comparison with actual cost.
510
661
 
511
- Note:
512
- Vision/PDF model compatibility considerations:
513
- - Images require vision-capable models that also support structured output
514
- - PDFs require models with both document processing AND structured output support
515
- - Many models support either vision OR structured output, but not both
516
- - Test your specific model+document combination before production use
517
- - Consider two-step approach: generate() for analysis, then generate_structured()
518
- for formatting
662
+ Vision/PDF model compatibility: Images require vision-capable models that also support
663
+ structured output. PDFs require models with both document processing AND structured output
664
+ support. Consider two-step approach: generate() for analysis, then generate_structured()
665
+ for formatting.
519
666
 
520
667
  Returns:
521
668
  StructuredModelResponse[T] containing:
@@ -529,26 +676,6 @@ async def generate_structured(
529
676
  LLMError: If generation fails after retries.
530
677
  ValidationError: If response cannot be parsed into response_format.
531
678
 
532
- Example:
533
- >>> from pydantic import BaseModel, Field
534
- >>>
535
- >>> class Analysis(BaseModel):
536
- ... summary: str = Field(description="Brief summary")
537
- ... sentiment: float = Field(ge=-1, le=1)
538
- ... key_points: list[str] = Field(max_length=5)
539
- >>>
540
- >>> # CORRECT - No options parameter
541
- >>> response = await llm.generate_structured(
542
- ... "gpt-5",
543
- ... response_format=Analysis,
544
- ... messages="Analyze this product review: ..."
545
- ... )
546
- >>>
547
- >>> analysis = response.parsed # Type: Analysis
548
- >>> print(f"Sentiment: {analysis.sentiment}")
549
- >>> for point in analysis.key_points:
550
- ... print(f"- {point}")
551
-
552
679
  Supported models:
553
680
  Structured output support varies by provider and model. Generally includes:
554
681
  - OpenAI: GPT-4 and newer models
@@ -563,12 +690,9 @@ async def generate_structured(
563
690
  - Complex schemas increase generation time
564
691
  - Validation overhead is minimal (Pydantic is fast)
565
692
 
566
- Note:
567
- - Pydantic model is converted to JSON Schema for the API
568
- - The model generates JSON matching the schema
569
- - Validation happens automatically via Pydantic
570
- - Use Field() descriptions to guide generation
571
- - Search models (models with '-search' suffix) do not support structured output
693
+ Pydantic model is converted to JSON Schema for the API. Validation happens
694
+ automatically via Pydantic. Search models (models with '-search' suffix) do
695
+ not support structured output.
572
696
  """
573
697
  if context is None:
574
698
  context = AIMessages()
@@ -585,9 +709,19 @@ async def generate_structured(
585
709
 
586
710
  assert isinstance(messages, AIMessages)
587
711
 
712
+ with contextlib.suppress(Exception):
713
+ track_llm_documents(context, messages)
714
+
588
715
  # Call the internal generate function with structured output enabled
589
716
  try:
590
- response = await _generate_with_retry(model, context, messages, options)
717
+ response = await _generate_with_retry(
718
+ model,
719
+ context,
720
+ messages,
721
+ options,
722
+ purpose=purpose,
723
+ expected_cost=expected_cost,
724
+ )
591
725
  except (ValueError, LLMError):
592
726
  raise # Explicitly re-raise to satisfy DOC502
593
727