ai-pipeline-core 0.1.12__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. ai_pipeline_core/__init__.py +83 -119
  2. ai_pipeline_core/deployment/__init__.py +34 -0
  3. ai_pipeline_core/deployment/base.py +861 -0
  4. ai_pipeline_core/deployment/contract.py +80 -0
  5. ai_pipeline_core/deployment/deploy.py +561 -0
  6. ai_pipeline_core/deployment/helpers.py +97 -0
  7. ai_pipeline_core/deployment/progress.py +126 -0
  8. ai_pipeline_core/deployment/remote.py +116 -0
  9. ai_pipeline_core/docs_generator/__init__.py +54 -0
  10. ai_pipeline_core/docs_generator/__main__.py +5 -0
  11. ai_pipeline_core/docs_generator/cli.py +196 -0
  12. ai_pipeline_core/docs_generator/extractor.py +324 -0
  13. ai_pipeline_core/docs_generator/guide_builder.py +644 -0
  14. ai_pipeline_core/docs_generator/trimmer.py +35 -0
  15. ai_pipeline_core/docs_generator/validator.py +114 -0
  16. ai_pipeline_core/document_store/__init__.py +13 -0
  17. ai_pipeline_core/document_store/_summary.py +9 -0
  18. ai_pipeline_core/document_store/_summary_worker.py +170 -0
  19. ai_pipeline_core/document_store/clickhouse.py +492 -0
  20. ai_pipeline_core/document_store/factory.py +38 -0
  21. ai_pipeline_core/document_store/local.py +312 -0
  22. ai_pipeline_core/document_store/memory.py +85 -0
  23. ai_pipeline_core/document_store/protocol.py +68 -0
  24. ai_pipeline_core/documents/__init__.py +14 -15
  25. ai_pipeline_core/documents/_context_vars.py +85 -0
  26. ai_pipeline_core/documents/_hashing.py +52 -0
  27. ai_pipeline_core/documents/attachment.py +85 -0
  28. ai_pipeline_core/documents/context.py +128 -0
  29. ai_pipeline_core/documents/document.py +349 -1062
  30. ai_pipeline_core/documents/mime_type.py +40 -85
  31. ai_pipeline_core/documents/utils.py +62 -7
  32. ai_pipeline_core/exceptions.py +10 -62
  33. ai_pipeline_core/images/__init__.py +309 -0
  34. ai_pipeline_core/images/_processing.py +151 -0
  35. ai_pipeline_core/llm/__init__.py +5 -3
  36. ai_pipeline_core/llm/ai_messages.py +284 -73
  37. ai_pipeline_core/llm/client.py +462 -209
  38. ai_pipeline_core/llm/model_options.py +86 -53
  39. ai_pipeline_core/llm/model_response.py +187 -241
  40. ai_pipeline_core/llm/model_types.py +34 -54
  41. ai_pipeline_core/logging/__init__.py +2 -9
  42. ai_pipeline_core/logging/logging.yml +1 -1
  43. ai_pipeline_core/logging/logging_config.py +27 -43
  44. ai_pipeline_core/logging/logging_mixin.py +17 -51
  45. ai_pipeline_core/observability/__init__.py +32 -0
  46. ai_pipeline_core/observability/_debug/__init__.py +30 -0
  47. ai_pipeline_core/observability/_debug/_auto_summary.py +94 -0
  48. ai_pipeline_core/observability/_debug/_config.py +95 -0
  49. ai_pipeline_core/observability/_debug/_content.py +764 -0
  50. ai_pipeline_core/observability/_debug/_processor.py +98 -0
  51. ai_pipeline_core/observability/_debug/_summary.py +312 -0
  52. ai_pipeline_core/observability/_debug/_types.py +75 -0
  53. ai_pipeline_core/observability/_debug/_writer.py +843 -0
  54. ai_pipeline_core/observability/_document_tracking.py +146 -0
  55. ai_pipeline_core/observability/_initialization.py +194 -0
  56. ai_pipeline_core/observability/_logging_bridge.py +57 -0
  57. ai_pipeline_core/observability/_summary.py +81 -0
  58. ai_pipeline_core/observability/_tracking/__init__.py +6 -0
  59. ai_pipeline_core/observability/_tracking/_client.py +178 -0
  60. ai_pipeline_core/observability/_tracking/_internal.py +28 -0
  61. ai_pipeline_core/observability/_tracking/_models.py +138 -0
  62. ai_pipeline_core/observability/_tracking/_processor.py +158 -0
  63. ai_pipeline_core/observability/_tracking/_service.py +311 -0
  64. ai_pipeline_core/observability/_tracking/_writer.py +229 -0
  65. ai_pipeline_core/observability/tracing.py +640 -0
  66. ai_pipeline_core/pipeline/__init__.py +10 -0
  67. ai_pipeline_core/pipeline/decorators.py +915 -0
  68. ai_pipeline_core/pipeline/options.py +16 -0
  69. ai_pipeline_core/prompt_manager.py +26 -105
  70. ai_pipeline_core/settings.py +41 -32
  71. ai_pipeline_core/testing.py +9 -0
  72. ai_pipeline_core-0.4.1.dist-info/METADATA +807 -0
  73. ai_pipeline_core-0.4.1.dist-info/RECORD +76 -0
  74. {ai_pipeline_core-0.1.12.dist-info → ai_pipeline_core-0.4.1.dist-info}/WHEEL +1 -1
  75. ai_pipeline_core/documents/document_list.py +0 -240
  76. ai_pipeline_core/documents/flow_document.py +0 -128
  77. ai_pipeline_core/documents/task_document.py +0 -133
  78. ai_pipeline_core/documents/temporary_document.py +0 -95
  79. ai_pipeline_core/flow/__init__.py +0 -9
  80. ai_pipeline_core/flow/config.py +0 -314
  81. ai_pipeline_core/flow/options.py +0 -75
  82. ai_pipeline_core/pipeline.py +0 -717
  83. ai_pipeline_core/prefect.py +0 -54
  84. ai_pipeline_core/simple_runner/__init__.py +0 -24
  85. ai_pipeline_core/simple_runner/cli.py +0 -255
  86. ai_pipeline_core/simple_runner/simple_runner.py +0 -385
  87. ai_pipeline_core/tracing.py +0 -475
  88. ai_pipeline_core-0.1.12.dist-info/METADATA +0 -450
  89. ai_pipeline_core-0.1.12.dist-info/RECORD +0 -36
  90. {ai_pipeline_core-0.1.12.dist-info → ai_pipeline_core-0.4.1.dist-info}/licenses/LICENSE +0 -0
@@ -1,93 +1,217 @@
1
1
  """LLM client implementation for AI model interactions.
2
2
 
3
- @public
4
-
5
3
  This module provides the core functionality for interacting with language models
6
4
  through a unified interface. It handles retries, caching, structured outputs,
7
5
  and integration with various LLM providers via LiteLLM.
8
6
 
9
- Key functions:
10
- - generate(): Text generation with optional context caching
11
- - generate_structured(): Type-safe structured output generation
7
+ Automatic image auto-tiling splits oversized images in attachments to meet
8
+ model-specific constraints (e.g., 3000x3000 for Gemini, 1000x1000 default).
9
+ Context caching separates static content from dynamic messages for 50-90% token savings.
10
+ Optional purpose and expected_cost parameters enable tracing and cost-tracking.
12
11
  """
13
12
 
14
13
  import asyncio
14
+ import contextlib
15
+ import time
16
+ from io import BytesIO
15
17
  from typing import Any, TypeVar
16
18
 
17
19
  from lmnr import Laminar
18
20
  from openai import AsyncOpenAI
21
+ from openai.lib.streaming.chat import ChunkEvent, ContentDeltaEvent, ContentDoneEvent
19
22
  from openai.types.chat import (
23
+ ChatCompletion,
20
24
  ChatCompletionMessageParam,
21
25
  )
22
- from prefect.logging import get_logger
23
- from pydantic import BaseModel
26
+ from PIL import Image
27
+ from pydantic import BaseModel, ValidationError
24
28
 
29
+ from ai_pipeline_core.documents import Document
30
+ from ai_pipeline_core.documents.attachment import Attachment
25
31
  from ai_pipeline_core.exceptions import LLMError
32
+ from ai_pipeline_core.images import ImageProcessingConfig, process_image, process_image_to_documents
33
+ from ai_pipeline_core.logging import get_pipeline_logger
34
+ from ai_pipeline_core.observability._document_tracking import track_llm_documents
26
35
  from ai_pipeline_core.settings import settings
27
- from ai_pipeline_core.tracing import trace
28
36
 
29
- from .ai_messages import AIMessages
37
+ from .ai_messages import AIMessages, AIMessageType
30
38
  from .model_options import ModelOptions
31
39
  from .model_response import ModelResponse, StructuredModelResponse
32
40
  from .model_types import ModelName
33
41
 
34
- logger = get_logger()
42
+ logger = get_pipeline_logger(__name__)
43
+
44
+ # Image splitting configs for automatic large-image handling at the LLM boundary.
45
+ # Gemini supports up to 3000x3000; all other models use a conservative 1000x1000 default.
46
+ _GEMINI_IMAGE_CONFIG = ImageProcessingConfig(max_dimension=3000, max_pixels=9_000_000, jpeg_quality=75)
47
+ _DEFAULT_IMAGE_CONFIG = ImageProcessingConfig(max_dimension=1000, max_pixels=1_000_000, jpeg_quality=75)
48
+
49
+
50
+ def _get_image_config(model: str) -> ImageProcessingConfig:
51
+ """Return the image splitting config for a model."""
52
+ if "gemini" in model.lower():
53
+ return _GEMINI_IMAGE_CONFIG
54
+ return _DEFAULT_IMAGE_CONFIG
55
+
56
+
57
+ def _prepare_images_for_model(messages: AIMessages, model: str) -> AIMessages: # noqa: C901, PLR0912, PLR0915, PLR0914
58
+ """Split image documents and image attachments that exceed model constraints.
59
+
60
+ Returns a new AIMessages with oversized images replaced by tiles.
61
+ Returns the original instance unchanged if no splitting is needed.
62
+ """
63
+ if not any(isinstance(m, Document) and (m.is_image or any(att.is_image for att in m.attachments)) for m in messages):
64
+ return messages
65
+
66
+ config = _get_image_config(model)
67
+ result: list[AIMessageType] = []
68
+ changed = False
69
+
70
+ for msg in messages:
71
+ if not isinstance(msg, Document):
72
+ result.append(msg)
73
+ continue
74
+
75
+ # 1. Handle top-level image Documents (existing logic)
76
+ if msg.is_image:
77
+ try:
78
+ with Image.open(BytesIO(msg.content)) as img:
79
+ w, h = img.size
80
+ except Exception:
81
+ result.append(msg)
82
+ continue
83
+
84
+ within_limits = w <= config.max_dimension and h <= config.max_dimension and w * h <= config.max_pixels
85
+ if within_limits:
86
+ pass # Falls through to attachment handling
87
+ else:
88
+ name_prefix = msg.name.rsplit(".", 1)[0] if "." in msg.name else msg.name
89
+ tiles = process_image_to_documents(msg, config=config, name_prefix=name_prefix)
90
+ if msg.attachments and tiles:
91
+ tiles[0] = tiles[0].model_copy(update={"attachments": msg.attachments})
92
+ result.extend(tiles)
93
+ changed = True
94
+ continue
95
+
96
+ # 2. Handle image attachments
97
+ if msg.attachments:
98
+ new_attachments: list[Attachment] = []
99
+ attachments_changed = False
100
+
101
+ for att in msg.attachments:
102
+ if not att.is_image:
103
+ new_attachments.append(att)
104
+ continue
105
+
106
+ try:
107
+ with Image.open(BytesIO(att.content)) as img:
108
+ w, h = img.size
109
+ except Exception:
110
+ new_attachments.append(att)
111
+ continue
112
+
113
+ att_within_limits = w <= config.max_dimension and h <= config.max_dimension and w * h <= config.max_pixels
114
+ if att_within_limits:
115
+ new_attachments.append(att)
116
+ continue
117
+
118
+ # Tile the oversized attachment image
119
+ processed = process_image(att.content, config=config)
120
+ att_prefix = att.name.rsplit(".", 1)[0] if "." in att.name else att.name
121
+
122
+ for part in processed.parts:
123
+ if part.total == 1:
124
+ tile_name = f"{att_prefix}.jpg"
125
+ tile_desc = att.description
126
+ else:
127
+ tile_name = f"{att_prefix}_{part.index + 1:02d}_of_{part.total:02d}.jpg"
128
+ tile_desc = f"{att.description} ({part.label})" if att.description else part.label
129
+
130
+ new_attachments.append(
131
+ Attachment(
132
+ name=tile_name,
133
+ content=part.data,
134
+ description=tile_desc,
135
+ )
136
+ )
137
+ attachments_changed = True
138
+
139
+ if attachments_changed:
140
+ msg = msg.model_copy(update={"attachments": tuple(new_attachments)}) # noqa: PLW2901
141
+ changed = True
142
+
143
+ result.append(msg)
144
+
145
+ if not changed:
146
+ return messages
147
+ return AIMessages(result)
35
148
 
36
149
 
37
150
  def _process_messages(
38
151
  context: AIMessages,
39
152
  messages: AIMessages,
40
153
  system_prompt: str | None = None,
154
+ cache_ttl: str | None = "300s",
41
155
  ) -> list[ChatCompletionMessageParam]:
42
156
  """Process and format messages for LLM API consumption.
43
157
 
44
158
  Internal function that combines context and messages into a single
45
159
  list of API-compatible messages. Applies caching directives to
46
- context messages for efficiency.
160
+ system prompt and context messages for efficiency.
47
161
 
48
162
  Args:
49
163
  context: Messages to be cached (typically expensive/static content).
50
164
  messages: Regular messages without caching (dynamic queries).
51
165
  system_prompt: Optional system instructions for the model.
166
+ cache_ttl: Cache TTL for system and context messages (e.g. "120s", "300s", "1h").
167
+ Set to None or empty string to disable caching.
52
168
 
53
169
  Returns:
54
170
  List of formatted messages ready for API calls, with:
55
- - System prompt at the beginning (if provided)
56
- - Context messages with cache_control on the last one
171
+ - System prompt at the beginning with cache_control (if provided and cache_ttl set)
172
+ - Context messages with cache_control on all messages (if cache_ttl set)
57
173
  - Regular messages without caching
58
174
 
59
175
  System Prompt Location:
60
- The system prompt from ModelOptions.system_prompt is always injected
61
- as the FIRST message with role="system". It is NOT cached with context,
62
- allowing dynamic system prompts without breaking cache efficiency.
176
+ The system prompt parameter is always injected as the FIRST message
177
+ with role="system". It is cached along with context when cache_ttl is set.
63
178
 
64
179
  Cache behavior:
65
- The last context message gets ephemeral caching (120s TTL)
180
+ All system and context messages get ephemeral caching with specified TTL
66
181
  to reduce token usage on repeated calls with same context.
182
+ If cache_ttl is None or empty string (falsy), no caching is applied.
183
+ All system and context messages receive cache_control to maximize cache efficiency.
67
184
 
68
- Note:
69
- This is an internal function used by _generate_with_retry().
70
- The context/messages split enables efficient token usage.
185
+ This is an internal function used by _generate_with_retry().
186
+ The context/messages split enables efficient token usage.
71
187
  """
72
188
  processed_messages: list[ChatCompletionMessageParam] = []
73
189
 
74
190
  # Add system prompt if provided
75
191
  if system_prompt:
76
- processed_messages.append({"role": "system", "content": system_prompt})
192
+ processed_messages.append({
193
+ "role": "system",
194
+ "content": [{"type": "text", "text": system_prompt}],
195
+ })
77
196
 
78
197
  # Process context messages with caching if provided
79
198
  if context:
80
199
  # Use AIMessages.to_prompt() for context
81
200
  context_messages = context.to_prompt()
82
-
83
- # Apply caching to last context message
84
- context_messages[-1]["cache_control"] = { # type: ignore
85
- "type": "ephemeral",
86
- "ttl": "120s", # Cache for 2m
87
- }
88
-
89
201
  processed_messages.extend(context_messages)
90
202
 
203
+ if cache_ttl:
204
+ for message in processed_messages:
205
+ message["cache_control"] = { # type: ignore
206
+ "type": "ephemeral",
207
+ "ttl": cache_ttl,
208
+ }
209
+ if isinstance(message["content"], list): # type: ignore
210
+ message["content"][-1]["cache_control"] = { # type: ignore
211
+ "type": "ephemeral",
212
+ "ttl": cache_ttl,
213
+ }
214
+
91
215
  # Process regular messages without caching
92
216
  if messages:
93
217
  regular_messages = messages.to_prompt()
@@ -96,56 +220,157 @@ def _process_messages(
96
220
  return processed_messages
97
221
 
98
222
 
99
- async def _generate(
100
- model: str, messages: list[ChatCompletionMessageParam], completion_kwargs: dict[str, Any]
223
+ def _remove_cache_control(
224
+ messages: list[ChatCompletionMessageParam],
225
+ ) -> list[ChatCompletionMessageParam]:
226
+ """Remove cache control directives from messages.
227
+
228
+ Internal utility that strips cache_control fields from both message-level
229
+ and content-level entries. Used in retry logic when cache-related errors
230
+ occur during LLM API calls.
231
+
232
+ Args:
233
+ messages: List of messages that may contain cache_control directives.
234
+
235
+ Returns:
236
+ The same message list (modified in-place) with all cache_control
237
+ fields removed from both messages and their content items.
238
+
239
+ Modifies the input list in-place but also returns it for convenience.
240
+ Handles both list-based content (multipart) and string content (simple messages).
241
+ """
242
+ for message in messages:
243
+ if (content := message.get("content")) and isinstance(content, list):
244
+ for item in content:
245
+ if "cache_control" in item:
246
+ del item["cache_control"]
247
+ if "cache_control" in message:
248
+ del message["cache_control"]
249
+ return messages
250
+
251
+
252
+ def _model_name_to_openrouter_model(model: ModelName) -> str:
253
+ """Convert a model name to an OpenRouter model name.
254
+
255
+ Args:
256
+ model: Model name to convert.
257
+
258
+ Returns:
259
+ OpenRouter model name.
260
+ """
261
+ if model == "gemini-3-flash-search":
262
+ return "google/gemini-3-flash:online"
263
+ if model == "sonar-pro-search":
264
+ return "perplexity/sonar-pro-search"
265
+ if model.startswith("gemini"):
266
+ return f"google/{model}"
267
+ elif model.startswith("gpt"):
268
+ return f"openai/{model}"
269
+ elif model.startswith("grok"):
270
+ return f"x-ai/{model}"
271
+ elif model.startswith("claude"):
272
+ return f"anthropic/{model}"
273
+ elif model.startswith("qwen3"):
274
+ return f"qwen/{model}"
275
+ elif model.startswith("deepseek-"):
276
+ return f"deepseek/{model}"
277
+ elif model.startswith("glm-"):
278
+ return f"z-ai/{model}"
279
+ elif model.startswith("kimi-"):
280
+ return f"moonshotai/{model}"
281
+ return model
282
+
283
+
284
+ async def _generate_streaming(client: AsyncOpenAI, model: str, messages: list[ChatCompletionMessageParam], completion_kwargs: dict[str, Any]) -> ModelResponse:
285
+ """Execute a streaming LLM API call."""
286
+ start_time = time.time()
287
+ first_token_time = None
288
+ usage = None
289
+ async with client.chat.completions.stream(
290
+ model=model,
291
+ messages=messages,
292
+ **completion_kwargs,
293
+ ) as s:
294
+ async for event in s:
295
+ if isinstance(event, ContentDeltaEvent):
296
+ if not first_token_time:
297
+ first_token_time = time.time()
298
+ elif isinstance(event, ContentDoneEvent):
299
+ pass
300
+ elif isinstance(event, ChunkEvent) and event.chunk.usage:
301
+ usage = event.chunk.usage
302
+ if not first_token_time:
303
+ first_token_time = time.time()
304
+ raw_response = await s.get_final_completion()
305
+
306
+ metadata = {
307
+ "time_taken": round(time.time() - start_time, 2),
308
+ "first_token_time": round(first_token_time - start_time, 2),
309
+ }
310
+ return ModelResponse(raw_response, model_options=completion_kwargs, metadata=metadata, usage=usage)
311
+
312
+
313
+ async def _generate_non_streaming(
314
+ client: AsyncOpenAI, model: str, messages: list[ChatCompletionMessageParam], completion_kwargs: dict[str, Any]
101
315
  ) -> ModelResponse:
102
- """Execute a single LLM API call.
316
+ """Execute a non-streaming LLM API call.
103
317
 
104
- Internal function that makes the actual API request to the LLM provider.
105
- Handles both regular and structured output generation.
318
+ Avoids OpenAI SDK delta accumulation some providers (e.g. Grok) send
319
+ streaming annotation deltas that crash the SDK's accumulate_delta().
320
+ """
321
+ start_time = time.time()
322
+ kwargs = {k: v for k, v in completion_kwargs.items() if k != "stream_options"}
323
+ response_format = kwargs.get("response_format")
324
+ if isinstance(response_format, type) and issubclass(response_format, BaseModel):
325
+ raw_response: ChatCompletion = await client.chat.completions.parse(
326
+ model=model,
327
+ messages=messages,
328
+ **kwargs,
329
+ )
330
+ else:
331
+ raw_response = await client.chat.completions.create(
332
+ model=model,
333
+ messages=messages,
334
+ stream=False,
335
+ **kwargs,
336
+ )
337
+ elapsed = round(time.time() - start_time, 2)
338
+ metadata = {"time_taken": elapsed, "first_token_time": elapsed}
339
+ return ModelResponse(raw_response, model_options=completion_kwargs, metadata=metadata)
340
+
341
+
342
+ async def _generate(model: str, messages: list[ChatCompletionMessageParam], completion_kwargs: dict[str, Any], *, stream: bool = True) -> ModelResponse:
343
+ """Execute a single LLM API call.
106
344
 
107
345
  Args:
108
- model: Model identifier (e.g., "gpt-5", "gemini-2.5-pro").
346
+ model: Model identifier (e.g., "gpt-5.1", "gemini-3-pro").
109
347
  messages: Formatted messages for the API.
110
348
  completion_kwargs: Additional parameters for the completion API.
349
+ stream: Whether to use streaming mode (default True). Non-streaming
350
+ avoids OpenAI SDK delta accumulation issues with some providers.
111
351
 
112
352
  Returns:
113
353
  ModelResponse with generated content and metadata.
114
-
115
- API selection:
116
- - Uses client.chat.completions.parse() for structured output
117
- - Uses client.chat.completions.create() for regular text
118
-
119
- Note:
120
- - Uses AsyncOpenAI client configured via settings
121
- - Captures response headers for cost tracking
122
- - Response includes model options for debugging
123
354
  """
355
+ if "openrouter" in settings.openai_base_url.lower():
356
+ model = _model_name_to_openrouter_model(model)
357
+
124
358
  async with AsyncOpenAI(
125
359
  api_key=settings.openai_api_key,
126
360
  base_url=settings.openai_base_url,
127
361
  ) as client:
128
- # Use parse for structured output, create for regular
129
- if completion_kwargs.get("response_format"):
130
- raw_response = await client.chat.completions.with_raw_response.parse( # type: ignore[var-annotated]
131
- **completion_kwargs,
132
- )
133
- else:
134
- raw_response = await client.chat.completions.with_raw_response.create( # type: ignore[var-annotated]
135
- **completion_kwargs
136
- )
137
-
138
- response = ModelResponse(raw_response.parse()) # type: ignore[arg-type]
139
- response.set_model_options(completion_kwargs)
140
- response.set_headers(dict(raw_response.headers.items())) # type: ignore[arg-type]
141
- return response
362
+ if stream:
363
+ return await _generate_streaming(client, model, messages, completion_kwargs)
364
+ return await _generate_non_streaming(client, model, messages, completion_kwargs)
142
365
 
143
366
 
144
- async def _generate_with_retry(
367
+ async def _generate_with_retry( # noqa: PLR0917
145
368
  model: str,
146
369
  context: AIMessages,
147
370
  messages: AIMessages,
148
371
  options: ModelOptions,
372
+ purpose: str | None = None,
373
+ expected_cost: float | None = None,
149
374
  ) -> ModelResponse:
150
375
  """Core LLM generation with automatic retry logic.
151
376
 
@@ -157,6 +382,8 @@ async def _generate_with_retry(
157
382
  context: Cached context messages (can be empty).
158
383
  messages: Dynamic query messages.
159
384
  options: Configuration including retries, timeout, temperature.
385
+ purpose: Optional semantic label for the LLM span name.
386
+ expected_cost: Optional expected cost for cost-tracking attributes.
160
387
 
161
388
  Returns:
162
389
  ModelResponse with generated content.
@@ -165,45 +392,51 @@ async def _generate_with_retry(
165
392
  ValueError: If model is not provided or both context and messages are empty.
166
393
  LLMError: If all retry attempts are exhausted.
167
394
 
168
- Note:
169
- Empty responses trigger a retry as they indicate API issues.
395
+ Empty responses trigger a retry as they indicate API issues.
170
396
  """
171
397
  if not model:
172
398
  raise ValueError("Model must be provided")
173
399
  if not context and not messages:
174
400
  raise ValueError("Either context or messages must be provided")
175
401
 
176
- processed_messages = _process_messages(context, messages, options.system_prompt)
402
+ # Auto-split large images based on model-specific constraints
403
+ context = _prepare_images_for_model(context, model)
404
+ messages = _prepare_images_for_model(messages, model)
405
+
406
+ if "gemini" in model.lower() and context.approximate_tokens_count < 10000:
407
+ # Bug fix for minimum explicit context size for Gemini models
408
+ options.cache_ttl = None
409
+
410
+ processed_messages = _process_messages(context, messages, options.system_prompt, options.cache_ttl)
177
411
  completion_kwargs: dict[str, Any] = {
178
- "model": model,
179
- "messages": processed_messages,
180
412
  **options.to_openai_completion_kwargs(),
181
413
  }
182
414
 
183
- if context:
415
+ if context and options.cache_ttl:
184
416
  completion_kwargs["prompt_cache_key"] = context.get_prompt_cache_key(options.system_prompt)
185
417
 
186
418
  for attempt in range(options.retries):
187
419
  try:
188
- with Laminar.start_as_current_span(
189
- model, span_type="LLM", input=processed_messages
190
- ) as span:
191
- response = await _generate(model, processed_messages, completion_kwargs)
192
- span.set_attributes(response.get_laminar_metadata())
193
- Laminar.set_span_output(response.content)
194
- if not response.content:
195
- raise ValueError(f"Model {model} returned an empty response.")
420
+ with Laminar.start_as_current_span(purpose or model, span_type="LLM", input=processed_messages) as span:
421
+ response = await _generate(model, processed_messages, completion_kwargs, stream=options.stream)
422
+ laminar_metadata = response.get_laminar_metadata()
423
+ if purpose:
424
+ laminar_metadata["purpose"] = purpose
425
+ if expected_cost is not None:
426
+ laminar_metadata["expected_cost"] = expected_cost
427
+ span.set_attributes(laminar_metadata) # pyright: ignore[reportArgumentType]
428
+ Laminar.set_span_output([r for r in (response.reasoning_content, response.content) if r])
429
+ response.validate_output()
196
430
  return response
197
- except (asyncio.TimeoutError, ValueError, Exception) as e:
431
+ except (TimeoutError, ValueError, ValidationError, Exception) as e:
198
432
  if not isinstance(e, asyncio.TimeoutError):
199
433
  # disable cache if it's not a timeout because it may cause an error
200
434
  completion_kwargs["extra_body"]["cache"] = {"no-cache": True}
435
+ # sometimes there are issues with cache so cache is removed in case of failure
436
+ processed_messages = _remove_cache_control(processed_messages)
201
437
 
202
438
  logger.warning(
203
- "LLM generation failed (attempt %d/%d): %s",
204
- attempt + 1,
205
- options.retries,
206
- e,
439
+ f"LLM generation failed (attempt {attempt + 1}/{options.retries}): {e}",
207
440
  )
208
441
  if attempt == options.retries - 1:
209
442
  raise LLMError("Exhausted all retry attempts for LLM generation.") from e
@@ -213,37 +446,43 @@ async def _generate_with_retry(
213
446
  raise LLMError("Unknown error occurred during LLM generation.")
214
447
 
215
448
 
216
- @trace(ignore_inputs=["context"])
217
449
  async def generate(
218
- model: ModelName | str,
450
+ model: ModelName,
219
451
  *,
220
452
  context: AIMessages | None = None,
221
453
  messages: AIMessages | str,
222
454
  options: ModelOptions | None = None,
455
+ purpose: str | None = None,
456
+ expected_cost: float | None = None,
223
457
  ) -> ModelResponse:
224
458
  """Generate text response from a language model.
225
459
 
226
- @public
227
-
228
460
  Main entry point for LLM text generation with smart context caching.
229
461
  The context/messages split enables efficient token usage by caching
230
462
  expensive static content separately from dynamic queries.
231
463
 
232
464
  Best Practices:
233
- 1. OPTIONS: Omit in 90% of cases - defaults are optimized
465
+ 1. OPTIONS: DO NOT use the options parameter - omit it entirely for production use
234
466
  2. MESSAGES: Use AIMessages or str - wrap Documents in AIMessages
235
467
  3. CONTEXT vs MESSAGES: Use context for static/cacheable, messages for dynamic
468
+ 4. CONFIGURATION: Configure model behavior via LiteLLM proxy or environment variables
236
469
 
237
470
  Args:
238
- model: Model to use (e.g., "gpt-5", "gemini-2.5-pro", "grok-4").
239
- Can be ModelName literal or any string for custom models.
471
+ model: Model to use (e.g., "gpt-5.1", "gemini-3-pro", "grok-4.1-fast").
472
+ Accepts predefined models or any string for custom models.
240
473
  context: Static context to cache (documents, examples, instructions).
241
- Defaults to None (empty context). Cached for 120 seconds.
474
+ Defaults to None (empty context). Cached for 5 minutes by default.
242
475
  messages: Dynamic messages/queries. AIMessages or str ONLY.
243
- Do not pass Document or DocumentList directly.
476
+ Do not pass Document or list[Document] directly.
244
477
  If string, converted to AIMessages internally.
245
- options: Model configuration (temperature, retries, timeout, etc.).
246
- Defaults to None (uses ModelOptions() with standard settings).
478
+ options: Internal framework parameter. Framework defaults are production-optimized
479
+ (3 retries, 20s delay, 600s timeout). Configure model behavior centrally via
480
+ LiteLLM proxy settings or environment variables, not per API call.
481
+ Provider-specific settings should be configured at the proxy level.
482
+ purpose: Optional semantic label used as the tracing span name
483
+ instead of model name. Stored as a span attribute.
484
+ expected_cost: Optional expected cost stored as a span attribute
485
+ for cost-tracking and comparison with actual cost.
247
486
 
248
487
  Returns:
249
488
  ModelResponse containing:
@@ -260,75 +499,57 @@ async def generate(
260
499
  Wrap Documents in AIMessages - DO NOT pass directly or convert to .text:
261
500
 
262
501
  # CORRECT - wrap Document in AIMessages
263
- response = await llm.generate("gpt-5", messages=AIMessages([my_document]))
502
+ response = await llm.generate("gpt-5.1", messages=AIMessages([my_document]))
264
503
 
265
504
  # WRONG - don't pass Document directly
266
- response = await llm.generate("gpt-5", messages=my_document) # NO!
505
+ response = await llm.generate("gpt-5.1", messages=my_document) # NO!
267
506
 
268
507
  # WRONG - don't convert to string yourself
269
- response = await llm.generate("gpt-5", messages=my_document.text) # NO!
508
+ response = await llm.generate("gpt-5.1", messages=my_document.text) # NO!
509
+
510
+ VISION/PDF MODEL COMPATIBILITY:
511
+ When using Documents containing images or PDFs, ensure your model supports these formats:
512
+ - Images require vision-capable models (gpt-5.1, gemini-3-flash, gemini-3-pro)
513
+ - PDFs require document processing support (varies by provider)
514
+ - Non-compatible models will raise ValueError or fall back to text extraction
515
+ - Check model capabilities before including visual/PDF content
270
516
 
271
517
  Context vs Messages Strategy:
272
- context: Static, reusable content (cached 120 seconds)
518
+ context: Static, reusable content for caching efficiency
273
519
  - Large documents, instructions, examples
274
- - Same across multiple calls
520
+ - Remains constant across multiple calls
521
+ - Cached when supported by provider/proxy configuration
275
522
 
276
- messages: Dynamic, query-specific content
523
+ messages: Dynamic, per-call specific content
277
524
  - User questions, current conversation turn
278
- - Changes every call
279
-
280
- Example:
281
- >>> # Simple case - no options needed (90% of cases)
282
- >>> response = await llm.generate("gpt-5", messages="Explain quantum computing")
283
- >>> print(response.content) # In production, use get_pipeline_logger instead of print
284
-
285
- >>> # With context caching for efficiency
286
- >>> # Context and messages are both AIMessages or str; wrap any Documents
287
- >>> static_doc = AIMessages([large_document, "few-shot example: ..."])
288
- >>>
289
- >>> # First call: caches context
290
- >>> r1 = await llm.generate("gpt-5", context=static_doc, messages="Summarize")
291
- >>>
292
- >>> # Second call: reuses cache, saves tokens!
293
- >>> r2 = await llm.generate("gpt-5", context=static_doc, messages="Key points?")
294
-
295
- >>> # AVOID unnecessary options (defaults are optimal)
296
- >>> response = await llm.generate(
297
- ... "gpt-5",
298
- ... messages="Hello",
299
- ... options=ModelOptions(temperature=0.7) # Default is probably fine!
300
- ... )
301
-
302
- >>> # Multi-turn conversation
303
- >>> messages = AIMessages([
304
- ... "What is Python?",
305
- ... previous_response,
306
- ... "Can you give an example?"
307
- ... ])
308
- >>> response = await llm.generate("gpt-5", messages=messages)
525
+ - Changes with each API call
526
+ - Never cached, always processed fresh
309
527
 
310
528
  Performance:
311
529
  - Context caching saves ~50-90% tokens on repeated calls
312
530
  - First call: full token cost
313
- - Subsequent calls (within 120s): only messages tokens
314
- - Default retry delay is 10s (configurable via ModelOptions.retry_delay_seconds)
531
+ - Subsequent calls (within cache TTL): only messages tokens
532
+ - Default cache TTL is 300s/5 minutes (production-optimized)
533
+ - Default retry logic: 3 attempts with 20s delay (production-optimized)
315
534
 
316
535
  Caching:
317
536
  When enabled in your LiteLLM proxy and supported by the upstream provider,
318
- context messages may be cached (typical TTL ~120s) to reduce token usage on
319
- repeated calls. Savings depend on provider and payload; treat this as an
320
- optimization, not a guarantee. Cache behavior varies by proxy configuration.
321
-
322
- Note:
323
- - Context argument is ignored by the tracer to avoid recording large data
324
- - All models are accessed via LiteLLM proxy
325
- - Automatic retry with configurable delay between attempts
326
- - Cost tracking via response headers
327
-
328
- See Also:
329
- - generate_structured: For typed/structured output
330
- - AIMessages: Message container with document support
331
- - ModelOptions: Configuration options
537
+ context messages may be cached to reduce token usage on repeated calls.
538
+ Default TTL is 5m (optimized for production workloads). Configure caching
539
+ behavior centrally via your LiteLLM proxy settings, not per API call.
540
+ Savings depend on provider and payload; treat this as an optimization, not a guarantee.
541
+
542
+ Configuration:
543
+ All model behavior should be configured at the LiteLLM proxy level:
544
+ - Temperature, max_tokens: Set in litellm_config.yaml model_list
545
+ - Retry logic: Configure in proxy general_settings
546
+ - Timeouts: Set via proxy configuration
547
+ - Caching: Enable/configure in proxy cache settings
548
+
549
+ This centralizes configuration and ensures consistency across all API calls.
550
+
551
+ All models are accessed via LiteLLM proxy with automatic retry and
552
+ cost tracking via response headers.
332
553
  """
333
554
  if isinstance(messages, str):
334
555
  messages = AIMessages([messages])
@@ -337,9 +558,22 @@ async def generate(
337
558
  context = AIMessages()
338
559
  if options is None:
339
560
  options = ModelOptions()
561
+ else:
562
+ # Create a copy to avoid mutating the caller's options object
563
+ options = options.model_copy()
564
+
565
+ with contextlib.suppress(Exception):
566
+ track_llm_documents(context, messages)
340
567
 
341
568
  try:
342
- return await _generate_with_retry(model, context, messages, options)
569
+ return await _generate_with_retry(
570
+ model,
571
+ context,
572
+ messages,
573
+ options,
574
+ purpose=purpose,
575
+ expected_cost=expected_cost,
576
+ )
343
577
  except (ValueError, LLMError):
344
578
  raise # Explicitly re-raise to satisfy DOC502
345
579
 
@@ -348,36 +582,87 @@ T = TypeVar("T", bound=BaseModel)
348
582
  """Type variable for Pydantic model types in structured generation."""
349
583
 
350
584
 
351
- @trace(ignore_inputs=["context"])
352
- async def generate_structured(
353
- model: ModelName | str,
585
+ async def generate_structured( # noqa: UP047
586
+ model: ModelName,
354
587
  response_format: type[T],
355
588
  *,
356
589
  context: AIMessages | None = None,
357
590
  messages: AIMessages | str,
358
591
  options: ModelOptions | None = None,
592
+ purpose: str | None = None,
593
+ expected_cost: float | None = None,
359
594
  ) -> StructuredModelResponse[T]:
360
595
  """Generate structured output conforming to a Pydantic model.
361
596
 
362
- @public
363
-
364
597
  Type-safe generation that returns validated Pydantic model instances.
365
598
  Uses OpenAI's structured output feature for guaranteed schema compliance.
366
599
 
367
- Best Practices (same as generate):
368
- 1. OPTIONS: Omit in 90% of cases - defaults are optimized
600
+ IMPORTANT: Search models (models with '-search' suffix) do not support
601
+ structured output. Use generate() instead for search models.
602
+
603
+ Best Practices:
604
+ 1. OPTIONS: DO NOT use the options parameter - omit it entirely for production use
369
605
  2. MESSAGES: Use AIMessages or str - wrap Documents in AIMessages
370
- 3. CONTEXT vs MESSAGES: Use context for static/cacheable, messages for dynamic
606
+ 3. CONFIGURATION: Configure model behavior via LiteLLM proxy or environment variables
607
+ 4. See generate() documentation for more details
608
+
609
+ Context vs Messages Strategy:
610
+ context: Static, reusable content for caching efficiency
611
+ - Schemas, examples, instructions
612
+ - Remains constant across multiple calls
613
+ - Cached when supported by provider/proxy configuration
614
+
615
+ messages: Dynamic, per-call specific content
616
+ - Data to be structured, user queries
617
+ - Changes with each API call
618
+ - Never cached, always processed fresh
619
+
620
+ Complex Task Pattern:
621
+ For complex tasks like research or deep analysis, it's recommended to use
622
+ a two-step approach:
623
+ 1. First use generate() with a capable model to perform the analysis
624
+ 2. Then use generate_structured() with a smaller model to convert the
625
+ response into structured output
626
+
627
+ This pattern is more reliable than trying to force complex reasoning
628
+ directly into structured format:
629
+
630
+ >>> # Step 1: Research/analysis with generate() - no options parameter
631
+ >>> research = await llm.generate(
632
+ ... "gpt-5.1",
633
+ ... messages="Research and analyze this complex topic..."
634
+ ... )
635
+ >>>
636
+ >>> # Step 2: Structure the results with generate_structured()
637
+ >>> structured = await llm.generate_structured(
638
+ ... "gpt-5-mini", # Smaller model is fine for structuring
639
+ ... response_format=ResearchSummary,
640
+ ... messages=f"Extract key information: {research.content}"
641
+ ... )
371
642
 
372
643
  Args:
373
644
  model: Model to use (must support structured output).
645
+ Search models (models with '-search' suffix) do not support structured output.
374
646
  response_format: Pydantic model class defining the output schema.
375
647
  The model will generate JSON matching this schema.
376
648
  context: Static context to cache (documents, schemas, examples).
377
649
  Defaults to None (empty AIMessages).
378
650
  messages: Dynamic prompts/queries. AIMessages or str ONLY.
379
- Do not pass Document or DocumentList directly.
380
- options: Model configuration. response_format is set automatically.
651
+ Do not pass Document or list[Document] directly.
652
+ options: Optional ModelOptions for configuring temperature, retries, etc.
653
+ If provided, it will NOT be mutated (a copy is created internally).
654
+ The response_format field is set automatically from the response_format parameter.
655
+ In most cases, leave as None to use framework defaults.
656
+ Configure model behavior centrally via LiteLLM proxy settings when possible.
657
+ purpose: Optional semantic label used as the tracing span name
658
+ instead of model name. Stored as a span attribute.
659
+ expected_cost: Optional expected cost stored as a span attribute
660
+ for cost-tracking and comparison with actual cost.
661
+
662
+ Vision/PDF model compatibility: Images require vision-capable models that also support
663
+ structured output. PDFs require models with both document processing AND structured output
664
+ support. Consider two-step approach: generate() for analysis, then generate_structured()
665
+ for formatting.
381
666
 
382
667
  Returns:
383
668
  StructuredModelResponse[T] containing:
@@ -387,89 +672,57 @@ async def generate_structured(
387
672
  Raises:
388
673
  TypeError: If response_format is not a Pydantic model class.
389
674
  ValueError: If model doesn't support structured output or no parsed content returned.
675
+ Structured output support varies by provider and model.
390
676
  LLMError: If generation fails after retries.
391
677
  ValidationError: If response cannot be parsed into response_format.
392
678
 
393
- Example:
394
- >>> from pydantic import BaseModel, Field
395
- >>>
396
- >>> class Analysis(BaseModel):
397
- ... summary: str = Field(description="Brief summary")
398
- ... sentiment: float = Field(ge=-1, le=1)
399
- ... key_points: list[str] = Field(max_length=5)
400
- >>>
401
- >>> response = await llm.generate_structured(
402
- ... model="gpt-5",
403
- ... response_format=Analysis,
404
- ... messages="Analyze this product review: ..."
405
- ... )
406
- >>>
407
- >>> analysis = response.parsed # Type: Analysis
408
- >>> print(f"Sentiment: {analysis.sentiment}")
409
- >>> for point in analysis.key_points:
410
- ... print(f"- {point}")
411
-
412
679
  Supported models:
413
- Support varies by provider and model. Generally includes:
680
+ Structured output support varies by provider and model. Generally includes:
414
681
  - OpenAI: GPT-4 and newer models
415
682
  - Anthropic: Claude 3+ models
416
683
  - Google: Gemini Pro models
417
- Check provider documentation for specific model support.
684
+
685
+ Search models (models with '-search' suffix) do not support structured output.
686
+ Check provider documentation for specific support.
418
687
 
419
688
  Performance:
420
689
  - Structured output may use more tokens than free text
421
690
  - Complex schemas increase generation time
422
691
  - Validation overhead is minimal (Pydantic is fast)
423
692
 
424
- Note:
425
- - Pydantic model is converted to JSON Schema for the API
426
- - The model generates JSON matching the schema
427
- - Validation happens automatically via Pydantic
428
- - Use Field() descriptions to guide generation
429
-
430
- See Also:
431
- - generate: For unstructured text generation
432
- - ModelOptions: Configuration including response_format
433
- - StructuredModelResponse: Response wrapper with .parsed property
693
+ Pydantic model is converted to JSON Schema for the API. Validation happens
694
+ automatically via Pydantic. Search models (models with '-search' suffix) do
695
+ not support structured output.
434
696
  """
435
697
  if context is None:
436
698
  context = AIMessages()
437
699
  if options is None:
438
700
  options = ModelOptions()
701
+ else:
702
+ # Create a copy to avoid mutating the caller's options object
703
+ options = options.model_copy()
439
704
 
440
705
  options.response_format = response_format
441
706
 
442
707
  if isinstance(messages, str):
443
708
  messages = AIMessages([messages])
444
709
 
710
+ assert isinstance(messages, AIMessages)
711
+
712
+ with contextlib.suppress(Exception):
713
+ track_llm_documents(context, messages)
714
+
445
715
  # Call the internal generate function with structured output enabled
446
716
  try:
447
- response = await _generate_with_retry(model, context, messages, options)
717
+ response = await _generate_with_retry(
718
+ model,
719
+ context,
720
+ messages,
721
+ options,
722
+ purpose=purpose,
723
+ expected_cost=expected_cost,
724
+ )
448
725
  except (ValueError, LLMError):
449
726
  raise # Explicitly re-raise to satisfy DOC502
450
727
 
451
- # Extract the parsed value from the response
452
- parsed_value: T | None = None
453
-
454
- # Check if response has choices and parsed content
455
- if response.choices and hasattr(response.choices[0].message, "parsed"):
456
- parsed: Any = response.choices[0].message.parsed # type: ignore[attr-defined]
457
-
458
- # If parsed is a dict, instantiate it as the response format class
459
- if isinstance(parsed, dict):
460
- parsed_value = response_format(**parsed)
461
- # If it's already the right type, use it
462
- elif isinstance(parsed, response_format):
463
- parsed_value = parsed
464
- else:
465
- # Otherwise try to convert it
466
- raise TypeError(
467
- f"Unable to convert parsed response to {response_format.__name__}: "
468
- f"got type {type(parsed).__name__}" # type: ignore[reportUnknownArgumentType]
469
- )
470
-
471
- if parsed_value is None:
472
- raise ValueError("No parsed content available from the model response")
473
-
474
- # Create a StructuredModelResponse with the parsed value
475
- return StructuredModelResponse[T](chat_completion=response, parsed_value=parsed_value)
728
+ return StructuredModelResponse[T].from_model_response(response)