ai-pipeline-core 0.3.3__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. ai_pipeline_core/__init__.py +70 -144
  2. ai_pipeline_core/deployment/__init__.py +6 -18
  3. ai_pipeline_core/deployment/base.py +392 -212
  4. ai_pipeline_core/deployment/contract.py +6 -10
  5. ai_pipeline_core/{utils → deployment}/deploy.py +50 -69
  6. ai_pipeline_core/deployment/helpers.py +16 -17
  7. ai_pipeline_core/{progress.py → deployment/progress.py} +23 -24
  8. ai_pipeline_core/{utils/remote_deployment.py → deployment/remote.py} +11 -14
  9. ai_pipeline_core/docs_generator/__init__.py +54 -0
  10. ai_pipeline_core/docs_generator/__main__.py +5 -0
  11. ai_pipeline_core/docs_generator/cli.py +196 -0
  12. ai_pipeline_core/docs_generator/extractor.py +324 -0
  13. ai_pipeline_core/docs_generator/guide_builder.py +644 -0
  14. ai_pipeline_core/docs_generator/trimmer.py +35 -0
  15. ai_pipeline_core/docs_generator/validator.py +114 -0
  16. ai_pipeline_core/document_store/__init__.py +13 -0
  17. ai_pipeline_core/document_store/_summary.py +9 -0
  18. ai_pipeline_core/document_store/_summary_worker.py +170 -0
  19. ai_pipeline_core/document_store/clickhouse.py +492 -0
  20. ai_pipeline_core/document_store/factory.py +38 -0
  21. ai_pipeline_core/document_store/local.py +312 -0
  22. ai_pipeline_core/document_store/memory.py +85 -0
  23. ai_pipeline_core/document_store/protocol.py +68 -0
  24. ai_pipeline_core/documents/__init__.py +12 -14
  25. ai_pipeline_core/documents/_context_vars.py +85 -0
  26. ai_pipeline_core/documents/_hashing.py +52 -0
  27. ai_pipeline_core/documents/attachment.py +85 -0
  28. ai_pipeline_core/documents/context.py +128 -0
  29. ai_pipeline_core/documents/document.py +318 -1434
  30. ai_pipeline_core/documents/mime_type.py +37 -82
  31. ai_pipeline_core/documents/utils.py +4 -12
  32. ai_pipeline_core/exceptions.py +10 -62
  33. ai_pipeline_core/images/__init__.py +32 -85
  34. ai_pipeline_core/images/_processing.py +5 -11
  35. ai_pipeline_core/llm/__init__.py +6 -4
  36. ai_pipeline_core/llm/ai_messages.py +106 -81
  37. ai_pipeline_core/llm/client.py +267 -158
  38. ai_pipeline_core/llm/model_options.py +12 -84
  39. ai_pipeline_core/llm/model_response.py +53 -99
  40. ai_pipeline_core/llm/model_types.py +8 -23
  41. ai_pipeline_core/logging/__init__.py +2 -7
  42. ai_pipeline_core/logging/logging.yml +1 -1
  43. ai_pipeline_core/logging/logging_config.py +27 -37
  44. ai_pipeline_core/logging/logging_mixin.py +15 -41
  45. ai_pipeline_core/observability/__init__.py +32 -0
  46. ai_pipeline_core/observability/_debug/__init__.py +30 -0
  47. ai_pipeline_core/observability/_debug/_auto_summary.py +94 -0
  48. ai_pipeline_core/{debug/config.py → observability/_debug/_config.py} +11 -7
  49. ai_pipeline_core/{debug/content.py → observability/_debug/_content.py} +134 -75
  50. ai_pipeline_core/{debug/processor.py → observability/_debug/_processor.py} +16 -17
  51. ai_pipeline_core/{debug/summary.py → observability/_debug/_summary.py} +113 -37
  52. ai_pipeline_core/observability/_debug/_types.py +75 -0
  53. ai_pipeline_core/{debug/writer.py → observability/_debug/_writer.py} +126 -196
  54. ai_pipeline_core/observability/_document_tracking.py +146 -0
  55. ai_pipeline_core/observability/_initialization.py +194 -0
  56. ai_pipeline_core/observability/_logging_bridge.py +57 -0
  57. ai_pipeline_core/observability/_summary.py +81 -0
  58. ai_pipeline_core/observability/_tracking/__init__.py +6 -0
  59. ai_pipeline_core/observability/_tracking/_client.py +178 -0
  60. ai_pipeline_core/observability/_tracking/_internal.py +28 -0
  61. ai_pipeline_core/observability/_tracking/_models.py +138 -0
  62. ai_pipeline_core/observability/_tracking/_processor.py +158 -0
  63. ai_pipeline_core/observability/_tracking/_service.py +311 -0
  64. ai_pipeline_core/observability/_tracking/_writer.py +229 -0
  65. ai_pipeline_core/{tracing.py → observability/tracing.py} +139 -335
  66. ai_pipeline_core/pipeline/__init__.py +10 -0
  67. ai_pipeline_core/pipeline/decorators.py +915 -0
  68. ai_pipeline_core/pipeline/options.py +16 -0
  69. ai_pipeline_core/prompt_manager.py +16 -102
  70. ai_pipeline_core/settings.py +26 -31
  71. ai_pipeline_core/testing.py +9 -0
  72. ai_pipeline_core-0.4.0.dist-info/METADATA +807 -0
  73. ai_pipeline_core-0.4.0.dist-info/RECORD +76 -0
  74. ai_pipeline_core/debug/__init__.py +0 -26
  75. ai_pipeline_core/documents/document_list.py +0 -420
  76. ai_pipeline_core/documents/flow_document.py +0 -112
  77. ai_pipeline_core/documents/task_document.py +0 -117
  78. ai_pipeline_core/documents/temporary_document.py +0 -74
  79. ai_pipeline_core/flow/__init__.py +0 -9
  80. ai_pipeline_core/flow/config.py +0 -494
  81. ai_pipeline_core/flow/options.py +0 -75
  82. ai_pipeline_core/pipeline.py +0 -718
  83. ai_pipeline_core/prefect.py +0 -63
  84. ai_pipeline_core/prompt_builder/__init__.py +0 -5
  85. ai_pipeline_core/prompt_builder/documents_prompt.jinja2 +0 -23
  86. ai_pipeline_core/prompt_builder/global_cache.py +0 -78
  87. ai_pipeline_core/prompt_builder/new_core_documents_prompt.jinja2 +0 -6
  88. ai_pipeline_core/prompt_builder/prompt_builder.py +0 -253
  89. ai_pipeline_core/prompt_builder/system_prompt.jinja2 +0 -41
  90. ai_pipeline_core/storage/__init__.py +0 -8
  91. ai_pipeline_core/storage/storage.py +0 -628
  92. ai_pipeline_core/utils/__init__.py +0 -8
  93. ai_pipeline_core-0.3.3.dist-info/METADATA +0 -569
  94. ai_pipeline_core-0.3.3.dist-info/RECORD +0 -57
  95. {ai_pipeline_core-0.3.3.dist-info → ai_pipeline_core-0.4.0.dist-info}/WHEEL +0 -0
  96. {ai_pipeline_core-0.3.3.dist-info → ai_pipeline_core-0.4.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,38 +1,150 @@
1
1
  """LLM client implementation for AI model interactions.
2
2
 
3
- @public
4
-
5
3
  This module provides the core functionality for interacting with language models
6
4
  through a unified interface. It handles retries, caching, structured outputs,
7
5
  and integration with various LLM providers via LiteLLM.
8
6
 
9
- Key functions:
10
- - generate(): Text generation with optional context caching
11
- - generate_structured(): Type-safe structured output generation
7
+ Automatic image auto-tiling splits oversized images in attachments to meet
8
+ model-specific constraints (e.g., 3000x3000 for Gemini, 1000x1000 default).
9
+ Context caching separates static content from dynamic messages for 50-90% token savings.
10
+ Optional purpose and expected_cost parameters enable tracing and cost-tracking.
12
11
  """
13
12
 
14
13
  import asyncio
14
+ import contextlib
15
15
  import time
16
+ from io import BytesIO
16
17
  from typing import Any, TypeVar
17
18
 
18
19
  from lmnr import Laminar
19
20
  from openai import AsyncOpenAI
20
21
  from openai.lib.streaming.chat import ChunkEvent, ContentDeltaEvent, ContentDoneEvent
21
22
  from openai.types.chat import (
23
+ ChatCompletion,
22
24
  ChatCompletionMessageParam,
23
25
  )
24
- from prefect.logging import get_logger
26
+ from PIL import Image
25
27
  from pydantic import BaseModel, ValidationError
26
28
 
29
+ from ai_pipeline_core.documents import Document
30
+ from ai_pipeline_core.documents.attachment import Attachment
27
31
  from ai_pipeline_core.exceptions import LLMError
32
+ from ai_pipeline_core.images import ImageProcessingConfig, process_image, process_image_to_documents
33
+ from ai_pipeline_core.logging import get_pipeline_logger
34
+ from ai_pipeline_core.observability._document_tracking import track_llm_documents
28
35
  from ai_pipeline_core.settings import settings
29
36
 
30
- from .ai_messages import AIMessages
37
+ from .ai_messages import AIMessages, AIMessageType
31
38
  from .model_options import ModelOptions
32
39
  from .model_response import ModelResponse, StructuredModelResponse
33
40
  from .model_types import ModelName
34
41
 
35
- logger = get_logger()
42
+ logger = get_pipeline_logger(__name__)
43
+
44
+ # Image splitting configs for automatic large-image handling at the LLM boundary.
45
+ # Gemini supports up to 3000x3000; all other models use a conservative 1000x1000 default.
46
+ _GEMINI_IMAGE_CONFIG = ImageProcessingConfig(max_dimension=3000, max_pixels=9_000_000, jpeg_quality=75)
47
+ _DEFAULT_IMAGE_CONFIG = ImageProcessingConfig(max_dimension=1000, max_pixels=1_000_000, jpeg_quality=75)
48
+
49
+
50
+ def _get_image_config(model: str) -> ImageProcessingConfig:
51
+ """Return the image splitting config for a model."""
52
+ if "gemini" in model.lower():
53
+ return _GEMINI_IMAGE_CONFIG
54
+ return _DEFAULT_IMAGE_CONFIG
55
+
56
+
57
+ def _prepare_images_for_model(messages: AIMessages, model: str) -> AIMessages: # noqa: C901, PLR0912, PLR0915, PLR0914
58
+ """Split image documents and image attachments that exceed model constraints.
59
+
60
+ Returns a new AIMessages with oversized images replaced by tiles.
61
+ Returns the original instance unchanged if no splitting is needed.
62
+ """
63
+ if not any(isinstance(m, Document) and (m.is_image or any(att.is_image for att in m.attachments)) for m in messages):
64
+ return messages
65
+
66
+ config = _get_image_config(model)
67
+ result: list[AIMessageType] = []
68
+ changed = False
69
+
70
+ for msg in messages:
71
+ if not isinstance(msg, Document):
72
+ result.append(msg)
73
+ continue
74
+
75
+ # 1. Handle top-level image Documents (existing logic)
76
+ if msg.is_image:
77
+ try:
78
+ with Image.open(BytesIO(msg.content)) as img:
79
+ w, h = img.size
80
+ except Exception:
81
+ result.append(msg)
82
+ continue
83
+
84
+ within_limits = w <= config.max_dimension and h <= config.max_dimension and w * h <= config.max_pixels
85
+ if within_limits:
86
+ pass # Falls through to attachment handling
87
+ else:
88
+ name_prefix = msg.name.rsplit(".", 1)[0] if "." in msg.name else msg.name
89
+ tiles = process_image_to_documents(msg, config=config, name_prefix=name_prefix)
90
+ if msg.attachments and tiles:
91
+ tiles[0] = tiles[0].model_copy(update={"attachments": msg.attachments})
92
+ result.extend(tiles)
93
+ changed = True
94
+ continue
95
+
96
+ # 2. Handle image attachments
97
+ if msg.attachments:
98
+ new_attachments: list[Attachment] = []
99
+ attachments_changed = False
100
+
101
+ for att in msg.attachments:
102
+ if not att.is_image:
103
+ new_attachments.append(att)
104
+ continue
105
+
106
+ try:
107
+ with Image.open(BytesIO(att.content)) as img:
108
+ w, h = img.size
109
+ except Exception:
110
+ new_attachments.append(att)
111
+ continue
112
+
113
+ att_within_limits = w <= config.max_dimension and h <= config.max_dimension and w * h <= config.max_pixels
114
+ if att_within_limits:
115
+ new_attachments.append(att)
116
+ continue
117
+
118
+ # Tile the oversized attachment image
119
+ processed = process_image(att.content, config=config)
120
+ att_prefix = att.name.rsplit(".", 1)[0] if "." in att.name else att.name
121
+
122
+ for part in processed.parts:
123
+ if part.total == 1:
124
+ tile_name = f"{att_prefix}.jpg"
125
+ tile_desc = att.description
126
+ else:
127
+ tile_name = f"{att_prefix}_{part.index + 1:02d}_of_{part.total:02d}.jpg"
128
+ tile_desc = f"{att.description} ({part.label})" if att.description else part.label
129
+
130
+ new_attachments.append(
131
+ Attachment(
132
+ name=tile_name,
133
+ content=part.data,
134
+ description=tile_desc,
135
+ )
136
+ )
137
+ attachments_changed = True
138
+
139
+ if attachments_changed:
140
+ msg = msg.model_copy(update={"attachments": tuple(new_attachments)}) # noqa: PLW2901
141
+ changed = True
142
+
143
+ result.append(msg)
144
+
145
+ if not changed:
146
+ return messages
147
+ return AIMessages(result)
36
148
 
37
149
 
38
150
  def _process_messages(
@@ -70,9 +182,8 @@ def _process_messages(
70
182
  If cache_ttl is None or empty string (falsy), no caching is applied.
71
183
  All system and context messages receive cache_control to maximize cache efficiency.
72
184
 
73
- Note:
74
- This is an internal function used by _generate_with_retry().
75
- The context/messages split enables efficient token usage.
185
+ This is an internal function used by _generate_with_retry().
186
+ The context/messages split enables efficient token usage.
76
187
  """
77
188
  processed_messages: list[ChatCompletionMessageParam] = []
78
189
 
@@ -125,20 +236,17 @@ def _remove_cache_control(
125
236
  The same message list (modified in-place) with all cache_control
126
237
  fields removed from both messages and their content items.
127
238
 
128
- Note:
129
- This function modifies the input list in-place but also returns it
130
- for convenience. Handles both list-based content (multipart) and
131
- string content (simple messages).
239
+ Modifies the input list in-place but also returns it for convenience.
240
+ Handles both list-based content (multipart) and string content (simple messages).
132
241
  """
133
242
  for message in messages:
134
- if content := message.get("content"):
135
- if isinstance(content, list):
136
- for item in content:
137
- if "cache_control" in item:
138
- del item["cache_control"]
243
+ if (content := message.get("content")) and isinstance(content, list):
244
+ for item in content:
245
+ if "cache_control" in item:
246
+ del item["cache_control"]
139
247
  if "cache_control" in message:
140
248
  del message["cache_control"]
141
- return messages # type: ignore
249
+ return messages
142
250
 
143
251
 
144
252
  def _model_name_to_openrouter_model(model: ModelName) -> str:
@@ -173,30 +281,76 @@ def _model_name_to_openrouter_model(model: ModelName) -> str:
173
281
  return model
174
282
 
175
283
 
176
- async def _generate(
177
- model: str, messages: list[ChatCompletionMessageParam], completion_kwargs: dict[str, Any]
284
+ async def _generate_streaming(client: AsyncOpenAI, model: str, messages: list[ChatCompletionMessageParam], completion_kwargs: dict[str, Any]) -> ModelResponse:
285
+ """Execute a streaming LLM API call."""
286
+ start_time = time.time()
287
+ first_token_time = None
288
+ usage = None
289
+ async with client.chat.completions.stream(
290
+ model=model,
291
+ messages=messages,
292
+ **completion_kwargs,
293
+ ) as s:
294
+ async for event in s:
295
+ if isinstance(event, ContentDeltaEvent):
296
+ if not first_token_time:
297
+ first_token_time = time.time()
298
+ elif isinstance(event, ContentDoneEvent):
299
+ pass
300
+ elif isinstance(event, ChunkEvent) and event.chunk.usage:
301
+ usage = event.chunk.usage
302
+ if not first_token_time:
303
+ first_token_time = time.time()
304
+ raw_response = await s.get_final_completion()
305
+
306
+ metadata = {
307
+ "time_taken": round(time.time() - start_time, 2),
308
+ "first_token_time": round(first_token_time - start_time, 2),
309
+ }
310
+ return ModelResponse(raw_response, model_options=completion_kwargs, metadata=metadata, usage=usage)
311
+
312
+
313
+ async def _generate_non_streaming(
314
+ client: AsyncOpenAI, model: str, messages: list[ChatCompletionMessageParam], completion_kwargs: dict[str, Any]
178
315
  ) -> ModelResponse:
179
- """Execute a single LLM API call.
316
+ """Execute a non-streaming LLM API call.
180
317
 
181
- Internal function that makes the actual API request to the LLM provider.
182
- Handles both regular and structured output generation.
318
+ Avoids OpenAI SDK delta accumulation some providers (e.g. Grok) send
319
+ streaming annotation deltas that crash the SDK's accumulate_delta().
320
+ """
321
+ start_time = time.time()
322
+ kwargs = {k: v for k, v in completion_kwargs.items() if k != "stream_options"}
323
+ response_format = kwargs.get("response_format")
324
+ if isinstance(response_format, type) and issubclass(response_format, BaseModel):
325
+ raw_response: ChatCompletion = await client.chat.completions.parse(
326
+ model=model,
327
+ messages=messages,
328
+ **kwargs,
329
+ )
330
+ else:
331
+ raw_response = await client.chat.completions.create(
332
+ model=model,
333
+ messages=messages,
334
+ stream=False,
335
+ **kwargs,
336
+ )
337
+ elapsed = round(time.time() - start_time, 2)
338
+ metadata = {"time_taken": elapsed, "first_token_time": elapsed}
339
+ return ModelResponse(raw_response, model_options=completion_kwargs, metadata=metadata)
340
+
341
+
342
+ async def _generate(model: str, messages: list[ChatCompletionMessageParam], completion_kwargs: dict[str, Any], *, stream: bool = True) -> ModelResponse:
343
+ """Execute a single LLM API call.
183
344
 
184
345
  Args:
185
346
  model: Model identifier (e.g., "gpt-5.1", "gemini-3-pro").
186
347
  messages: Formatted messages for the API.
187
348
  completion_kwargs: Additional parameters for the completion API.
349
+ stream: Whether to use streaming mode (default True). Non-streaming
350
+ avoids OpenAI SDK delta accumulation issues with some providers.
188
351
 
189
352
  Returns:
190
353
  ModelResponse with generated content and metadata.
191
-
192
- API selection:
193
- - Uses client.chat.completions.parse() for structured output
194
- - Uses client.chat.completions.create() for regular text
195
-
196
- Note:
197
- - Uses AsyncOpenAI client configured via settings
198
- - Captures response headers for cost tracking
199
- - Response includes model options for debugging
200
354
  """
201
355
  if "openrouter" in settings.openai_base_url.lower():
202
356
  model = _model_name_to_openrouter_model(model)
@@ -205,45 +359,18 @@ async def _generate(
205
359
  api_key=settings.openai_api_key,
206
360
  base_url=settings.openai_base_url,
207
361
  ) as client:
208
- start_time = time.time()
209
- first_token_time = None
210
- usage = None
211
- async with client.chat.completions.stream(
212
- model=model,
213
- messages=messages,
214
- **completion_kwargs,
215
- ) as stream:
216
- async for event in stream:
217
- if isinstance(event, ContentDeltaEvent):
218
- if not first_token_time:
219
- first_token_time = time.time()
220
- elif isinstance(event, ContentDoneEvent):
221
- pass
222
- elif isinstance(event, ChunkEvent):
223
- if event.chunk.usage: # used to fix a bug with missing usage data
224
- usage = event.chunk.usage
225
- if not first_token_time:
226
- first_token_time = time.time()
227
- raw_response = await stream.get_final_completion()
228
-
229
- metadata = {
230
- "time_taken": round(time.time() - start_time, 2),
231
- "first_token_time": round(first_token_time - start_time, 2),
232
- }
233
- response = ModelResponse(
234
- raw_response,
235
- model_options=completion_kwargs,
236
- metadata=metadata,
237
- usage=usage,
238
- )
239
- return response
362
+ if stream:
363
+ return await _generate_streaming(client, model, messages, completion_kwargs)
364
+ return await _generate_non_streaming(client, model, messages, completion_kwargs)
240
365
 
241
366
 
242
- async def _generate_with_retry(
367
+ async def _generate_with_retry( # noqa: PLR0917
243
368
  model: str,
244
369
  context: AIMessages,
245
370
  messages: AIMessages,
246
371
  options: ModelOptions,
372
+ purpose: str | None = None,
373
+ expected_cost: float | None = None,
247
374
  ) -> ModelResponse:
248
375
  """Core LLM generation with automatic retry logic.
249
376
 
@@ -255,6 +382,8 @@ async def _generate_with_retry(
255
382
  context: Cached context messages (can be empty).
256
383
  messages: Dynamic query messages.
257
384
  options: Configuration including retries, timeout, temperature.
385
+ purpose: Optional semantic label for the LLM span name.
386
+ expected_cost: Optional expected cost for cost-tracking attributes.
258
387
 
259
388
  Returns:
260
389
  ModelResponse with generated content.
@@ -263,21 +392,22 @@ async def _generate_with_retry(
263
392
  ValueError: If model is not provided or both context and messages are empty.
264
393
  LLMError: If all retry attempts are exhausted.
265
394
 
266
- Note:
267
- Empty responses trigger a retry as they indicate API issues.
395
+ Empty responses trigger a retry as they indicate API issues.
268
396
  """
269
397
  if not model:
270
398
  raise ValueError("Model must be provided")
271
399
  if not context and not messages:
272
400
  raise ValueError("Either context or messages must be provided")
273
401
 
402
+ # Auto-split large images based on model-specific constraints
403
+ context = _prepare_images_for_model(context, model)
404
+ messages = _prepare_images_for_model(messages, model)
405
+
274
406
  if "gemini" in model.lower() and context.approximate_tokens_count < 10000:
275
407
  # Bug fix for minimum explicit context size for Gemini models
276
408
  options.cache_ttl = None
277
409
 
278
- processed_messages = _process_messages(
279
- context, messages, options.system_prompt, options.cache_ttl
280
- )
410
+ processed_messages = _process_messages(context, messages, options.system_prompt, options.cache_ttl)
281
411
  completion_kwargs: dict[str, Any] = {
282
412
  **options.to_openai_completion_kwargs(),
283
413
  }
@@ -287,17 +417,18 @@ async def _generate_with_retry(
287
417
 
288
418
  for attempt in range(options.retries):
289
419
  try:
290
- with Laminar.start_as_current_span(
291
- model, span_type="LLM", input=processed_messages
292
- ) as span:
293
- response = await _generate(model, processed_messages, completion_kwargs)
294
- span.set_attributes(response.get_laminar_metadata()) # pyright: ignore[reportArgumentType]
295
- Laminar.set_span_output([
296
- r for r in (response.reasoning_content, response.content) if r
297
- ])
420
+ with Laminar.start_as_current_span(purpose or model, span_type="LLM", input=processed_messages) as span:
421
+ response = await _generate(model, processed_messages, completion_kwargs, stream=options.stream)
422
+ laminar_metadata = response.get_laminar_metadata()
423
+ if purpose:
424
+ laminar_metadata["purpose"] = purpose
425
+ if expected_cost is not None:
426
+ laminar_metadata["expected_cost"] = expected_cost
427
+ span.set_attributes(laminar_metadata) # pyright: ignore[reportArgumentType]
428
+ Laminar.set_span_output([r for r in (response.reasoning_content, response.content) if r])
298
429
  response.validate_output()
299
430
  return response
300
- except (asyncio.TimeoutError, ValueError, ValidationError, Exception) as e:
431
+ except (TimeoutError, ValueError, ValidationError, Exception) as e:
301
432
  if not isinstance(e, asyncio.TimeoutError):
302
433
  # disable cache if it's not a timeout because it may cause an error
303
434
  completion_kwargs["extra_body"]["cache"] = {"no-cache": True}
@@ -321,11 +452,11 @@ async def generate(
321
452
  context: AIMessages | None = None,
322
453
  messages: AIMessages | str,
323
454
  options: ModelOptions | None = None,
455
+ purpose: str | None = None,
456
+ expected_cost: float | None = None,
324
457
  ) -> ModelResponse:
325
458
  """Generate text response from a language model.
326
459
 
327
- @public
328
-
329
460
  Main entry point for LLM text generation with smart context caching.
330
461
  The context/messages split enables efficient token usage by caching
331
462
  expensive static content separately from dynamic queries.
@@ -342,13 +473,16 @@ async def generate(
342
473
  context: Static context to cache (documents, examples, instructions).
343
474
  Defaults to None (empty context). Cached for 5 minutes by default.
344
475
  messages: Dynamic messages/queries. AIMessages or str ONLY.
345
- Do not pass Document or DocumentList directly.
476
+ Do not pass Document or list[Document] directly.
346
477
  If string, converted to AIMessages internally.
347
- options: DEPRECATED - DO NOT USE. Reserved for internal framework usage only.
348
- Framework defaults are production-optimized (3 retries, 10s delay, 300s timeout).
349
- Configure model behavior centrally via LiteLLM proxy settings or environment
350
- variables, not per API call. Provider-specific settings should be configured
351
- at the proxy level.
478
+ options: Internal framework parameter. Framework defaults are production-optimized
479
+ (3 retries, 20s delay, 600s timeout). Configure model behavior centrally via
480
+ LiteLLM proxy settings or environment variables, not per API call.
481
+ Provider-specific settings should be configured at the proxy level.
482
+ purpose: Optional semantic label used as the tracing span name
483
+ instead of model name. Stored as a span attribute.
484
+ expected_cost: Optional expected cost stored as a span attribute
485
+ for cost-tracking and comparison with actual cost.
352
486
 
353
487
  Returns:
354
488
  ModelResponse containing:
@@ -391,35 +525,12 @@ async def generate(
391
525
  - Changes with each API call
392
526
  - Never cached, always processed fresh
393
527
 
394
- Example:
395
- >>> # CORRECT - No options parameter (this is the recommended pattern)
396
- >>> response = await llm.generate("gpt-5.1", messages="Explain quantum computing")
397
- >>> print(response.content) # In production, use get_pipeline_logger instead of print
398
-
399
- >>> # With context caching for efficiency
400
- >>> # Context and messages are both AIMessages or str; wrap any Documents
401
- >>> static_doc = AIMessages([large_document, "few-shot example: ..."])
402
- >>>
403
- >>> # First call: caches context
404
- >>> r1 = await llm.generate("gpt-5.1", context=static_doc, messages="Summarize")
405
- >>>
406
- >>> # Second call: reuses cache, saves tokens!
407
- >>> r2 = await llm.generate("gpt-5.1", context=static_doc, messages="Key points?")
408
-
409
- >>> # Multi-turn conversation
410
- >>> messages = AIMessages([
411
- ... "What is Python?",
412
- ... previous_response,
413
- ... "Can you give an example?"
414
- ... ])
415
- >>> response = await llm.generate("gpt-5.1", messages=messages)
416
-
417
528
  Performance:
418
529
  - Context caching saves ~50-90% tokens on repeated calls
419
530
  - First call: full token cost
420
531
  - Subsequent calls (within cache TTL): only messages tokens
421
532
  - Default cache TTL is 300s/5 minutes (production-optimized)
422
- - Default retry logic: 3 attempts with 10s delay (production-optimized)
533
+ - Default retry logic: 3 attempts with 20s delay (production-optimized)
423
534
 
424
535
  Caching:
425
536
  When enabled in your LiteLLM proxy and supported by the upstream provider,
@@ -437,10 +548,8 @@ async def generate(
437
548
 
438
549
  This centralizes configuration and ensures consistency across all API calls.
439
550
 
440
- Note:
441
- - All models are accessed via LiteLLM proxy
442
- - Automatic retry with configurable delay between attempts
443
- - Cost tracking via response headers
551
+ All models are accessed via LiteLLM proxy with automatic retry and
552
+ cost tracking via response headers.
444
553
  """
445
554
  if isinstance(messages, str):
446
555
  messages = AIMessages([messages])
@@ -449,9 +558,22 @@ async def generate(
449
558
  context = AIMessages()
450
559
  if options is None:
451
560
  options = ModelOptions()
561
+ else:
562
+ # Create a copy to avoid mutating the caller's options object
563
+ options = options.model_copy()
564
+
565
+ with contextlib.suppress(Exception):
566
+ track_llm_documents(context, messages)
452
567
 
453
568
  try:
454
- return await _generate_with_retry(model, context, messages, options)
569
+ return await _generate_with_retry(
570
+ model,
571
+ context,
572
+ messages,
573
+ options,
574
+ purpose=purpose,
575
+ expected_cost=expected_cost,
576
+ )
455
577
  except (ValueError, LLMError):
456
578
  raise # Explicitly re-raise to satisfy DOC502
457
579
 
@@ -460,18 +582,18 @@ T = TypeVar("T", bound=BaseModel)
460
582
  """Type variable for Pydantic model types in structured generation."""
461
583
 
462
584
 
463
- async def generate_structured(
585
+ async def generate_structured( # noqa: UP047
464
586
  model: ModelName,
465
587
  response_format: type[T],
466
588
  *,
467
589
  context: AIMessages | None = None,
468
590
  messages: AIMessages | str,
469
591
  options: ModelOptions | None = None,
592
+ purpose: str | None = None,
593
+ expected_cost: float | None = None,
470
594
  ) -> StructuredModelResponse[T]:
471
595
  """Generate structured output conforming to a Pydantic model.
472
596
 
473
- @public
474
-
475
597
  Type-safe generation that returns validated Pydantic model instances.
476
598
  Uses OpenAI's structured output feature for guaranteed schema compliance.
477
599
 
@@ -526,21 +648,21 @@ async def generate_structured(
526
648
  context: Static context to cache (documents, schemas, examples).
527
649
  Defaults to None (empty AIMessages).
528
650
  messages: Dynamic prompts/queries. AIMessages or str ONLY.
529
- Do not pass Document or DocumentList directly.
651
+ Do not pass Document or list[Document] directly.
530
652
  options: Optional ModelOptions for configuring temperature, retries, etc.
531
653
  If provided, it will NOT be mutated (a copy is created internally).
532
654
  The response_format field is set automatically from the response_format parameter.
533
655
  In most cases, leave as None to use framework defaults.
534
656
  Configure model behavior centrally via LiteLLM proxy settings when possible.
657
+ purpose: Optional semantic label used as the tracing span name
658
+ instead of model name. Stored as a span attribute.
659
+ expected_cost: Optional expected cost stored as a span attribute
660
+ for cost-tracking and comparison with actual cost.
535
661
 
536
- Note:
537
- Vision/PDF model compatibility considerations:
538
- - Images require vision-capable models that also support structured output
539
- - PDFs require models with both document processing AND structured output support
540
- - Many models support either vision OR structured output, but not both
541
- - Test your specific model+document combination before production use
542
- - Consider two-step approach: generate() for analysis, then generate_structured()
543
- for formatting
662
+ Vision/PDF model compatibility: Images require vision-capable models that also support
663
+ structured output. PDFs require models with both document processing AND structured output
664
+ support. Consider two-step approach: generate() for analysis, then generate_structured()
665
+ for formatting.
544
666
 
545
667
  Returns:
546
668
  StructuredModelResponse[T] containing:
@@ -554,26 +676,6 @@ async def generate_structured(
554
676
  LLMError: If generation fails after retries.
555
677
  ValidationError: If response cannot be parsed into response_format.
556
678
 
557
- Example:
558
- >>> from pydantic import BaseModel, Field
559
- >>>
560
- >>> class Analysis(BaseModel):
561
- ... summary: str = Field(description="Brief summary")
562
- ... sentiment: float = Field(ge=-1, le=1)
563
- ... key_points: list[str] = Field(max_length=5)
564
- >>>
565
- >>> # CORRECT - No options parameter
566
- >>> response = await llm.generate_structured(
567
- ... "gpt-5.1",
568
- ... response_format=Analysis,
569
- ... messages="Analyze this product review: ..."
570
- ... )
571
- >>>
572
- >>> analysis = response.parsed # Type: Analysis
573
- >>> print(f"Sentiment: {analysis.sentiment}")
574
- >>> for point in analysis.key_points:
575
- ... print(f"- {point}")
576
-
577
679
  Supported models:
578
680
  Structured output support varies by provider and model. Generally includes:
579
681
  - OpenAI: GPT-4 and newer models
@@ -588,12 +690,9 @@ async def generate_structured(
588
690
  - Complex schemas increase generation time
589
691
  - Validation overhead is minimal (Pydantic is fast)
590
692
 
591
- Note:
592
- - Pydantic model is converted to JSON Schema for the API
593
- - The model generates JSON matching the schema
594
- - Validation happens automatically via Pydantic
595
- - Use Field() descriptions to guide generation
596
- - Search models (models with '-search' suffix) do not support structured output
693
+ Pydantic model is converted to JSON Schema for the API. Validation happens
694
+ automatically via Pydantic. Search models (models with '-search' suffix) do
695
+ not support structured output.
597
696
  """
598
697
  if context is None:
599
698
  context = AIMessages()
@@ -610,9 +709,19 @@ async def generate_structured(
610
709
 
611
710
  assert isinstance(messages, AIMessages)
612
711
 
712
+ with contextlib.suppress(Exception):
713
+ track_llm_documents(context, messages)
714
+
613
715
  # Call the internal generate function with structured output enabled
614
716
  try:
615
- response = await _generate_with_retry(model, context, messages, options)
717
+ response = await _generate_with_retry(
718
+ model,
719
+ context,
720
+ messages,
721
+ options,
722
+ purpose=purpose,
723
+ expected_cost=expected_cost,
724
+ )
616
725
  except (ValueError, LLMError):
617
726
  raise # Explicitly re-raise to satisfy DOC502
618
727