ai-pipeline-core 0.3.4__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. ai_pipeline_core/__init__.py +64 -158
  2. ai_pipeline_core/deployment/__init__.py +6 -18
  3. ai_pipeline_core/deployment/base.py +392 -212
  4. ai_pipeline_core/deployment/contract.py +6 -10
  5. ai_pipeline_core/{utils → deployment}/deploy.py +50 -69
  6. ai_pipeline_core/deployment/helpers.py +16 -17
  7. ai_pipeline_core/{progress.py → deployment/progress.py} +23 -24
  8. ai_pipeline_core/{utils/remote_deployment.py → deployment/remote.py} +11 -14
  9. ai_pipeline_core/docs_generator/__init__.py +54 -0
  10. ai_pipeline_core/docs_generator/__main__.py +5 -0
  11. ai_pipeline_core/docs_generator/cli.py +196 -0
  12. ai_pipeline_core/docs_generator/extractor.py +324 -0
  13. ai_pipeline_core/docs_generator/guide_builder.py +644 -0
  14. ai_pipeline_core/docs_generator/trimmer.py +35 -0
  15. ai_pipeline_core/docs_generator/validator.py +114 -0
  16. ai_pipeline_core/document_store/__init__.py +13 -0
  17. ai_pipeline_core/document_store/_summary.py +9 -0
  18. ai_pipeline_core/document_store/_summary_worker.py +170 -0
  19. ai_pipeline_core/document_store/clickhouse.py +492 -0
  20. ai_pipeline_core/document_store/factory.py +38 -0
  21. ai_pipeline_core/document_store/local.py +312 -0
  22. ai_pipeline_core/document_store/memory.py +85 -0
  23. ai_pipeline_core/document_store/protocol.py +68 -0
  24. ai_pipeline_core/documents/__init__.py +12 -14
  25. ai_pipeline_core/documents/_context_vars.py +85 -0
  26. ai_pipeline_core/documents/_hashing.py +52 -0
  27. ai_pipeline_core/documents/attachment.py +85 -0
  28. ai_pipeline_core/documents/context.py +128 -0
  29. ai_pipeline_core/documents/document.py +318 -1434
  30. ai_pipeline_core/documents/mime_type.py +11 -84
  31. ai_pipeline_core/documents/utils.py +4 -12
  32. ai_pipeline_core/exceptions.py +10 -62
  33. ai_pipeline_core/images/__init__.py +32 -85
  34. ai_pipeline_core/images/_processing.py +5 -11
  35. ai_pipeline_core/llm/__init__.py +6 -4
  36. ai_pipeline_core/llm/ai_messages.py +102 -90
  37. ai_pipeline_core/llm/client.py +229 -183
  38. ai_pipeline_core/llm/model_options.py +12 -84
  39. ai_pipeline_core/llm/model_response.py +53 -99
  40. ai_pipeline_core/llm/model_types.py +8 -23
  41. ai_pipeline_core/logging/__init__.py +2 -7
  42. ai_pipeline_core/logging/logging.yml +1 -1
  43. ai_pipeline_core/logging/logging_config.py +27 -37
  44. ai_pipeline_core/logging/logging_mixin.py +15 -41
  45. ai_pipeline_core/observability/__init__.py +32 -0
  46. ai_pipeline_core/observability/_debug/__init__.py +30 -0
  47. ai_pipeline_core/observability/_debug/_auto_summary.py +94 -0
  48. ai_pipeline_core/{debug/config.py → observability/_debug/_config.py} +11 -7
  49. ai_pipeline_core/{debug/content.py → observability/_debug/_content.py} +133 -75
  50. ai_pipeline_core/{debug/processor.py → observability/_debug/_processor.py} +16 -17
  51. ai_pipeline_core/{debug/summary.py → observability/_debug/_summary.py} +113 -37
  52. ai_pipeline_core/observability/_debug/_types.py +75 -0
  53. ai_pipeline_core/{debug/writer.py → observability/_debug/_writer.py} +126 -196
  54. ai_pipeline_core/observability/_document_tracking.py +146 -0
  55. ai_pipeline_core/observability/_initialization.py +194 -0
  56. ai_pipeline_core/observability/_logging_bridge.py +57 -0
  57. ai_pipeline_core/observability/_summary.py +81 -0
  58. ai_pipeline_core/observability/_tracking/__init__.py +6 -0
  59. ai_pipeline_core/observability/_tracking/_client.py +178 -0
  60. ai_pipeline_core/observability/_tracking/_internal.py +28 -0
  61. ai_pipeline_core/observability/_tracking/_models.py +138 -0
  62. ai_pipeline_core/observability/_tracking/_processor.py +158 -0
  63. ai_pipeline_core/observability/_tracking/_service.py +311 -0
  64. ai_pipeline_core/observability/_tracking/_writer.py +229 -0
  65. ai_pipeline_core/{tracing.py → observability/tracing.py} +139 -335
  66. ai_pipeline_core/pipeline/__init__.py +10 -0
  67. ai_pipeline_core/pipeline/decorators.py +915 -0
  68. ai_pipeline_core/pipeline/options.py +16 -0
  69. ai_pipeline_core/prompt_manager.py +16 -102
  70. ai_pipeline_core/settings.py +26 -31
  71. ai_pipeline_core/testing.py +9 -0
  72. ai_pipeline_core-0.4.0.dist-info/METADATA +807 -0
  73. ai_pipeline_core-0.4.0.dist-info/RECORD +76 -0
  74. ai_pipeline_core/debug/__init__.py +0 -26
  75. ai_pipeline_core/documents/document_list.py +0 -420
  76. ai_pipeline_core/documents/flow_document.py +0 -112
  77. ai_pipeline_core/documents/task_document.py +0 -117
  78. ai_pipeline_core/documents/temporary_document.py +0 -74
  79. ai_pipeline_core/flow/__init__.py +0 -9
  80. ai_pipeline_core/flow/config.py +0 -494
  81. ai_pipeline_core/flow/options.py +0 -75
  82. ai_pipeline_core/pipeline.py +0 -718
  83. ai_pipeline_core/prefect.py +0 -63
  84. ai_pipeline_core/prompt_builder/__init__.py +0 -5
  85. ai_pipeline_core/prompt_builder/documents_prompt.jinja2 +0 -23
  86. ai_pipeline_core/prompt_builder/global_cache.py +0 -78
  87. ai_pipeline_core/prompt_builder/new_core_documents_prompt.jinja2 +0 -6
  88. ai_pipeline_core/prompt_builder/prompt_builder.py +0 -253
  89. ai_pipeline_core/prompt_builder/system_prompt.jinja2 +0 -41
  90. ai_pipeline_core/storage/__init__.py +0 -8
  91. ai_pipeline_core/storage/storage.py +0 -628
  92. ai_pipeline_core/utils/__init__.py +0 -8
  93. ai_pipeline_core-0.3.4.dist-info/METADATA +0 -569
  94. ai_pipeline_core-0.3.4.dist-info/RECORD +0 -57
  95. {ai_pipeline_core-0.3.4.dist-info → ai_pipeline_core-0.4.0.dist-info}/WHEEL +0 -0
  96. {ai_pipeline_core-0.3.4.dist-info → ai_pipeline_core-0.4.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,17 +1,17 @@
1
1
  """LLM client implementation for AI model interactions.
2
2
 
3
- @public
4
-
5
3
  This module provides the core functionality for interacting with language models
6
4
  through a unified interface. It handles retries, caching, structured outputs,
7
5
  and integration with various LLM providers via LiteLLM.
8
6
 
9
- Key functions:
10
- - generate(): Text generation with optional context caching
11
- - generate_structured(): Type-safe structured output generation
7
+ Automatic image auto-tiling splits oversized images in attachments to meet
8
+ model-specific constraints (e.g., 3000x3000 for Gemini, 1000x1000 default).
9
+ Context caching separates static content from dynamic messages for 50-90% token savings.
10
+ Optional purpose and expected_cost parameters enable tracing and cost-tracking.
12
11
  """
13
12
 
14
13
  import asyncio
14
+ import contextlib
15
15
  import time
16
16
  from io import BytesIO
17
17
  from typing import Any, TypeVar
@@ -20,15 +20,18 @@ from lmnr import Laminar
20
20
  from openai import AsyncOpenAI
21
21
  from openai.lib.streaming.chat import ChunkEvent, ContentDeltaEvent, ContentDoneEvent
22
22
  from openai.types.chat import (
23
+ ChatCompletion,
23
24
  ChatCompletionMessageParam,
24
25
  )
25
26
  from PIL import Image
26
- from prefect.logging import get_logger
27
27
  from pydantic import BaseModel, ValidationError
28
28
 
29
29
  from ai_pipeline_core.documents import Document
30
+ from ai_pipeline_core.documents.attachment import Attachment
30
31
  from ai_pipeline_core.exceptions import LLMError
31
- from ai_pipeline_core.images import ImageProcessingConfig, process_image_to_documents
32
+ from ai_pipeline_core.images import ImageProcessingConfig, process_image, process_image_to_documents
33
+ from ai_pipeline_core.logging import get_pipeline_logger
34
+ from ai_pipeline_core.observability._document_tracking import track_llm_documents
32
35
  from ai_pipeline_core.settings import settings
33
36
 
34
37
  from .ai_messages import AIMessages, AIMessageType
@@ -36,16 +39,12 @@ from .model_options import ModelOptions
36
39
  from .model_response import ModelResponse, StructuredModelResponse
37
40
  from .model_types import ModelName
38
41
 
39
- logger = get_logger()
42
+ logger = get_pipeline_logger(__name__)
40
43
 
41
44
  # Image splitting configs for automatic large-image handling at the LLM boundary.
42
45
  # Gemini supports up to 3000x3000; all other models use a conservative 1000x1000 default.
43
- _GEMINI_IMAGE_CONFIG = ImageProcessingConfig(
44
- max_dimension=3000, max_pixels=9_000_000, jpeg_quality=75
45
- )
46
- _DEFAULT_IMAGE_CONFIG = ImageProcessingConfig(
47
- max_dimension=1000, max_pixels=1_000_000, jpeg_quality=75
48
- )
46
+ _GEMINI_IMAGE_CONFIG = ImageProcessingConfig(max_dimension=3000, max_pixels=9_000_000, jpeg_quality=75)
47
+ _DEFAULT_IMAGE_CONFIG = ImageProcessingConfig(max_dimension=1000, max_pixels=1_000_000, jpeg_quality=75)
49
48
 
50
49
 
51
50
  def _get_image_config(model: str) -> ImageProcessingConfig:
@@ -55,13 +54,13 @@ def _get_image_config(model: str) -> ImageProcessingConfig:
55
54
  return _DEFAULT_IMAGE_CONFIG
56
55
 
57
56
 
58
- def _prepare_images_for_model(messages: AIMessages, model: str) -> AIMessages:
59
- """Split image documents that exceed model constraints.
57
+ def _prepare_images_for_model(messages: AIMessages, model: str) -> AIMessages: # noqa: C901, PLR0912, PLR0915, PLR0914
58
+ """Split image documents and image attachments that exceed model constraints.
60
59
 
61
60
  Returns a new AIMessages with oversized images replaced by tiles.
62
61
  Returns the original instance unchanged if no splitting is needed.
63
62
  """
64
- if not any(isinstance(m, Document) and m.is_image for m in messages):
63
+ if not any(isinstance(m, Document) and (m.is_image or any(att.is_image for att in m.attachments)) for m in messages):
65
64
  return messages
66
65
 
67
66
  config = _get_image_config(model)
@@ -69,25 +68,79 @@ def _prepare_images_for_model(messages: AIMessages, model: str) -> AIMessages:
69
68
  changed = False
70
69
 
71
70
  for msg in messages:
72
- if not (isinstance(msg, Document) and msg.is_image):
71
+ if not isinstance(msg, Document):
73
72
  result.append(msg)
74
73
  continue
75
74
 
76
- try:
77
- with Image.open(BytesIO(msg.content)) as img:
78
- w, h = img.size
79
- except Exception:
80
- result.append(msg)
81
- continue
82
-
83
- if w <= config.max_dimension and h <= config.max_dimension and w * h <= config.max_pixels:
84
- result.append(msg)
85
- continue
86
-
87
- name_prefix = msg.name.rsplit(".", 1)[0] if "." in msg.name else msg.name
88
- tiles = process_image_to_documents(msg, config=config, name_prefix=name_prefix)
89
- result.extend(tiles)
90
- changed = True
75
+ # 1. Handle top-level image Documents (existing logic)
76
+ if msg.is_image:
77
+ try:
78
+ with Image.open(BytesIO(msg.content)) as img:
79
+ w, h = img.size
80
+ except Exception:
81
+ result.append(msg)
82
+ continue
83
+
84
+ within_limits = w <= config.max_dimension and h <= config.max_dimension and w * h <= config.max_pixels
85
+ if within_limits:
86
+ pass # Falls through to attachment handling
87
+ else:
88
+ name_prefix = msg.name.rsplit(".", 1)[0] if "." in msg.name else msg.name
89
+ tiles = process_image_to_documents(msg, config=config, name_prefix=name_prefix)
90
+ if msg.attachments and tiles:
91
+ tiles[0] = tiles[0].model_copy(update={"attachments": msg.attachments})
92
+ result.extend(tiles)
93
+ changed = True
94
+ continue
95
+
96
+ # 2. Handle image attachments
97
+ if msg.attachments:
98
+ new_attachments: list[Attachment] = []
99
+ attachments_changed = False
100
+
101
+ for att in msg.attachments:
102
+ if not att.is_image:
103
+ new_attachments.append(att)
104
+ continue
105
+
106
+ try:
107
+ with Image.open(BytesIO(att.content)) as img:
108
+ w, h = img.size
109
+ except Exception:
110
+ new_attachments.append(att)
111
+ continue
112
+
113
+ att_within_limits = w <= config.max_dimension and h <= config.max_dimension and w * h <= config.max_pixels
114
+ if att_within_limits:
115
+ new_attachments.append(att)
116
+ continue
117
+
118
+ # Tile the oversized attachment image
119
+ processed = process_image(att.content, config=config)
120
+ att_prefix = att.name.rsplit(".", 1)[0] if "." in att.name else att.name
121
+
122
+ for part in processed.parts:
123
+ if part.total == 1:
124
+ tile_name = f"{att_prefix}.jpg"
125
+ tile_desc = att.description
126
+ else:
127
+ tile_name = f"{att_prefix}_{part.index + 1:02d}_of_{part.total:02d}.jpg"
128
+ tile_desc = f"{att.description} ({part.label})" if att.description else part.label
129
+
130
+ new_attachments.append(
131
+ Attachment(
132
+ name=tile_name,
133
+ content=part.data,
134
+ description=tile_desc,
135
+ )
136
+ )
137
+ attachments_changed = True
138
+
139
+ if attachments_changed:
140
+ msg = msg.model_copy(update={"attachments": tuple(new_attachments)}) # noqa: PLW2901
141
+ changed = True
142
+
143
+ result.append(msg)
91
144
 
92
145
  if not changed:
93
146
  return messages
@@ -129,9 +182,8 @@ def _process_messages(
129
182
  If cache_ttl is None or empty string (falsy), no caching is applied.
130
183
  All system and context messages receive cache_control to maximize cache efficiency.
131
184
 
132
- Note:
133
- This is an internal function used by _generate_with_retry().
134
- The context/messages split enables efficient token usage.
185
+ This is an internal function used by _generate_with_retry().
186
+ The context/messages split enables efficient token usage.
135
187
  """
136
188
  processed_messages: list[ChatCompletionMessageParam] = []
137
189
 
@@ -184,20 +236,17 @@ def _remove_cache_control(
184
236
  The same message list (modified in-place) with all cache_control
185
237
  fields removed from both messages and their content items.
186
238
 
187
- Note:
188
- This function modifies the input list in-place but also returns it
189
- for convenience. Handles both list-based content (multipart) and
190
- string content (simple messages).
239
+ Modifies the input list in-place but also returns it for convenience.
240
+ Handles both list-based content (multipart) and string content (simple messages).
191
241
  """
192
242
  for message in messages:
193
- if content := message.get("content"):
194
- if isinstance(content, list):
195
- for item in content:
196
- if "cache_control" in item:
197
- del item["cache_control"]
243
+ if (content := message.get("content")) and isinstance(content, list):
244
+ for item in content:
245
+ if "cache_control" in item:
246
+ del item["cache_control"]
198
247
  if "cache_control" in message:
199
248
  del message["cache_control"]
200
- return messages # type: ignore
249
+ return messages
201
250
 
202
251
 
203
252
  def _model_name_to_openrouter_model(model: ModelName) -> str:
@@ -232,30 +281,76 @@ def _model_name_to_openrouter_model(model: ModelName) -> str:
232
281
  return model
233
282
 
234
283
 
235
- async def _generate(
236
- model: str, messages: list[ChatCompletionMessageParam], completion_kwargs: dict[str, Any]
284
+ async def _generate_streaming(client: AsyncOpenAI, model: str, messages: list[ChatCompletionMessageParam], completion_kwargs: dict[str, Any]) -> ModelResponse:
285
+ """Execute a streaming LLM API call."""
286
+ start_time = time.time()
287
+ first_token_time = None
288
+ usage = None
289
+ async with client.chat.completions.stream(
290
+ model=model,
291
+ messages=messages,
292
+ **completion_kwargs,
293
+ ) as s:
294
+ async for event in s:
295
+ if isinstance(event, ContentDeltaEvent):
296
+ if not first_token_time:
297
+ first_token_time = time.time()
298
+ elif isinstance(event, ContentDoneEvent):
299
+ pass
300
+ elif isinstance(event, ChunkEvent) and event.chunk.usage:
301
+ usage = event.chunk.usage
302
+ if not first_token_time:
303
+ first_token_time = time.time()
304
+ raw_response = await s.get_final_completion()
305
+
306
+ metadata = {
307
+ "time_taken": round(time.time() - start_time, 2),
308
+ "first_token_time": round(first_token_time - start_time, 2),
309
+ }
310
+ return ModelResponse(raw_response, model_options=completion_kwargs, metadata=metadata, usage=usage)
311
+
312
+
313
+ async def _generate_non_streaming(
314
+ client: AsyncOpenAI, model: str, messages: list[ChatCompletionMessageParam], completion_kwargs: dict[str, Any]
237
315
  ) -> ModelResponse:
238
- """Execute a single LLM API call.
316
+ """Execute a non-streaming LLM API call.
317
+
318
+ Avoids OpenAI SDK delta accumulation — some providers (e.g. Grok) send
319
+ streaming annotation deltas that crash the SDK's accumulate_delta().
320
+ """
321
+ start_time = time.time()
322
+ kwargs = {k: v for k, v in completion_kwargs.items() if k != "stream_options"}
323
+ response_format = kwargs.get("response_format")
324
+ if isinstance(response_format, type) and issubclass(response_format, BaseModel):
325
+ raw_response: ChatCompletion = await client.chat.completions.parse(
326
+ model=model,
327
+ messages=messages,
328
+ **kwargs,
329
+ )
330
+ else:
331
+ raw_response = await client.chat.completions.create(
332
+ model=model,
333
+ messages=messages,
334
+ stream=False,
335
+ **kwargs,
336
+ )
337
+ elapsed = round(time.time() - start_time, 2)
338
+ metadata = {"time_taken": elapsed, "first_token_time": elapsed}
339
+ return ModelResponse(raw_response, model_options=completion_kwargs, metadata=metadata)
340
+
239
341
 
240
- Internal function that makes the actual API request to the LLM provider.
241
- Handles both regular and structured output generation.
342
+ async def _generate(model: str, messages: list[ChatCompletionMessageParam], completion_kwargs: dict[str, Any], *, stream: bool = True) -> ModelResponse:
343
+ """Execute a single LLM API call.
242
344
 
243
345
  Args:
244
346
  model: Model identifier (e.g., "gpt-5.1", "gemini-3-pro").
245
347
  messages: Formatted messages for the API.
246
348
  completion_kwargs: Additional parameters for the completion API.
349
+ stream: Whether to use streaming mode (default True). Non-streaming
350
+ avoids OpenAI SDK delta accumulation issues with some providers.
247
351
 
248
352
  Returns:
249
353
  ModelResponse with generated content and metadata.
250
-
251
- API selection:
252
- - Uses client.chat.completions.parse() for structured output
253
- - Uses client.chat.completions.create() for regular text
254
-
255
- Note:
256
- - Uses AsyncOpenAI client configured via settings
257
- - Captures response headers for cost tracking
258
- - Response includes model options for debugging
259
354
  """
260
355
  if "openrouter" in settings.openai_base_url.lower():
261
356
  model = _model_name_to_openrouter_model(model)
@@ -264,45 +359,18 @@ async def _generate(
264
359
  api_key=settings.openai_api_key,
265
360
  base_url=settings.openai_base_url,
266
361
  ) as client:
267
- start_time = time.time()
268
- first_token_time = None
269
- usage = None
270
- async with client.chat.completions.stream(
271
- model=model,
272
- messages=messages,
273
- **completion_kwargs,
274
- ) as stream:
275
- async for event in stream:
276
- if isinstance(event, ContentDeltaEvent):
277
- if not first_token_time:
278
- first_token_time = time.time()
279
- elif isinstance(event, ContentDoneEvent):
280
- pass
281
- elif isinstance(event, ChunkEvent):
282
- if event.chunk.usage: # used to fix a bug with missing usage data
283
- usage = event.chunk.usage
284
- if not first_token_time:
285
- first_token_time = time.time()
286
- raw_response = await stream.get_final_completion()
287
-
288
- metadata = {
289
- "time_taken": round(time.time() - start_time, 2),
290
- "first_token_time": round(first_token_time - start_time, 2),
291
- }
292
- response = ModelResponse(
293
- raw_response,
294
- model_options=completion_kwargs,
295
- metadata=metadata,
296
- usage=usage,
297
- )
298
- return response
362
+ if stream:
363
+ return await _generate_streaming(client, model, messages, completion_kwargs)
364
+ return await _generate_non_streaming(client, model, messages, completion_kwargs)
299
365
 
300
366
 
301
- async def _generate_with_retry(
367
+ async def _generate_with_retry( # noqa: PLR0917
302
368
  model: str,
303
369
  context: AIMessages,
304
370
  messages: AIMessages,
305
371
  options: ModelOptions,
372
+ purpose: str | None = None,
373
+ expected_cost: float | None = None,
306
374
  ) -> ModelResponse:
307
375
  """Core LLM generation with automatic retry logic.
308
376
 
@@ -314,6 +382,8 @@ async def _generate_with_retry(
314
382
  context: Cached context messages (can be empty).
315
383
  messages: Dynamic query messages.
316
384
  options: Configuration including retries, timeout, temperature.
385
+ purpose: Optional semantic label for the LLM span name.
386
+ expected_cost: Optional expected cost for cost-tracking attributes.
317
387
 
318
388
  Returns:
319
389
  ModelResponse with generated content.
@@ -322,8 +392,7 @@ async def _generate_with_retry(
322
392
  ValueError: If model is not provided or both context and messages are empty.
323
393
  LLMError: If all retry attempts are exhausted.
324
394
 
325
- Note:
326
- Empty responses trigger a retry as they indicate API issues.
395
+ Empty responses trigger a retry as they indicate API issues.
327
396
  """
328
397
  if not model:
329
398
  raise ValueError("Model must be provided")
@@ -338,9 +407,7 @@ async def _generate_with_retry(
338
407
  # Bug fix for minimum explicit context size for Gemini models
339
408
  options.cache_ttl = None
340
409
 
341
- processed_messages = _process_messages(
342
- context, messages, options.system_prompt, options.cache_ttl
343
- )
410
+ processed_messages = _process_messages(context, messages, options.system_prompt, options.cache_ttl)
344
411
  completion_kwargs: dict[str, Any] = {
345
412
  **options.to_openai_completion_kwargs(),
346
413
  }
@@ -350,17 +417,18 @@ async def _generate_with_retry(
350
417
 
351
418
  for attempt in range(options.retries):
352
419
  try:
353
- with Laminar.start_as_current_span(
354
- model, span_type="LLM", input=processed_messages
355
- ) as span:
356
- response = await _generate(model, processed_messages, completion_kwargs)
357
- span.set_attributes(response.get_laminar_metadata()) # pyright: ignore[reportArgumentType]
358
- Laminar.set_span_output([
359
- r for r in (response.reasoning_content, response.content) if r
360
- ])
420
+ with Laminar.start_as_current_span(purpose or model, span_type="LLM", input=processed_messages) as span:
421
+ response = await _generate(model, processed_messages, completion_kwargs, stream=options.stream)
422
+ laminar_metadata = response.get_laminar_metadata()
423
+ if purpose:
424
+ laminar_metadata["purpose"] = purpose
425
+ if expected_cost is not None:
426
+ laminar_metadata["expected_cost"] = expected_cost
427
+ span.set_attributes(laminar_metadata) # pyright: ignore[reportArgumentType]
428
+ Laminar.set_span_output([r for r in (response.reasoning_content, response.content) if r])
361
429
  response.validate_output()
362
430
  return response
363
- except (asyncio.TimeoutError, ValueError, ValidationError, Exception) as e:
431
+ except (TimeoutError, ValueError, ValidationError, Exception) as e:
364
432
  if not isinstance(e, asyncio.TimeoutError):
365
433
  # disable cache if it's not a timeout because it may cause an error
366
434
  completion_kwargs["extra_body"]["cache"] = {"no-cache": True}
@@ -384,11 +452,11 @@ async def generate(
384
452
  context: AIMessages | None = None,
385
453
  messages: AIMessages | str,
386
454
  options: ModelOptions | None = None,
455
+ purpose: str | None = None,
456
+ expected_cost: float | None = None,
387
457
  ) -> ModelResponse:
388
458
  """Generate text response from a language model.
389
459
 
390
- @public
391
-
392
460
  Main entry point for LLM text generation with smart context caching.
393
461
  The context/messages split enables efficient token usage by caching
394
462
  expensive static content separately from dynamic queries.
@@ -405,13 +473,16 @@ async def generate(
405
473
  context: Static context to cache (documents, examples, instructions).
406
474
  Defaults to None (empty context). Cached for 5 minutes by default.
407
475
  messages: Dynamic messages/queries. AIMessages or str ONLY.
408
- Do not pass Document or DocumentList directly.
476
+ Do not pass Document or list[Document] directly.
409
477
  If string, converted to AIMessages internally.
410
- options: DEPRECATED - DO NOT USE. Reserved for internal framework usage only.
411
- Framework defaults are production-optimized (3 retries, 10s delay, 300s timeout).
412
- Configure model behavior centrally via LiteLLM proxy settings or environment
413
- variables, not per API call. Provider-specific settings should be configured
414
- at the proxy level.
478
+ options: Internal framework parameter. Framework defaults are production-optimized
479
+ (3 retries, 20s delay, 600s timeout). Configure model behavior centrally via
480
+ LiteLLM proxy settings or environment variables, not per API call.
481
+ Provider-specific settings should be configured at the proxy level.
482
+ purpose: Optional semantic label used as the tracing span name
483
+ instead of model name. Stored as a span attribute.
484
+ expected_cost: Optional expected cost stored as a span attribute
485
+ for cost-tracking and comparison with actual cost.
415
486
 
416
487
  Returns:
417
488
  ModelResponse containing:
@@ -454,35 +525,12 @@ async def generate(
454
525
  - Changes with each API call
455
526
  - Never cached, always processed fresh
456
527
 
457
- Example:
458
- >>> # CORRECT - No options parameter (this is the recommended pattern)
459
- >>> response = await llm.generate("gpt-5.1", messages="Explain quantum computing")
460
- >>> print(response.content) # In production, use get_pipeline_logger instead of print
461
-
462
- >>> # With context caching for efficiency
463
- >>> # Context and messages are both AIMessages or str; wrap any Documents
464
- >>> static_doc = AIMessages([large_document, "few-shot example: ..."])
465
- >>>
466
- >>> # First call: caches context
467
- >>> r1 = await llm.generate("gpt-5.1", context=static_doc, messages="Summarize")
468
- >>>
469
- >>> # Second call: reuses cache, saves tokens!
470
- >>> r2 = await llm.generate("gpt-5.1", context=static_doc, messages="Key points?")
471
-
472
- >>> # Multi-turn conversation
473
- >>> messages = AIMessages([
474
- ... "What is Python?",
475
- ... previous_response,
476
- ... "Can you give an example?"
477
- ... ])
478
- >>> response = await llm.generate("gpt-5.1", messages=messages)
479
-
480
528
  Performance:
481
529
  - Context caching saves ~50-90% tokens on repeated calls
482
530
  - First call: full token cost
483
531
  - Subsequent calls (within cache TTL): only messages tokens
484
532
  - Default cache TTL is 300s/5 minutes (production-optimized)
485
- - Default retry logic: 3 attempts with 10s delay (production-optimized)
533
+ - Default retry logic: 3 attempts with 20s delay (production-optimized)
486
534
 
487
535
  Caching:
488
536
  When enabled in your LiteLLM proxy and supported by the upstream provider,
@@ -500,10 +548,8 @@ async def generate(
500
548
 
501
549
  This centralizes configuration and ensures consistency across all API calls.
502
550
 
503
- Note:
504
- - All models are accessed via LiteLLM proxy
505
- - Automatic retry with configurable delay between attempts
506
- - Cost tracking via response headers
551
+ All models are accessed via LiteLLM proxy with automatic retry and
552
+ cost tracking via response headers.
507
553
  """
508
554
  if isinstance(messages, str):
509
555
  messages = AIMessages([messages])
@@ -512,9 +558,22 @@ async def generate(
512
558
  context = AIMessages()
513
559
  if options is None:
514
560
  options = ModelOptions()
561
+ else:
562
+ # Create a copy to avoid mutating the caller's options object
563
+ options = options.model_copy()
564
+
565
+ with contextlib.suppress(Exception):
566
+ track_llm_documents(context, messages)
515
567
 
516
568
  try:
517
- return await _generate_with_retry(model, context, messages, options)
569
+ return await _generate_with_retry(
570
+ model,
571
+ context,
572
+ messages,
573
+ options,
574
+ purpose=purpose,
575
+ expected_cost=expected_cost,
576
+ )
518
577
  except (ValueError, LLMError):
519
578
  raise # Explicitly re-raise to satisfy DOC502
520
579
 
@@ -523,18 +582,18 @@ T = TypeVar("T", bound=BaseModel)
523
582
  """Type variable for Pydantic model types in structured generation."""
524
583
 
525
584
 
526
- async def generate_structured(
585
+ async def generate_structured( # noqa: UP047
527
586
  model: ModelName,
528
587
  response_format: type[T],
529
588
  *,
530
589
  context: AIMessages | None = None,
531
590
  messages: AIMessages | str,
532
591
  options: ModelOptions | None = None,
592
+ purpose: str | None = None,
593
+ expected_cost: float | None = None,
533
594
  ) -> StructuredModelResponse[T]:
534
595
  """Generate structured output conforming to a Pydantic model.
535
596
 
536
- @public
537
-
538
597
  Type-safe generation that returns validated Pydantic model instances.
539
598
  Uses OpenAI's structured output feature for guaranteed schema compliance.
540
599
 
@@ -589,21 +648,21 @@ async def generate_structured(
589
648
  context: Static context to cache (documents, schemas, examples).
590
649
  Defaults to None (empty AIMessages).
591
650
  messages: Dynamic prompts/queries. AIMessages or str ONLY.
592
- Do not pass Document or DocumentList directly.
651
+ Do not pass Document or list[Document] directly.
593
652
  options: Optional ModelOptions for configuring temperature, retries, etc.
594
653
  If provided, it will NOT be mutated (a copy is created internally).
595
654
  The response_format field is set automatically from the response_format parameter.
596
655
  In most cases, leave as None to use framework defaults.
597
656
  Configure model behavior centrally via LiteLLM proxy settings when possible.
657
+ purpose: Optional semantic label used as the tracing span name
658
+ instead of model name. Stored as a span attribute.
659
+ expected_cost: Optional expected cost stored as a span attribute
660
+ for cost-tracking and comparison with actual cost.
598
661
 
599
- Note:
600
- Vision/PDF model compatibility considerations:
601
- - Images require vision-capable models that also support structured output
602
- - PDFs require models with both document processing AND structured output support
603
- - Many models support either vision OR structured output, but not both
604
- - Test your specific model+document combination before production use
605
- - Consider two-step approach: generate() for analysis, then generate_structured()
606
- for formatting
662
+ Vision/PDF model compatibility: Images require vision-capable models that also support
663
+ structured output. PDFs require models with both document processing AND structured output
664
+ support. Consider two-step approach: generate() for analysis, then generate_structured()
665
+ for formatting.
607
666
 
608
667
  Returns:
609
668
  StructuredModelResponse[T] containing:
@@ -617,26 +676,6 @@ async def generate_structured(
617
676
  LLMError: If generation fails after retries.
618
677
  ValidationError: If response cannot be parsed into response_format.
619
678
 
620
- Example:
621
- >>> from pydantic import BaseModel, Field
622
- >>>
623
- >>> class Analysis(BaseModel):
624
- ... summary: str = Field(description="Brief summary")
625
- ... sentiment: float = Field(ge=-1, le=1)
626
- ... key_points: list[str] = Field(max_length=5)
627
- >>>
628
- >>> # CORRECT - No options parameter
629
- >>> response = await llm.generate_structured(
630
- ... "gpt-5.1",
631
- ... response_format=Analysis,
632
- ... messages="Analyze this product review: ..."
633
- ... )
634
- >>>
635
- >>> analysis = response.parsed # Type: Analysis
636
- >>> print(f"Sentiment: {analysis.sentiment}")
637
- >>> for point in analysis.key_points:
638
- ... print(f"- {point}")
639
-
640
679
  Supported models:
641
680
  Structured output support varies by provider and model. Generally includes:
642
681
  - OpenAI: GPT-4 and newer models
@@ -651,12 +690,9 @@ async def generate_structured(
651
690
  - Complex schemas increase generation time
652
691
  - Validation overhead is minimal (Pydantic is fast)
653
692
 
654
- Note:
655
- - Pydantic model is converted to JSON Schema for the API
656
- - The model generates JSON matching the schema
657
- - Validation happens automatically via Pydantic
658
- - Use Field() descriptions to guide generation
659
- - Search models (models with '-search' suffix) do not support structured output
693
+ Pydantic model is converted to JSON Schema for the API. Validation happens
694
+ automatically via Pydantic. Search models (models with '-search' suffix) do
695
+ not support structured output.
660
696
  """
661
697
  if context is None:
662
698
  context = AIMessages()
@@ -673,9 +709,19 @@ async def generate_structured(
673
709
 
674
710
  assert isinstance(messages, AIMessages)
675
711
 
712
+ with contextlib.suppress(Exception):
713
+ track_llm_documents(context, messages)
714
+
676
715
  # Call the internal generate function with structured output enabled
677
716
  try:
678
- response = await _generate_with_retry(model, context, messages, options)
717
+ response = await _generate_with_retry(
718
+ model,
719
+ context,
720
+ messages,
721
+ options,
722
+ purpose=purpose,
723
+ expected_cost=expected_cost,
724
+ )
679
725
  except (ValueError, LLMError):
680
726
  raise # Explicitly re-raise to satisfy DOC502
681
727