ai-pipeline-core 0.1.13__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. ai_pipeline_core/__init__.py +25 -14
  2. ai_pipeline_core/documents/__init__.py +2 -1
  3. ai_pipeline_core/documents/document.py +317 -49
  4. ai_pipeline_core/documents/document_list.py +136 -33
  5. ai_pipeline_core/documents/flow_document.py +8 -29
  6. ai_pipeline_core/documents/task_document.py +6 -27
  7. ai_pipeline_core/documents/temporary_document.py +6 -27
  8. ai_pipeline_core/documents/utils.py +64 -1
  9. ai_pipeline_core/flow/config.py +174 -5
  10. ai_pipeline_core/flow/options.py +2 -2
  11. ai_pipeline_core/llm/__init__.py +6 -1
  12. ai_pipeline_core/llm/ai_messages.py +14 -7
  13. ai_pipeline_core/llm/client.py +143 -55
  14. ai_pipeline_core/llm/model_options.py +20 -5
  15. ai_pipeline_core/llm/model_response.py +77 -29
  16. ai_pipeline_core/llm/model_types.py +38 -40
  17. ai_pipeline_core/logging/__init__.py +0 -2
  18. ai_pipeline_core/logging/logging_config.py +0 -6
  19. ai_pipeline_core/logging/logging_mixin.py +2 -10
  20. ai_pipeline_core/pipeline.py +68 -65
  21. ai_pipeline_core/prefect.py +12 -3
  22. ai_pipeline_core/prompt_manager.py +6 -7
  23. ai_pipeline_core/settings.py +13 -5
  24. ai_pipeline_core/simple_runner/__init__.py +1 -11
  25. ai_pipeline_core/simple_runner/cli.py +13 -12
  26. ai_pipeline_core/simple_runner/simple_runner.py +34 -172
  27. ai_pipeline_core/storage/__init__.py +8 -0
  28. ai_pipeline_core/storage/storage.py +628 -0
  29. ai_pipeline_core/tracing.py +110 -26
  30. {ai_pipeline_core-0.1.13.dist-info → ai_pipeline_core-0.2.0.dist-info}/METADATA +60 -23
  31. ai_pipeline_core-0.2.0.dist-info/RECORD +38 -0
  32. ai_pipeline_core-0.1.13.dist-info/RECORD +0 -36
  33. {ai_pipeline_core-0.1.13.dist-info → ai_pipeline_core-0.2.0.dist-info}/WHEEL +0 -0
  34. {ai_pipeline_core-0.1.13.dist-info → ai_pipeline_core-0.2.0.dist-info}/licenses/LICENSE +0 -0
@@ -10,11 +10,16 @@ Best Practice:
10
10
  to ensure type safety and proper validation of output documents.
11
11
  """
12
12
 
13
+ import json
13
14
  from abc import ABC
14
15
  from typing import Any, ClassVar, Iterable
15
16
 
16
- from ai_pipeline_core.documents import DocumentList, FlowDocument
17
+ from ai_pipeline_core.documents import Document, DocumentList, FlowDocument
17
18
  from ai_pipeline_core.exceptions import DocumentValidationError
19
+ from ai_pipeline_core.logging import get_pipeline_logger
20
+ from ai_pipeline_core.storage import Storage
21
+
22
+ logger = get_pipeline_logger(__name__)
18
23
 
19
24
 
20
25
  class FlowConfig(ABC):
@@ -51,8 +56,10 @@ class FlowConfig(ABC):
51
56
  ... OUTPUT_DOCUMENT_TYPE = ProcessedDocument # Different type!
52
57
  >>>
53
58
  >>> # Use in @pipeline_flow - RECOMMENDED PATTERN
54
- >>> @pipeline_flow(name="processing")
55
- >>> async def process(config: ProcessingFlowConfig, docs: DocumentList) -> DocumentList:
59
+ >>> @pipeline_flow(config=ProcessingFlowConfig, name="processing")
60
+ >>> async def process(
61
+ ... project_name: str, docs: DocumentList, flow_options: FlowOptions
62
+ ... ) -> DocumentList:
56
63
  ... outputs = []
57
64
  ... # ... processing logic ...
58
65
  ... return config.create_and_validate_output(outputs)
@@ -289,8 +296,10 @@ class FlowConfig(ABC):
289
296
  DocumentValidationError: If output type doesn't match OUTPUT_DOCUMENT_TYPE.
290
297
 
291
298
  Example:
292
- >>> @pipeline_flow(name="my_flow")
293
- >>> async def process_flow(config: MyFlowConfig, ...) -> DocumentList:
299
+ >>> @pipeline_flow(config=MyFlowConfig, name="my_flow")
300
+ >>> async def process_flow(
301
+ ... project_name: str, documents: DocumentList, flow_options: FlowOptions
302
+ ... ) -> DocumentList:
294
303
  >>> outputs = []
295
304
  >>> # ... processing logic ...
296
305
  >>> outputs.append(OutputDoc(...))
@@ -312,3 +321,163 @@ class FlowConfig(ABC):
312
321
  documents = DocumentList(list(output)) # type: ignore[arg-type]
313
322
  cls.validate_output_documents(documents)
314
323
  return documents
324
+
325
+ @classmethod
326
+ async def load_documents(
327
+ cls,
328
+ uri: str,
329
+ ) -> DocumentList:
330
+ """Load documents from storage matching INPUT_DOCUMENT_TYPES.
331
+
332
+ Loads documents from a storage location based on the class's INPUT_DOCUMENT_TYPES.
333
+ Supports both local filesystem and Google Cloud Storage backends.
334
+ Automatically loads metadata (.description.md and .sources.json) when present.
335
+
336
+ Args:
337
+ uri: Storage URI (file://, gs://, or local path)
338
+
339
+ Returns:
340
+ DocumentList containing loaded documents matching INPUT_DOCUMENT_TYPES
341
+
342
+ Example:
343
+ >>> # Load from local filesystem
344
+ >>> docs = await MyFlowConfig.load_documents("./data")
345
+ >>>
346
+ >>> # Load from GCS (uses GCS_SERVICE_ACCOUNT_FILE from settings if configured)
347
+ >>> docs = await MyFlowConfig.load_documents("gs://bucket/data")
348
+ """
349
+ # Use INPUT_DOCUMENT_TYPES if not specified
350
+ storage = await Storage.from_uri(uri)
351
+ loaded_documents = DocumentList()
352
+
353
+ # Process each document type
354
+ for doc_type in cls.INPUT_DOCUMENT_TYPES:
355
+ canonical_name = doc_type.canonical_name()
356
+ doc_storage = storage.with_base(canonical_name)
357
+
358
+ # Check if subdirectory exists
359
+ if not await doc_storage.exists(""):
360
+ logger.debug(f"Subdirectory {canonical_name} not found, skipping")
361
+ continue
362
+
363
+ # List files in subdirectory
364
+ objects = await doc_storage.list("", recursive=False, include_dirs=False)
365
+
366
+ # Create lookup set for metadata files
367
+ object_keys = {obj.key for obj in objects}
368
+
369
+ # Filter out metadata files
370
+ doc_files = [
371
+ obj
372
+ for obj in objects
373
+ if not obj.key.endswith(Document.DESCRIPTION_EXTENSION)
374
+ and not obj.key.endswith(Document.SOURCES_EXTENSION)
375
+ ]
376
+
377
+ for obj in doc_files:
378
+ try:
379
+ # Load document content
380
+ content = await doc_storage.read_bytes(obj.key)
381
+
382
+ # Load metadata if present
383
+ description = None
384
+ sources: list[str] = []
385
+
386
+ # Check for description in objects list
387
+ desc_path = f"{obj.key}{Document.DESCRIPTION_EXTENSION}"
388
+ if desc_path in object_keys:
389
+ try:
390
+ description = await doc_storage.read_text(desc_path)
391
+ except Exception as e:
392
+ logger.warning(f"Failed to load description for {obj.key}: {e}")
393
+
394
+ # Check for sources in objects list
395
+ sources_path = f"{obj.key}{Document.SOURCES_EXTENSION}"
396
+ if sources_path in object_keys:
397
+ try:
398
+ sources_text = await doc_storage.read_text(sources_path)
399
+ sources = json.loads(sources_text)
400
+ except Exception as e:
401
+ logger.warning(f"Failed to load sources for {obj.key}: {e}")
402
+
403
+ # Create document instance
404
+ doc = doc_type(
405
+ name=obj.key,
406
+ content=content,
407
+ description=description,
408
+ sources=sources,
409
+ )
410
+
411
+ loaded_documents.append(doc)
412
+ logger.debug(f"Loaded {doc_type.__name__} document: {obj.key}")
413
+ except Exception as e:
414
+ logger.error(f"Failed to load {doc_type.__name__} document {obj.key}: {e}")
415
+
416
+ logger.info(f"Loaded {len(loaded_documents)} documents from {uri}")
417
+ return loaded_documents
418
+
419
+ @classmethod
420
+ async def save_documents(
421
+ cls,
422
+ uri: str,
423
+ documents: DocumentList,
424
+ *,
425
+ validate_output_type: bool = True,
426
+ ) -> None:
427
+ """Save documents to storage with metadata.
428
+
429
+ Saves FlowDocument instances to a storage location with their content
430
+ and metadata files (Document.DESCRIPTION_EXTENSION and Document.SOURCES_EXTENSION).
431
+ Non-FlowDocument instances (TaskDocument, TemporaryDocument) are skipped.
432
+
433
+ Args:
434
+ uri: Storage URI (file://, gs://, or local path)
435
+ documents: DocumentList to save
436
+ validate_output_type: If True, validate documents match cls.OUTPUT_DOCUMENT_TYPE
437
+
438
+ Raises:
439
+ DocumentValidationError: If validate_output_type=True and documents don't match
440
+ OUTPUT_DOCUMENT_TYPE
441
+
442
+ Example:
443
+ >>> # Save to local filesystem
444
+ >>> await MyFlowConfig.save_documents("./output", docs)
445
+ >>>
446
+ >>> # Save to GCS (uses GCS_SERVICE_ACCOUNT_FILE from settings if configured)
447
+ >>> await MyFlowConfig.save_documents("gs://bucket/output", docs)
448
+ """
449
+ # Validate output type if requested
450
+ if validate_output_type:
451
+ cls.validate_output_documents(documents)
452
+
453
+ storage = await Storage.from_uri(uri)
454
+ saved_count = 0
455
+
456
+ for doc in documents:
457
+ # Skip non-FlowDocument instances
458
+ if not isinstance(doc, FlowDocument):
459
+ logger.warning(f"Skipping non-FlowDocument: {type(doc).__name__}")
460
+ continue
461
+
462
+ # Get canonical name for subdirectory
463
+ canonical_name = doc.canonical_name()
464
+ doc_storage = storage.with_base(canonical_name)
465
+
466
+ # Save document content
467
+ await doc_storage.write_bytes(doc.name, doc.content)
468
+ saved_count += 1
469
+
470
+ # Save description if present
471
+ if doc.description:
472
+ desc_path = f"{doc.name}{Document.DESCRIPTION_EXTENSION}"
473
+ await doc_storage.write_text(desc_path, doc.description)
474
+
475
+ # Save sources if present
476
+ if doc.sources:
477
+ sources_path = f"{doc.name}{Document.SOURCES_EXTENSION}"
478
+ sources_json = json.dumps(doc.sources, indent=2)
479
+ await doc_storage.write_text(sources_path, sources_json)
480
+
481
+ logger.debug(f"Saved {type(doc).__name__} document: {doc.name}")
482
+
483
+ logger.info(f"Saved {saved_count} documents to {uri}")
@@ -60,11 +60,11 @@ class FlowOptions(BaseSettings):
60
60
  add flow-specific parameters with appropriate validation.
61
61
  """
62
62
 
63
- core_model: ModelName | str = Field(
63
+ core_model: ModelName = Field(
64
64
  default="gpt-5",
65
65
  description="Primary model for complex analysis and generation tasks.",
66
66
  )
67
- small_model: ModelName | str = Field(
67
+ small_model: ModelName = Field(
68
68
  default="gpt-5-mini",
69
69
  description="Fast, cost-effective model for simple tasks and orchestration.",
70
70
  )
@@ -8,6 +8,8 @@ from .ai_messages import AIMessages, AIMessageType
8
8
  from .client import (
9
9
  generate,
10
10
  generate_structured,
11
+ generate_with_retry_for_testing,
12
+ process_messages_for_testing,
11
13
  )
12
14
  from .model_options import ModelOptions
13
15
  from .model_response import ModelResponse, StructuredModelResponse
@@ -17,9 +19,12 @@ __all__ = [
17
19
  "AIMessages",
18
20
  "AIMessageType",
19
21
  "ModelName",
20
- "ModelOptions",
21
22
  "ModelResponse",
23
+ "ModelOptions",
22
24
  "StructuredModelResponse",
23
25
  "generate",
24
26
  "generate_structured",
27
+ # Internal functions exposed for testing only
28
+ "process_messages_for_testing",
29
+ "generate_with_retry_for_testing",
25
30
  ]
@@ -48,22 +48,31 @@ class AIMessages(list[AIMessageType]):
48
48
  - ModelResponse: Becomes {"role": "assistant", "content": response.content}
49
49
 
50
50
  Note: Document conversion is automatic. Text content becomes user text messages.
51
- Images are sent to vision-capable models (non-vision models will raise ValueError).
52
- PDFs are attached when supported by the model, otherwise a text extraction
53
- fallback is used. LiteLLM proxy handles the specific encoding requirements
54
- for each provider.
51
+
52
+ VISION/PDF MODEL COMPATIBILITY WARNING:
53
+ Images require vision-capable models (e.g., gpt-4o, gemini-pro-vision, claude-3-haiku).
54
+ Non-vision models will raise ValueError when encountering image documents.
55
+ PDFs require models with document processing support - check your model's capabilities
56
+ before including PDF documents in messages. Unsupported models may fall back to
57
+ text extraction or raise errors depending on provider configuration.
58
+ LiteLLM proxy handles the specific encoding requirements for each provider.
55
59
 
56
60
  IMPORTANT: Although AIMessages can contain Document entries, the LLM client functions
57
61
  expect `messages` to be `AIMessages` or `str`. If you start from a Document or a list
58
62
  of Documents, build AIMessages first (e.g., `AIMessages([doc])` or `AIMessages(docs)`).
59
63
 
64
+ CAUTION: AIMessages is a list subclass. Always use list construction (e.g.,
65
+ `AIMessages(["text"])`) or empty constructor with append (e.g.,
66
+ `AIMessages(); messages.append("text")`). Never pass raw strings directly to the
67
+ constructor (`AIMessages("text")`) as this will iterate over the string characters
68
+ instead of treating it as a single message.
69
+
60
70
  Example:
61
71
  >>> from ai_pipeline_core import llm
62
72
  >>> messages = AIMessages()
63
73
  >>> messages.append("What is the capital of France?")
64
74
  >>> response = await llm.generate("gpt-5", messages=messages)
65
75
  >>> messages.append(response) # Add the actual response
66
- >>> prompt = messages.get_last_message_as_str() # Get the last message as a string
67
76
  """
68
77
 
69
78
  def get_last_message(self) -> AIMessageType:
@@ -78,8 +87,6 @@ class AIMessages(list[AIMessageType]):
78
87
  def get_last_message_as_str(self) -> str:
79
88
  """Get the last message as a string, raising if not a string.
80
89
 
81
- @public
82
-
83
90
  Returns:
84
91
  The last message as a string.
85
92
 
@@ -24,7 +24,6 @@ from pydantic import BaseModel
24
24
 
25
25
  from ai_pipeline_core.exceptions import LLMError
26
26
  from ai_pipeline_core.settings import settings
27
- from ai_pipeline_core.tracing import trace
28
27
 
29
28
  from .ai_messages import AIMessages
30
29
  from .model_options import ModelOptions
@@ -38,6 +37,7 @@ def _process_messages(
38
37
  context: AIMessages,
39
38
  messages: AIMessages,
40
39
  system_prompt: str | None = None,
40
+ cache_ttl: str | None = "120s",
41
41
  ) -> list[ChatCompletionMessageParam]:
42
42
  """Process and format messages for LLM API consumption.
43
43
 
@@ -49,21 +49,25 @@ def _process_messages(
49
49
  context: Messages to be cached (typically expensive/static content).
50
50
  messages: Regular messages without caching (dynamic queries).
51
51
  system_prompt: Optional system instructions for the model.
52
+ cache_ttl: Cache TTL for context messages (e.g. "120s", "5m", "1h").
53
+ Set to None or empty string to disable caching.
52
54
 
53
55
  Returns:
54
56
  List of formatted messages ready for API calls, with:
55
57
  - System prompt at the beginning (if provided)
56
- - Context messages with cache_control on the last one
58
+ - Context messages with cache_control on the last one (if cache_ttl)
57
59
  - Regular messages without caching
58
60
 
59
61
  System Prompt Location:
60
- The system prompt from ModelOptions.system_prompt is always injected
61
- as the FIRST message with role="system". It is NOT cached with context,
62
- allowing dynamic system prompts without breaking cache efficiency.
62
+ The system prompt parameter is always injected as the FIRST message
63
+ with role="system". It is NOT cached with context, allowing dynamic
64
+ system prompts without breaking cache efficiency.
63
65
 
64
66
  Cache behavior:
65
- The last context message gets ephemeral caching (120s TTL)
67
+ The last context message gets ephemeral caching with specified TTL
66
68
  to reduce token usage on repeated calls with same context.
69
+ If cache_ttl is None or empty string (falsy), no caching is applied.
70
+ Only the last context message receives cache_control to maximize efficiency.
67
71
 
68
72
  Note:
69
73
  This is an internal function used by _generate_with_retry().
@@ -80,11 +84,12 @@ def _process_messages(
80
84
  # Use AIMessages.to_prompt() for context
81
85
  context_messages = context.to_prompt()
82
86
 
83
- # Apply caching to last context message
84
- context_messages[-1]["cache_control"] = { # type: ignore
85
- "type": "ephemeral",
86
- "ttl": "120s", # Cache for 2m
87
- }
87
+ # Apply caching to last context message if cache_ttl is set
88
+ if cache_ttl:
89
+ context_messages[-1]["cache_control"] = { # type: ignore
90
+ "type": "ephemeral",
91
+ "ttl": cache_ttl,
92
+ }
88
93
 
89
94
  processed_messages.extend(context_messages)
90
95
 
@@ -173,7 +178,9 @@ async def _generate_with_retry(
173
178
  if not context and not messages:
174
179
  raise ValueError("Either context or messages must be provided")
175
180
 
176
- processed_messages = _process_messages(context, messages, options.system_prompt)
181
+ processed_messages = _process_messages(
182
+ context, messages, options.system_prompt, options.cache_ttl
183
+ )
177
184
  completion_kwargs: dict[str, Any] = {
178
185
  "model": model,
179
186
  "messages": processed_messages,
@@ -213,9 +220,8 @@ async def _generate_with_retry(
213
220
  raise LLMError("Unknown error occurred during LLM generation.")
214
221
 
215
222
 
216
- @trace(ignore_inputs=["context"])
217
223
  async def generate(
218
- model: ModelName | str,
224
+ model: ModelName,
219
225
  *,
220
226
  context: AIMessages | None = None,
221
227
  messages: AIMessages | str,
@@ -230,20 +236,24 @@ async def generate(
230
236
  expensive static content separately from dynamic queries.
231
237
 
232
238
  Best Practices:
233
- 1. OPTIONS: Omit in 90% of cases - defaults are optimized
239
+ 1. OPTIONS: DO NOT use the options parameter - omit it entirely for production use
234
240
  2. MESSAGES: Use AIMessages or str - wrap Documents in AIMessages
235
241
  3. CONTEXT vs MESSAGES: Use context for static/cacheable, messages for dynamic
242
+ 4. CONFIGURATION: Configure model behavior via LiteLLM proxy or environment variables
236
243
 
237
244
  Args:
238
245
  model: Model to use (e.g., "gpt-5", "gemini-2.5-pro", "grok-4").
239
- Can be ModelName literal or any string for custom models.
246
+ Accepts predefined models or any string for custom models.
240
247
  context: Static context to cache (documents, examples, instructions).
241
248
  Defaults to None (empty context). Cached for 120 seconds.
242
249
  messages: Dynamic messages/queries. AIMessages or str ONLY.
243
250
  Do not pass Document or DocumentList directly.
244
251
  If string, converted to AIMessages internally.
245
- options: Model configuration (temperature, retries, timeout, etc.).
246
- Defaults to None (uses ModelOptions() with standard settings).
252
+ options: DEPRECATED - DO NOT USE. Reserved for internal framework usage only.
253
+ Framework defaults are production-optimized (3 retries, 10s delay, 300s timeout).
254
+ Configure model behavior centrally via LiteLLM proxy settings or environment
255
+ variables, not per API call. Provider-specific settings should be configured
256
+ at the proxy level.
247
257
 
248
258
  Returns:
249
259
  ModelResponse containing:
@@ -268,17 +278,26 @@ async def generate(
268
278
  # WRONG - don't convert to string yourself
269
279
  response = await llm.generate("gpt-5", messages=my_document.text) # NO!
270
280
 
281
+ VISION/PDF MODEL COMPATIBILITY:
282
+ When using Documents containing images or PDFs, ensure your model supports these formats:
283
+ - Images require vision-capable models (gpt-4o, gemini-pro-vision, claude-3-sonnet)
284
+ - PDFs require document processing support (varies by provider)
285
+ - Non-compatible models will raise ValueError or fall back to text extraction
286
+ - Check model capabilities before including visual/PDF content
287
+
271
288
  Context vs Messages Strategy:
272
- context: Static, reusable content (cached 120 seconds)
289
+ context: Static, reusable content for caching efficiency
273
290
  - Large documents, instructions, examples
274
- - Same across multiple calls
291
+ - Remains constant across multiple calls
292
+ - Cached when supported by provider/proxy configuration
275
293
 
276
- messages: Dynamic, query-specific content
294
+ messages: Dynamic, per-call specific content
277
295
  - User questions, current conversation turn
278
- - Changes every call
296
+ - Changes with each API call
297
+ - Never cached, always processed fresh
279
298
 
280
299
  Example:
281
- >>> # Simple case - no options needed (90% of cases)
300
+ >>> # CORRECT - No options parameter (this is the recommended pattern)
282
301
  >>> response = await llm.generate("gpt-5", messages="Explain quantum computing")
283
302
  >>> print(response.content) # In production, use get_pipeline_logger instead of print
284
303
 
@@ -292,13 +311,6 @@ async def generate(
292
311
  >>> # Second call: reuses cache, saves tokens!
293
312
  >>> r2 = await llm.generate("gpt-5", context=static_doc, messages="Key points?")
294
313
 
295
- >>> # AVOID unnecessary options (defaults are optimal)
296
- >>> response = await llm.generate(
297
- ... "gpt-5",
298
- ... messages="Hello",
299
- ... options=ModelOptions(temperature=0.7) # Default is probably fine!
300
- ... )
301
-
302
314
  >>> # Multi-turn conversation
303
315
  >>> messages = AIMessages([
304
316
  ... "What is Python?",
@@ -307,28 +319,48 @@ async def generate(
307
319
  ... ])
308
320
  >>> response = await llm.generate("gpt-5", messages=messages)
309
321
 
322
+ Configuration via LiteLLM Proxy:
323
+ >>> # Configure temperature in litellm_config.yaml:
324
+ >>> # model_list:
325
+ >>> # - model_name: gpt-5
326
+ >>> # litellm_params:
327
+ >>> # model: openai/gpt-4o
328
+ >>> # temperature: 0.3
329
+ >>> # max_tokens: 1000
330
+ >>>
331
+ >>> # Configure retry logic in proxy:
332
+ >>> # general_settings:
333
+ >>> # master_key: sk-1234
334
+ >>> # max_retries: 5
335
+ >>> # retry_delay: 15
336
+
310
337
  Performance:
311
338
  - Context caching saves ~50-90% tokens on repeated calls
312
339
  - First call: full token cost
313
- - Subsequent calls (within 120s): only messages tokens
314
- - Default retry delay is 10s (configurable via ModelOptions.retry_delay_seconds)
340
+ - Subsequent calls (within cache TTL): only messages tokens
341
+ - Default cache TTL is 120s (production-optimized)
342
+ - Default retry logic: 3 attempts with 10s delay (production-optimized)
315
343
 
316
344
  Caching:
317
345
  When enabled in your LiteLLM proxy and supported by the upstream provider,
318
- context messages may be cached (typical TTL ~120s) to reduce token usage on
319
- repeated calls. Savings depend on provider and payload; treat this as an
320
- optimization, not a guarantee. Cache behavior varies by proxy configuration.
346
+ context messages may be cached to reduce token usage on repeated calls.
347
+ Default TTL is 120s (optimized for production workloads). Configure caching
348
+ behavior centrally via your LiteLLM proxy settings, not per API call.
349
+ Savings depend on provider and payload; treat this as an optimization, not a guarantee.
350
+
351
+ Configuration:
352
+ All model behavior should be configured at the LiteLLM proxy level:
353
+ - Temperature, max_tokens: Set in litellm_config.yaml model_list
354
+ - Retry logic: Configure in proxy general_settings
355
+ - Timeouts: Set via proxy configuration
356
+ - Caching: Enable/configure in proxy cache settings
357
+
358
+ This centralizes configuration and ensures consistency across all API calls.
321
359
 
322
360
  Note:
323
- - Context argument is ignored by the tracer to avoid recording large data
324
361
  - All models are accessed via LiteLLM proxy
325
362
  - Automatic retry with configurable delay between attempts
326
363
  - Cost tracking via response headers
327
-
328
- See Also:
329
- - generate_structured: For typed/structured output
330
- - AIMessages: Message container with document support
331
- - ModelOptions: Configuration options
332
364
  """
333
365
  if isinstance(messages, str):
334
366
  messages = AIMessages([messages])
@@ -348,9 +380,8 @@ T = TypeVar("T", bound=BaseModel)
348
380
  """Type variable for Pydantic model types in structured generation."""
349
381
 
350
382
 
351
- @trace(ignore_inputs=["context"])
352
383
  async def generate_structured(
353
- model: ModelName | str,
384
+ model: ModelName,
354
385
  response_format: type[T],
355
386
  *,
356
387
  context: AIMessages | None = None,
@@ -364,20 +395,71 @@ async def generate_structured(
364
395
  Type-safe generation that returns validated Pydantic model instances.
365
396
  Uses OpenAI's structured output feature for guaranteed schema compliance.
366
397
 
367
- Best Practices (same as generate):
368
- 1. OPTIONS: Omit in 90% of cases - defaults are optimized
398
+ IMPORTANT: Search models (models with '-search' suffix) do not support
399
+ structured output. Use generate() instead for search models.
400
+
401
+ Best Practices:
402
+ 1. OPTIONS: DO NOT use the options parameter - omit it entirely for production use
369
403
  2. MESSAGES: Use AIMessages or str - wrap Documents in AIMessages
370
- 3. CONTEXT vs MESSAGES: Use context for static/cacheable, messages for dynamic
404
+ 3. CONFIGURATION: Configure model behavior via LiteLLM proxy or environment variables
405
+ 4. See generate() documentation for more details
406
+
407
+ Context vs Messages Strategy:
408
+ context: Static, reusable content for caching efficiency
409
+ - Schemas, examples, instructions
410
+ - Remains constant across multiple calls
411
+ - Cached when supported by provider/proxy configuration
412
+
413
+ messages: Dynamic, per-call specific content
414
+ - Data to be structured, user queries
415
+ - Changes with each API call
416
+ - Never cached, always processed fresh
417
+
418
+ Complex Task Pattern:
419
+ For complex tasks like research or deep analysis, it's recommended to use
420
+ a two-step approach:
421
+ 1. First use generate() with a capable model to perform the analysis
422
+ 2. Then use generate_structured() with a smaller model to convert the
423
+ response into structured output
424
+
425
+ This pattern is more reliable than trying to force complex reasoning
426
+ directly into structured format:
427
+
428
+ >>> # Step 1: Research/analysis with generate() - no options parameter
429
+ >>> research = await llm.generate(
430
+ ... "gpt-5",
431
+ ... messages="Research and analyze this complex topic..."
432
+ ... )
433
+ >>>
434
+ >>> # Step 2: Structure the results with generate_structured()
435
+ >>> structured = await llm.generate_structured(
436
+ ... "gpt-5-mini", # Smaller model is fine for structuring
437
+ ... response_format=ResearchSummary,
438
+ ... messages=f"Extract key information: {research.content}"
439
+ ... )
371
440
 
372
441
  Args:
373
442
  model: Model to use (must support structured output).
443
+ Search models (models with '-search' suffix) do not support structured output.
374
444
  response_format: Pydantic model class defining the output schema.
375
445
  The model will generate JSON matching this schema.
376
446
  context: Static context to cache (documents, schemas, examples).
377
447
  Defaults to None (empty AIMessages).
378
448
  messages: Dynamic prompts/queries. AIMessages or str ONLY.
379
449
  Do not pass Document or DocumentList directly.
380
- options: Model configuration. response_format is set automatically.
450
+ options: DEPRECATED - DO NOT USE. Reserved for internal framework usage only.
451
+ Framework defaults are production-optimized. Configure model behavior
452
+ centrally via LiteLLM proxy settings, not per API call.
453
+ The response_format is set automatically from the response_format parameter.
454
+
455
+ VISION/PDF MODEL COMPATIBILITY:
456
+ When using Documents with images/PDFs in structured output:
457
+ - Images require vision-capable models that also support structured output
458
+ - PDFs require models with both document processing AND structured output support
459
+ - Many models support either vision OR structured output, but not both
460
+ - Test your specific model+document combination before production use
461
+ - Consider two-step approach: generate() for analysis, then generate_structured()
462
+ for formatting
381
463
 
382
464
  Returns:
383
465
  StructuredModelResponse[T] containing:
@@ -387,6 +469,7 @@ async def generate_structured(
387
469
  Raises:
388
470
  TypeError: If response_format is not a Pydantic model class.
389
471
  ValueError: If model doesn't support structured output or no parsed content returned.
472
+ Structured output support varies by provider and model.
390
473
  LLMError: If generation fails after retries.
391
474
  ValidationError: If response cannot be parsed into response_format.
392
475
 
@@ -398,8 +481,9 @@ async def generate_structured(
398
481
  ... sentiment: float = Field(ge=-1, le=1)
399
482
  ... key_points: list[str] = Field(max_length=5)
400
483
  >>>
484
+ >>> # CORRECT - No options parameter
401
485
  >>> response = await llm.generate_structured(
402
- ... model="gpt-5",
486
+ ... "gpt-5",
403
487
  ... response_format=Analysis,
404
488
  ... messages="Analyze this product review: ..."
405
489
  ... )
@@ -410,11 +494,13 @@ async def generate_structured(
410
494
  ... print(f"- {point}")
411
495
 
412
496
  Supported models:
413
- Support varies by provider and model. Generally includes:
497
+ Structured output support varies by provider and model. Generally includes:
414
498
  - OpenAI: GPT-4 and newer models
415
499
  - Anthropic: Claude 3+ models
416
500
  - Google: Gemini Pro models
417
- Check provider documentation for specific model support.
501
+
502
+ Search models (models with '-search' suffix) do not support structured output.
503
+ Check provider documentation for specific support.
418
504
 
419
505
  Performance:
420
506
  - Structured output may use more tokens than free text
@@ -426,11 +512,7 @@ async def generate_structured(
426
512
  - The model generates JSON matching the schema
427
513
  - Validation happens automatically via Pydantic
428
514
  - Use Field() descriptions to guide generation
429
-
430
- See Also:
431
- - generate: For unstructured text generation
432
- - ModelOptions: Configuration including response_format
433
- - StructuredModelResponse: Response wrapper with .parsed property
515
+ - Search models (models with '-search' suffix) do not support structured output
434
516
  """
435
517
  if context is None:
436
518
  context = AIMessages()
@@ -473,3 +555,9 @@ async def generate_structured(
473
555
 
474
556
  # Create a StructuredModelResponse with the parsed value
475
557
  return StructuredModelResponse[T](chat_completion=response, parsed_value=parsed_value)
558
+
559
+
560
+ # Public aliases for testing internal functions
561
+ # These are exported to allow testing of implementation details
562
+ process_messages_for_testing = _process_messages
563
+ generate_with_retry_for_testing = _generate_with_retry