PyPI - ai-pipeline-core - Versions diffs - 0.2.6__py3-none-any.whl → 0.4.1__py3-none-any.whl - Mend

ai-pipeline-core 0.2.6py3-none-any.whl → 0.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (94) hide show

ai_pipeline_core/__init__.py +78 -125
ai_pipeline_core/deployment/__init__.py +34 -0
ai_pipeline_core/deployment/base.py +861 -0
ai_pipeline_core/deployment/contract.py +80 -0
ai_pipeline_core/deployment/deploy.py +561 -0
ai_pipeline_core/deployment/helpers.py +97 -0
ai_pipeline_core/deployment/progress.py +126 -0
ai_pipeline_core/deployment/remote.py +116 -0
ai_pipeline_core/docs_generator/__init__.py +54 -0
ai_pipeline_core/docs_generator/__main__.py +5 -0
ai_pipeline_core/docs_generator/cli.py +196 -0
ai_pipeline_core/docs_generator/extractor.py +324 -0
ai_pipeline_core/docs_generator/guide_builder.py +644 -0
ai_pipeline_core/docs_generator/trimmer.py +35 -0
ai_pipeline_core/docs_generator/validator.py +114 -0
ai_pipeline_core/document_store/__init__.py +13 -0
ai_pipeline_core/document_store/_summary.py +9 -0
ai_pipeline_core/document_store/_summary_worker.py +170 -0
ai_pipeline_core/document_store/clickhouse.py +492 -0
ai_pipeline_core/document_store/factory.py +38 -0
ai_pipeline_core/document_store/local.py +312 -0
ai_pipeline_core/document_store/memory.py +85 -0
ai_pipeline_core/document_store/protocol.py +68 -0
ai_pipeline_core/documents/__init__.py +12 -14
ai_pipeline_core/documents/_context_vars.py +85 -0
ai_pipeline_core/documents/_hashing.py +52 -0
ai_pipeline_core/documents/attachment.py +85 -0
ai_pipeline_core/documents/context.py +128 -0
ai_pipeline_core/documents/document.py +318 -1434
ai_pipeline_core/documents/mime_type.py +37 -82
ai_pipeline_core/documents/utils.py +4 -12
ai_pipeline_core/exceptions.py +10 -62
ai_pipeline_core/images/__init__.py +309 -0
ai_pipeline_core/images/_processing.py +151 -0
ai_pipeline_core/llm/__init__.py +6 -4
ai_pipeline_core/llm/ai_messages.py +130 -81
ai_pipeline_core/llm/client.py +327 -193
ai_pipeline_core/llm/model_options.py +14 -86
ai_pipeline_core/llm/model_response.py +60 -103
ai_pipeline_core/llm/model_types.py +16 -34
ai_pipeline_core/logging/__init__.py +2 -7
ai_pipeline_core/logging/logging.yml +1 -1
ai_pipeline_core/logging/logging_config.py +27 -37
ai_pipeline_core/logging/logging_mixin.py +15 -41
ai_pipeline_core/observability/__init__.py +32 -0
ai_pipeline_core/observability/_debug/__init__.py +30 -0
ai_pipeline_core/observability/_debug/_auto_summary.py +94 -0
ai_pipeline_core/observability/_debug/_config.py +95 -0
ai_pipeline_core/observability/_debug/_content.py +764 -0
ai_pipeline_core/observability/_debug/_processor.py +98 -0
ai_pipeline_core/observability/_debug/_summary.py +312 -0
ai_pipeline_core/observability/_debug/_types.py +75 -0
ai_pipeline_core/observability/_debug/_writer.py +843 -0
ai_pipeline_core/observability/_document_tracking.py +146 -0
ai_pipeline_core/observability/_initialization.py +194 -0
ai_pipeline_core/observability/_logging_bridge.py +57 -0
ai_pipeline_core/observability/_summary.py +81 -0
ai_pipeline_core/observability/_tracking/__init__.py +6 -0
ai_pipeline_core/observability/_tracking/_client.py +178 -0
ai_pipeline_core/observability/_tracking/_internal.py +28 -0
ai_pipeline_core/observability/_tracking/_models.py +138 -0
ai_pipeline_core/observability/_tracking/_processor.py +158 -0
ai_pipeline_core/observability/_tracking/_service.py +311 -0
ai_pipeline_core/observability/_tracking/_writer.py +229 -0
ai_pipeline_core/{tracing.py → observability/tracing.py} +139 -283
ai_pipeline_core/pipeline/__init__.py +10 -0
ai_pipeline_core/pipeline/decorators.py +915 -0
ai_pipeline_core/pipeline/options.py +16 -0
ai_pipeline_core/prompt_manager.py +16 -102
ai_pipeline_core/settings.py +26 -31
ai_pipeline_core/testing.py +9 -0
ai_pipeline_core-0.4.1.dist-info/METADATA +807 -0
ai_pipeline_core-0.4.1.dist-info/RECORD +76 -0
{ai_pipeline_core-0.2.6.dist-info → ai_pipeline_core-0.4.1.dist-info}/WHEEL +1 -1
ai_pipeline_core/documents/document_list.py +0 -420
ai_pipeline_core/documents/flow_document.py +0 -112
ai_pipeline_core/documents/task_document.py +0 -117
ai_pipeline_core/documents/temporary_document.py +0 -74
ai_pipeline_core/flow/__init__.py +0 -9
ai_pipeline_core/flow/config.py +0 -483
ai_pipeline_core/flow/options.py +0 -75
ai_pipeline_core/pipeline.py +0 -718
ai_pipeline_core/prefect.py +0 -63
ai_pipeline_core/simple_runner/__init__.py +0 -14
ai_pipeline_core/simple_runner/cli.py +0 -254
ai_pipeline_core/simple_runner/simple_runner.py +0 -247
ai_pipeline_core/storage/__init__.py +0 -8
ai_pipeline_core/storage/storage.py +0 -628
ai_pipeline_core/utils/__init__.py +0 -8
ai_pipeline_core/utils/deploy.py +0 -373
ai_pipeline_core/utils/remote_deployment.py +0 -269
ai_pipeline_core-0.2.6.dist-info/METADATA +0 -500
ai_pipeline_core-0.2.6.dist-info/RECORD +0 -41
{ai_pipeline_core-0.2.6.dist-info → ai_pipeline_core-0.4.1.dist-info}/licenses/LICENSE +0 -0

ai_pipeline_core/llm/client.py CHANGED Viewed

@@ -1,99 +1,216 @@
 """LLM client implementation for AI model interactions.
-@public
 This module provides the core functionality for interacting with language models
 through a unified interface. It handles retries, caching, structured outputs,
 and integration with various LLM providers via LiteLLM.
-Key functions:
-- generate(): Text generation with optional context caching
-- generate_structured(): Type-safe structured output generation
+Automatic image auto-tiling splits oversized images in attachments to meet
+model-specific constraints (e.g., 3000x3000 for Gemini, 1000x1000 default).
+Context caching separates static content from dynamic messages for 50-90% token savings.
+Optional purpose and expected_cost parameters enable tracing and cost-tracking.
 """
 import asyncio
+import contextlib
 import time
+from io import BytesIO
 from typing import Any, TypeVar
 from lmnr import Laminar
 from openai import AsyncOpenAI
 from openai.lib.streaming.chat import ChunkEvent, ContentDeltaEvent, ContentDoneEvent
 from openai.types.chat import (
+    ChatCompletion,
     ChatCompletionMessageParam,
 )
-from prefect.logging import get_logger
+from PIL import Image
 from pydantic import BaseModel, ValidationError
+from ai_pipeline_core.documents import Document
+from ai_pipeline_core.documents.attachment import Attachment
 from ai_pipeline_core.exceptions import LLMError
+from ai_pipeline_core.images import ImageProcessingConfig, process_image, process_image_to_documents
+from ai_pipeline_core.logging import get_pipeline_logger
+from ai_pipeline_core.observability._document_tracking import track_llm_documents
 from ai_pipeline_core.settings import settings
-from .ai_messages import AIMessages
+from .ai_messages import AIMessages, AIMessageType
 from .model_options import ModelOptions
 from .model_response import ModelResponse, StructuredModelResponse
 from .model_types import ModelName
-logger = get_logger()
+logger = get_pipeline_logger(__name__)
+# Image splitting configs for automatic large-image handling at the LLM boundary.
+# Gemini supports up to 3000x3000; all other models use a conservative 1000x1000 default.
+_GEMINI_IMAGE_CONFIG = ImageProcessingConfig(max_dimension=3000, max_pixels=9_000_000, jpeg_quality=75)
+_DEFAULT_IMAGE_CONFIG = ImageProcessingConfig(max_dimension=1000, max_pixels=1_000_000, jpeg_quality=75)
+def _get_image_config(model: str) -> ImageProcessingConfig:
+    """Return the image splitting config for a model."""
+    if "gemini" in model.lower():
+        return _GEMINI_IMAGE_CONFIG
+    return _DEFAULT_IMAGE_CONFIG
+def _prepare_images_for_model(messages: AIMessages, model: str) -> AIMessages:  # noqa: C901, PLR0912, PLR0915, PLR0914
+    """Split image documents and image attachments that exceed model constraints.
+    Returns a new AIMessages with oversized images replaced by tiles.
+    Returns the original instance unchanged if no splitting is needed.
+    """
+    if not any(isinstance(m, Document) and (m.is_image or any(att.is_image for att in m.attachments)) for m in messages):
+        return messages
+    config = _get_image_config(model)
+    result: list[AIMessageType] = []
+    changed = False
+    for msg in messages:
+        if not isinstance(msg, Document):
+            result.append(msg)
+            continue
+        # 1. Handle top-level image Documents (existing logic)
+        if msg.is_image:
+            try:
+                with Image.open(BytesIO(msg.content)) as img:
+                    w, h = img.size
+            except Exception:
+                result.append(msg)
+                continue
+            within_limits = w <= config.max_dimension and h <= config.max_dimension and w * h <= config.max_pixels
+            if within_limits:
+                pass  # Falls through to attachment handling
+            else:
+                name_prefix = msg.name.rsplit(".", 1)[0] if "." in msg.name else msg.name
+                tiles = process_image_to_documents(msg, config=config, name_prefix=name_prefix)
+                if msg.attachments and tiles:
+                    tiles[0] = tiles[0].model_copy(update={"attachments": msg.attachments})
+                result.extend(tiles)
+                changed = True
+                continue
+        # 2. Handle image attachments
+        if msg.attachments:
+            new_attachments: list[Attachment] = []
+            attachments_changed = False
+            for att in msg.attachments:
+                if not att.is_image:
+                    new_attachments.append(att)
+                    continue
+                try:
+                    with Image.open(BytesIO(att.content)) as img:
+                        w, h = img.size
+                except Exception:
+                    new_attachments.append(att)
+                    continue
+                att_within_limits = w <= config.max_dimension and h <= config.max_dimension and w * h <= config.max_pixels
+                if att_within_limits:
+                    new_attachments.append(att)
+                    continue
+                # Tile the oversized attachment image
+                processed = process_image(att.content, config=config)
+                att_prefix = att.name.rsplit(".", 1)[0] if "." in att.name else att.name
+                for part in processed.parts:
+                    if part.total == 1:
+                        tile_name = f"{att_prefix}.jpg"
+                        tile_desc = att.description
+                    else:
+                        tile_name = f"{att_prefix}_{part.index + 1:02d}_of_{part.total:02d}.jpg"
+                        tile_desc = f"{att.description} ({part.label})" if att.description else part.label
+                    new_attachments.append(
+                        Attachment(
+                            name=tile_name,
+                            content=part.data,
+                            description=tile_desc,
+                        )
+                    )
+                attachments_changed = True
+            if attachments_changed:
+                msg = msg.model_copy(update={"attachments": tuple(new_attachments)})  # noqa: PLW2901
+                changed = True
+        result.append(msg)
+    if not changed:
+        return messages
+    return AIMessages(result)
 def _process_messages(
     context: AIMessages,
     messages: AIMessages,
     system_prompt: str | None = None,
-    cache_ttl: str | None = "5m",
+    cache_ttl: str | None = "300s",
 ) -> list[ChatCompletionMessageParam]:
     """Process and format messages for LLM API consumption.
     Internal function that combines context and messages into a single
     list of API-compatible messages. Applies caching directives to
-    context messages for efficiency.
+    system prompt and context messages for efficiency.
     Args:
         context: Messages to be cached (typically expensive/static content).
         messages: Regular messages without caching (dynamic queries).
         system_prompt: Optional system instructions for the model.
-        cache_ttl: Cache TTL for context messages (e.g. "120s", "5m", "1h").
+        cache_ttl: Cache TTL for system and context messages (e.g. "120s", "300s", "1h").
                    Set to None or empty string to disable caching.
     Returns:
         List of formatted messages ready for API calls, with:
-        - System prompt at the beginning (if provided)
-        - Context messages with cache_control on the last one (if cache_ttl)
+        - System prompt at the beginning with cache_control (if provided and cache_ttl set)
+        - Context messages with cache_control on all messages (if cache_ttl set)
         - Regular messages without caching
     System Prompt Location:
         The system prompt parameter is always injected as the FIRST message
-        with role="system". It is NOT cached with context, allowing dynamic
-        system prompts without breaking cache efficiency.
+        with role="system". It is cached along with context when cache_ttl is set.
     Cache behavior:
-        The last context message gets ephemeral caching with specified TTL
+        All system and context messages get ephemeral caching with specified TTL
         to reduce token usage on repeated calls with same context.
         If cache_ttl is None or empty string (falsy), no caching is applied.
-        Only the last context message receives cache_control to maximize efficiency.
+        All system and context messages receive cache_control to maximize cache efficiency.
-    Note:
-        This is an internal function used by _generate_with_retry().
-        The context/messages split enables efficient token usage.
+    This is an internal function used by _generate_with_retry().
+    The context/messages split enables efficient token usage.
     """
     processed_messages: list[ChatCompletionMessageParam] = []
     # Add system prompt if provided
     if system_prompt:
-        processed_messages.append({"role": "system", "content": system_prompt})
+        processed_messages.append({
+            "role": "system",
+            "content": [{"type": "text", "text": system_prompt}],
+        })
     # Process context messages with caching if provided
     if context:
         # Use AIMessages.to_prompt() for context
         context_messages = context.to_prompt()
+        processed_messages.extend(context_messages)
-        # Apply caching to last context message if cache_ttl is set
-        if cache_ttl:
-            context_messages[-1]["cache_control"] = {  # type: ignore
+    if cache_ttl:
+        for message in processed_messages:
+            message["cache_control"] = {  # type: ignore
                 "type": "ephemeral",
                 "ttl": cache_ttl,
             }
-        processed_messages.extend(context_messages)
+            if isinstance(message["content"], list):  # type: ignore
+                message["content"][-1]["cache_control"] = {  # type: ignore
+                    "type": "ephemeral",
+                    "ttl": cache_ttl,
+                }
     # Process regular messages without caching
     if messages:
@@ -103,6 +220,35 @@ def _process_messages(
     return processed_messages
+def _remove_cache_control(
+    messages: list[ChatCompletionMessageParam],
+) -> list[ChatCompletionMessageParam]:
+    """Remove cache control directives from messages.
+    Internal utility that strips cache_control fields from both message-level
+    and content-level entries. Used in retry logic when cache-related errors
+    occur during LLM API calls.
+    Args:
+        messages: List of messages that may contain cache_control directives.
+    Returns:
+        The same message list (modified in-place) with all cache_control
+        fields removed from both messages and their content items.
+    Modifies the input list in-place but also returns it for convenience.
+    Handles both list-based content (multipart) and string content (simple messages).
+    """
+    for message in messages:
+        if (content := message.get("content")) and isinstance(content, list):
+            for item in content:
+                if "cache_control" in item:
+                    del item["cache_control"]
+        if "cache_control" in message:
+            del message["cache_control"]
+    return messages
 def _model_name_to_openrouter_model(model: ModelName) -> str:
     """Convert a model name to an OpenRouter model name.
@@ -112,14 +258,10 @@ def _model_name_to_openrouter_model(model: ModelName) -> str:
     Returns:
         OpenRouter model name.
     """
-    if model == "gpt-4o-search":
-        return "openai/gpt-4o-search-preview"
-    if model == "gemini-2.5-flash-search":
-        return "google/gemini-2.5-flash:online"
-    if model == "grok-4-fast-search":
-        return "x-ai/grok-4-fast:online"
+    if model == "gemini-3-flash-search":
+        return "google/gemini-3-flash:online"
     if model == "sonar-pro-search":
-        return "perplexity/sonar-reasoning-pro"
+        return "perplexity/sonar-pro-search"
     if model.startswith("gemini"):
         return f"google/{model}"
     elif model.startswith("gpt"):
@@ -139,30 +281,76 @@ def _model_name_to_openrouter_model(model: ModelName) -> str:
     return model
-async def _generate(
-    model: str, messages: list[ChatCompletionMessageParam], completion_kwargs: dict[str, Any]
+async def _generate_streaming(client: AsyncOpenAI, model: str, messages: list[ChatCompletionMessageParam], completion_kwargs: dict[str, Any]) -> ModelResponse:
+    """Execute a streaming LLM API call."""
+    start_time = time.time()
+    first_token_time = None
+    usage = None
+    async with client.chat.completions.stream(
+        model=model,
+        messages=messages,
+        **completion_kwargs,
+    ) as s:
+        async for event in s:
+            if isinstance(event, ContentDeltaEvent):
+                if not first_token_time:
+                    first_token_time = time.time()
+            elif isinstance(event, ContentDoneEvent):
+                pass
+            elif isinstance(event, ChunkEvent) and event.chunk.usage:
+                usage = event.chunk.usage
+        if not first_token_time:
+            first_token_time = time.time()
+        raw_response = await s.get_final_completion()
+    metadata = {
+        "time_taken": round(time.time() - start_time, 2),
+        "first_token_time": round(first_token_time - start_time, 2),
+    }
+    return ModelResponse(raw_response, model_options=completion_kwargs, metadata=metadata, usage=usage)
+async def _generate_non_streaming(
+    client: AsyncOpenAI, model: str, messages: list[ChatCompletionMessageParam], completion_kwargs: dict[str, Any]
 ) -> ModelResponse:
-    """Execute a single LLM API call.
+    """Execute a non-streaming LLM API call.
+    Avoids OpenAI SDK delta accumulation — some providers (e.g. Grok) send
+    streaming annotation deltas that crash the SDK's accumulate_delta().
+    """
+    start_time = time.time()
+    kwargs = {k: v for k, v in completion_kwargs.items() if k != "stream_options"}
+    response_format = kwargs.get("response_format")
+    if isinstance(response_format, type) and issubclass(response_format, BaseModel):
+        raw_response: ChatCompletion = await client.chat.completions.parse(
+            model=model,
+            messages=messages,
+            **kwargs,
+        )
+    else:
+        raw_response = await client.chat.completions.create(
+            model=model,
+            messages=messages,
+            stream=False,
+            **kwargs,
+        )
+    elapsed = round(time.time() - start_time, 2)
+    metadata = {"time_taken": elapsed, "first_token_time": elapsed}
+    return ModelResponse(raw_response, model_options=completion_kwargs, metadata=metadata)
-    Internal function that makes the actual API request to the LLM provider.
-    Handles both regular and structured output generation.
+async def _generate(model: str, messages: list[ChatCompletionMessageParam], completion_kwargs: dict[str, Any], *, stream: bool = True) -> ModelResponse:
+    """Execute a single LLM API call.
     Args:
-        model: Model identifier (e.g., "gpt-5", "gemini-2.5-pro").
+        model: Model identifier (e.g., "gpt-5.1", "gemini-3-pro").
         messages: Formatted messages for the API.
         completion_kwargs: Additional parameters for the completion API.
+        stream: Whether to use streaming mode (default True). Non-streaming
+                avoids OpenAI SDK delta accumulation issues with some providers.
     Returns:
         ModelResponse with generated content and metadata.
-    API selection:
-        - Uses client.chat.completions.parse() for structured output
-        - Uses client.chat.completions.create() for regular text
-    Note:
-        - Uses AsyncOpenAI client configured via settings
-        - Captures response headers for cost tracking
-        - Response includes model options for debugging
     """
     if "openrouter" in settings.openai_base_url.lower():
         model = _model_name_to_openrouter_model(model)
@@ -171,45 +359,18 @@ async def _generate(
         api_key=settings.openai_api_key,
         base_url=settings.openai_base_url,
     ) as client:
-        start_time = time.time()
-        first_token_time = None
-        usage = None
-        async with client.chat.completions.stream(
-            model=model,
-            messages=messages,
-            **completion_kwargs,
-        ) as stream:
-            async for event in stream:
-                if isinstance(event, ContentDeltaEvent):
-                    if not first_token_time:
-                        first_token_time = time.time()
-                elif isinstance(event, ContentDoneEvent):
-                    pass
-                elif isinstance(event, ChunkEvent):
-                    if event.chunk.usage:  # used to fix a bug with missing usage data
-                        usage = event.chunk.usage
-            if not first_token_time:
-                first_token_time = time.time()
-            raw_response = await stream.get_final_completion()
-        metadata = {
-            "time_taken": round(time.time() - start_time, 2),
-            "first_token_time": round(first_token_time - start_time, 2),
-        }
-        response = ModelResponse(
-            raw_response,
-            model_options=completion_kwargs,
-            metadata=metadata,
-            usage=usage,
-        )
-        return response
+        if stream:
+            return await _generate_streaming(client, model, messages, completion_kwargs)
+        return await _generate_non_streaming(client, model, messages, completion_kwargs)
-async def _generate_with_retry(
+async def _generate_with_retry(  # noqa: PLR0917
     model: str,
     context: AIMessages,
     messages: AIMessages,
     options: ModelOptions,
+    purpose: str | None = None,
+    expected_cost: float | None = None,
 ) -> ModelResponse:
     """Core LLM generation with automatic retry logic.
@@ -221,6 +382,8 @@ async def _generate_with_retry(
         context: Cached context messages (can be empty).
         messages: Dynamic query messages.
         options: Configuration including retries, timeout, temperature.
+        purpose: Optional semantic label for the LLM span name.
+        expected_cost: Optional expected cost for cost-tracking attributes.
     Returns:
         ModelResponse with generated content.
@@ -229,17 +392,22 @@ async def _generate_with_retry(
         ValueError: If model is not provided or both context and messages are empty.
         LLMError: If all retry attempts are exhausted.
-    Note:
-        Empty responses trigger a retry as they indicate API issues.
+    Empty responses trigger a retry as they indicate API issues.
     """
     if not model:
         raise ValueError("Model must be provided")
     if not context and not messages:
         raise ValueError("Either context or messages must be provided")
-    processed_messages = _process_messages(
-        context, messages, options.system_prompt, options.cache_ttl
-    )
+    # Auto-split large images based on model-specific constraints
+    context = _prepare_images_for_model(context, model)
+    messages = _prepare_images_for_model(messages, model)
+    if "gemini" in model.lower() and context.approximate_tokens_count < 10000:
+        # Bug fix for minimum explicit context size for Gemini models
+        options.cache_ttl = None
+    processed_messages = _process_messages(context, messages, options.system_prompt, options.cache_ttl)
     completion_kwargs: dict[str, Any] = {
         **options.to_openai_completion_kwargs(),
     }
@@ -249,20 +417,23 @@ async def _generate_with_retry(
     for attempt in range(options.retries):
         try:
-            with Laminar.start_as_current_span(
-                model, span_type="LLM", input=processed_messages
-            ) as span:
-                response = await _generate(model, processed_messages, completion_kwargs)
-                span.set_attributes(response.get_laminar_metadata())
-                Laminar.set_span_output([
-                    r for r in (response.reasoning_content, response.content) if r
-                ])
+            with Laminar.start_as_current_span(purpose or model, span_type="LLM", input=processed_messages) as span:
+                response = await _generate(model, processed_messages, completion_kwargs, stream=options.stream)
+                laminar_metadata = response.get_laminar_metadata()
+                if purpose:
+                    laminar_metadata["purpose"] = purpose
+                if expected_cost is not None:
+                    laminar_metadata["expected_cost"] = expected_cost
+                span.set_attributes(laminar_metadata)  # pyright: ignore[reportArgumentType]
+                Laminar.set_span_output([r for r in (response.reasoning_content, response.content) if r])
                 response.validate_output()
                 return response
-        except (asyncio.TimeoutError, ValueError, ValidationError, Exception) as e:
+        except (TimeoutError, ValueError, ValidationError, Exception) as e:
             if not isinstance(e, asyncio.TimeoutError):
                 # disable cache if it's not a timeout because it may cause an error
                 completion_kwargs["extra_body"]["cache"] = {"no-cache": True}
+                # sometimes there are issues with cache so cache is removed in case of failure
+                processed_messages = _remove_cache_control(processed_messages)
             logger.warning(
                 f"LLM generation failed (attempt {attempt + 1}/{options.retries}): {e}",
@@ -281,11 +452,11 @@ async def generate(
     context: AIMessages | None = None,
     messages: AIMessages | str,
     options: ModelOptions | None = None,
+    purpose: str | None = None,
+    expected_cost: float | None = None,
 ) -> ModelResponse:
     """Generate text response from a language model.
-    @public
     Main entry point for LLM text generation with smart context caching.
     The context/messages split enables efficient token usage by caching
     expensive static content separately from dynamic queries.
@@ -297,18 +468,21 @@ async def generate(
         4. CONFIGURATION: Configure model behavior via LiteLLM proxy or environment variables
     Args:
-        model: Model to use (e.g., "gpt-5", "gemini-2.5-pro", "grok-4").
+        model: Model to use (e.g., "gpt-5.1", "gemini-3-pro", "grok-4.1-fast").
                Accepts predefined models or any string for custom models.
         context: Static context to cache (documents, examples, instructions).
                 Defaults to None (empty context). Cached for 5 minutes by default.
         messages: Dynamic messages/queries. AIMessages or str ONLY.
-                 Do not pass Document or DocumentList directly.
+                 Do not pass Document or list[Document] directly.
                  If string, converted to AIMessages internally.
-        options: DEPRECATED - DO NOT USE. Reserved for internal framework usage only.
-                Framework defaults are production-optimized (3 retries, 10s delay, 300s timeout).
-                Configure model behavior centrally via LiteLLM proxy settings or environment
-                variables, not per API call. Provider-specific settings should be configured
-                at the proxy level.
+        options: Internal framework parameter. Framework defaults are production-optimized
+                (3 retries, 20s delay, 600s timeout). Configure model behavior centrally via
+                LiteLLM proxy settings or environment variables, not per API call.
+                Provider-specific settings should be configured at the proxy level.
+        purpose: Optional semantic label used as the tracing span name
+                instead of model name. Stored as a span attribute.
+        expected_cost: Optional expected cost stored as a span attribute
+                      for cost-tracking and comparison with actual cost.
     Returns:
         ModelResponse containing:
@@ -325,17 +499,17 @@ async def generate(
         Wrap Documents in AIMessages - DO NOT pass directly or convert to .text:
         # CORRECT - wrap Document in AIMessages
-        response = await llm.generate("gpt-5", messages=AIMessages([my_document]))
+        response = await llm.generate("gpt-5.1", messages=AIMessages([my_document]))
         # WRONG - don't pass Document directly
-        response = await llm.generate("gpt-5", messages=my_document)  # NO!
+        response = await llm.generate("gpt-5.1", messages=my_document)  # NO!
         # WRONG - don't convert to string yourself
-        response = await llm.generate("gpt-5", messages=my_document.text)  # NO!
+        response = await llm.generate("gpt-5.1", messages=my_document.text)  # NO!
     VISION/PDF MODEL COMPATIBILITY:
         When using Documents containing images or PDFs, ensure your model supports these formats:
-        - Images require vision-capable models (gpt-4o, gemini-pro-vision, claude-3-sonnet)
+        - Images require vision-capable models (gpt-5.1, gemini-3-flash, gemini-3-pro)
         - PDFs require document processing support (varies by provider)
         - Non-compatible models will raise ValueError or fall back to text extraction
         - Check model capabilities before including visual/PDF content
@@ -351,50 +525,12 @@ async def generate(
             - Changes with each API call
             - Never cached, always processed fresh
-    Example:
-        >>> # CORRECT - No options parameter (this is the recommended pattern)
-        >>> response = await llm.generate("gpt-5", messages="Explain quantum computing")
-        >>> print(response.content)  # In production, use get_pipeline_logger instead of print
-        >>> # With context caching for efficiency
-        >>> # Context and messages are both AIMessages or str; wrap any Documents
-        >>> static_doc = AIMessages([large_document, "few-shot example: ..."])
-        >>>
-        >>> # First call: caches context
-        >>> r1 = await llm.generate("gpt-5", context=static_doc, messages="Summarize")
-        >>>
-        >>> # Second call: reuses cache, saves tokens!
-        >>> r2 = await llm.generate("gpt-5", context=static_doc, messages="Key points?")
-        >>> # Multi-turn conversation
-        >>> messages = AIMessages([
-        ...     "What is Python?",
-        ...     previous_response,
-        ...     "Can you give an example?"
-        ... ])
-        >>> response = await llm.generate("gpt-5", messages=messages)
-        Configuration via LiteLLM Proxy:
-        >>> # Configure temperature in litellm_config.yaml:
-        >>> # model_list:
-        >>> #   - model_name: gpt-5
-        >>> #     litellm_params:
-        >>> #       model: openai/gpt-4o
-        >>> #       temperature: 0.3
-        >>> #       max_tokens: 1000
-        >>>
-        >>> # Configure retry logic in proxy:
-        >>> # general_settings:
-        >>> #   master_key: sk-1234
-        >>> #   max_retries: 5
-        >>> #   retry_delay: 15
     Performance:
         - Context caching saves ~50-90% tokens on repeated calls
         - First call: full token cost
         - Subsequent calls (within cache TTL): only messages tokens
-        - Default cache TTL is 5m (production-optimized)
-        - Default retry logic: 3 attempts with 10s delay (production-optimized)
+        - Default cache TTL is 300s/5 minutes (production-optimized)
+        - Default retry logic: 3 attempts with 20s delay (production-optimized)
     Caching:
         When enabled in your LiteLLM proxy and supported by the upstream provider,
@@ -412,10 +548,8 @@ async def generate(
         This centralizes configuration and ensures consistency across all API calls.
-    Note:
-        - All models are accessed via LiteLLM proxy
-        - Automatic retry with configurable delay between attempts
-        - Cost tracking via response headers
+    All models are accessed via LiteLLM proxy with automatic retry and
+    cost tracking via response headers.
     """
     if isinstance(messages, str):
         messages = AIMessages([messages])
@@ -424,9 +558,22 @@ async def generate(
         context = AIMessages()
     if options is None:
         options = ModelOptions()
+    else:
+        # Create a copy to avoid mutating the caller's options object
+        options = options.model_copy()
+    with contextlib.suppress(Exception):
+        track_llm_documents(context, messages)
     try:
-        return await _generate_with_retry(model, context, messages, options)
+        return await _generate_with_retry(
+            model,
+            context,
+            messages,
+            options,
+            purpose=purpose,
+            expected_cost=expected_cost,
+        )
     except (ValueError, LLMError):
         raise  # Explicitly re-raise to satisfy DOC502
@@ -435,18 +582,18 @@ T = TypeVar("T", bound=BaseModel)
 """Type variable for Pydantic model types in structured generation."""
-async def generate_structured(
+async def generate_structured(  # noqa: UP047
     model: ModelName,
     response_format: type[T],
     *,
     context: AIMessages | None = None,
     messages: AIMessages | str,
     options: ModelOptions | None = None,
+    purpose: str | None = None,
+    expected_cost: float | None = None,
 ) -> StructuredModelResponse[T]:
     """Generate structured output conforming to a Pydantic model.
-    @public
     Type-safe generation that returns validated Pydantic model instances.
     Uses OpenAI's structured output feature for guaranteed schema compliance.
@@ -482,7 +629,7 @@ async def generate_structured(
         >>> # Step 1: Research/analysis with generate() - no options parameter
         >>> research = await llm.generate(
-        ...     "gpt-5",
+        ...     "gpt-5.1",
         ...     messages="Research and analyze this complex topic..."
         ... )
         >>>
@@ -501,21 +648,21 @@ async def generate_structured(
         context: Static context to cache (documents, schemas, examples).
                 Defaults to None (empty AIMessages).
         messages: Dynamic prompts/queries. AIMessages or str ONLY.
-                 Do not pass Document or DocumentList directly.
+                 Do not pass Document or list[Document] directly.
         options: Optional ModelOptions for configuring temperature, retries, etc.
                 If provided, it will NOT be mutated (a copy is created internally).
                 The response_format field is set automatically from the response_format parameter.
                 In most cases, leave as None to use framework defaults.
                 Configure model behavior centrally via LiteLLM proxy settings when possible.
+        purpose: Optional semantic label used as the tracing span name
+                instead of model name. Stored as a span attribute.
+        expected_cost: Optional expected cost stored as a span attribute
+                      for cost-tracking and comparison with actual cost.
-    Note:
-        Vision/PDF model compatibility considerations:
-        - Images require vision-capable models that also support structured output
-        - PDFs require models with both document processing AND structured output support
-        - Many models support either vision OR structured output, but not both
-        - Test your specific model+document combination before production use
-        - Consider two-step approach: generate() for analysis, then generate_structured()
-          for formatting
+    Vision/PDF model compatibility: Images require vision-capable models that also support
+    structured output. PDFs require models with both document processing AND structured output
+    support. Consider two-step approach: generate() for analysis, then generate_structured()
+    for formatting.
     Returns:
         StructuredModelResponse[T] containing:
@@ -529,26 +676,6 @@ async def generate_structured(
         LLMError: If generation fails after retries.
         ValidationError: If response cannot be parsed into response_format.
-    Example:
-        >>> from pydantic import BaseModel, Field
-        >>>
-        >>> class Analysis(BaseModel):
-        ...     summary: str = Field(description="Brief summary")
-        ...     sentiment: float = Field(ge=-1, le=1)
-        ...     key_points: list[str] = Field(max_length=5)
-        >>>
-        >>> # CORRECT - No options parameter
-        >>> response = await llm.generate_structured(
-        ...     "gpt-5",
-        ...     response_format=Analysis,
-        ...     messages="Analyze this product review: ..."
-        ... )
-        >>>
-        >>> analysis = response.parsed  # Type: Analysis
-        >>> print(f"Sentiment: {analysis.sentiment}")
-        >>> for point in analysis.key_points:
-        ...     print(f"- {point}")
     Supported models:
         Structured output support varies by provider and model. Generally includes:
         - OpenAI: GPT-4 and newer models
@@ -563,12 +690,9 @@ async def generate_structured(
         - Complex schemas increase generation time
         - Validation overhead is minimal (Pydantic is fast)
-    Note:
-        - Pydantic model is converted to JSON Schema for the API
-        - The model generates JSON matching the schema
-        - Validation happens automatically via Pydantic
-        - Use Field() descriptions to guide generation
-        - Search models (models with '-search' suffix) do not support structured output
+    Pydantic model is converted to JSON Schema for the API. Validation happens
+    automatically via Pydantic. Search models (models with '-search' suffix) do
+    not support structured output.
     """
     if context is None:
         context = AIMessages()
@@ -585,9 +709,19 @@ async def generate_structured(
     assert isinstance(messages, AIMessages)
+    with contextlib.suppress(Exception):
+        track_llm_documents(context, messages)
     # Call the internal generate function with structured output enabled
     try:
-        response = await _generate_with_retry(model, context, messages, options)
+        response = await _generate_with_retry(
+            model,
+            context,
+            messages,
+            options,
+            purpose=purpose,
+            expected_cost=expected_cost,
+        )
     except (ValueError, LLMError):
         raise  # Explicitly re-raise to satisfy DOC502

ai-pipeline-core 0.2.6__py3-none-any.whl → 0.4.1__py3-none-any.whl

ai-pipeline-core 0.2.6py3-none-any.whl → 0.4.1py3-none-any.whl