PyPI - ai-pipeline-core - Versions diffs - 0.1.10__py3-none-any.whl → 0.1.12__py3-none-any.whl - Mend

ai-pipeline-core 0.1.10py3-none-any.whl → 0.1.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

ai_pipeline_core/__init__.py +84 -4
ai_pipeline_core/documents/__init__.py +9 -0
ai_pipeline_core/documents/document.py +1044 -152
ai_pipeline_core/documents/document_list.py +147 -38
ai_pipeline_core/documents/flow_document.py +112 -11
ai_pipeline_core/documents/mime_type.py +173 -15
ai_pipeline_core/documents/task_document.py +117 -12
ai_pipeline_core/documents/temporary_document.py +84 -5
ai_pipeline_core/documents/utils.py +41 -9
ai_pipeline_core/exceptions.py +47 -11
ai_pipeline_core/flow/__init__.py +2 -0
ai_pipeline_core/flow/config.py +236 -27
ai_pipeline_core/flow/options.py +50 -1
ai_pipeline_core/llm/__init__.py +6 -0
ai_pipeline_core/llm/ai_messages.py +125 -27
ai_pipeline_core/llm/client.py +278 -26
ai_pipeline_core/llm/model_options.py +130 -1
ai_pipeline_core/llm/model_response.py +239 -35
ai_pipeline_core/llm/model_types.py +67 -0
ai_pipeline_core/logging/__init__.py +13 -0
ai_pipeline_core/logging/logging_config.py +72 -20
ai_pipeline_core/logging/logging_mixin.py +38 -32
ai_pipeline_core/pipeline.py +363 -60
ai_pipeline_core/prefect.py +48 -1
ai_pipeline_core/prompt_manager.py +209 -24
ai_pipeline_core/settings.py +108 -4
ai_pipeline_core/simple_runner/__init__.py +5 -0
ai_pipeline_core/simple_runner/cli.py +96 -11
ai_pipeline_core/simple_runner/simple_runner.py +237 -4
ai_pipeline_core/tracing.py +253 -30
ai_pipeline_core-0.1.12.dist-info/METADATA +450 -0
ai_pipeline_core-0.1.12.dist-info/RECORD +36 -0
ai_pipeline_core-0.1.10.dist-info/METADATA +0 -538
ai_pipeline_core-0.1.10.dist-info/RECORD +0 -36
{ai_pipeline_core-0.1.10.dist-info → ai_pipeline_core-0.1.12.dist-info}/WHEEL +0 -0
{ai_pipeline_core-0.1.10.dist-info → ai_pipeline_core-0.1.12.dist-info}/licenses/LICENSE +0 -0

ai_pipeline_core/llm/client.py CHANGED Viewed

@@ -1,3 +1,16 @@
+"""LLM client implementation for AI model interactions.
+@public
+This module provides the core functionality for interacting with language models
+through a unified interface. It handles retries, caching, structured outputs,
+and integration with various LLM providers via LiteLLM.
+Key functions:
+- generate(): Text generation with optional context caching
+- generate_structured(): Type-safe structured output generation
+"""
 import asyncio
 from typing import Any, TypeVar
@@ -26,17 +39,36 @@ def _process_messages(
     messages: AIMessages,
     system_prompt: str | None = None,
 ) -> list[ChatCompletionMessageParam]:
-    """Convert context and messages to OpenAI-compatible format.
+    """Process and format messages for LLM API consumption.
+    Internal function that combines context and messages into a single
+    list of API-compatible messages. Applies caching directives to
+    context messages for efficiency.
     Args:
-        context: Messages to be cached (optional)
-        messages: Regular messages that won't be cached
-        system_prompt: Optional system prompt
+        context: Messages to be cached (typically expensive/static content).
+        messages: Regular messages without caching (dynamic queries).
+        system_prompt: Optional system instructions for the model.
     Returns:
-        List of formatted messages for OpenAI API
+        List of formatted messages ready for API calls, with:
+        - System prompt at the beginning (if provided)
+        - Context messages with cache_control on the last one
+        - Regular messages without caching
+    System Prompt Location:
+        The system prompt from ModelOptions.system_prompt is always injected
+        as the FIRST message with role="system". It is NOT cached with context,
+        allowing dynamic system prompts without breaking cache efficiency.
+    Cache behavior:
+        The last context message gets ephemeral caching (120s TTL)
+        to reduce token usage on repeated calls with same context.
+    Note:
+        This is an internal function used by _generate_with_retry().
+        The context/messages split enables efficient token usage.
     """
     processed_messages: list[ChatCompletionMessageParam] = []
     # Add system prompt if provided
@@ -67,6 +99,28 @@ def _process_messages(
 async def _generate(
     model: str, messages: list[ChatCompletionMessageParam], completion_kwargs: dict[str, Any]
 ) -> ModelResponse:
+    """Execute a single LLM API call.
+    Internal function that makes the actual API request to the LLM provider.
+    Handles both regular and structured output generation.
+    Args:
+        model: Model identifier (e.g., "gpt-5", "gemini-2.5-pro").
+        messages: Formatted messages for the API.
+        completion_kwargs: Additional parameters for the completion API.
+    Returns:
+        ModelResponse with generated content and metadata.
+    API selection:
+        - Uses client.chat.completions.parse() for structured output
+        - Uses client.chat.completions.create() for regular text
+    Note:
+        - Uses AsyncOpenAI client configured via settings
+        - Captures response headers for cost tracking
+        - Response includes model options for debugging
+    """
     async with AsyncOpenAI(
         api_key=settings.openai_api_key,
         base_url=settings.openai_base_url,
@@ -93,7 +147,27 @@ async def _generate_with_retry(
     messages: AIMessages,
     options: ModelOptions,
 ) -> ModelResponse:
-    """Core generation logic with exponential backoff retry."""
+    """Core LLM generation with automatic retry logic.
+    Internal function that orchestrates the complete generation process
+    including message processing, retries, caching, and tracing.
+    Args:
+        model: Model identifier string.
+        context: Cached context messages (can be empty).
+        messages: Dynamic query messages.
+        options: Configuration including retries, timeout, temperature.
+    Returns:
+        ModelResponse with generated content.
+    Raises:
+        ValueError: If model is not provided or both context and messages are empty.
+        LLMError: If all retry attempts are exhausted.
+    Note:
+        Empty responses trigger a retry as they indicate API issues.
+    """
     if not model:
         raise ValueError("Model must be provided")
     if not context and not messages:
@@ -143,28 +217,135 @@ async def _generate_with_retry(
 async def generate(
     model: ModelName | str,
     *,
-    context: AIMessages = AIMessages(),
+    context: AIMessages | None = None,
     messages: AIMessages | str,
-    options: ModelOptions = ModelOptions(),
+    options: ModelOptions | None = None,
 ) -> ModelResponse:
-    """Generate response using a large or small model.
+    """Generate text response from a language model.
+    @public
+    Main entry point for LLM text generation with smart context caching.
+    The context/messages split enables efficient token usage by caching
+    expensive static content separately from dynamic queries.
+    Best Practices:
+        1. OPTIONS: Omit in 90% of cases - defaults are optimized
+        2. MESSAGES: Use AIMessages or str - wrap Documents in AIMessages
+        3. CONTEXT vs MESSAGES: Use context for static/cacheable, messages for dynamic
     Args:
-        model: The model to use for generation
-        context: Messages to be cached (optional) - keyword only
-        messages: Regular messages that won't be cached - keyword only
-        options: Model options - keyword only
+        model: Model to use (e.g., "gpt-5", "gemini-2.5-pro", "grok-4").
+               Can be ModelName literal or any string for custom models.
+        context: Static context to cache (documents, examples, instructions).
+                Defaults to None (empty context). Cached for 120 seconds.
+        messages: Dynamic messages/queries. AIMessages or str ONLY.
+                 Do not pass Document or DocumentList directly.
+                 If string, converted to AIMessages internally.
+        options: Model configuration (temperature, retries, timeout, etc.).
+                Defaults to None (uses ModelOptions() with standard settings).
     Returns:
-        Model response
+        ModelResponse containing:
+        - Generated text content
+        - Usage statistics
+        - Cost information (if available)
+        - Model metadata
+    Raises:
+        ValueError: If model is empty or messages are invalid.
+        LLMError: If generation fails after all retries.
+    Document Handling:
+        Wrap Documents in AIMessages - DO NOT pass directly or convert to .text:
+        # CORRECT - wrap Document in AIMessages
+        response = await llm.generate("gpt-5", messages=AIMessages([my_document]))
+        # WRONG - don't pass Document directly
+        response = await llm.generate("gpt-5", messages=my_document)  # NO!
+        # WRONG - don't convert to string yourself
+        response = await llm.generate("gpt-5", messages=my_document.text)  # NO!
+    Context vs Messages Strategy:
+        context: Static, reusable content (cached 120 seconds)
+            - Large documents, instructions, examples
+            - Same across multiple calls
+        messages: Dynamic, query-specific content
+            - User questions, current conversation turn
+            - Changes every call
+    Example:
+        >>> # Simple case - no options needed (90% of cases)
+        >>> response = await llm.generate("gpt-5", messages="Explain quantum computing")
+        >>> print(response.content)  # In production, use get_pipeline_logger instead of print
+        >>> # With context caching for efficiency
+        >>> # Context and messages are both AIMessages or str; wrap any Documents
+        >>> static_doc = AIMessages([large_document, "few-shot example: ..."])
+        >>>
+        >>> # First call: caches context
+        >>> r1 = await llm.generate("gpt-5", context=static_doc, messages="Summarize")
+        >>>
+        >>> # Second call: reuses cache, saves tokens!
+        >>> r2 = await llm.generate("gpt-5", context=static_doc, messages="Key points?")
+        >>> # AVOID unnecessary options (defaults are optimal)
+        >>> response = await llm.generate(
+        ...     "gpt-5",
+        ...     messages="Hello",
+        ...     options=ModelOptions(temperature=0.7)  # Default is probably fine!
+        ... )
+        >>> # Multi-turn conversation
+        >>> messages = AIMessages([
+        ...     "What is Python?",
+        ...     previous_response,
+        ...     "Can you give an example?"
+        ... ])
+        >>> response = await llm.generate("gpt-5", messages=messages)
+    Performance:
+        - Context caching saves ~50-90% tokens on repeated calls
+        - First call: full token cost
+        - Subsequent calls (within 120s): only messages tokens
+        - Default retry delay is 10s (configurable via ModelOptions.retry_delay_seconds)
+    Caching:
+        When enabled in your LiteLLM proxy and supported by the upstream provider,
+        context messages may be cached (typical TTL ~120s) to reduce token usage on
+        repeated calls. Savings depend on provider and payload; treat this as an
+        optimization, not a guarantee. Cache behavior varies by proxy configuration.
+    Note:
+        - Context argument is ignored by the tracer to avoid recording large data
+        - All models are accessed via LiteLLM proxy
+        - Automatic retry with configurable delay between attempts
+        - Cost tracking via response headers
+    See Also:
+        - generate_structured: For typed/structured output
+        - AIMessages: Message container with document support
+        - ModelOptions: Configuration options
     """
     if isinstance(messages, str):
         messages = AIMessages([messages])
-    return await _generate_with_retry(model, context, messages, options)
+    if context is None:
+        context = AIMessages()
+    if options is None:
+        options = ModelOptions()
+    try:
+        return await _generate_with_retry(model, context, messages, options)
+    except (ValueError, LLMError):
+        raise  # Explicitly re-raise to satisfy DOC502
 T = TypeVar("T", bound=BaseModel)
+"""Type variable for Pydantic model types in structured generation."""
 @trace(ignore_inputs=["context"])
@@ -172,29 +353,100 @@ async def generate_structured(
     model: ModelName | str,
     response_format: type[T],
     *,
-    context: AIMessages = AIMessages(),
+    context: AIMessages | None = None,
     messages: AIMessages | str,
-    options: ModelOptions = ModelOptions(),
+    options: ModelOptions | None = None,
 ) -> StructuredModelResponse[T]:
-    """Generate structured response using Pydantic models.
+    """Generate structured output conforming to a Pydantic model.
+    @public
+    Type-safe generation that returns validated Pydantic model instances.
+    Uses OpenAI's structured output feature for guaranteed schema compliance.
+    Best Practices (same as generate):
+        1. OPTIONS: Omit in 90% of cases - defaults are optimized
+        2. MESSAGES: Use AIMessages or str - wrap Documents in AIMessages
+        3. CONTEXT vs MESSAGES: Use context for static/cacheable, messages for dynamic
     Args:
-        model: The model to use for generation
-        response_format: A Pydantic model class
-        context: Messages to be cached (optional) - keyword only
-        messages: Regular messages that won't be cached - keyword only
-        options: Model options - keyword only
+        model: Model to use (must support structured output).
+        response_format: Pydantic model class defining the output schema.
+                        The model will generate JSON matching this schema.
+        context: Static context to cache (documents, schemas, examples).
+                Defaults to None (empty AIMessages).
+        messages: Dynamic prompts/queries. AIMessages or str ONLY.
+                 Do not pass Document or DocumentList directly.
+        options: Model configuration. response_format is set automatically.
     Returns:
-        A StructuredModelResponse containing the parsed Pydantic model instance
+        StructuredModelResponse[T] containing:
+        - parsed: Validated instance of response_format class
+        - All fields from regular ModelResponse (content, usage, etc.)
+    Raises:
+        TypeError: If response_format is not a Pydantic model class.
+        ValueError: If model doesn't support structured output or no parsed content returned.
+        LLMError: If generation fails after retries.
+        ValidationError: If response cannot be parsed into response_format.
+    Example:
+        >>> from pydantic import BaseModel, Field
+        >>>
+        >>> class Analysis(BaseModel):
+        ...     summary: str = Field(description="Brief summary")
+        ...     sentiment: float = Field(ge=-1, le=1)
+        ...     key_points: list[str] = Field(max_length=5)
+        >>>
+        >>> response = await llm.generate_structured(
+        ...     model="gpt-5",
+        ...     response_format=Analysis,
+        ...     messages="Analyze this product review: ..."
+        ... )
+        >>>
+        >>> analysis = response.parsed  # Type: Analysis
+        >>> print(f"Sentiment: {analysis.sentiment}")
+        >>> for point in analysis.key_points:
+        ...     print(f"- {point}")
+    Supported models:
+        Support varies by provider and model. Generally includes:
+        - OpenAI: GPT-4 and newer models
+        - Anthropic: Claude 3+ models
+        - Google: Gemini Pro models
+        Check provider documentation for specific model support.
+    Performance:
+        - Structured output may use more tokens than free text
+        - Complex schemas increase generation time
+        - Validation overhead is minimal (Pydantic is fast)
+    Note:
+        - Pydantic model is converted to JSON Schema for the API
+        - The model generates JSON matching the schema
+        - Validation happens automatically via Pydantic
+        - Use Field() descriptions to guide generation
+    See Also:
+        - generate: For unstructured text generation
+        - ModelOptions: Configuration including response_format
+        - StructuredModelResponse: Response wrapper with .parsed property
     """
+    if context is None:
+        context = AIMessages()
+    if options is None:
+        options = ModelOptions()
     options.response_format = response_format
     if isinstance(messages, str):
         messages = AIMessages([messages])
     # Call the internal generate function with structured output enabled
-    response = await _generate_with_retry(model, context, messages, options)
+    try:
+        response = await _generate_with_retry(model, context, messages, options)
+    except (ValueError, LLMError):
+        raise  # Explicitly re-raise to satisfy DOC502
     # Extract the parsed value from the response
     parsed_value: T | None = None

ai_pipeline_core/llm/model_options.py CHANGED Viewed

@@ -1,9 +1,103 @@
+"""Configuration options for LLM generation.
+@public
+Provides the ModelOptions class for configuring model behavior,
+retry logic, and advanced features like web search and reasoning.
+"""
 from typing import Any, Literal
 from pydantic import BaseModel
 class ModelOptions(BaseModel):
+    """Configuration options for LLM generation requests.
+    @public
+    ModelOptions encapsulates all configuration parameters for model
+    generation, including model behavior settings, retry logic, and
+    advanced features. All fields are optional with sensible defaults.
+    Attributes:
+        temperature: Controls randomness in generation (0.0-2.0).
+                    Lower values = more deterministic, higher = more creative.
+                    If None, the parameter is omitted from the API call,
+                    causing the provider to use its own default (often 1.0).
+        system_prompt: System-level instructions for the model.
+                      Sets the model's behavior and persona.
+        search_context_size: Web search result depth for search-enabled models.
+                           Literal["low", "medium", "high"] | None
+                           "low": Minimal context (~1-2 results)
+                           "medium": Moderate context (~3-5 results)
+                           "high": Extensive context (~6+ results)
+        reasoning_effort: Reasoning intensity for models that support explicit reasoning.
+                         Literal["low", "medium", "high"] | None
+                         "low": Quick reasoning
+                         "medium": Balanced reasoning
+                         "high": Deep, thorough reasoning
+                         Note: Availability and effect vary by provider and model. Only models
+                         that expose an explicit reasoning control will honor this parameter.
+        retries: Number of retry attempts on failure (default: 3).
+        retry_delay_seconds: Seconds to wait between retries (default: 10).
+        timeout: Maximum seconds to wait for response (default: 300).
+        service_tier: API tier selection for performance/cost trade-offs.
+                     "auto": Let API choose
+                     "default": Standard tier
+                     "flex": Flexible (cheaper, may be slower)
+                     "scale": Scaled performance
+                     "priority": Priority processing
+                     Note: Service tiers are correct as of Q3 2025. Only OpenAI models
+                     support this parameter. Other providers (Anthropic, Google, Grok)
+                     silently ignore it.
+        max_completion_tokens: Maximum tokens to generate.
+                              None uses model default.
+        response_format: Pydantic model class for structured output.
+                        Pass a Pydantic model; the client converts it to JSON Schema.
+                        Set automatically by generate_structured(). Provider support varies.
+    Example:
+        >>> # Basic configuration
+        >>> options = ModelOptions(
+        ...     temperature=0.7,
+        ...     max_completion_tokens=1000
+        ... )
+        >>>
+        >>> # With system prompt
+        >>> options = ModelOptions(
+        ...     system_prompt="You are a helpful coding assistant",
+        ...     temperature=0.3  # Lower for code generation
+        ... )
+        >>>
+        >>> # For search-enabled models
+        >>> options = ModelOptions(
+        ...     search_context_size="high",  # Get more search results
+        ...     max_completion_tokens=2000
+        ... )
+        >>>
+        >>> # For reasoning models
+        >>> options = ModelOptions(
+        ...     reasoning_effort="high",  # Deep reasoning
+        ...     timeout=600  # More time for complex reasoning
+        ... )
+    Note:
+        - Not all options apply to all models
+        - search_context_size only works with search models
+        - reasoning_effort only works with models that support explicit reasoning
+        - response_format is set internally by generate_structured()
+    """
     temperature: float | None = None
     system_prompt: str | None = None
     search_context_size: Literal["low", "medium", "high"] | None = None
@@ -16,7 +110,42 @@ class ModelOptions(BaseModel):
     response_format: type[BaseModel] | None = None
     def to_openai_completion_kwargs(self) -> dict[str, Any]:
-        """Convert ModelOptions to OpenAI completion kwargs."""
+        """Convert options to OpenAI API completion parameters.
+        Transforms ModelOptions fields into the format expected by
+        the OpenAI completion API. Only includes non-None values.
+        Returns:
+            Dictionary with OpenAI API parameters:
+            - Always includes 'timeout' and 'extra_body'
+            - Conditionally includes other parameters if set
+            - Maps search_context_size to web_search_options
+            - Passes reasoning_effort directly
+        API parameter mapping:
+            - temperature -> temperature
+            - max_completion_tokens -> max_completion_tokens
+            - reasoning_effort -> reasoning_effort
+            - search_context_size -> web_search_options.search_context_size
+            - response_format -> response_format
+            - service_tier -> service_tier
+        Web Search Structure:
+            When search_context_size is set, creates:
+            {"web_search_options": {"search_context_size": "low|medium|high"}}
+            Non-search models silently ignore this parameter.
+        Example:
+            >>> options = ModelOptions(temperature=0.5, timeout=60)
+            >>> kwargs = options.to_openai_completion_kwargs()
+            >>> kwargs
+            {'timeout': 60, 'extra_body': {}, 'temperature': 0.5}
+        Note:
+            - system_prompt is handled separately in _process_messages()
+            - retries and retry_delay_seconds are used by retry logic
+            - extra_body is always included for potential extensions
+        """
         kwargs: dict[str, Any] = {
             "timeout": self.timeout,
             "extra_body": {},

ai-pipeline-core 0.1.10__py3-none-any.whl → 0.1.12__py3-none-any.whl

ai-pipeline-core 0.1.10py3-none-any.whl → 0.1.12py3-none-any.whl