PyPI - ai-pipeline-core - Versions diffs - 0.1.10__py3-none-any.whl → 0.1.12__py3-none-any.whl - Mend

ai-pipeline-core 0.1.10py3-none-any.whl → 0.1.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

ai_pipeline_core/__init__.py +84 -4
ai_pipeline_core/documents/__init__.py +9 -0
ai_pipeline_core/documents/document.py +1044 -152
ai_pipeline_core/documents/document_list.py +147 -38
ai_pipeline_core/documents/flow_document.py +112 -11
ai_pipeline_core/documents/mime_type.py +173 -15
ai_pipeline_core/documents/task_document.py +117 -12
ai_pipeline_core/documents/temporary_document.py +84 -5
ai_pipeline_core/documents/utils.py +41 -9
ai_pipeline_core/exceptions.py +47 -11
ai_pipeline_core/flow/__init__.py +2 -0
ai_pipeline_core/flow/config.py +236 -27
ai_pipeline_core/flow/options.py +50 -1
ai_pipeline_core/llm/__init__.py +6 -0
ai_pipeline_core/llm/ai_messages.py +125 -27
ai_pipeline_core/llm/client.py +278 -26
ai_pipeline_core/llm/model_options.py +130 -1
ai_pipeline_core/llm/model_response.py +239 -35
ai_pipeline_core/llm/model_types.py +67 -0
ai_pipeline_core/logging/__init__.py +13 -0
ai_pipeline_core/logging/logging_config.py +72 -20
ai_pipeline_core/logging/logging_mixin.py +38 -32
ai_pipeline_core/pipeline.py +363 -60
ai_pipeline_core/prefect.py +48 -1
ai_pipeline_core/prompt_manager.py +209 -24
ai_pipeline_core/settings.py +108 -4
ai_pipeline_core/simple_runner/__init__.py +5 -0
ai_pipeline_core/simple_runner/cli.py +96 -11
ai_pipeline_core/simple_runner/simple_runner.py +237 -4
ai_pipeline_core/tracing.py +253 -30
ai_pipeline_core-0.1.12.dist-info/METADATA +450 -0
ai_pipeline_core-0.1.12.dist-info/RECORD +36 -0
ai_pipeline_core-0.1.10.dist-info/METADATA +0 -538
ai_pipeline_core-0.1.10.dist-info/RECORD +0 -36
{ai_pipeline_core-0.1.10.dist-info → ai_pipeline_core-0.1.12.dist-info}/WHEEL +0 -0
{ai_pipeline_core-0.1.10.dist-info → ai_pipeline_core-0.1.12.dist-info}/licenses/LICENSE +0 -0

ai_pipeline_core/flow/config.py CHANGED Viewed

@@ -1,27 +1,130 @@
-"""Flow configuration base class."""
+"""Flow configuration system for type-safe pipeline definitions.
+@public
+This module provides the FlowConfig abstract base class that enforces
+type safety for flow inputs and outputs in the pipeline system.
+Best Practice:
+    Always finish @pipeline_flow functions with create_and_validate_output()
+    to ensure type safety and proper validation of output documents.
+"""
 from abc import ABC
-from typing import Any, ClassVar
+from typing import Any, ClassVar, Iterable
 from ai_pipeline_core.documents import DocumentList, FlowDocument
+from ai_pipeline_core.exceptions import DocumentValidationError
 class FlowConfig(ABC):
-    """
-    Configuration for a flow. It makes flow easier to implement and test.
+    """Abstract base class for type-safe flow configuration.
+    @public
+    FlowConfig defines the contract for flow inputs and outputs, ensuring
+    type safety and preventing circular dependencies in pipeline flows.
+    Each flow must have a corresponding FlowConfig subclass that specifies
+    its input document types and output document type.
+    CRITICAL RULE: OUTPUT_DOCUMENT_TYPE must NEVER be in INPUT_DOCUMENT_TYPES!
+        This prevents circular dependencies as flows chain together.
+        Each flow transforms input types to a DIFFERENT output type.
+    Class Variables:
+        INPUT_DOCUMENT_TYPES: List of FlowDocument types this flow accepts
+        OUTPUT_DOCUMENT_TYPE: Single FlowDocument type this flow produces
+    Validation Rules:
+        - INPUT_DOCUMENT_TYPES and OUTPUT_DOCUMENT_TYPE must be defined
+        - OUTPUT_DOCUMENT_TYPE cannot be in INPUT_DOCUMENT_TYPES (prevents cycles)
+        - Field names must be exact (common typos are detected)
+    Why this matters:
+        Flows connect in pipelines where one flow's output becomes another's input.
+        Same input/output types would create infinite loops or circular dependencies.
+    Example:
+        >>> # CORRECT - Different output type from inputs
+        >>> class ProcessingFlowConfig(FlowConfig):
+        ...     INPUT_DOCUMENT_TYPES = [RawDataDocument]
+        ...     OUTPUT_DOCUMENT_TYPE = ProcessedDocument  # Different type!
+        >>>
+        >>> # Use in @pipeline_flow - RECOMMENDED PATTERN
+        >>> @pipeline_flow(name="processing")
+        >>> async def process(config: ProcessingFlowConfig, docs: DocumentList) -> DocumentList:
+        ...     outputs = []
+        ...     # ... processing logic ...
+        ...     return config.create_and_validate_output(outputs)
+        >>> # WRONG - Will raise TypeError
+        >>> class BadConfig(FlowConfig):
+        ...     INPUT_DOCUMENT_TYPES = [DataDocument]
+        ...     OUTPUT_DOCUMENT_TYPE = DataDocument  # SAME TYPE - NOT ALLOWED!
+    Note:
+        - Validation happens at class definition time
+        - Helps catch configuration errors early
+        - Used by simple_runner to manage document flow
     """
     INPUT_DOCUMENT_TYPES: ClassVar[list[type[FlowDocument]]]
     OUTPUT_DOCUMENT_TYPE: ClassVar[type[FlowDocument]]
     def __init_subclass__(cls, **kwargs: Any):
-        """Validate that OUTPUT_DOCUMENT_TYPE is not in INPUT_DOCUMENT_TYPES."""
+        """Validate flow configuration at subclass definition time.
+        Performs comprehensive validation when a FlowConfig subclass is defined:
+        1. Checks for common field name mistakes (typos)
+        2. Ensures required fields are defined
+        3. Prevents circular dependencies (output != input)
+        Args:
+            **kwargs: Additional arguments for parent __init_subclass__.
+        Raises:
+            TypeError: If configuration violates any validation rules:
+                      - Missing required fields
+                      - Incorrect field names
+                      - Circular dependency detected
+        Note:
+            This runs at class definition time, not instantiation,
+            providing immediate feedback during development.
+        """
         super().__init_subclass__(**kwargs)
         # Skip validation for the abstract base class itself
         if cls.__name__ == "FlowConfig":
             return
+        # Check for invalid field names (common mistakes)
+        allowed_fields = {"INPUT_DOCUMENT_TYPES", "OUTPUT_DOCUMENT_TYPE"}
+        class_attrs = {name for name in dir(cls) if not name.startswith("_") and name.isupper()}
+        # Find fields that look like they might be mistakes
+        suspicious_fields = class_attrs - allowed_fields
+        common_mistakes = {
+            "OUTPUT_DOCUMENT_TYPES": "OUTPUT_DOCUMENT_TYPE",
+            "INPUT_DOCUMENT_TYPE": "INPUT_DOCUMENT_TYPES",
+        }
+        for field in suspicious_fields:
+            # Skip inherited attributes from parent classes
+            if any(hasattr(base, field) for base in cls.__bases__):
+                continue
+            if field in common_mistakes:
+                raise TypeError(
+                    f"FlowConfig {cls.__name__}: Found '{field}' but expected "
+                    f"'{common_mistakes[field]}'. Please use the correct field name."
+                )
+            elif "DOCUMENT" in field:
+                raise TypeError(
+                    f"FlowConfig {cls.__name__}: Invalid field '{field}'. "
+                    f"Only 'INPUT_DOCUMENT_TYPES' and 'OUTPUT_DOCUMENT_TYPE' are allowed."
+                )
         # Ensure required attributes are defined
         if not hasattr(cls, "INPUT_DOCUMENT_TYPES"):
             raise TypeError(f"FlowConfig {cls.__name__} must define INPUT_DOCUMENT_TYPES")
@@ -37,22 +140,55 @@ class FlowConfig(ABC):
     @classmethod
     def get_input_document_types(cls) -> list[type[FlowDocument]]:
-        """
-        Get the input document types for the flow.
+        """Get the list of input document types this flow accepts.
+        Returns:
+            List of FlowDocument subclasses that this flow requires
+            as input.
+        Example:
+            >>> types = MyFlowConfig.get_input_document_types()
+            >>> print([t.__name__ for t in types])
+            ['InputDoc', 'ConfigDoc']
         """
         return cls.INPUT_DOCUMENT_TYPES
     @classmethod
     def get_output_document_type(cls) -> type[FlowDocument]:
-        """
-        Get the output document type for the flow.
+        """Get the output document type this flow produces.
+        Returns:
+            Single FlowDocument subclass that this flow outputs.
+        Example:
+            >>> output_type = MyFlowConfig.get_output_document_type()
+            >>> print(output_type.__name__)
+            'ProcessedDataDocument'
         """
         return cls.OUTPUT_DOCUMENT_TYPE
     @classmethod
     def has_input_documents(cls, documents: DocumentList) -> bool:
-        """
-        Check if the flow has all required input documents.
+        """Check if all required input documents are present.
+        Verifies that the document list contains at least one instance
+        of each required input document type.
+        Args:
+            documents: DocumentList to check for required inputs.
+        Returns:
+            True if all required document types are present,
+            False if any are missing.
+        Example:
+            >>> docs = DocumentList([input_doc, config_doc])
+            >>> if MyFlowConfig.has_input_documents(docs):
+            ...     # Safe to proceed with flow
+            ...     pass
+        Note:
+            Use this before get_input_documents() to avoid exceptions.
         """
         for doc_cls in cls.INPUT_DOCUMENT_TYPES:
             if not any(isinstance(doc, doc_cls) for doc in documents):
@@ -61,8 +197,29 @@ class FlowConfig(ABC):
     @classmethod
     def get_input_documents(cls, documents: DocumentList) -> DocumentList:
-        """
-        Get the input documents for the flow.
+        """Extract and return all required input documents.
+        Filters the provided document list to return only documents
+        matching the required input types. Returns all matching documents,
+        not just the first of each type.
+        Args:
+            documents: DocumentList containing mixed document types.
+        Returns:
+            DocumentList containing only the required input documents.
+        Raises:
+            ValueError: If any required document type is missing.
+        Example:
+            >>> all_docs = DocumentList([input1, input2, other_doc])
+            >>> input_docs = MyFlowConfig.get_input_documents(all_docs)
+            >>> len(input_docs)  # Contains only input1 and input2
+            2
+        Note:
+            Call has_input_documents() first to check availability.
         """
         input_documents = DocumentList()
         for doc_cls in cls.INPUT_DOCUMENT_TYPES:
@@ -73,25 +230,77 @@ class FlowConfig(ABC):
         return input_documents
     @classmethod
-    def validate_output_documents(cls, documents: DocumentList) -> None:
-        """
-        Validate the output documents of the flow.
+    def validate_output_documents(cls, documents: Any) -> None:
+        """Validate that output documents match the expected type.
+        Ensures all documents in the list are instances of the
+        declared OUTPUT_DOCUMENT_TYPE.
+        Args:
+            documents: DocumentList to validate.
+        Raises:
+            DocumentValidationError: If documents is not a DocumentList or if any
+                document has incorrect type.
+        Example:
+            >>> output = DocumentList([ProcessedDoc(...)])
+            >>> MyFlowConfig.validate_output_documents(output)
+            >>> # No exception means valid
+        Note:
+            Used internally by create_and_validate_output().
+            Uses explicit exceptions for validation (works with python -O).
         """
-        assert isinstance(documents, DocumentList), "Documents must be a DocumentList"
+        if not isinstance(documents, DocumentList):
+            raise DocumentValidationError("Documents must be a DocumentList")
         output_document_class = cls.get_output_document_type()
-        invalid = [type(d).__name__ for d in documents if not isinstance(d, output_document_class)]
-        assert not invalid, (
-            "Documents must be of the correct type. "
-            f"Expected: {output_document_class.__name__}, Got invalid: {invalid}"
-        )
+        for doc in documents:
+            if not isinstance(doc, output_document_class):
+                raise DocumentValidationError(
+                    f"Document '{doc.name}' has incorrect type. "
+                    f"Expected: {output_document_class.__name__}, "
+                    f"Got: {type(doc).__name__}"
+                )
     @classmethod
     def create_and_validate_output(
-        cls, output: FlowDocument | list[FlowDocument] | DocumentList
+        cls, output: FlowDocument | Iterable[FlowDocument] | DocumentList
     ) -> DocumentList:
-        """
-        Create the output documents for the flow.
+        """Create and validate flow output documents.
+        @public
+        RECOMMENDED: Always use this method at the end of @pipeline_flow functions
+        to ensure type safety and proper output validation.
+        Convenience method that wraps output in a DocumentList if needed
+        and validates it matches the expected OUTPUT_DOCUMENT_TYPE.
+        Args:
+            output: Single document, iterable of documents, or DocumentList.
+        Returns:
+            Validated DocumentList containing the output documents.
+        Raises:
+            DocumentValidationError: If output type doesn't match OUTPUT_DOCUMENT_TYPE.
+        Example:
+            >>> @pipeline_flow(name="my_flow")
+            >>> async def process_flow(config: MyFlowConfig, ...) -> DocumentList:
+            >>>     outputs = []
+            >>>     # ... processing logic ...
+            >>>     outputs.append(OutputDoc(...))
+            >>>
+            >>>     # Always finish with this validation
+            >>>     return config.create_and_validate_output(outputs)
+        Note:
+            This is the recommended pattern for all @pipeline_flow functions.
+            It ensures type safety and catches output errors immediately.
         """
         documents: DocumentList
         if isinstance(output, FlowDocument):
@@ -99,7 +308,7 @@ class FlowConfig(ABC):
         elif isinstance(output, DocumentList):
             documents = output
         else:
-            assert isinstance(output, list)
-            documents = DocumentList(output)  # type: ignore[arg-type]
+            # Handle any iterable of FlowDocuments
+            documents = DocumentList(list(output))  # type: ignore[arg-type]
         cls.validate_output_documents(documents)
         return documents

ai_pipeline_core/flow/options.py CHANGED Viewed

@@ -1,3 +1,11 @@
+"""Flow options configuration for pipeline execution.
+@public
+Provides base configuration settings for AI pipeline flows,
+including model selection and runtime parameters.
+"""
 from typing import TypeVar
 from pydantic import Field
@@ -9,7 +17,48 @@ T = TypeVar("T", bound="FlowOptions")
 class FlowOptions(BaseSettings):
-    """Base configuration for AI Pipeline flows."""
+    """Base configuration settings for AI pipeline flows.
+    @public
+    FlowOptions provides runtime configuration for pipeline flows,
+    including model selection and other parameters. It uses pydantic-settings
+    to support environment variable overrides and is immutable (frozen) by default.
+    This class is designed to be subclassed for flow-specific configuration:
+    Example:
+        >>> class MyFlowOptions(FlowOptions):
+        ...     temperature: float = Field(0.7, ge=0, le=2)
+        ...     batch_size: int = Field(10, gt=0)
+        ...     custom_param: str = "default"
+        >>> # Use in CLI with run_cli:
+        >>> run_cli(
+        ...     flows=[my_flow],
+        ...     options_cls=MyFlowOptions  # Will parse CLI args
+        ... )
+        >>> # Or create programmatically:
+        >>> options = MyFlowOptions(
+        ...     core_model="gemini-2.5-pro",
+        ...     temperature=0.9
+        ... )
+    Attributes:
+        core_model: Primary LLM for complex tasks (default: gpt-5)
+        small_model: Fast model for simple tasks (default: gpt-5-mini)
+    Configuration:
+        - Frozen (immutable) after creation
+        - Extra fields ignored (not strict)
+        - Can be populated from environment variables
+        - Used by simple_runner.cli for command-line parsing
+    Note:
+        The base class provides model selection. Subclasses should
+        add flow-specific parameters with appropriate validation.
+    """
     core_model: ModelName | str = Field(
         default="gpt-5",

ai_pipeline_core/llm/__init__.py CHANGED Viewed

@@ -1,3 +1,9 @@
+"""Large Language Model integration via LiteLLM proxy.
+This package provides OpenAI API-compatible LLM interactions with built-in retry logic,
+LMNR tracing, and structured output generation using Pydantic models.
+"""
 from .ai_messages import AIMessages, AIMessageType
 from .client import (
     generate,

ai_pipeline_core/llm/ai_messages.py CHANGED Viewed

@@ -1,3 +1,11 @@
+"""AI message handling for LLM interactions.
+@public
+Provides AIMessages container for managing conversations with mixed content types
+including text, documents, and model responses.
+"""
 import base64
 import hashlib
 import json
@@ -13,13 +21,82 @@ from ai_pipeline_core.documents import Document
 from .model_response import ModelResponse
 AIMessageType = str | Document | ModelResponse
+"""Type for messages in AIMessages container.
+@public
+Represents the allowed types for conversation messages:
+- str: Plain text messages
+- Document: Structured document content
+- ModelResponse: LLM generation responses
+"""
 class AIMessages(list[AIMessageType]):
+    """Container for AI conversation messages supporting mixed types.
+    @public
+    This class extends list to manage conversation messages between user
+    and AI, supporting text, Document objects, and ModelResponse instances.
+    Messages are converted to OpenAI-compatible format for LLM interactions.
+    Conversion Rules:
+        - str: Becomes {"role": "user", "content": text}
+        - Document: Becomes {"role": "user", "content": document_content}
+          (automatically handles text, images, PDFs based on MIME type)
+        - ModelResponse: Becomes {"role": "assistant", "content": response.content}
+    Note: Document conversion is automatic. Text content becomes user text messages.
+    Images are sent to vision-capable models (non-vision models will raise ValueError).
+    PDFs are attached when supported by the model, otherwise a text extraction
+    fallback is used. LiteLLM proxy handles the specific encoding requirements
+    for each provider.
+    IMPORTANT: Although AIMessages can contain Document entries, the LLM client functions
+    expect `messages` to be `AIMessages` or `str`. If you start from a Document or a list
+    of Documents, build AIMessages first (e.g., `AIMessages([doc])` or `AIMessages(docs)`).
+    Example:
+        >>> from ai_pipeline_core import llm
+        >>> messages = AIMessages()
+        >>> messages.append("What is the capital of France?")
+        >>> response = await llm.generate("gpt-5", messages=messages)
+        >>> messages.append(response)  # Add the actual response
+        >>> prompt = messages.get_last_message_as_str()  # Get the last message as a string
+    """
     def get_last_message(self) -> AIMessageType:
+        """Get the last message in the conversation.
+        Returns:
+            The last message in the conversation, which can be a string,
+            Document, or ModelResponse.
+        """
         return self[-1]
     def get_last_message_as_str(self) -> str:
+        """Get the last message as a string, raising if not a string.
+        @public
+        Returns:
+            The last message as a string.
+        Raises:
+            ValueError: If the last message is not a string.
+        Safer Pattern:
+            Instead of catching ValueError, check type first:
+            >>> messages = AIMessages([user_msg, response, followup])
+            >>> last = messages.get_last_message()
+            >>> if isinstance(last, str):
+            ...     text = last
+            >>> elif isinstance(last, ModelResponse):
+            ...     text = last.content
+            >>> elif isinstance(last, Document):
+            ...     text = last.text if last.is_text else "<binary>"
+        """
         last_message = self.get_last_message()
         if isinstance(last_message, str):
             return last_message
@@ -28,8 +105,25 @@ class AIMessages(list[AIMessageType]):
     def to_prompt(self) -> list[ChatCompletionMessageParam]:
         """Convert AIMessages to OpenAI-compatible format.
+        Transforms the message list into the format expected by OpenAI API.
+        Each message type is converted according to its role and content.
         Returns:
-            List of ChatCompletionMessageParam for OpenAI API
+            List of ChatCompletionMessageParam dicts (from openai.types.chat)
+            with 'role' and 'content' keys. Ready to be passed to generate()
+            or OpenAI API directly.
+        Raises:
+            ValueError: If message type is not supported.
+        Example:
+            >>> messages = AIMessages(["Hello", response, "Follow up"])
+            >>> prompt = messages.to_prompt()
+            >>> # Result: [
+            >>> #   {"role": "user", "content": "Hello"},
+            >>> #   {"role": "assistant", "content": "..."},
+            >>> #   {"role": "user", "content": "Follow up"}
+            >>> # ]
         """
         messages: list[ChatCompletionMessageParam] = []
@@ -46,7 +140,11 @@ class AIMessages(list[AIMessageType]):
         return messages
     def to_tracing_log(self) -> list[str]:
-        """Convert AIMessages to a list of strings for tracing."""
+        """Convert AIMessages to a list of strings for tracing.
+        Returns:
+            List of string representations for tracing logs.
+        """
         messages: list[str] = []
         for message in self:
             if isinstance(message, Document):
@@ -61,20 +159,27 @@ class AIMessages(list[AIMessageType]):
         return messages
     def get_prompt_cache_key(self, system_prompt: str | None = None) -> str:
+        """Generate cache key for message set.
+        Args:
+            system_prompt: Optional system prompt to include in cache key.
+        Returns:
+            SHA256 hash as hex string for cache key.
+        """
         if not system_prompt:
             system_prompt = ""
         return hashlib.sha256((system_prompt + json.dumps(self.to_prompt())).encode()).hexdigest()
     @staticmethod
     def document_to_prompt(document: Document) -> list[ChatCompletionContentPartParam]:
-        """
-        Convert a document to prompt format for LLM consumption.
+        """Convert a document to prompt format for LLM consumption.
         Args:
-            document: The document to convert
+            document: The document to convert.
         Returns:
-            List of chat completion content parts for the prompt
+            List of chat completion content parts for the prompt.
         """
         prompt: list[ChatCompletionContentPartParam] = []
@@ -88,9 +193,8 @@ class AIMessages(list[AIMessageType]):
         # Handle text documents
         if document.is_text:
-            content_text = (
-                f"{header_text}<content>\n{document.as_text()}\n</content>\n</document>\n"
-            )
+            text_content = document.content.decode("utf-8")
+            content_text = f"{header_text}<content>\n{text_content}\n</content>\n</document>\n"
             prompt.append({"type": "text", "text": content_text})
             return prompt
@@ -102,12 +206,10 @@ class AIMessages(list[AIMessageType]):
             return []
         # Add header for binary content
-        prompt.append(
-            {
-                "type": "text",
-                "text": f"{header_text}<content>\n",
-            }
-        )
+        prompt.append({
+            "type": "text",
+            "text": f"{header_text}<content>\n",
+        })
         # Encode binary content
         base64_content = base64.b64encode(document.content).decode("utf-8")
@@ -115,19 +217,15 @@ class AIMessages(list[AIMessageType]):
         # Add appropriate content type
         if document.is_pdf:
-            prompt.append(
-                {
-                    "type": "file",
-                    "file": {"file_data": data_uri},
-                }
-            )
+            prompt.append({
+                "type": "file",
+                "file": {"file_data": data_uri},
+            })
         else:  # is_image
-            prompt.append(
-                {
-                    "type": "image_url",
-                    "image_url": {"url": data_uri, "detail": "high"},
-                }
-            )
+            prompt.append({
+                "type": "image_url",
+                "image_url": {"url": data_uri, "detail": "high"},
+            })
         # Close the document tag
         prompt.append({"type": "text", "text": "</content>\n</document>\n"})

ai-pipeline-core 0.1.10__py3-none-any.whl → 0.1.12__py3-none-any.whl

ai-pipeline-core 0.1.10py3-none-any.whl → 0.1.12py3-none-any.whl