PyPI - ai-pipeline-core - Versions diffs - 0.1.14__py3-none-any.whl → 0.2.1__py3-none-any.whl - Mend

ai-pipeline-core 0.1.14py3-none-any.whl → 0.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

ai_pipeline_core/__init__.py +21 -13
ai_pipeline_core/documents/document.py +202 -51
ai_pipeline_core/documents/document_list.py +148 -24
ai_pipeline_core/documents/flow_document.py +2 -6
ai_pipeline_core/documents/task_document.py +0 -4
ai_pipeline_core/documents/temporary_document.py +1 -8
ai_pipeline_core/flow/config.py +174 -5
ai_pipeline_core/llm/__init__.py +1 -6
ai_pipeline_core/llm/ai_messages.py +137 -4
ai_pipeline_core/llm/client.py +118 -65
ai_pipeline_core/llm/model_options.py +6 -7
ai_pipeline_core/llm/model_response.py +17 -16
ai_pipeline_core/llm/model_types.py +3 -7
ai_pipeline_core/logging/__init__.py +0 -2
ai_pipeline_core/logging/logging_config.py +0 -6
ai_pipeline_core/logging/logging_mixin.py +2 -10
ai_pipeline_core/pipeline.py +54 -68
ai_pipeline_core/prefect.py +12 -3
ai_pipeline_core/prompt_manager.py +14 -7
ai_pipeline_core/settings.py +13 -5
ai_pipeline_core/simple_runner/__init__.py +1 -11
ai_pipeline_core/simple_runner/cli.py +13 -12
ai_pipeline_core/simple_runner/simple_runner.py +34 -189
ai_pipeline_core/storage/__init__.py +8 -0
ai_pipeline_core/storage/storage.py +628 -0
ai_pipeline_core/tracing.py +234 -30
{ai_pipeline_core-0.1.14.dist-info → ai_pipeline_core-0.2.1.dist-info}/METADATA +35 -20
ai_pipeline_core-0.2.1.dist-info/RECORD +38 -0
ai_pipeline_core-0.1.14.dist-info/RECORD +0 -36
{ai_pipeline_core-0.1.14.dist-info → ai_pipeline_core-0.2.1.dist-info}/WHEEL +0 -0
{ai_pipeline_core-0.1.14.dist-info → ai_pipeline_core-0.2.1.dist-info}/licenses/LICENSE +0 -0

ai_pipeline_core/documents/document_list.py CHANGED Viewed

@@ -3,7 +3,8 @@
 @public
 """
-from typing import Any, Iterable, SupportsIndex, Union, overload
+from copy import deepcopy
+from typing import Any, Callable, Iterable, SupportsIndex, Union, overload
 from typing_extensions import Self
@@ -17,8 +18,8 @@ class DocumentList(list[Document]):
     Specialized list with validation and filtering for documents.
-    Best Practice: Use default constructor in 90% of cases. Only enable
-    validate_same_type or validate_duplicates when you explicitly need them.
+    Best Practice: Use default constructor by default, unless instructed otherwise.
+    Only enable validate_same_type or validate_duplicates when you explicitly need them.
     Example:
         >>> # RECOMMENDED - default constructor for most cases
@@ -37,6 +38,7 @@ class DocumentList(list[Document]):
         documents: list[Document] | None = None,
         validate_same_type: bool = False,
         validate_duplicates: bool = False,
+        frozen: bool = False,
     ) -> None:
         """Initialize DocumentList.
@@ -46,12 +48,15 @@ class DocumentList(list[Document]):
             documents: Initial list of documents.
             validate_same_type: Enforce same document type.
             validate_duplicates: Prevent duplicate filenames.
+            frozen: If True, list is immutable from creation.
         """
         super().__init__()
         self._validate_same_type = validate_same_type
         self._validate_duplicates = validate_duplicates
+        self._frozen = False  # Initialize as unfrozen to allow initial population
         if documents:
             self.extend(documents)
+        self._frozen = frozen  # Set frozen state after initial population
     def _validate_no_duplicates(self) -> None:
         """Check for duplicate document names.
@@ -109,18 +114,51 @@ class DocumentList(list[Document]):
         self._validate_no_description_files()
         self._validate_types()
+    def freeze(self) -> None:
+        """Permanently freeze the list, preventing modifications.
+        Once frozen, the list cannot be unfrozen.
+        """
+        self._frozen = True
+    def copy(self) -> "DocumentList":
+        """Create an unfrozen deep copy of the list.
+        Returns:
+            New unfrozen DocumentList with deep-copied documents.
+        """
+        copied_docs = deepcopy(list(self))
+        return DocumentList(
+            documents=copied_docs,
+            validate_same_type=self._validate_same_type,
+            validate_duplicates=self._validate_duplicates,
+            frozen=False,  # Copies are always unfrozen
+        )
+    def _check_frozen(self) -> None:
+        """Check if list is frozen and raise if it is.
+        Raises:
+            RuntimeError: If the list is frozen.
+        """
+        if self._frozen:
+            raise RuntimeError("Cannot modify frozen DocumentList")
     def append(self, document: Document) -> None:
         """Add a document to the end of the list."""
+        self._check_frozen()
         super().append(document)
         self._validate()
     def extend(self, documents: Iterable[Document]) -> None:
         """Add multiple documents to the list."""
+        self._check_frozen()
         super().extend(documents)
         self._validate()
     def insert(self, index: SupportsIndex, document: Document) -> None:
         """Insert a document at the specified position."""
+        self._check_frozen()
         super().insert(index, document)
         self._validate()
@@ -132,6 +170,7 @@ class DocumentList(list[Document]):
     def __setitem__(self, index: Union[SupportsIndex, slice], value: Any) -> None:
         """Set item or slice with validation."""
+        self._check_frozen()
         super().__setitem__(index, value)
         self._validate()
@@ -141,10 +180,48 @@ class DocumentList(list[Document]):
         Returns:
             Self: This DocumentList after modification.
         """
+        self._check_frozen()
         result = super().__iadd__(other)
         self._validate()
         return result
+    def __delitem__(self, index: Union[SupportsIndex, slice]) -> None:
+        """Delete item or slice from list."""
+        self._check_frozen()
+        super().__delitem__(index)
+    def pop(self, index: SupportsIndex = -1) -> Document:
+        """Remove and return item at index.
+        Returns:
+            Document removed from the list.
+        """
+        self._check_frozen()
+        return super().pop(index)
+    def remove(self, document: Document) -> None:
+        """Remove first occurrence of document."""
+        self._check_frozen()
+        super().remove(document)
+    def clear(self) -> None:
+        """Remove all items from list."""
+        self._check_frozen()
+        super().clear()
+    def reverse(self) -> None:
+        """Reverse list in place."""
+        self._check_frozen()
+        super().reverse()
+    def sort(self, *, key: Callable[[Document], Any] | None = None, reverse: bool = False) -> None:
+        """Sort list in place."""
+        self._check_frozen()
+        if key is None:
+            super().sort(reverse=reverse)  # type: ignore[call-arg]
+        else:
+            super().sort(key=key, reverse=reverse)
     @overload
     def filter_by(self, arg: str) -> "DocumentList": ...
@@ -164,6 +241,9 @@ class DocumentList(list[Document]):
         @public
+        ALWAYS returns a DocumentList (which may be empty), never raises an exception
+        for no matches. Use this when you want to process all matching documents.
         Args:
             arg: Can be one of:
                 - str: Single document name to filter by
@@ -174,7 +254,9 @@ class DocumentList(list[Document]):
                   (list, tuple, set, generator, or any iterable)
         Returns:
-            New DocumentList with filtered documents.
+            New DocumentList with filtered documents (may be empty).
+            - Returns ALL matching documents
+            - Empty DocumentList if no matches found
         Raises:
             TypeError: If arg is not a valid type (not str, type, or iterable),
@@ -182,12 +264,19 @@ class DocumentList(list[Document]):
             AttributeError: If arg is expected to be iterable but doesn't support iteration.
         Example:
-            >>> docs.filter_by("file.txt")  # Filter by single name
-            >>> docs.filter_by(MyDocument)  # Filter by single type
-            >>> docs.filter_by([Doc1, Doc2])  # Filter by multiple types (list)
-            >>> docs.filter_by({"file1.txt", "file2.txt"})  # Filter by multiple names (set)
-            >>> docs.filter_by((SubDoc, AnotherDoc))  # Filter by multiple types (tuple)
-            >>> docs.filter_by(name for name in ["a.txt", "b.txt"])  # Generator expression
+            >>> # Returns list with all matching documents
+            >>> matching_docs = docs.filter_by("file.txt")  # May be empty
+            >>> for doc in matching_docs:
+            ...     process(doc)
+            >>>
+            >>> # Filter by type - returns all instances
+            >>> config_docs = docs.filter_by(ConfigDocument)
+            >>> print(f"Found {len(config_docs)} config documents")
+            >>>
+            >>> # Filter by multiple names
+            >>> important_docs = docs.filter_by(["config.yaml", "settings.json"])
+            >>> if not important_docs:  # Check if empty
+            ...     print("No important documents found")
         """
         if isinstance(arg, str):
             # Filter by single name
@@ -257,38 +346,73 @@ class DocumentList(list[Document]):
     def get_by(self, arg: type[Document], required: bool = True) -> Document | None: ...
     def get_by(self, arg: str | type[Document], required: bool = True) -> Document | None:
-        """Get a single document by name or type.
+        """Get EXACTLY ONE document by name or type.
         @public
+        IMPORTANT: This method expects to find exactly one matching document.
+        - If no matches and required=True: raises ValueError
+        - If no matches and required=False: returns None
+        - If multiple matches: ALWAYS raises ValueError (ambiguous)
+        When required=True (default), you do NOT need to check for None:
+            >>> doc = docs.get_by("config.yaml")  # Will raise if not found
+            >>> # No need for: if doc is not None  <- This is redundant!
+            >>> print(doc.content)  # Safe to use directly
         Args:
             arg: Document name (str) or document type.
-            required: If True, raises ValueError when not found. If False, returns None.
+            required: If True (default), raises ValueError when not found.
+                     If False, returns None when not found.
         Returns:
-            The first matching document, or None if not found and required=False.
+            The single matching document, or None if not found and required=False.
         Raises:
-            ValueError: If required=True and document not found.
+            ValueError: If required=True and document not found, OR if multiple
+                       documents match (ambiguous result).
             TypeError: If arg is not a string or Document type.
         Example:
-            >>> doc = docs.get_by("file.txt")  # Get by name, raises if not found
-            >>> doc = docs.get_by(MyDocument, required=False)  # Returns None if not found
+            >>> # CORRECT - No need to check for None when required=True (default)
+            >>> doc = docs.get_by("file.txt")  # Raises if not found
+            >>> print(doc.content)  # Safe to use directly
+            >>>
+            >>> # When using required=False, check for None
+            >>> doc = docs.get_by("optional.txt", required=False)
+            >>> if doc is not None:
+            ...     print(doc.content)
+            >>>
+            >>> # Will raise if multiple documents have same type
+            >>> # Use filter_by() instead if you want all matches
+            >>> try:
+            ...     doc = docs.get_by(ConfigDocument)  # Error if 2+ configs
+            >>> except ValueError as e:
+            ...     configs = docs.filter_by(ConfigDocument)  # Get all instead
         """
         if isinstance(arg, str):
-            # Get by name
-            for doc in self:
-                if doc.name == arg:
-                    return doc
+            # Get by name - collect all matches to check for duplicates
+            matches = [doc for doc in self if doc.name == arg]
+            if len(matches) > 1:
+                raise ValueError(
+                    f"Multiple documents found with name '{arg}'. "
+                    f"Found {len(matches)} matches. Use filter_by() to get all matches."
+                )
+            if matches:
+                return matches[0]
             if required:
                 raise ValueError(f"Document with name '{arg}' not found")
             return None
         elif isinstance(arg, type):  # type: ignore[reportUnnecessaryIsInstance]
-            # Get by type (including subclasses)
-            for doc in self:
-                if isinstance(doc, arg):
-                    return doc
+            # Get by type (including subclasses) - collect all matches
+            matches = [doc for doc in self if isinstance(doc, arg)]
+            if len(matches) > 1:
+                raise ValueError(
+                    f"Multiple documents found of type '{arg.__name__}'. "
+                    f"Found {len(matches)} matches. Use filter_by() to get all matches."
+                )
+            if matches:
+                return matches[0]
             if required:
                 raise ValueError(f"Document of type '{arg.__name__}' not found")
             return None

ai_pipeline_core/documents/flow_document.py CHANGED Viewed

@@ -24,24 +24,20 @@ class FlowDocument(Document):
     - Persisted to file system between pipeline steps
     - Survives across multiple flow runs
     - Used for flow inputs and outputs
-    - Saved in directories named after the document's canonical name
+    - Saved in directories organized by the document's type/name
     Creating FlowDocuments:
         Same as Document - use `create()` for automatic conversion, `__init__` for bytes.
         See Document.create() for detailed usage examples.
     Persistence:
-        Documents are saved to: {output_dir}/{canonical_name}/{filename}
+        Documents are saved under an output directory path associated with the document's type/name.
         For example: output/my_doc/data.json
     Note:
         - Cannot instantiate FlowDocument directly - must subclass
         - Used with FlowConfig to define flow input/output types
         - No additional abstract methods to implement
-    See Also:
-        TaskDocument: For temporary documents within task execution
-        TemporaryDocument: For documents that are never persisted
     """
     def __init__(

ai_pipeline_core/documents/task_document.py CHANGED Viewed

@@ -43,10 +43,6 @@ class TaskDocument(Document):
         - Not saved by simple_runner utilities
         - Reduces I/O overhead for temporary data
         - No additional abstract methods to implement
-    See Also:
-        FlowDocument: For documents that persist across flow runs
-        TemporaryDocument: Alternative for non-persistent documents
     """
     def __init__(

ai_pipeline_core/documents/temporary_document.py CHANGED Viewed

@@ -1,7 +1,5 @@
 """Temporary document implementation for non-persistent data.
-@public
 This module provides the TemporaryDocument class for documents that
 are never persisted, regardless of context.
 """
@@ -15,8 +13,6 @@ from .document import Document
 class TemporaryDocument(Document):
     r"""Concrete document class for data that is never persisted.
-    @public
     TemporaryDocument is a final (non-subclassable) document type for
     data that should never be saved to disk, regardless of whether it's
     used in a flow or task context. Unlike FlowDocument and TaskDocument
@@ -28,6 +24,7 @@ class TemporaryDocument(Document):
     - Cannot be subclassed (annotated with Python's @final decorator in code)
     - Useful for transient data like API responses or intermediate calculations
     - Ignored by simple_runner save operations
+    - Useful for tests and debugging
     Creating TemporaryDocuments:
         Same as Document - use `create()` for automatic conversion, `__init__` for bytes.
@@ -48,10 +45,6 @@ class TemporaryDocument(Document):
         - This is a final class and cannot be subclassed
         - Use when you explicitly want to prevent persistence
         - Useful for sensitive data that shouldn't be written to disk
-    See Also:
-        FlowDocument: For documents that persist across flow runs
-        TaskDocument: For documents temporary within task execution
     """
     def __init_subclass__(cls, **kwargs: Any) -> None:

ai_pipeline_core/flow/config.py CHANGED Viewed

@@ -10,11 +10,16 @@ Best Practice:
     to ensure type safety and proper validation of output documents.
 """
+import json
 from abc import ABC
 from typing import Any, ClassVar, Iterable
-from ai_pipeline_core.documents import DocumentList, FlowDocument
+from ai_pipeline_core.documents import Document, DocumentList, FlowDocument
 from ai_pipeline_core.exceptions import DocumentValidationError
+from ai_pipeline_core.logging import get_pipeline_logger
+from ai_pipeline_core.storage import Storage
+logger = get_pipeline_logger(__name__)
 class FlowConfig(ABC):
@@ -51,8 +56,10 @@ class FlowConfig(ABC):
         ...     OUTPUT_DOCUMENT_TYPE = ProcessedDocument  # Different type!
         >>>
         >>> # Use in @pipeline_flow - RECOMMENDED PATTERN
-        >>> @pipeline_flow(name="processing")
-        >>> async def process(config: ProcessingFlowConfig, docs: DocumentList) -> DocumentList:
+        >>> @pipeline_flow(config=ProcessingFlowConfig, name="processing")
+        >>> async def process(
+        ...     project_name: str, docs: DocumentList, flow_options: FlowOptions
+        ... ) -> DocumentList:
         ...     outputs = []
         ...     # ... processing logic ...
         ...     return config.create_and_validate_output(outputs)
@@ -289,8 +296,10 @@ class FlowConfig(ABC):
             DocumentValidationError: If output type doesn't match OUTPUT_DOCUMENT_TYPE.
         Example:
-            >>> @pipeline_flow(name="my_flow")
-            >>> async def process_flow(config: MyFlowConfig, ...) -> DocumentList:
+            >>> @pipeline_flow(config=MyFlowConfig, name="my_flow")
+            >>> async def process_flow(
+            ...     project_name: str, documents: DocumentList, flow_options: FlowOptions
+            ... ) -> DocumentList:
             >>>     outputs = []
             >>>     # ... processing logic ...
             >>>     outputs.append(OutputDoc(...))
@@ -312,3 +321,163 @@ class FlowConfig(ABC):
             documents = DocumentList(list(output))  # type: ignore[arg-type]
         cls.validate_output_documents(documents)
         return documents
+    @classmethod
+    async def load_documents(
+        cls,
+        uri: str,
+    ) -> DocumentList:
+        """Load documents from storage matching INPUT_DOCUMENT_TYPES.
+        Loads documents from a storage location based on the class's INPUT_DOCUMENT_TYPES.
+        Supports both local filesystem and Google Cloud Storage backends.
+        Automatically loads metadata (.description.md and .sources.json) when present.
+        Args:
+            uri: Storage URI (file://, gs://, or local path)
+        Returns:
+            DocumentList containing loaded documents matching INPUT_DOCUMENT_TYPES
+        Example:
+            >>> # Load from local filesystem
+            >>> docs = await MyFlowConfig.load_documents("./data")
+            >>>
+            >>> # Load from GCS (uses GCS_SERVICE_ACCOUNT_FILE from settings if configured)
+            >>> docs = await MyFlowConfig.load_documents("gs://bucket/data")
+        """
+        # Use INPUT_DOCUMENT_TYPES if not specified
+        storage = await Storage.from_uri(uri)
+        loaded_documents = DocumentList()
+        # Process each document type
+        for doc_type in cls.INPUT_DOCUMENT_TYPES:
+            canonical_name = doc_type.canonical_name()
+            doc_storage = storage.with_base(canonical_name)
+            # Check if subdirectory exists
+            if not await doc_storage.exists(""):
+                logger.debug(f"Subdirectory {canonical_name} not found, skipping")
+                continue
+            # List files in subdirectory
+            objects = await doc_storage.list("", recursive=False, include_dirs=False)
+            # Create lookup set for metadata files
+            object_keys = {obj.key for obj in objects}
+            # Filter out metadata files
+            doc_files = [
+                obj
+                for obj in objects
+                if not obj.key.endswith(Document.DESCRIPTION_EXTENSION)
+                and not obj.key.endswith(Document.SOURCES_EXTENSION)
+            ]
+            for obj in doc_files:
+                try:
+                    # Load document content
+                    content = await doc_storage.read_bytes(obj.key)
+                    # Load metadata if present
+                    description = None
+                    sources: list[str] = []
+                    # Check for description in objects list
+                    desc_path = f"{obj.key}{Document.DESCRIPTION_EXTENSION}"
+                    if desc_path in object_keys:
+                        try:
+                            description = await doc_storage.read_text(desc_path)
+                        except Exception as e:
+                            logger.warning(f"Failed to load description for {obj.key}: {e}")
+                    # Check for sources in objects list
+                    sources_path = f"{obj.key}{Document.SOURCES_EXTENSION}"
+                    if sources_path in object_keys:
+                        try:
+                            sources_text = await doc_storage.read_text(sources_path)
+                            sources = json.loads(sources_text)
+                        except Exception as e:
+                            logger.warning(f"Failed to load sources for {obj.key}: {e}")
+                    # Create document instance
+                    doc = doc_type(
+                        name=obj.key,
+                        content=content,
+                        description=description,
+                        sources=sources,
+                    )
+                    loaded_documents.append(doc)
+                    logger.debug(f"Loaded {doc_type.__name__} document: {obj.key}")
+                except Exception as e:
+                    logger.error(f"Failed to load {doc_type.__name__} document {obj.key}: {e}")
+        logger.info(f"Loaded {len(loaded_documents)} documents from {uri}")
+        return loaded_documents
+    @classmethod
+    async def save_documents(
+        cls,
+        uri: str,
+        documents: DocumentList,
+        *,
+        validate_output_type: bool = True,
+    ) -> None:
+        """Save documents to storage with metadata.
+        Saves FlowDocument instances to a storage location with their content
+        and metadata files (Document.DESCRIPTION_EXTENSION and Document.SOURCES_EXTENSION).
+        Non-FlowDocument instances (TaskDocument, TemporaryDocument) are skipped.
+        Args:
+            uri: Storage URI (file://, gs://, or local path)
+            documents: DocumentList to save
+            validate_output_type: If True, validate documents match cls.OUTPUT_DOCUMENT_TYPE
+        Raises:
+            DocumentValidationError: If validate_output_type=True and documents don't match
+                                   OUTPUT_DOCUMENT_TYPE
+        Example:
+            >>> # Save to local filesystem
+            >>> await MyFlowConfig.save_documents("./output", docs)
+            >>>
+            >>> # Save to GCS (uses GCS_SERVICE_ACCOUNT_FILE from settings if configured)
+            >>> await MyFlowConfig.save_documents("gs://bucket/output", docs)
+        """
+        # Validate output type if requested
+        if validate_output_type:
+            cls.validate_output_documents(documents)
+        storage = await Storage.from_uri(uri)
+        saved_count = 0
+        for doc in documents:
+            # Skip non-FlowDocument instances
+            if not isinstance(doc, FlowDocument):
+                logger.warning(f"Skipping non-FlowDocument: {type(doc).__name__}")
+                continue
+            # Get canonical name for subdirectory
+            canonical_name = doc.canonical_name()
+            doc_storage = storage.with_base(canonical_name)
+            # Save document content
+            await doc_storage.write_bytes(doc.name, doc.content)
+            saved_count += 1
+            # Save description if present
+            if doc.description:
+                desc_path = f"{doc.name}{Document.DESCRIPTION_EXTENSION}"
+                await doc_storage.write_text(desc_path, doc.description)
+            # Save sources if present
+            if doc.sources:
+                sources_path = f"{doc.name}{Document.SOURCES_EXTENSION}"
+                sources_json = json.dumps(doc.sources, indent=2)
+                await doc_storage.write_text(sources_path, sources_json)
+            logger.debug(f"Saved {type(doc).__name__} document: {doc.name}")
+        logger.info(f"Saved {saved_count} documents to {uri}")

ai_pipeline_core/llm/__init__.py CHANGED Viewed

@@ -8,8 +8,6 @@ from .ai_messages import AIMessages, AIMessageType
 from .client import (
     generate,
     generate_structured,
-    generate_with_retry_for_testing,
-    process_messages_for_testing,
 )
 from .model_options import ModelOptions
 from .model_response import ModelResponse, StructuredModelResponse
@@ -19,12 +17,9 @@ __all__ = [
     "AIMessages",
     "AIMessageType",
     "ModelName",
-    "ModelOptions",
     "ModelResponse",
+    "ModelOptions",
     "StructuredModelResponse",
     "generate",
     "generate_structured",
-    # Internal functions exposed for testing only
-    "process_messages_for_testing",
-    "generate_with_retry_for_testing",
 ]

ai-pipeline-core 0.1.14__py3-none-any.whl → 0.2.1__py3-none-any.whl

ai-pipeline-core 0.1.14py3-none-any.whl → 0.2.1py3-none-any.whl