PyPI - ai-pipeline-core - Versions diffs - 0.1.13__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

ai-pipeline-core 0.1.13py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

ai_pipeline_core/__init__.py +25 -14
ai_pipeline_core/documents/__init__.py +2 -1
ai_pipeline_core/documents/document.py +317 -49
ai_pipeline_core/documents/document_list.py +136 -33
ai_pipeline_core/documents/flow_document.py +8 -29
ai_pipeline_core/documents/task_document.py +6 -27
ai_pipeline_core/documents/temporary_document.py +6 -27
ai_pipeline_core/documents/utils.py +64 -1
ai_pipeline_core/flow/config.py +174 -5
ai_pipeline_core/flow/options.py +2 -2
ai_pipeline_core/llm/__init__.py +6 -1
ai_pipeline_core/llm/ai_messages.py +14 -7
ai_pipeline_core/llm/client.py +143 -55
ai_pipeline_core/llm/model_options.py +20 -5
ai_pipeline_core/llm/model_response.py +77 -29
ai_pipeline_core/llm/model_types.py +38 -40
ai_pipeline_core/logging/__init__.py +0 -2
ai_pipeline_core/logging/logging_config.py +0 -6
ai_pipeline_core/logging/logging_mixin.py +2 -10
ai_pipeline_core/pipeline.py +68 -65
ai_pipeline_core/prefect.py +12 -3
ai_pipeline_core/prompt_manager.py +6 -7
ai_pipeline_core/settings.py +13 -5
ai_pipeline_core/simple_runner/__init__.py +1 -11
ai_pipeline_core/simple_runner/cli.py +13 -12
ai_pipeline_core/simple_runner/simple_runner.py +34 -172
ai_pipeline_core/storage/__init__.py +8 -0
ai_pipeline_core/storage/storage.py +628 -0
ai_pipeline_core/tracing.py +110 -26
{ai_pipeline_core-0.1.13.dist-info → ai_pipeline_core-0.2.0.dist-info}/METADATA +60 -23
ai_pipeline_core-0.2.0.dist-info/RECORD +38 -0
ai_pipeline_core-0.1.13.dist-info/RECORD +0 -36
{ai_pipeline_core-0.1.13.dist-info → ai_pipeline_core-0.2.0.dist-info}/WHEEL +0 -0
{ai_pipeline_core-0.1.13.dist-info → ai_pipeline_core-0.2.0.dist-info}/licenses/LICENSE +0 -0

ai_pipeline_core/documents/document_list.py CHANGED Viewed

@@ -17,8 +17,8 @@ class DocumentList(list[Document]):
     Specialized list with validation and filtering for documents.
-    Best Practice: Use default constructor in 90% of cases. Only enable
-    validate_same_type or validate_duplicates when you explicitly need them.
+    Best Practice: Use default constructor by default, unless instructed otherwise.
+    Only enable validate_same_type or validate_duplicates when you explicitly need them.
     Example:
         >>> # RECOMMENDED - default constructor for most cases
@@ -152,41 +152,109 @@ class DocumentList(list[Document]):
     def filter_by(self, arg: type[Document]) -> "DocumentList": ...
     @overload
-    def filter_by(self, arg: list[type[Document]]) -> "DocumentList": ...
+    def filter_by(self, arg: Iterable[type[Document]]) -> "DocumentList": ...
-    def filter_by(self, arg: str | type[Document] | list[type[Document]]) -> "DocumentList":
-        """Filter documents by name or type(s).
+    @overload
+    def filter_by(self, arg: Iterable[str]) -> "DocumentList": ...
+    def filter_by(
+        self, arg: str | type[Document] | Iterable[type[Document]] | Iterable[str]
+    ) -> "DocumentList":
+        """Filter documents by name(s) or type(s).
         @public
+        ALWAYS returns a DocumentList (which may be empty), never raises an exception
+        for no matches. Use this when you want to process all matching documents.
         Args:
-            arg: Document name (str), single document type, or list of document types.
+            arg: Can be one of:
+                - str: Single document name to filter by
+                - type[Document]: Single document type to filter by (includes subclasses)
+                - Iterable[type[Document]]: Multiple document types to filter by
+                  (list, tuple, set, generator, or any iterable)
+                - Iterable[str]: Multiple document names to filter by
+                  (list, tuple, set, generator, or any iterable)
         Returns:
-            New DocumentList with filtered documents.
+            New DocumentList with filtered documents (may be empty).
+            - Returns ALL matching documents
+            - Empty DocumentList if no matches found
         Raises:
-            TypeError: If arg is not a valid type (str, Document type, or list of Document types).
+            TypeError: If arg is not a valid type (not str, type, or iterable),
+                or if iterable contains mixed types (strings and types together).
+            AttributeError: If arg is expected to be iterable but doesn't support iteration.
         Example:
-            >>> docs.filter_by("file.txt")  # Filter by name
-            >>> docs.filter_by(MyDocument)  # Filter by type
-            >>> docs.filter_by([Doc1, Doc2])  # Filter by multiple types
+            >>> # Returns list with all matching documents
+            >>> matching_docs = docs.filter_by("file.txt")  # May be empty
+            >>> for doc in matching_docs:
+            ...     process(doc)
+            >>>
+            >>> # Filter by type - returns all instances
+            >>> config_docs = docs.filter_by(ConfigDocument)
+            >>> print(f"Found {len(config_docs)} config documents")
+            >>>
+            >>> # Filter by multiple names
+            >>> important_docs = docs.filter_by(["config.yaml", "settings.json"])
+            >>> if not important_docs:  # Check if empty
+            ...     print("No important documents found")
         """
         if isinstance(arg, str):
-            # Filter by name
+            # Filter by single name
             return DocumentList([doc for doc in self if doc.name == arg])
         elif isinstance(arg, type):
             # Filter by single type (including subclasses)
+            # The type system ensures arg is type[Document] due to overloads
             return DocumentList([doc for doc in self if isinstance(doc, arg)])
-        elif isinstance(arg, list):  # type: ignore[reportUnnecessaryIsInstance]
-            # Filter by multiple types
-            documents = DocumentList()
-            for document_type in arg:
-                documents.extend([doc for doc in self if isinstance(doc, document_type)])
-            return documents
         else:
-            raise TypeError(f"Invalid argument type for filter_by: {type(arg)}")
+            # Try to consume as iterable
+            try:
+                # Convert to list to check the first element and allow reuse
+                items = list(arg)  # type: ignore[arg-type]
+                if not items:
+                    return DocumentList()
+                first_item = items[0]
+                if isinstance(first_item, str):
+                    # Iterable of names - validate all items are strings
+                    for item in items:
+                        if not isinstance(item, str):
+                            raise TypeError(
+                                "Iterable must contain only strings or only Document types, "
+                                "not mixed types"
+                            )
+                    names_set = set(items)
+                    return DocumentList([doc for doc in self if doc.name in names_set])
+                elif isinstance(first_item, type):  # type: ignore[reportUnnecessaryIsInstance]
+                    # Iterable of document types - validate all items are types
+                    for item in items:
+                        if not isinstance(item, type):
+                            raise TypeError(
+                                "Iterable must contain only strings or only Document types, "
+                                "not mixed types"
+                            )
+                    # Convert to set for efficient lookup
+                    types_set = set(items)
+                    # Filter documents that match any of the requested types
+                    matching = [
+                        doc
+                        for doc in self
+                        if any(isinstance(doc, doc_type) for doc_type in types_set)  # type: ignore[arg-type]
+                    ]
+                    return DocumentList(matching)
+                else:
+                    raise TypeError(
+                        f"Iterable must contain strings or Document types, "
+                        f"got {type(first_item).__name__}"
+                    )
+            except (TypeError, AttributeError) as e:
+                # If the error message already mentions Iterable, re-raise it
+                if "Iterable" in str(e) or "strings or Document types" in str(e):
+                    raise
+                # Otherwise, provide a generic error message
+                raise TypeError(f"Invalid argument type for filter_by: {type(arg).__name__}") from e
     @overload
     def get_by(self, arg: str) -> Document: ...
@@ -201,38 +269,73 @@ class DocumentList(list[Document]):
     def get_by(self, arg: type[Document], required: bool = True) -> Document | None: ...
     def get_by(self, arg: str | type[Document], required: bool = True) -> Document | None:
-        """Get a single document by name or type.
+        """Get EXACTLY ONE document by name or type.
         @public
+        IMPORTANT: This method expects to find exactly one matching document.
+        - If no matches and required=True: raises ValueError
+        - If no matches and required=False: returns None
+        - If multiple matches: ALWAYS raises ValueError (ambiguous)
+        When required=True (default), you do NOT need to check for None:
+            >>> doc = docs.get_by("config.yaml")  # Will raise if not found
+            >>> # No need for: if doc is not None  <- This is redundant!
+            >>> print(doc.content)  # Safe to use directly
         Args:
             arg: Document name (str) or document type.
-            required: If True, raises ValueError when not found. If False, returns None.
+            required: If True (default), raises ValueError when not found.
+                     If False, returns None when not found.
         Returns:
-            The first matching document, or None if not found and required=False.
+            The single matching document, or None if not found and required=False.
         Raises:
-            ValueError: If required=True and document not found.
+            ValueError: If required=True and document not found, OR if multiple
+                       documents match (ambiguous result).
             TypeError: If arg is not a string or Document type.
         Example:
-            >>> doc = docs.get_by("file.txt")  # Get by name, raises if not found
-            >>> doc = docs.get_by(MyDocument, required=False)  # Returns None if not found
+            >>> # CORRECT - No need to check for None when required=True (default)
+            >>> doc = docs.get_by("file.txt")  # Raises if not found
+            >>> print(doc.content)  # Safe to use directly
+            >>>
+            >>> # When using required=False, check for None
+            >>> doc = docs.get_by("optional.txt", required=False)
+            >>> if doc is not None:
+            ...     print(doc.content)
+            >>>
+            >>> # Will raise if multiple documents have same type
+            >>> # Use filter_by() instead if you want all matches
+            >>> try:
+            ...     doc = docs.get_by(ConfigDocument)  # Error if 2+ configs
+            >>> except ValueError as e:
+            ...     configs = docs.filter_by(ConfigDocument)  # Get all instead
         """
         if isinstance(arg, str):
-            # Get by name
-            for doc in self:
-                if doc.name == arg:
-                    return doc
+            # Get by name - collect all matches to check for duplicates
+            matches = [doc for doc in self if doc.name == arg]
+            if len(matches) > 1:
+                raise ValueError(
+                    f"Multiple documents found with name '{arg}'. "
+                    f"Found {len(matches)} matches. Use filter_by() to get all matches."
+                )
+            if matches:
+                return matches[0]
             if required:
                 raise ValueError(f"Document with name '{arg}' not found")
             return None
         elif isinstance(arg, type):  # type: ignore[reportUnnecessaryIsInstance]
-            # Get by type (including subclasses)
-            for doc in self:
-                if isinstance(doc, arg):
-                    return doc
+            # Get by type (including subclasses) - collect all matches
+            matches = [doc for doc in self if isinstance(doc, arg)]
+            if len(matches) > 1:
+                raise ValueError(
+                    f"Multiple documents found of type '{arg.__name__}'. "
+                    f"Found {len(matches)} matches. Use filter_by() to get all matches."
+                )
+            if matches:
+                return matches[0]
             if required:
                 raise ValueError(f"Document of type '{arg.__name__}' not found")
             return None

ai_pipeline_core/documents/flow_document.py CHANGED Viewed

@@ -24,40 +24,20 @@ class FlowDocument(Document):
     - Persisted to file system between pipeline steps
     - Survives across multiple flow runs
     - Used for flow inputs and outputs
-    - Saved in directories named after the document's canonical name
+    - Saved in directories organized by the document's type/name
     Creating FlowDocuments:
-        **Use the `create` classmethod** for most use cases. It handles automatic
-        conversion of various content types. Only use __init__ when you have bytes.
-        >>> from enum import StrEnum
-        >>>
-        >>> # Simple document with pass:
-        >>> class MyDoc(FlowDocument):
-        ...     pass
-        >>>
-        >>> # Document with restricted file names:
-        >>> class ConfigDoc(FlowDocument):
-        ...     class FILES(StrEnum):
-        ...         CONFIG = "config.yaml"
-        ...         SETTINGS = "settings.json"
-        >>>
-        >>> # RECOMMENDED - automatic conversion:
-        >>> doc = MyDoc.create(name="data.json", content={"key": "value"})
-        >>> doc = ConfigDoc.create(name="config.yaml", content={"host": "localhost"})
+        Same as Document - use `create()` for automatic conversion, `__init__` for bytes.
+        See Document.create() for detailed usage examples.
     Persistence:
-        Documents are saved to: {output_dir}/{canonical_name}/{filename}
+        Documents are saved under an output directory path associated with the document's type/name.
         For example: output/my_doc/data.json
     Note:
         - Cannot instantiate FlowDocument directly - must subclass
         - Used with FlowConfig to define flow input/output types
         - No additional abstract methods to implement
-    See Also:
-        TaskDocument: For temporary documents within task execution
-        TemporaryDocument: For documents that are never persisted
     """
     def __init__(
@@ -66,13 +46,11 @@ class FlowDocument(Document):
         name: str,
         content: bytes,
         description: str | None = None,
+        sources: list[str] = [],
     ) -> None:
         """Initialize a FlowDocument with raw bytes content.
-        Important:
-            **Most users should use the `create` classmethod instead of __init__.**
-            The create method provides automatic content conversion for various types
-            (str, dict, list, Pydantic models) while __init__ only accepts bytes.
+        See Document.__init__() for parameter details and usage notes.
         Prevents direct instantiation of the abstract FlowDocument class.
         FlowDocument must be subclassed for specific document types.
@@ -81,6 +59,7 @@ class FlowDocument(Document):
             name: Document filename (required, keyword-only)
             content: Document content as raw bytes (required, keyword-only)
             description: Optional human-readable description (keyword-only)
+            sources: Optional list of strings for provenance tracking
         Raises:
             TypeError: If attempting to instantiate FlowDocument directly
@@ -109,7 +88,7 @@ class FlowDocument(Document):
         """
         if type(self) is FlowDocument:
             raise TypeError("Cannot instantiate abstract FlowDocument class directly")
-        super().__init__(name=name, content=content, description=description)
+        super().__init__(name=name, content=content, description=description, sources=sources)
     @final
     def get_base_type(self) -> Literal["flow"]:

ai_pipeline_core/documents/task_document.py CHANGED Viewed

@@ -29,24 +29,8 @@ class TaskDocument(Document):
     - Reduces persistent I/O for temporary data
     Creating TaskDocuments:
-        **Use the `create` classmethod** for most use cases. It handles automatic
-        conversion of various content types. Only use __init__ when you have bytes.
-        >>> from enum import StrEnum
-        >>>
-        >>> # Simple task document:
-        >>> class TempDoc(TaskDocument):
-        ...     pass
-        >>>
-        >>> # With restricted files:
-        >>> class CacheDoc(TaskDocument):
-        ...     class FILES(StrEnum):
-        ...         CACHE = "cache.json"
-        ...         INDEX = "index.dat"
-        >>>
-        >>> # RECOMMENDED - automatic conversion:
-        >>> doc = TempDoc.create(name="temp.json", content={"status": "processing"})
-        >>> doc = CacheDoc.create(name="cache.json", content={"data": [1, 2, 3]})
+        Same as Document - use `create()` for automatic conversion, `__init__` for bytes.
+        See Document.create() for detailed usage examples.
     Use Cases:
         - Intermediate transformation results
@@ -59,10 +43,6 @@ class TaskDocument(Document):
         - Not saved by simple_runner utilities
         - Reduces I/O overhead for temporary data
         - No additional abstract methods to implement
-    See Also:
-        FlowDocument: For documents that persist across flow runs
-        TemporaryDocument: Alternative for non-persistent documents
     """
     def __init__(
@@ -71,13 +51,11 @@ class TaskDocument(Document):
         name: str,
         content: bytes,
         description: str | None = None,
+        sources: list[str] = [],
     ) -> None:
         """Initialize a TaskDocument with raw bytes content.
-        Important:
-            **Most users should use the `create` classmethod instead of __init__.**
-            The create method provides automatic content conversion for various types
-            (str, dict, list, Pydantic models) while __init__ only accepts bytes.
+        See Document.__init__() for parameter details and usage notes.
         Prevents direct instantiation of the abstract TaskDocument class.
         TaskDocument must be subclassed for specific temporary document types.
@@ -86,6 +64,7 @@ class TaskDocument(Document):
             name: Document filename (required, keyword-only)
             content: Document content as raw bytes (required, keyword-only)
             description: Optional human-readable description (keyword-only)
+            sources: Optional list of strings for provenance tracking
         Raises:
             TypeError: If attempting to instantiate TaskDocument directly
@@ -114,7 +93,7 @@ class TaskDocument(Document):
         """
         if type(self) is TaskDocument:
             raise TypeError("Cannot instantiate abstract TaskDocument class directly")
-        super().__init__(name=name, content=content, description=description)
+        super().__init__(name=name, content=content, description=description, sources=sources)
     @final
     def get_base_type(self) -> Literal["task"]:

ai_pipeline_core/documents/temporary_document.py CHANGED Viewed

@@ -1,7 +1,5 @@
 """Temporary document implementation for non-persistent data.
-@public
 This module provides the TemporaryDocument class for documents that
 are never persisted, regardless of context.
 """
@@ -15,8 +13,6 @@ from .document import Document
 class TemporaryDocument(Document):
     r"""Concrete document class for data that is never persisted.
-    @public
     TemporaryDocument is a final (non-subclassable) document type for
     data that should never be saved to disk, regardless of whether it's
     used in a flow or task context. Unlike FlowDocument and TaskDocument
@@ -28,27 +24,14 @@ class TemporaryDocument(Document):
     - Cannot be subclassed (annotated with Python's @final decorator in code)
     - Useful for transient data like API responses or intermediate calculations
     - Ignored by simple_runner save operations
+    - Useful for tests and debugging
     Creating TemporaryDocuments:
-        **Use the `create` classmethod** for most use cases. It handles automatic
-        conversion of various content types. Only use __init__ when you have bytes.
-        >>> # RECOMMENDED - automatic conversion:
-        >>> doc = TemporaryDocument.create(
-        ...     name="api_response.json",
-        ...     content={"status": "ok", "data": [1, 2, 3]}
-        ... )
-        >>> doc = TemporaryDocument.create(
-        ...     name="credentials.txt",
-        ...     content="secret_token_xyz"
-        ... )
-        >>>
-        >>> # Direct constructor - only for bytes:
-        >>> doc = TemporaryDocument(
-        ...     name="binary.dat",
-        ...     content=b"\x00\x01\x02"
-        ... )
-        >>>
+        Same as Document - use `create()` for automatic conversion, `__init__` for bytes.
+        Unlike abstract document types, TemporaryDocument can be instantiated directly.
+        See Document.create() for detailed usage examples.
+        >>> doc = TemporaryDocument.create(name="api.json", content={"status": "ok"})
         >>> doc.is_temporary  # Always True
     Use Cases:
@@ -62,10 +45,6 @@ class TemporaryDocument(Document):
         - This is a final class and cannot be subclassed
         - Use when you explicitly want to prevent persistence
         - Useful for sensitive data that shouldn't be written to disk
-    See Also:
-        FlowDocument: For documents that persist across flow runs
-        TaskDocument: For documents temporary within task execution
     """
     def __init_subclass__(cls, **kwargs: Any) -> None:

ai_pipeline_core/documents/utils.py CHANGED Viewed

@@ -1,7 +1,7 @@
 """Utility functions for document handling.
 Provides helper functions for URL sanitization, naming conventions,
-and canonical key generation used throughout the document system.
+canonical key generation, and hash validation used throughout the document system.
 """
 import re
@@ -115,3 +115,66 @@ def canonical_name_key(
                 break
     return camel_to_snake(name)
+def is_document_sha256(value: str) -> bool:
+    """Check if a string is a valid base32-encoded SHA256 hash with proper entropy.
+    @public
+    This function validates that a string is not just formatted like a SHA256 hash,
+    but actually has the entropy characteristics of a real hash. It checks:
+    1. Correct length (52 characters without padding)
+    2. Valid base32 characters (A-Z, 2-7)
+    3. Sufficient entropy (at least 8 unique characters)
+    The entropy check prevents false positives like 'AAAAAAA...AAA' from being
+    identified as valid document hashes.
+    Args:
+        value: String to check if it's a document SHA256 hash.
+    Returns:
+        True if the string appears to be a real base32-encoded SHA256 hash,
+        False otherwise.
+    Examples:
+        >>> # Real SHA256 hash
+        >>> is_document_sha256("P3AEMA2PSYILKFYVBUALJLMIYWVZIS2QDI3S5VTMD2X7SOODF2YQ")
+        True
+        >>> # Too uniform - lacks entropy
+        >>> is_document_sha256("A" * 52)
+        False
+        >>> # Wrong length
+        >>> is_document_sha256("ABC123")
+        False
+        >>> # Invalid characters
+        >>> is_document_sha256("a" * 52)  # lowercase
+        False
+    """
+    # Check basic format: exactly 52 uppercase base32 characters
+    try:
+        if not value or len(value) != 52:
+            return False
+    except (TypeError, AttributeError):
+        return False
+    # Check if all characters are valid base32 (A-Z, 2-7)
+    try:
+        if not re.match(r"^[A-Z2-7]{52}$", value):
+            return False
+    except TypeError:
+        # re.match raises TypeError for non-string types like bytes
+        return False
+    # Check entropy: real SHA256 hashes have high entropy
+    # Require at least 8 unique characters (out of 32 possible in base32)
+    # This prevents patterns like "AAAAAAA..." from being identified as real hashes
+    unique_chars = len(set(value))
+    if unique_chars < 8:
+        return False
+    return True

ai-pipeline-core 0.1.13__py3-none-any.whl → 0.2.0__py3-none-any.whl

ai-pipeline-core 0.1.13py3-none-any.whl → 0.2.0py3-none-any.whl