PyPI - ai-pipeline-core - Versions diffs - 0.1.12__py3-none-any.whl → 0.1.14__py3-none-any.whl - Mend

ai-pipeline-core 0.1.12py3-none-any.whl → 0.1.14py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

ai_pipeline_core/__init__.py +5 -2
ai_pipeline_core/documents/__init__.py +2 -1
ai_pipeline_core/documents/document.py +239 -14
ai_pipeline_core/documents/document_list.py +72 -16
ai_pipeline_core/documents/flow_document.py +6 -23
ai_pipeline_core/documents/task_document.py +6 -23
ai_pipeline_core/documents/temporary_document.py +5 -19
ai_pipeline_core/documents/utils.py +64 -1
ai_pipeline_core/flow/options.py +2 -2
ai_pipeline_core/llm/__init__.py +5 -0
ai_pipeline_core/llm/ai_messages.py +0 -3
ai_pipeline_core/llm/client.py +50 -19
ai_pipeline_core/llm/model_options.py +18 -0
ai_pipeline_core/llm/model_response.py +62 -15
ai_pipeline_core/llm/model_types.py +38 -36
ai_pipeline_core/pipeline.py +28 -2
ai_pipeline_core/settings.py +4 -0
ai_pipeline_core/simple_runner/simple_runner.py +18 -1
ai_pipeline_core/tracing.py +115 -7
{ai_pipeline_core-0.1.12.dist-info → ai_pipeline_core-0.1.14.dist-info}/METADATA +42 -7
ai_pipeline_core-0.1.14.dist-info/RECORD +36 -0
ai_pipeline_core-0.1.12.dist-info/RECORD +0 -36
{ai_pipeline_core-0.1.12.dist-info → ai_pipeline_core-0.1.14.dist-info}/WHEEL +0 -0
{ai_pipeline_core-0.1.12.dist-info → ai_pipeline_core-0.1.14.dist-info}/licenses/LICENSE +0 -0

ai_pipeline_core/__init__.py CHANGED Viewed

@@ -88,6 +88,7 @@ from .documents import (
     TaskDocument,
     TemporaryDocument,
     canonical_name_key,
+    is_document_sha256,
     sanitize_url,
 )
 from .flow import FlowConfig, FlowOptions
@@ -111,9 +112,9 @@ from .pipeline import pipeline_flow, pipeline_task
 from .prefect import disable_run_logger, prefect_test_harness
 from .prompt_manager import PromptManager
 from .settings import Settings
-from .tracing import TraceInfo, TraceLevel, trace
+from .tracing import TraceInfo, TraceLevel, set_trace_cost, trace
-__version__ = "0.1.12"
+__version__ = "0.1.14"
 __all__ = [
     # Config/Settings
@@ -132,6 +133,7 @@ __all__ = [
     "TaskDocument",
     "TemporaryDocument",
     "canonical_name_key",
+    "is_document_sha256",
     "sanitize_url",
     # Flow/Task
     "FlowConfig",
@@ -154,6 +156,7 @@ __all__ = [
     "trace",
     "TraceLevel",
     "TraceInfo",
+    "set_trace_cost",
     # Utils
     "PromptManager",
 ]

ai_pipeline_core/documents/__init__.py CHANGED Viewed

@@ -12,7 +12,7 @@ from .document_list import DocumentList
 from .flow_document import FlowDocument
 from .task_document import TaskDocument
 from .temporary_document import TemporaryDocument
-from .utils import canonical_name_key, sanitize_url
+from .utils import canonical_name_key, is_document_sha256, sanitize_url
 __all__ = [
     "Document",
@@ -21,5 +21,6 @@ __all__ = [
     "TaskDocument",
     "TemporaryDocument",
     "canonical_name_key",
+    "is_document_sha256",
     "sanitize_url",
 ]

ai_pipeline_core/documents/document.py CHANGED Viewed

@@ -6,6 +6,8 @@ This module provides the core document abstraction for working with various type
 in AI pipelines. Documents are immutable Pydantic models that wrap binary content with metadata.
 """
+from __future__ import annotations
 import base64
 import hashlib
 import json
@@ -30,13 +32,14 @@ from typing import (
 from pydantic import (
     BaseModel,
     ConfigDict,
+    Field,
     ValidationInfo,
     field_serializer,
     field_validator,
 )
 from ruamel.yaml import YAML
-from ai_pipeline_core.documents.utils import canonical_name_key
+from ai_pipeline_core.documents.utils import canonical_name_key, is_document_sha256
 from ai_pipeline_core.exceptions import DocumentNameError, DocumentSizeError
 from .mime_type import (
@@ -94,6 +97,7 @@ class Document(BaseModel, ABC):
     - SHA256 hashing for deduplication
     - Support for text, JSON, YAML, PDF, and image formats
     - Conversion utilities between different formats
+    - Source provenance tracking via sources field
     Class Variables:
         MAX_CONTENT_SIZE: Maximum allowed content size in bytes (default 25MB)
@@ -102,6 +106,7 @@ class Document(BaseModel, ABC):
         name: Document filename (validated for security)
         description: Optional human-readable description
         content: Raw document content as bytes
+        sources: List of source references tracking document provenance
     Creating Documents:
         **Use the `create` classmethod** for most use cases. It accepts various
@@ -117,7 +122,7 @@ class Document(BaseModel, ABC):
     Warning:
         - Document subclasses should NOT start with 'Test' prefix (pytest conflict)
         - Cannot instantiate Document directly - must subclass FlowDocument or TaskDocument
-        - Cannot add custom fields - only name, description, content are allowed
+        - Cannot add custom fields - only name, description, content, sources are allowed
         - Document is an abstract class and cannot be instantiated directly
     Metadata Attachment Patterns:
@@ -145,6 +150,15 @@ class Document(BaseModel, ABC):
         >>> doc = MyDocument.create(name="data.json", content={"key": "value"})
         >>> print(doc.is_text)  # True
         >>> data = doc.as_json()  # {'key': 'value'}
+        >>>
+        >>> # Track document provenance with sources
+        >>> source_doc = MyDocument.create(name="input.txt", content="raw data")
+        >>> processed = MyDocument.create(
+        ...     name="output.txt",
+        ...     content="processed data",
+        ...     sources=[source_doc.sha256]  # Reference source document
+        ... )
+        >>> processed.has_source(source_doc)  # True
     """
     MAX_CONTENT_SIZE: ClassVar[int] = 25 * 1024 * 1024
@@ -193,7 +207,7 @@ class Document(BaseModel, ABC):
                 )
         # Check that the Document's model_fields only contain the allowed fields
         # It prevents AI models from adding additional fields to documents
-        allowed = {"name", "description", "content"}
+        allowed = {"name", "description", "content", "sources"}
         current = set(getattr(cls, "model_fields", {}).keys())
         extras = current - allowed
         if extras:
@@ -204,25 +218,58 @@ class Document(BaseModel, ABC):
     @overload
     @classmethod
-    def create(cls, *, name: str, content: bytes, description: str | None = None) -> Self: ...
+    def create(
+        cls,
+        *,
+        name: str,
+        content: bytes,
+        description: str | None = None,
+        sources: list[str] = [],
+    ) -> Self: ...
     @overload
     @classmethod
-    def create(cls, *, name: str, content: str, description: str | None = None) -> Self: ...
+    def create(
+        cls,
+        *,
+        name: str,
+        content: str,
+        description: str | None = None,
+        sources: list[str] = [],
+    ) -> Self: ...
     @overload
     @classmethod
     def create(
-        cls, *, name: str, content: dict[str, Any], description: str | None = None
+        cls,
+        *,
+        name: str,
+        content: dict[str, Any],
+        description: str | None = None,
+        sources: list[str] = [],
     ) -> Self: ...
     @overload
     @classmethod
-    def create(cls, *, name: str, content: list[Any], description: str | None = None) -> Self: ...
+    def create(
+        cls,
+        *,
+        name: str,
+        content: list[Any],
+        description: str | None = None,
+        sources: list[str] = [],
+    ) -> Self: ...
     @overload
     @classmethod
-    def create(cls, *, name: str, content: BaseModel, description: str | None = None) -> Self: ...
+    def create(
+        cls,
+        *,
+        name: str,
+        content: BaseModel,
+        description: str | None = None,
+        sources: list[str] = [],
+    ) -> Self: ...
     @classmethod
     def create(
@@ -231,6 +278,7 @@ class Document(BaseModel, ABC):
         name: str,
         content: str | bytes | dict[str, Any] | list[Any] | BaseModel,
         description: str | None = None,
+        sources: list[str] = [],
     ) -> Self:
         r"""Create a Document with automatic content type conversion (recommended).
@@ -260,6 +308,11 @@ class Document(BaseModel, ABC):
                 - BaseModel: Serialized to JSON or YAML based on extension
             description: Optional description - USUALLY OMIT THIS (defaults to None).
                         Only use when meaningful metadata helps downstream processing
+            sources: Optional list of source strings (document SHA256 hashes or references).
+                    Used to track what sources contributed to creating this document.
+                    Can contain document SHA256 hashes (for referencing other documents)
+                    or arbitrary reference strings (URLs, file paths, descriptions).
+                    Defaults to empty list
         Returns:
             New Document instance with content converted to bytes
@@ -306,11 +359,31 @@ class Document(BaseModel, ABC):
             >>> items = ["Section 1", "Section 2"]
             >>> doc = MyDocument.create(name="sections.md", content=items)
             >>> doc.parse(list)  # ["Section 1", "Section 2"]
+            >>> # Document with sources for provenance tracking
+            >>> source_doc = MyDocument.create(name="source.txt", content="original")
+            >>> derived = MyDocument.create(
+            ...     name="result.txt",
+            ...     content="processed",
+            ...     sources=[source_doc.sha256, "https://api.example.com/data"]
+            ... )
+            >>> derived.get_source_documents()  # [source_doc.sha256]
+            >>> derived.get_source_references()  # ["https://api.example.com/data"]
         """
         # Use model_validate to leverage the existing validator logic
-        temp = cls.model_validate({"name": name, "content": content, "description": description})
+        temp = cls.model_validate({
+            "name": name,
+            "content": content,
+            "description": description,
+            "sources": sources,
+        })
         # Now construct with type-checker-friendly call (bytes only)
-        return cls(name=temp.name, content=temp.content, description=temp.description)
+        return cls(
+            name=temp.name,
+            content=temp.content,
+            description=temp.description,
+            sources=temp.sources,
+        )
     def __init__(
         self,
@@ -318,6 +391,7 @@ class Document(BaseModel, ABC):
         name: str,
         content: bytes,
         description: str | None = None,
+        sources: list[str] = [],
     ) -> None:
         """Initialize a Document instance with raw bytes content.
@@ -335,6 +409,10 @@ class Document(BaseModel, ABC):
             name: Document filename (required, keyword-only)
             content: Document content as raw bytes (required, keyword-only)
             description: Optional human-readable description (keyword-only)
+            sources: Optional list of source strings for provenance tracking.
+                    Can contain document SHA256 hashes (for referencing other documents)
+                    or arbitrary reference strings (URLs, file paths, descriptions).
+                    Defaults to empty list
         Raises:
             TypeError: If attempting to instantiate Document directly.
@@ -357,11 +435,17 @@ class Document(BaseModel, ABC):
         if type(self) is Document:
             raise TypeError("Cannot instantiate abstract Document class directly")
-        super().__init__(name=name, content=content, description=description)
+        super().__init__(name=name, content=content, description=description, sources=sources)
     name: str
     description: str | None = None
     content: bytes  # Note: constructor accepts str | bytes, but field stores bytes only
+    sources: list[str] = Field(
+        default_factory=list,
+        description="List of source references for tracking document provenance. "
+        "Can contain document SHA256 hashes (for referencing other documents) "
+        "or arbitrary reference strings (URLs, file paths, descriptions)",
+    )
     # Pydantic configuration
     model_config = ConfigDict(
@@ -795,7 +879,7 @@ class Document(BaseModel, ABC):
             This is computed once and cached for performance.
             The hash is deterministic based on content only.
         """
-        return b32encode(hashlib.sha256(self.content).digest()).decode("ascii").upper()
+        return b32encode(hashlib.sha256(self.content).digest()).decode("ascii").upper().rstrip("=")
     @final
     @property
@@ -1215,6 +1299,144 @@ class Document(BaseModel, ABC):
         raise ValueError(f"Unsupported type {type_} for file {self.name}")
+    def get_source_documents(self) -> list[str]:
+        """Get list of document SHA256 hashes referenced as sources.
+        Retrieves all document references from this document's sources list,
+        filtering for valid SHA256 hashes that reference other documents.
+        This is useful for building dependency graphs and tracking document
+        lineage in processing pipelines.
+        Returns:
+            List of SHA256 hashes (base32 encoded) for documents referenced
+            as sources. Each hash uniquely identifies another document that
+            contributed to creating this one.
+        Example:
+            >>> # Create a derived document from multiple sources
+            >>> source1 = MyDocument.create(name="data1.txt", content="First")
+            >>> source2 = MyDocument.create(name="data2.txt", content="Second")
+            >>>
+            >>> merged = MyDocument.create(
+            ...     name="merged.txt",
+            ...     content="Combined data",
+            ...     sources=[source1.sha256, source2.sha256, "https://api.example.com"]
+            ... )
+            >>>
+            >>> # Get only document references (not URLs)
+            >>> doc_refs = merged.get_source_documents()
+            >>> print(doc_refs)  # [source1.sha256, source2.sha256]
+            >>>
+            >>> # Check if specific document is a source
+            >>> if source1.sha256 in doc_refs:
+            ...     print("Document derived from source1")
+        See Also:
+            - get_source_references: Get non-document source references (URLs, etc.)
+            - has_source: Check if a specific source is tracked
+            - Document.create: Add sources when creating documents
+        """
+        return [src for src in self.sources if is_document_sha256(src)]
+    def get_source_references(self) -> list[str]:
+        """Get list of arbitrary reference strings from sources.
+        Retrieves all non-document references from this document's sources list.
+        These are typically URLs, file paths, API endpoints, or descriptive strings
+        that indicate where the document's content originated from, but are not
+        references to other documents in the pipeline.
+        Returns:
+            List of reference strings that are not document SHA256 hashes.
+            Can include URLs, file paths, API endpoints, dataset names,
+            or any other string that provides source context.
+        Example:
+            >>> # Create document with mixed source types
+            >>> doc = MyDocument.create(
+            ...     name="report.txt",
+            ...     content="Analysis results",
+            ...     sources=[
+            ...         other_doc.sha256,  # Document reference
+            ...         "https://api.example.com/data",  # API URL
+            ...         "dataset:customer-2024",  # Dataset identifier
+            ...         "/path/to/source.csv",  # File path
+            ...     ]
+            ... )
+            >>>
+            >>> # Get only non-document references
+            >>> refs = doc.get_source_references()
+            >>> print(refs)
+            >>> # ["https://api.example.com/data", "dataset:customer-2024", "/path/to/source.csv"]
+            >>>
+            >>> # Use for attribution or debugging
+            >>> for ref in refs:
+            ...     print(f"Data sourced from: {ref}")
+        See Also:
+            - get_source_documents: Get document SHA256 references
+            - has_source: Check if a specific source is tracked
+            - Document.create: Add sources when creating documents
+        """
+        return [src for src in self.sources if not is_document_sha256(src)]
+    def has_source(self, source: Document | str) -> bool:
+        """Check if a specific source is tracked for this document.
+        Verifies whether a given source (document or reference string) is
+        included in this document's sources list. Useful for dependency
+        checking, lineage verification, and conditional processing based
+        on document origins.
+        Args:
+            source: Source to check for. Can be:
+                    - Document: Checks if document's SHA256 is in sources
+                    - str: Checks if exact string is in sources (hash or reference)
+        Returns:
+            True if the source is tracked in this document's sources,
+            False otherwise.
+        Raises:
+            TypeError: If source is not a Document or string.
+        Example:
+            >>> # Check if document was derived from specific source
+            >>> source_doc = MyDocument.create(name="original.txt", content="Data")
+            >>> api_url = "https://api.example.com/data"
+            >>>
+            >>> derived = MyDocument.create(
+            ...     name="processed.txt",
+            ...     content="Processed data",
+            ...     sources=[source_doc.sha256, api_url]
+            ... )
+            >>>
+            >>> # Check document source
+            >>> if derived.has_source(source_doc):
+            ...     print("Derived from source_doc")
+            >>>
+            >>> # Check string reference
+            >>> if derived.has_source(api_url):
+            ...     print("Data from API")
+            >>>
+            >>> # Check by SHA256 directly
+            >>> if derived.has_source(source_doc.sha256):
+            ...     print("Has specific hash")
+        See Also:
+            - get_source_documents: Get all document sources
+            - get_source_references: Get all reference sources
+            - Document.create: Add sources when creating documents
+        """
+        if isinstance(source, str):
+            # Direct string comparison
+            return source in self.sources
+        elif isinstance(source, Document):  # type: ignore[misc]
+            # Check if document's SHA256 is in sources
+            return source.sha256 in self.sources
+        else:
+            raise TypeError(f"Invalid source type: {type(source)}")
     @final
     def serialize_model(self) -> dict[str, Any]:
         """Serialize document to dictionary for storage or transmission.
@@ -1230,8 +1452,9 @@ class Document(BaseModel, ABC):
                 - base_type: Persistence type - "flow", "task", or "temporary" (str)
                 - size: Content size in bytes (int)
                 - id: Short hash identifier, first 6 chars of SHA256 (str)
-                - sha256: Full SHA256 hash in base32 encoding (str)
+                - sha256: Full SHA256 hash in base32 encoding without padding (str)
                 - mime_type: Detected MIME type (str)
+                - sources: List of source strings (list[dict])
                 - content: Encoded content (str)
                 - content_encoding: Either "utf-8" or "base64" (str)
@@ -1254,6 +1477,7 @@ class Document(BaseModel, ABC):
             "id": self.id,
             "sha256": self.sha256,
             "mime_type": self.mime_type,
+            "sources": self.sources,
         }
         # Try to encode content as UTF-8, fall back to base64
@@ -1288,6 +1512,7 @@ class Document(BaseModel, ABC):
                 Optional keys:
                 - description: Document description (str | None)
                 - content_encoding: "utf-8" or "base64" (defaults to "utf-8")
+                - sources: List of source strings
         Returns:
             New Document instance with restored content.
@@ -1326,9 +1551,9 @@ class Document(BaseModel, ABC):
         else:
             raise ValueError(f"Invalid content type: {type(content_raw)}")
-        # Create document with the required fields
         return cls(
             name=data["name"],
             content=content,
             description=data.get("description"),
+            sources=data.get("sources", []),
         )

ai_pipeline_core/documents/document_list.py CHANGED Viewed

@@ -152,41 +152,97 @@ class DocumentList(list[Document]):
     def filter_by(self, arg: type[Document]) -> "DocumentList": ...
     @overload
-    def filter_by(self, arg: list[type[Document]]) -> "DocumentList": ...
+    def filter_by(self, arg: Iterable[type[Document]]) -> "DocumentList": ...
-    def filter_by(self, arg: str | type[Document] | list[type[Document]]) -> "DocumentList":
-        """Filter documents by name or type(s).
+    @overload
+    def filter_by(self, arg: Iterable[str]) -> "DocumentList": ...
+    def filter_by(
+        self, arg: str | type[Document] | Iterable[type[Document]] | Iterable[str]
+    ) -> "DocumentList":
+        """Filter documents by name(s) or type(s).
         @public
         Args:
-            arg: Document name (str), single document type, or list of document types.
+            arg: Can be one of:
+                - str: Single document name to filter by
+                - type[Document]: Single document type to filter by (includes subclasses)
+                - Iterable[type[Document]]: Multiple document types to filter by
+                  (list, tuple, set, generator, or any iterable)
+                - Iterable[str]: Multiple document names to filter by
+                  (list, tuple, set, generator, or any iterable)
         Returns:
             New DocumentList with filtered documents.
         Raises:
-            TypeError: If arg is not a valid type (str, Document type, or list of Document types).
+            TypeError: If arg is not a valid type (not str, type, or iterable),
+                or if iterable contains mixed types (strings and types together).
+            AttributeError: If arg is expected to be iterable but doesn't support iteration.
         Example:
-            >>> docs.filter_by("file.txt")  # Filter by name
-            >>> docs.filter_by(MyDocument)  # Filter by type
-            >>> docs.filter_by([Doc1, Doc2])  # Filter by multiple types
+            >>> docs.filter_by("file.txt")  # Filter by single name
+            >>> docs.filter_by(MyDocument)  # Filter by single type
+            >>> docs.filter_by([Doc1, Doc2])  # Filter by multiple types (list)
+            >>> docs.filter_by({"file1.txt", "file2.txt"})  # Filter by multiple names (set)
+            >>> docs.filter_by((SubDoc, AnotherDoc))  # Filter by multiple types (tuple)
+            >>> docs.filter_by(name for name in ["a.txt", "b.txt"])  # Generator expression
         """
         if isinstance(arg, str):
-            # Filter by name
+            # Filter by single name
             return DocumentList([doc for doc in self if doc.name == arg])
         elif isinstance(arg, type):
             # Filter by single type (including subclasses)
+            # The type system ensures arg is type[Document] due to overloads
             return DocumentList([doc for doc in self if isinstance(doc, arg)])
-        elif isinstance(arg, list):  # type: ignore[reportUnnecessaryIsInstance]
-            # Filter by multiple types
-            documents = DocumentList()
-            for document_type in arg:
-                documents.extend([doc for doc in self if isinstance(doc, document_type)])
-            return documents
         else:
-            raise TypeError(f"Invalid argument type for filter_by: {type(arg)}")
+            # Try to consume as iterable
+            try:
+                # Convert to list to check the first element and allow reuse
+                items = list(arg)  # type: ignore[arg-type]
+                if not items:
+                    return DocumentList()
+                first_item = items[0]
+                if isinstance(first_item, str):
+                    # Iterable of names - validate all items are strings
+                    for item in items:
+                        if not isinstance(item, str):
+                            raise TypeError(
+                                "Iterable must contain only strings or only Document types, "
+                                "not mixed types"
+                            )
+                    names_set = set(items)
+                    return DocumentList([doc for doc in self if doc.name in names_set])
+                elif isinstance(first_item, type):  # type: ignore[reportUnnecessaryIsInstance]
+                    # Iterable of document types - validate all items are types
+                    for item in items:
+                        if not isinstance(item, type):
+                            raise TypeError(
+                                "Iterable must contain only strings or only Document types, "
+                                "not mixed types"
+                            )
+                    # Convert to set for efficient lookup
+                    types_set = set(items)
+                    # Filter documents that match any of the requested types
+                    matching = [
+                        doc
+                        for doc in self
+                        if any(isinstance(doc, doc_type) for doc_type in types_set)  # type: ignore[arg-type]
+                    ]
+                    return DocumentList(matching)
+                else:
+                    raise TypeError(
+                        f"Iterable must contain strings or Document types, "
+                        f"got {type(first_item).__name__}"
+                    )
+            except (TypeError, AttributeError) as e:
+                # If the error message already mentions Iterable, re-raise it
+                if "Iterable" in str(e) or "strings or Document types" in str(e):
+                    raise
+                # Otherwise, provide a generic error message
+                raise TypeError(f"Invalid argument type for filter_by: {type(arg).__name__}") from e
     @overload
     def get_by(self, arg: str) -> Document: ...

ai_pipeline_core/documents/flow_document.py CHANGED Viewed

@@ -27,24 +27,8 @@ class FlowDocument(Document):
     - Saved in directories named after the document's canonical name
     Creating FlowDocuments:
-        **Use the `create` classmethod** for most use cases. It handles automatic
-        conversion of various content types. Only use __init__ when you have bytes.
-        >>> from enum import StrEnum
-        >>>
-        >>> # Simple document with pass:
-        >>> class MyDoc(FlowDocument):
-        ...     pass
-        >>>
-        >>> # Document with restricted file names:
-        >>> class ConfigDoc(FlowDocument):
-        ...     class FILES(StrEnum):
-        ...         CONFIG = "config.yaml"
-        ...         SETTINGS = "settings.json"
-        >>>
-        >>> # RECOMMENDED - automatic conversion:
-        >>> doc = MyDoc.create(name="data.json", content={"key": "value"})
-        >>> doc = ConfigDoc.create(name="config.yaml", content={"host": "localhost"})
+        Same as Document - use `create()` for automatic conversion, `__init__` for bytes.
+        See Document.create() for detailed usage examples.
     Persistence:
         Documents are saved to: {output_dir}/{canonical_name}/{filename}
@@ -66,13 +50,11 @@ class FlowDocument(Document):
         name: str,
         content: bytes,
         description: str | None = None,
+        sources: list[str] = [],
     ) -> None:
         """Initialize a FlowDocument with raw bytes content.
-        Important:
-            **Most users should use the `create` classmethod instead of __init__.**
-            The create method provides automatic content conversion for various types
-            (str, dict, list, Pydantic models) while __init__ only accepts bytes.
+        See Document.__init__() for parameter details and usage notes.
         Prevents direct instantiation of the abstract FlowDocument class.
         FlowDocument must be subclassed for specific document types.
@@ -81,6 +63,7 @@ class FlowDocument(Document):
             name: Document filename (required, keyword-only)
             content: Document content as raw bytes (required, keyword-only)
             description: Optional human-readable description (keyword-only)
+            sources: Optional list of strings for provenance tracking
         Raises:
             TypeError: If attempting to instantiate FlowDocument directly
@@ -109,7 +92,7 @@ class FlowDocument(Document):
         """
         if type(self) is FlowDocument:
             raise TypeError("Cannot instantiate abstract FlowDocument class directly")
-        super().__init__(name=name, content=content, description=description)
+        super().__init__(name=name, content=content, description=description, sources=sources)
     @final
     def get_base_type(self) -> Literal["flow"]:

ai-pipeline-core 0.1.12__py3-none-any.whl → 0.1.14__py3-none-any.whl

ai-pipeline-core 0.1.12py3-none-any.whl → 0.1.14py3-none-any.whl