PyPI - ai-pipeline-core - Versions diffs - 0.1.13__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

ai-pipeline-core 0.1.13py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

ai_pipeline_core/__init__.py +25 -14
ai_pipeline_core/documents/__init__.py +2 -1
ai_pipeline_core/documents/document.py +317 -49
ai_pipeline_core/documents/document_list.py +136 -33
ai_pipeline_core/documents/flow_document.py +8 -29
ai_pipeline_core/documents/task_document.py +6 -27
ai_pipeline_core/documents/temporary_document.py +6 -27
ai_pipeline_core/documents/utils.py +64 -1
ai_pipeline_core/flow/config.py +174 -5
ai_pipeline_core/flow/options.py +2 -2
ai_pipeline_core/llm/__init__.py +6 -1
ai_pipeline_core/llm/ai_messages.py +14 -7
ai_pipeline_core/llm/client.py +143 -55
ai_pipeline_core/llm/model_options.py +20 -5
ai_pipeline_core/llm/model_response.py +77 -29
ai_pipeline_core/llm/model_types.py +38 -40
ai_pipeline_core/logging/__init__.py +0 -2
ai_pipeline_core/logging/logging_config.py +0 -6
ai_pipeline_core/logging/logging_mixin.py +2 -10
ai_pipeline_core/pipeline.py +68 -65
ai_pipeline_core/prefect.py +12 -3
ai_pipeline_core/prompt_manager.py +6 -7
ai_pipeline_core/settings.py +13 -5
ai_pipeline_core/simple_runner/__init__.py +1 -11
ai_pipeline_core/simple_runner/cli.py +13 -12
ai_pipeline_core/simple_runner/simple_runner.py +34 -172
ai_pipeline_core/storage/__init__.py +8 -0
ai_pipeline_core/storage/storage.py +628 -0
ai_pipeline_core/tracing.py +110 -26
{ai_pipeline_core-0.1.13.dist-info → ai_pipeline_core-0.2.0.dist-info}/METADATA +60 -23
ai_pipeline_core-0.2.0.dist-info/RECORD +38 -0
ai_pipeline_core-0.1.13.dist-info/RECORD +0 -36
{ai_pipeline_core-0.1.13.dist-info → ai_pipeline_core-0.2.0.dist-info}/WHEEL +0 -0
{ai_pipeline_core-0.1.13.dist-info → ai_pipeline_core-0.2.0.dist-info}/licenses/LICENSE +0 -0

ai_pipeline_core/documents/document.py CHANGED Viewed

@@ -6,6 +6,8 @@ This module provides the core document abstraction for working with various type
 in AI pipelines. Documents are immutable Pydantic models that wrap binary content with metadata.
 """
+from __future__ import annotations
 import base64
 import hashlib
 import json
@@ -30,13 +32,14 @@ from typing import (
 from pydantic import (
     BaseModel,
     ConfigDict,
+    Field,
     ValidationInfo,
     field_serializer,
     field_validator,
 )
 from ruamel.yaml import YAML
-from ai_pipeline_core.documents.utils import canonical_name_key
+from ai_pipeline_core.documents.utils import canonical_name_key, is_document_sha256
 from ai_pipeline_core.exceptions import DocumentNameError, DocumentSizeError
 from .mime_type import (
@@ -58,8 +61,7 @@ class Document(BaseModel, ABC):
     Document is the fundamental data abstraction for all content flowing through
     pipelines. It provides automatic encoding, MIME type detection, serialization,
     and validation. All documents must be subclassed from FlowDocument or TaskDocument
-    based on their persistence requirements. TemporaryDocument is a special concrete
-    class that can be instantiated directly (not abstract).
+    based on their persistence requirements.
     VALIDATION IS AUTOMATIC - Do not add manual validation!
         Size validation, name validation, and MIME type detection are built-in.
@@ -71,7 +73,7 @@ class Document(BaseModel, ABC):
         document.validate_file_name(document.name)  # NO! Automatic
     Best Practices:
-        - Use create() classmethod for automatic type conversion (90% of cases)
+        - Use create() classmethod for automatic type conversion (default preferred)
         - Omit description parameter unless truly needed for metadata
         - When using LLM functions, pass AIMessages or str. Wrap any Document values
           in AIMessages([...]). Do not call .text yourself
@@ -94,6 +96,7 @@ class Document(BaseModel, ABC):
     - SHA256 hashing for deduplication
     - Support for text, JSON, YAML, PDF, and image formats
     - Conversion utilities between different formats
+    - Source provenance tracking via sources field
     Class Variables:
         MAX_CONTENT_SIZE: Maximum allowed content size in bytes (default 25MB)
@@ -102,6 +105,7 @@ class Document(BaseModel, ABC):
         name: Document filename (validated for security)
         description: Optional human-readable description
         content: Raw document content as bytes
+        sources: List of source references tracking document provenance
     Creating Documents:
         **Use the `create` classmethod** for most use cases. It accepts various
@@ -117,7 +121,7 @@ class Document(BaseModel, ABC):
     Warning:
         - Document subclasses should NOT start with 'Test' prefix (pytest conflict)
         - Cannot instantiate Document directly - must subclass FlowDocument or TaskDocument
-        - Cannot add custom fields - only name, description, content are allowed
+        - Cannot add custom fields - only name, description, content, sources are allowed
         - Document is an abstract class and cannot be instantiated directly
     Metadata Attachment Patterns:
@@ -126,10 +130,62 @@ class Document(BaseModel, ABC):
         2. Embed metadata in content (e.g., JSON with data + metadata fields)
         3. Create a separate MetadataDocument type to accompany data documents
         4. Use document naming conventions (e.g., "data_v2_2024.json")
-        5. Store metadata in flow_options or pass through TraceInfo
+        5. Store metadata in flow_options
+    FILES Enum Best Practice:
+        When defining a FILES enum, NEVER use magic strings to reference files.
+        Always use the enum values to maintain type safety and refactorability.
+        WRONG - Magic strings/numbers:
+            doc = ConfigDocument.create(name="config.yaml", content=data)  # NO!
+            doc = docs.get_by("settings.json")  # NO! Magic string
+            files = ["config.yaml", "settings.json"]  # NO! Magic strings
+        CORRECT - Use enum references:
+            doc = ConfigDocument.create(
+                name=ConfigDocument.FILES.CONFIG,  # YES! Type-safe
+                content=data
+            )
+            doc = docs.get_by(ConfigDocument.FILES.SETTINGS)  # YES!
+            files = [
+                ConfigDocument.FILES.CONFIG,
+                ConfigDocument.FILES.SETTINGS
+            ]  # YES! Refactorable
+    Pydantic Model Interaction:
+        Documents provide DIRECT support for Pydantic models. Use the built-in
+        methods instead of manual JSON conversion.
+        WRONG - Manual JSON conversion:
+            # Don't do this - manual JSON handling
+            json_str = doc.text
+            json_data = json.loads(json_str)
+            model = MyModel(**json_data)  # NO! Use as_pydantic_model
+            # Don't do this - manual serialization
+            json_str = model.model_dump_json()
+            doc = MyDocument.create(name="data.json", content=json_str)  # NO!
+        CORRECT - Direct Pydantic interaction:
+            # Reading Pydantic model from document
+            model = doc.as_pydantic_model(MyModel)  # Direct conversion
+            models = doc.as_pydantic_model(list[MyModel])  # List support
+            # Creating document from Pydantic model
+            doc = MyDocument.create(
+                name="data.json",
+                content=model  # Direct BaseModel support
+            )
+            # Round-trip is seamless
+            original_model = MyModel(field="value")
+            doc = MyDocument.create(name="data.json", content=original_model)
+            restored = doc.as_pydantic_model(MyModel)
+            assert restored == original_model  # Perfect round-trip
     Example:
         >>> from enum import StrEnum
+        >>> from pydantic import BaseModel
         >>>
         >>> # Simple document:
         >>> class MyDocument(FlowDocument):
@@ -141,10 +197,32 @@ class Document(BaseModel, ABC):
         ...         CONFIG = "config.yaml"
         ...         SETTINGS = "settings.json"
         >>>
-        >>> # RECOMMENDED: Use create for automatic conversion
-        >>> doc = MyDocument.create(name="data.json", content={"key": "value"})
-        >>> print(doc.is_text)  # True
-        >>> data = doc.as_json()  # {'key': 'value'}
+        >>> # CORRECT FILES usage - no magic strings:
+        >>> doc = ConfigDocument.create(
+        ...     name=ConfigDocument.FILES.CONFIG,  # Use enum
+        ...     content={"key": "value"}
+        ... )
+        >>>
+        >>> # CORRECT Pydantic usage:
+        >>> class Config(BaseModel):
+        ...     key: str
+        >>>
+        >>> # Direct creation from Pydantic model
+        >>> config_model = Config(key="value")
+        >>> doc = MyDocument.create(name="data.json", content=config_model)
+        >>>
+        >>> # Direct extraction to Pydantic model
+        >>> restored = doc.as_pydantic_model(Config)
+        >>> print(restored.key)  # "value"
+        >>>
+        >>> # Track document provenance with sources
+        >>> source_doc = MyDocument.create(name="input.txt", content="raw data")
+        >>> processed = MyDocument.create(
+        ...     name="output.txt",
+        ...     content="processed data",
+        ...     sources=[source_doc.sha256]  # Reference source document
+        ... )
+        >>> processed.has_source(source_doc)  # True
     """
     MAX_CONTENT_SIZE: ClassVar[int] = 25 * 1024 * 1024
@@ -156,6 +234,9 @@ class Document(BaseModel, ABC):
     DESCRIPTION_EXTENSION: ClassVar[str] = ".description.md"
     """File extension for description files."""
+    SOURCES_EXTENSION: ClassVar[str] = ".sources.json"
+    """File extension for sources metadata files."""
     MARKDOWN_LIST_SEPARATOR: ClassVar[str] = "\n\n-----------------\n\n"
     """Separator for markdown list items."""
@@ -193,7 +274,7 @@ class Document(BaseModel, ABC):
                 )
         # Check that the Document's model_fields only contain the allowed fields
         # It prevents AI models from adding additional fields to documents
-        allowed = {"name", "description", "content"}
+        allowed = {"name", "description", "content", "sources"}
         current = set(getattr(cls, "model_fields", {}).keys())
         extras = current - allowed
         if extras:
@@ -204,25 +285,58 @@ class Document(BaseModel, ABC):
     @overload
     @classmethod
-    def create(cls, *, name: str, content: bytes, description: str | None = None) -> Self: ...
+    def create(
+        cls,
+        *,
+        name: str,
+        content: bytes,
+        description: str | None = None,
+        sources: list[str] = [],
+    ) -> Self: ...
     @overload
     @classmethod
-    def create(cls, *, name: str, content: str, description: str | None = None) -> Self: ...
+    def create(
+        cls,
+        *,
+        name: str,
+        content: str,
+        description: str | None = None,
+        sources: list[str] = [],
+    ) -> Self: ...
     @overload
     @classmethod
     def create(
-        cls, *, name: str, content: dict[str, Any], description: str | None = None
+        cls,
+        *,
+        name: str,
+        content: dict[str, Any],
+        description: str | None = None,
+        sources: list[str] = [],
     ) -> Self: ...
     @overload
     @classmethod
-    def create(cls, *, name: str, content: list[Any], description: str | None = None) -> Self: ...
+    def create(
+        cls,
+        *,
+        name: str,
+        content: list[Any],
+        description: str | None = None,
+        sources: list[str] = [],
+    ) -> Self: ...
     @overload
     @classmethod
-    def create(cls, *, name: str, content: BaseModel, description: str | None = None) -> Self: ...
+    def create(
+        cls,
+        *,
+        name: str,
+        content: BaseModel,
+        description: str | None = None,
+        sources: list[str] = [],
+    ) -> Self: ...
     @classmethod
     def create(
@@ -231,6 +345,7 @@ class Document(BaseModel, ABC):
         name: str,
         content: str | bytes | dict[str, Any] | list[Any] | BaseModel,
         description: str | None = None,
+        sources: list[str] = [],
     ) -> Self:
         r"""Create a Document with automatic content type conversion (recommended).
@@ -240,7 +355,7 @@ class Document(BaseModel, ABC):
         content types and automatically converts them to bytes based on the file
         extension. Use the `parse` method to reverse this conversion.
-        Best Practice (90% of cases):
+        Best Practice (by default, unless instructed otherwise):
             Only provide name and content. The description parameter is RARELY needed.
         Args:
@@ -254,19 +369,24 @@ class Document(BaseModel, ABC):
                 - bytes: Used directly without conversion
                 - str: Encoded to UTF-8 bytes
                 - dict[str, Any]: Serialized to JSON (.json) or YAML (.yaml/.yml)
-                - list[str]: Joined with separator for .md (validates no items
-                            contain separator), else JSON/YAML
+                - list[str]: Joined automatically for .md (validates format compatibility),
+                            else JSON/YAML
                 - list[BaseModel]: Serialized to JSON or YAML based on extension
                 - BaseModel: Serialized to JSON or YAML based on extension
             description: Optional description - USUALLY OMIT THIS (defaults to None).
                         Only use when meaningful metadata helps downstream processing
+            sources: Optional list of source strings (document SHA256 hashes or references).
+                    Used to track what sources contributed to creating this document.
+                    Can contain document SHA256 hashes (for referencing other documents)
+                    or arbitrary reference strings (URLs, file paths, descriptions).
+                    Defaults to empty list
         Returns:
             New Document instance with content converted to bytes
         Raises:
             ValueError: If content type is not supported for the file extension,
-                       or if markdown list items contain the separator
+                       or if markdown list format is incompatible
             DocumentNameError: If filename violates validation rules
             DocumentSizeError: If content exceeds MAX_CONTENT_SIZE
@@ -276,7 +396,7 @@ class Document(BaseModel, ABC):
             returns the original dictionary {"key": "value"}.
         Example:
-            >>> # CORRECT - no description needed (90% of cases)
+            >>> # CORRECT - no description needed (by default, unless instructed otherwise)
             >>> doc = MyDocument.create(name="test.txt", content="Hello World")
             >>> doc.content  # b'Hello World'
             >>> doc.parse(str)  # "Hello World"
@@ -306,11 +426,31 @@ class Document(BaseModel, ABC):
             >>> items = ["Section 1", "Section 2"]
             >>> doc = MyDocument.create(name="sections.md", content=items)
             >>> doc.parse(list)  # ["Section 1", "Section 2"]
+            >>> # Document with sources for provenance tracking
+            >>> source_doc = MyDocument.create(name="source.txt", content="original")
+            >>> derived = MyDocument.create(
+            ...     name="result.txt",
+            ...     content="processed",
+            ...     sources=[source_doc.sha256, "https://api.example.com/data"]
+            ... )
+            >>> derived.get_source_documents()  # [source_doc.sha256]
+            >>> derived.get_source_references()  # ["https://api.example.com/data"]
         """
         # Use model_validate to leverage the existing validator logic
-        temp = cls.model_validate({"name": name, "content": content, "description": description})
+        temp = cls.model_validate({
+            "name": name,
+            "content": content,
+            "description": description,
+            "sources": sources,
+        })
         # Now construct with type-checker-friendly call (bytes only)
-        return cls(name=temp.name, content=temp.content, description=temp.description)
+        return cls(
+            name=temp.name,
+            content=temp.content,
+            description=temp.description,
+            sources=temp.sources,
+        )
     def __init__(
         self,
@@ -318,6 +458,7 @@ class Document(BaseModel, ABC):
         name: str,
         content: bytes,
         description: str | None = None,
+        sources: list[str] = [],
     ) -> None:
         """Initialize a Document instance with raw bytes content.
@@ -335,6 +476,10 @@ class Document(BaseModel, ABC):
             name: Document filename (required, keyword-only)
             content: Document content as raw bytes (required, keyword-only)
             description: Optional human-readable description (keyword-only)
+            sources: Optional list of source strings for provenance tracking.
+                    Can contain document SHA256 hashes (for referencing other documents)
+                    or arbitrary reference strings (URLs, file paths, descriptions).
+                    Defaults to empty list
         Raises:
             TypeError: If attempting to instantiate Document directly.
@@ -349,19 +494,21 @@ class Document(BaseModel, ABC):
             >>> doc = MyDocument.create(name="data.json", content={"key": "value"})
             >>> doc = MyDocument.create(name="config.yaml", content=my_model)
             >>> doc = MyDocument.create(name="items.md", content=["item1", "item2"])
-        See Also:
-            create: Recommended factory method with automatic type conversion
-            parse: Method to reverse the conversion done by create
         """
         if type(self) is Document:
             raise TypeError("Cannot instantiate abstract Document class directly")
-        super().__init__(name=name, content=content, description=description)
+        super().__init__(name=name, content=content, description=description, sources=sources)
     name: str
     description: str | None = None
     content: bytes  # Note: constructor accepts str | bytes, but field stores bytes only
+    sources: list[str] = Field(
+        default_factory=list,
+        description="List of source references for tracking document provenance. "
+        "Can contain document SHA256 hashes (for referencing other documents) "
+        "or arbitrary reference strings (URLs, file paths, descriptions)",
+    )
     # Pydantic configuration
     model_config = ConfigDict(
@@ -383,8 +530,7 @@ class Document(BaseModel, ABC):
         Note:
             This method determines document persistence and lifecycle.
-            FlowDocument returns "flow", TaskDocument returns "task",
-            TemporaryDocument returns "temporary".
+            FlowDocument returns "flow", TaskDocument returns "task".
         """
         raise NotImplementedError("Subclasses must implement this method")
@@ -436,7 +582,7 @@ class Document(BaseModel, ABC):
         during execution.
         Returns:
-            True if this is a TemporaryDocument, False otherwise.
+            True if this document is temporary, False otherwise.
         """
         return self.get_base_type() == "temporary"
@@ -481,8 +627,6 @@ class Document(BaseModel, ABC):
     def validate_file_name(cls, name: str) -> None:
         """Validate that a file name matches allowed patterns.
-        @public
         DO NOT OVERRIDE this method if you define a FILES enum!
         The validation is automatic when FILES enum is present.
@@ -526,7 +670,7 @@ class Document(BaseModel, ABC):
         Ensures the document name is secure and follows conventions:
         - No path traversal characters (.., \\, /)
-        - Cannot end with .description.md
+        - Cannot end with .description.md or .sources.json
         - No leading/trailing whitespace
         - Must match FILES enum if defined
@@ -551,6 +695,9 @@ class Document(BaseModel, ABC):
                 f"Document names cannot end with {cls.DESCRIPTION_EXTENSION}: {v}"
             )
+        if v.endswith(cls.SOURCES_EXTENSION):
+            raise DocumentNameError(f"Document names cannot end with {cls.SOURCES_EXTENSION}: {v}")
         if ".." in v or "\\" in v or "/" in v:
             raise DocumentNameError(f"Invalid filename - contains path traversal characters: {v}")
@@ -575,7 +722,7 @@ class Document(BaseModel, ABC):
             2. str → UTF-8 encoding
             3. dict/BaseModel + .json → JSON serialization (indented)
             4. dict/BaseModel + .yaml/.yml → YAML serialization
-            5. list[str] + .md → Join with markdown separator (validates no items contain separator)
+            5. list[str] + .md → Join with markdown sections (validates format compatibility)
             6. list[Any] + .json/.yaml → JSON/YAML array
             7. int/float/bool + .json → JSON primitive
@@ -795,7 +942,7 @@ class Document(BaseModel, ABC):
             This is computed once and cached for performance.
             The hash is deterministic based on content only.
         """
-        return b32encode(hashlib.sha256(self.content).digest()).decode("ascii").upper()
+        return b32encode(hashlib.sha256(self.content).digest()).decode("ascii").upper().rstrip("=")
     @final
     @property
@@ -944,8 +1091,6 @@ class Document(BaseModel, ABC):
     def as_yaml(self) -> Any:
         r"""Parse document content as YAML.
-        @public
         Parses the document's text content as YAML and returns Python objects.
         Uses ruamel.yaml which is safe by default (no code execution).
@@ -973,8 +1118,6 @@ class Document(BaseModel, ABC):
     def as_json(self) -> Any:
         """Parse document content as JSON.
-        @public
         Parses the document's text content as JSON and returns Python objects.
         Document must contain valid JSON text.
@@ -1069,7 +1212,7 @@ class Document(BaseModel, ABC):
         @public
-        Splits text content using markdown separator ("\n\n-----------------\n\n").
+        Splits text content automatically using markdown section separators.
         Designed for markdown documents with multiple sections.
         Returns:
@@ -1084,9 +1227,9 @@ class Document(BaseModel, ABC):
             >>> doc = MyDocument.create(name="book.md", content=sections)
             >>> doc.as_markdown_list()  # Returns original sections
-            >>> # Manual creation with separator
-            >>> content = "Part 1\n\n-----------------\n\nPart 2\n\n-----------------\n\nPart 3"
-            >>> doc2 = MyDocument(name="parts.md", content=content.encode())
+            >>> # Round-trip conversion works automatically
+            >>> sections = ["Part 1", "Part 2", "Part 3"]
+            >>> doc2 = MyDocument.create(name="parts.md", content=sections)
             >>> doc2.as_markdown_list()  # ['Part 1', 'Part 2', 'Part 3']
         """
         return self.text.split(self.MARKDOWN_LIST_SEPARATOR)
@@ -1123,7 +1266,7 @@ class Document(BaseModel, ABC):
         Extension Rules:
             - .json → JSON parsing for dict/list/BaseModel
             - .yaml/.yml → YAML parsing for dict/list/BaseModel
-            - .md + list → Split by markdown separator
+            - .md + list → Split automatically into sections
             - Any + str → UTF-8 decode
             - Any + bytes → Raw content
@@ -1139,8 +1282,7 @@ class Document(BaseModel, ABC):
             >>> # Markdown list
             >>> items = ["Item 1", "Item 2"]
-            >>> content = "\n\n---\n\n".join(items).encode()
-            >>> doc = MyDocument(name="list.md", content=content)
+            >>> doc = MyDocument.create(name="list.md", content=items)
             >>> doc.parse(list)
             ['Item 1', 'Item 2']
         """
@@ -1215,6 +1357,129 @@ class Document(BaseModel, ABC):
         raise ValueError(f"Unsupported type {type_} for file {self.name}")
+    def get_source_documents(self) -> list[str]:
+        """Get list of document SHA256 hashes referenced as sources.
+        Retrieves all document references from this document's sources list,
+        filtering for valid SHA256 hashes that reference other documents.
+        This is useful for building dependency graphs and tracking document
+        lineage in processing pipelines.
+        Returns:
+            List of SHA256 hashes (base32 encoded) for documents referenced
+            as sources. Each hash uniquely identifies another document that
+            contributed to creating this one.
+        Example:
+            >>> # Create a derived document from multiple sources
+            >>> source1 = MyDocument.create(name="data1.txt", content="First")
+            >>> source2 = MyDocument.create(name="data2.txt", content="Second")
+            >>>
+            >>> merged = MyDocument.create(
+            ...     name="merged.txt",
+            ...     content="Combined data",
+            ...     sources=[source1.sha256, source2.sha256, "https://api.example.com"]
+            ... )
+            >>>
+            >>> # Get only document references (not URLs)
+            >>> doc_refs = merged.get_source_documents()
+            >>> print(doc_refs)  # [source1.sha256, source2.sha256]
+            >>>
+            >>> # Check if specific document is a source
+            >>> if source1.sha256 in doc_refs:
+            ...     print("Document derived from source1")
+        """
+        return [src for src in self.sources if is_document_sha256(src)]
+    def get_source_references(self) -> list[str]:
+        """Get list of arbitrary reference strings from sources.
+        Retrieves all non-document references from this document's sources list.
+        These are typically URLs, file paths, API endpoints, or descriptive strings
+        that indicate where the document's content originated from, but are not
+        references to other documents in the pipeline.
+        Returns:
+            List of reference strings that are not document SHA256 hashes.
+            Can include URLs, file paths, API endpoints, dataset names,
+            or any other string that provides source context.
+        Example:
+            >>> # Create document with mixed source types
+            >>> doc = MyDocument.create(
+            ...     name="report.txt",
+            ...     content="Analysis results",
+            ...     sources=[
+            ...         other_doc.sha256,  # Document reference
+            ...         "https://api.example.com/data",  # API URL
+            ...         "dataset:customer-2024",  # Dataset identifier
+            ...         "/path/to/source.csv",  # File path
+            ...     ]
+            ... )
+            >>>
+            >>> # Get only non-document references
+            >>> refs = doc.get_source_references()
+            >>> print(refs)
+            >>> # ["https://api.example.com/data", "dataset:customer-2024", "/path/to/source.csv"]
+            >>>
+            >>> # Use for attribution or debugging
+            >>> for ref in refs:
+            ...     print(f"Data sourced from: {ref}")
+        """
+        return [src for src in self.sources if not is_document_sha256(src)]
+    def has_source(self, source: Document | str) -> bool:
+        """Check if a specific source is tracked for this document.
+        Verifies whether a given source (document or reference string) is
+        included in this document's sources list. Useful for dependency
+        checking, lineage verification, and conditional processing based
+        on document origins.
+        Args:
+            source: Source to check for. Can be:
+                    - Document: Checks if document's SHA256 is in sources
+                    - str: Checks if exact string is in sources (hash or reference)
+        Returns:
+            True if the source is tracked in this document's sources,
+            False otherwise.
+        Raises:
+            TypeError: If source is not a Document or string.
+        Example:
+            >>> # Check if document was derived from specific source
+            >>> source_doc = MyDocument.create(name="original.txt", content="Data")
+            >>> api_url = "https://api.example.com/data"
+            >>>
+            >>> derived = MyDocument.create(
+            ...     name="processed.txt",
+            ...     content="Processed data",
+            ...     sources=[source_doc.sha256, api_url]
+            ... )
+            >>>
+            >>> # Check document source
+            >>> if derived.has_source(source_doc):
+            ...     print("Derived from source_doc")
+            >>>
+            >>> # Check string reference
+            >>> if derived.has_source(api_url):
+            ...     print("Data from API")
+            >>>
+            >>> # Check by SHA256 directly
+            >>> if derived.has_source(source_doc.sha256):
+            ...     print("Has specific hash")
+        """
+        if isinstance(source, str):
+            # Direct string comparison
+            return source in self.sources
+        elif isinstance(source, Document):  # type: ignore[misc]
+            # Check if document's SHA256 is in sources
+            return source.sha256 in self.sources
+        else:
+            raise TypeError(f"Invalid source type: {type(source)}")
     @final
     def serialize_model(self) -> dict[str, Any]:
         """Serialize document to dictionary for storage or transmission.
@@ -1230,8 +1495,9 @@ class Document(BaseModel, ABC):
                 - base_type: Persistence type - "flow", "task", or "temporary" (str)
                 - size: Content size in bytes (int)
                 - id: Short hash identifier, first 6 chars of SHA256 (str)
-                - sha256: Full SHA256 hash in base32 encoding (str)
+                - sha256: Full SHA256 hash in base32 encoding without padding (str)
                 - mime_type: Detected MIME type (str)
+                - sources: List of source strings (list[dict])
                 - content: Encoded content (str)
                 - content_encoding: Either "utf-8" or "base64" (str)
@@ -1254,6 +1520,7 @@ class Document(BaseModel, ABC):
             "id": self.id,
             "sha256": self.sha256,
             "mime_type": self.mime_type,
+            "sources": self.sources,
         }
         # Try to encode content as UTF-8, fall back to base64
@@ -1288,6 +1555,7 @@ class Document(BaseModel, ABC):
                 Optional keys:
                 - description: Document description (str | None)
                 - content_encoding: "utf-8" or "base64" (defaults to "utf-8")
+                - sources: List of source strings
         Returns:
             New Document instance with restored content.
@@ -1326,9 +1594,9 @@ class Document(BaseModel, ABC):
         else:
             raise ValueError(f"Invalid content type: {type(content_raw)}")
-        # Create document with the required fields
         return cls(
             name=data["name"],
             content=content,
             description=data.get("description"),
+            sources=data.get("sources", []),
         )

ai-pipeline-core 0.1.13__py3-none-any.whl → 0.2.0__py3-none-any.whl

ai-pipeline-core 0.1.13py3-none-any.whl → 0.2.0py3-none-any.whl