PyPI - ai-pipeline-core - Versions diffs - 0.1.14__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

ai-pipeline-core 0.1.14py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

ai_pipeline_core/__init__.py +21 -13
ai_pipeline_core/documents/document.py +93 -50
ai_pipeline_core/documents/document_list.py +70 -23
ai_pipeline_core/documents/flow_document.py +2 -6
ai_pipeline_core/documents/task_document.py +0 -4
ai_pipeline_core/documents/temporary_document.py +1 -8
ai_pipeline_core/flow/config.py +174 -5
ai_pipeline_core/llm/__init__.py +1 -1
ai_pipeline_core/llm/ai_messages.py +14 -4
ai_pipeline_core/llm/client.py +116 -59
ai_pipeline_core/llm/model_options.py +2 -5
ai_pipeline_core/llm/model_response.py +17 -16
ai_pipeline_core/llm/model_types.py +0 -4
ai_pipeline_core/logging/__init__.py +0 -2
ai_pipeline_core/logging/logging_config.py +0 -6
ai_pipeline_core/logging/logging_mixin.py +2 -10
ai_pipeline_core/pipeline.py +45 -68
ai_pipeline_core/prefect.py +12 -3
ai_pipeline_core/prompt_manager.py +6 -7
ai_pipeline_core/settings.py +13 -5
ai_pipeline_core/simple_runner/__init__.py +1 -11
ai_pipeline_core/simple_runner/cli.py +13 -12
ai_pipeline_core/simple_runner/simple_runner.py +34 -189
ai_pipeline_core/storage/__init__.py +8 -0
ai_pipeline_core/storage/storage.py +628 -0
ai_pipeline_core/tracing.py +3 -26
{ai_pipeline_core-0.1.14.dist-info → ai_pipeline_core-0.2.0.dist-info}/METADATA +19 -17
ai_pipeline_core-0.2.0.dist-info/RECORD +38 -0
ai_pipeline_core-0.1.14.dist-info/RECORD +0 -36
{ai_pipeline_core-0.1.14.dist-info → ai_pipeline_core-0.2.0.dist-info}/WHEEL +0 -0
{ai_pipeline_core-0.1.14.dist-info → ai_pipeline_core-0.2.0.dist-info}/licenses/LICENSE +0 -0

ai_pipeline_core/__init__.py CHANGED Viewed

@@ -7,7 +7,7 @@ It combines document processing, LLM integration, and workflow orchestration int
 system designed for production use.
 The framework enforces best practices through strong typing (Pydantic), automatic retries,
-cost tracking, and distributed tracing. All I/O operations are async for maximum throughput.
+and cost tracking. All I/O operations are async for maximum throughput.
 **CRITICAL IMPORT RULE**:
     Always import from the top-level package:
@@ -18,12 +18,12 @@ cost tracking, and distributed tracing. All I/O operations are async for maximum
         from ai_pipeline_core.llm import generate  # NO!
         from ai_pipeline_core.documents import FlowDocument  # NO!
-FRAMEWORK RULES (90% Use Cases):
-    1. Decorators: Use @trace, @pipeline_task, @pipeline_flow WITHOUT parameters
+FRAMEWORK RULES (Use by default, unless instructed otherwise):
+    1. Decorators: Use @pipeline_task WITHOUT parameters, @pipeline_flow WITH config
     2. Logging: Use get_pipeline_logger(__name__) - NEVER print() or logging module
     3. LLM calls: Use AIMessages or str. Wrap Documents in AIMessages; do not call .text yourself
-    4. Options: Omit ModelOptions unless specifically needed (defaults are optimal)
-    5. Documents: Create with just name and content - skip description
+    4. Options: DO NOT use options parameter - omit it entirely (defaults are optimal)
+    5. Documents: Create with just name and content - skip description unless needed
     6. FlowConfig: OUTPUT_DOCUMENT_TYPE must differ from all INPUT_DOCUMENT_TYPES
     7. Initialization: PromptManager and logger at module scope, not in functions
     8. DocumentList: Use default constructor - no validation flags needed
@@ -36,18 +36,22 @@ Core Capabilities:
     - **LLM Integration**: Unified interface to any model via LiteLLM with caching
     - **Structured Output**: Type-safe generation with Pydantic model validation
     - **Workflow Orchestration**: Prefect-based flows and tasks with retries
-    - **Observability**: Distributed tracing via Laminar (LMNR) for debugging
+    - **Observability**: Built-in monitoring and debugging capabilities
     - **Local Development**: Simple runner for testing without infrastructure
 Quick Start:
     >>> from ai_pipeline_core import (
-    ...     pipeline_flow, FlowDocument, DocumentList, FlowOptions, llm, AIMessages
+    ...     pipeline_flow, FlowDocument, DocumentList, FlowOptions, FlowConfig, llm, AIMessages
     ... )
     >>>
     >>> class OutputDoc(FlowDocument):
     ...     '''Analysis result document.'''
     >>>
-    >>> @pipeline_flow
+    >>> class MyFlowConfig(FlowConfig):
+    ...     INPUT_DOCUMENT_TYPES = []
+    ...     OUTPUT_DOCUMENT_TYPE = OutputDoc
+    >>>
+    >>> @pipeline_flow(config=MyFlowConfig)
     >>> async def analyze_flow(
     ...     project_name: str,
     ...     documents: DocumentList,
@@ -55,7 +59,7 @@ Quick Start:
     ... ) -> DocumentList:
     ...     # Messages accept AIMessages or str. Wrap documents: AIMessages([doc])
     ...     response = await llm.generate(
-    ...         model="gpt-5",
+    ...         "gpt-5",
     ...         messages=AIMessages([documents[0]])
     ...     )
     ...     result = OutputDoc.create(
@@ -76,8 +80,6 @@ Optional Environment Variables:
     - PREFECT_API_KEY: Prefect API authentication key
     - LMNR_PROJECT_API_KEY: Laminar (LMNR) API key for tracing
     - LMNR_DEBUG: Set to "true" to enable debug-level traces
-    - LMNR_SESSION_ID: Default session ID for traces
-    - LMNR_USER_ID: Default user ID for traces
 """
 from . import llm
@@ -99,6 +101,8 @@ from .llm import (
     ModelOptions,
     ModelResponse,
     StructuredModelResponse,
+    generate,
+    generate_structured,
 )
 from .logging import (
     LoggerMixin,
@@ -114,7 +118,7 @@ from .prompt_manager import PromptManager
 from .settings import Settings
 from .tracing import TraceInfo, TraceLevel, set_trace_cost, trace
-__version__ = "0.1.14"
+__version__ = "0.2.0"
 __all__ = [
     # Config/Settings
@@ -145,7 +149,9 @@ __all__ = [
     "prefect_test_harness",
     "disable_run_logger",
     # LLM
-    "llm",
+    "llm",  # for backward compatibility
+    "generate",
+    "generate_structured",
     "ModelName",
     "ModelOptions",
     "ModelResponse",
@@ -159,4 +165,6 @@ __all__ = [
     "set_trace_cost",
     # Utils
     "PromptManager",
+    "generate",
+    "generate_structured",
 ]

ai_pipeline_core/documents/document.py CHANGED Viewed

@@ -61,8 +61,7 @@ class Document(BaseModel, ABC):
     Document is the fundamental data abstraction for all content flowing through
     pipelines. It provides automatic encoding, MIME type detection, serialization,
     and validation. All documents must be subclassed from FlowDocument or TaskDocument
-    based on their persistence requirements. TemporaryDocument is a special concrete
-    class that can be instantiated directly (not abstract).
+    based on their persistence requirements.
     VALIDATION IS AUTOMATIC - Do not add manual validation!
         Size validation, name validation, and MIME type detection are built-in.
@@ -74,7 +73,7 @@ class Document(BaseModel, ABC):
         document.validate_file_name(document.name)  # NO! Automatic
     Best Practices:
-        - Use create() classmethod for automatic type conversion (90% of cases)
+        - Use create() classmethod for automatic type conversion (default preferred)
         - Omit description parameter unless truly needed for metadata
         - When using LLM functions, pass AIMessages or str. Wrap any Document values
           in AIMessages([...]). Do not call .text yourself
@@ -131,10 +130,62 @@ class Document(BaseModel, ABC):
         2. Embed metadata in content (e.g., JSON with data + metadata fields)
         3. Create a separate MetadataDocument type to accompany data documents
         4. Use document naming conventions (e.g., "data_v2_2024.json")
-        5. Store metadata in flow_options or pass through TraceInfo
+        5. Store metadata in flow_options
+    FILES Enum Best Practice:
+        When defining a FILES enum, NEVER use magic strings to reference files.
+        Always use the enum values to maintain type safety and refactorability.
+        WRONG - Magic strings/numbers:
+            doc = ConfigDocument.create(name="config.yaml", content=data)  # NO!
+            doc = docs.get_by("settings.json")  # NO! Magic string
+            files = ["config.yaml", "settings.json"]  # NO! Magic strings
+        CORRECT - Use enum references:
+            doc = ConfigDocument.create(
+                name=ConfigDocument.FILES.CONFIG,  # YES! Type-safe
+                content=data
+            )
+            doc = docs.get_by(ConfigDocument.FILES.SETTINGS)  # YES!
+            files = [
+                ConfigDocument.FILES.CONFIG,
+                ConfigDocument.FILES.SETTINGS
+            ]  # YES! Refactorable
+    Pydantic Model Interaction:
+        Documents provide DIRECT support for Pydantic models. Use the built-in
+        methods instead of manual JSON conversion.
+        WRONG - Manual JSON conversion:
+            # Don't do this - manual JSON handling
+            json_str = doc.text
+            json_data = json.loads(json_str)
+            model = MyModel(**json_data)  # NO! Use as_pydantic_model
+            # Don't do this - manual serialization
+            json_str = model.model_dump_json()
+            doc = MyDocument.create(name="data.json", content=json_str)  # NO!
+        CORRECT - Direct Pydantic interaction:
+            # Reading Pydantic model from document
+            model = doc.as_pydantic_model(MyModel)  # Direct conversion
+            models = doc.as_pydantic_model(list[MyModel])  # List support
+            # Creating document from Pydantic model
+            doc = MyDocument.create(
+                name="data.json",
+                content=model  # Direct BaseModel support
+            )
+            # Round-trip is seamless
+            original_model = MyModel(field="value")
+            doc = MyDocument.create(name="data.json", content=original_model)
+            restored = doc.as_pydantic_model(MyModel)
+            assert restored == original_model  # Perfect round-trip
     Example:
         >>> from enum import StrEnum
+        >>> from pydantic import BaseModel
         >>>
         >>> # Simple document:
         >>> class MyDocument(FlowDocument):
@@ -146,10 +197,23 @@ class Document(BaseModel, ABC):
         ...         CONFIG = "config.yaml"
         ...         SETTINGS = "settings.json"
         >>>
-        >>> # RECOMMENDED: Use create for automatic conversion
-        >>> doc = MyDocument.create(name="data.json", content={"key": "value"})
-        >>> print(doc.is_text)  # True
-        >>> data = doc.as_json()  # {'key': 'value'}
+        >>> # CORRECT FILES usage - no magic strings:
+        >>> doc = ConfigDocument.create(
+        ...     name=ConfigDocument.FILES.CONFIG,  # Use enum
+        ...     content={"key": "value"}
+        ... )
+        >>>
+        >>> # CORRECT Pydantic usage:
+        >>> class Config(BaseModel):
+        ...     key: str
+        >>>
+        >>> # Direct creation from Pydantic model
+        >>> config_model = Config(key="value")
+        >>> doc = MyDocument.create(name="data.json", content=config_model)
+        >>>
+        >>> # Direct extraction to Pydantic model
+        >>> restored = doc.as_pydantic_model(Config)
+        >>> print(restored.key)  # "value"
         >>>
         >>> # Track document provenance with sources
         >>> source_doc = MyDocument.create(name="input.txt", content="raw data")
@@ -170,6 +234,9 @@ class Document(BaseModel, ABC):
     DESCRIPTION_EXTENSION: ClassVar[str] = ".description.md"
     """File extension for description files."""
+    SOURCES_EXTENSION: ClassVar[str] = ".sources.json"
+    """File extension for sources metadata files."""
     MARKDOWN_LIST_SEPARATOR: ClassVar[str] = "\n\n-----------------\n\n"
     """Separator for markdown list items."""
@@ -288,7 +355,7 @@ class Document(BaseModel, ABC):
         content types and automatically converts them to bytes based on the file
         extension. Use the `parse` method to reverse this conversion.
-        Best Practice (90% of cases):
+        Best Practice (by default, unless instructed otherwise):
             Only provide name and content. The description parameter is RARELY needed.
         Args:
@@ -302,8 +369,8 @@ class Document(BaseModel, ABC):
                 - bytes: Used directly without conversion
                 - str: Encoded to UTF-8 bytes
                 - dict[str, Any]: Serialized to JSON (.json) or YAML (.yaml/.yml)
-                - list[str]: Joined with separator for .md (validates no items
-                            contain separator), else JSON/YAML
+                - list[str]: Joined automatically for .md (validates format compatibility),
+                            else JSON/YAML
                 - list[BaseModel]: Serialized to JSON or YAML based on extension
                 - BaseModel: Serialized to JSON or YAML based on extension
             description: Optional description - USUALLY OMIT THIS (defaults to None).
@@ -319,7 +386,7 @@ class Document(BaseModel, ABC):
         Raises:
             ValueError: If content type is not supported for the file extension,
-                       or if markdown list items contain the separator
+                       or if markdown list format is incompatible
             DocumentNameError: If filename violates validation rules
             DocumentSizeError: If content exceeds MAX_CONTENT_SIZE
@@ -329,7 +396,7 @@ class Document(BaseModel, ABC):
             returns the original dictionary {"key": "value"}.
         Example:
-            >>> # CORRECT - no description needed (90% of cases)
+            >>> # CORRECT - no description needed (by default, unless instructed otherwise)
             >>> doc = MyDocument.create(name="test.txt", content="Hello World")
             >>> doc.content  # b'Hello World'
             >>> doc.parse(str)  # "Hello World"
@@ -427,10 +494,6 @@ class Document(BaseModel, ABC):
             >>> doc = MyDocument.create(name="data.json", content={"key": "value"})
             >>> doc = MyDocument.create(name="config.yaml", content=my_model)
             >>> doc = MyDocument.create(name="items.md", content=["item1", "item2"])
-        See Also:
-            create: Recommended factory method with automatic type conversion
-            parse: Method to reverse the conversion done by create
         """
         if type(self) is Document:
             raise TypeError("Cannot instantiate abstract Document class directly")
@@ -467,8 +530,7 @@ class Document(BaseModel, ABC):
         Note:
             This method determines document persistence and lifecycle.
-            FlowDocument returns "flow", TaskDocument returns "task",
-            TemporaryDocument returns "temporary".
+            FlowDocument returns "flow", TaskDocument returns "task".
         """
         raise NotImplementedError("Subclasses must implement this method")
@@ -520,7 +582,7 @@ class Document(BaseModel, ABC):
         during execution.
         Returns:
-            True if this is a TemporaryDocument, False otherwise.
+            True if this document is temporary, False otherwise.
         """
         return self.get_base_type() == "temporary"
@@ -565,8 +627,6 @@ class Document(BaseModel, ABC):
     def validate_file_name(cls, name: str) -> None:
         """Validate that a file name matches allowed patterns.
-        @public
         DO NOT OVERRIDE this method if you define a FILES enum!
         The validation is automatic when FILES enum is present.
@@ -610,7 +670,7 @@ class Document(BaseModel, ABC):
         Ensures the document name is secure and follows conventions:
         - No path traversal characters (.., \\, /)
-        - Cannot end with .description.md
+        - Cannot end with .description.md or .sources.json
         - No leading/trailing whitespace
         - Must match FILES enum if defined
@@ -635,6 +695,9 @@ class Document(BaseModel, ABC):
                 f"Document names cannot end with {cls.DESCRIPTION_EXTENSION}: {v}"
             )
+        if v.endswith(cls.SOURCES_EXTENSION):
+            raise DocumentNameError(f"Document names cannot end with {cls.SOURCES_EXTENSION}: {v}")
         if ".." in v or "\\" in v or "/" in v:
             raise DocumentNameError(f"Invalid filename - contains path traversal characters: {v}")
@@ -659,7 +722,7 @@ class Document(BaseModel, ABC):
             2. str → UTF-8 encoding
             3. dict/BaseModel + .json → JSON serialization (indented)
             4. dict/BaseModel + .yaml/.yml → YAML serialization
-            5. list[str] + .md → Join with markdown separator (validates no items contain separator)
+            5. list[str] + .md → Join with markdown sections (validates format compatibility)
             6. list[Any] + .json/.yaml → JSON/YAML array
             7. int/float/bool + .json → JSON primitive
@@ -1028,8 +1091,6 @@ class Document(BaseModel, ABC):
     def as_yaml(self) -> Any:
         r"""Parse document content as YAML.
-        @public
         Parses the document's text content as YAML and returns Python objects.
         Uses ruamel.yaml which is safe by default (no code execution).
@@ -1057,8 +1118,6 @@ class Document(BaseModel, ABC):
     def as_json(self) -> Any:
         """Parse document content as JSON.
-        @public
         Parses the document's text content as JSON and returns Python objects.
         Document must contain valid JSON text.
@@ -1153,7 +1212,7 @@ class Document(BaseModel, ABC):
         @public
-        Splits text content using markdown separator ("\n\n-----------------\n\n").
+        Splits text content automatically using markdown section separators.
         Designed for markdown documents with multiple sections.
         Returns:
@@ -1168,9 +1227,9 @@ class Document(BaseModel, ABC):
             >>> doc = MyDocument.create(name="book.md", content=sections)
             >>> doc.as_markdown_list()  # Returns original sections
-            >>> # Manual creation with separator
-            >>> content = "Part 1\n\n-----------------\n\nPart 2\n\n-----------------\n\nPart 3"
-            >>> doc2 = MyDocument(name="parts.md", content=content.encode())
+            >>> # Round-trip conversion works automatically
+            >>> sections = ["Part 1", "Part 2", "Part 3"]
+            >>> doc2 = MyDocument.create(name="parts.md", content=sections)
             >>> doc2.as_markdown_list()  # ['Part 1', 'Part 2', 'Part 3']
         """
         return self.text.split(self.MARKDOWN_LIST_SEPARATOR)
@@ -1207,7 +1266,7 @@ class Document(BaseModel, ABC):
         Extension Rules:
             - .json → JSON parsing for dict/list/BaseModel
             - .yaml/.yml → YAML parsing for dict/list/BaseModel
-            - .md + list → Split by markdown separator
+            - .md + list → Split automatically into sections
             - Any + str → UTF-8 decode
             - Any + bytes → Raw content
@@ -1223,8 +1282,7 @@ class Document(BaseModel, ABC):
             >>> # Markdown list
             >>> items = ["Item 1", "Item 2"]
-            >>> content = "\n\n---\n\n".join(items).encode()
-            >>> doc = MyDocument(name="list.md", content=content)
+            >>> doc = MyDocument.create(name="list.md", content=items)
             >>> doc.parse(list)
             ['Item 1', 'Item 2']
         """
@@ -1330,11 +1388,6 @@ class Document(BaseModel, ABC):
             >>> # Check if specific document is a source
             >>> if source1.sha256 in doc_refs:
             ...     print("Document derived from source1")
-        See Also:
-            - get_source_references: Get non-document source references (URLs, etc.)
-            - has_source: Check if a specific source is tracked
-            - Document.create: Add sources when creating documents
         """
         return [src for src in self.sources if is_document_sha256(src)]
@@ -1372,11 +1425,6 @@ class Document(BaseModel, ABC):
             >>> # Use for attribution or debugging
             >>> for ref in refs:
             ...     print(f"Data sourced from: {ref}")
-        See Also:
-            - get_source_documents: Get document SHA256 references
-            - has_source: Check if a specific source is tracked
-            - Document.create: Add sources when creating documents
         """
         return [src for src in self.sources if not is_document_sha256(src)]
@@ -1422,11 +1470,6 @@ class Document(BaseModel, ABC):
             >>> # Check by SHA256 directly
             >>> if derived.has_source(source_doc.sha256):
             ...     print("Has specific hash")
-        See Also:
-            - get_source_documents: Get all document sources
-            - get_source_references: Get all reference sources
-            - Document.create: Add sources when creating documents
         """
         if isinstance(source, str):
             # Direct string comparison

ai_pipeline_core/documents/document_list.py CHANGED Viewed

@@ -17,8 +17,8 @@ class DocumentList(list[Document]):
     Specialized list with validation and filtering for documents.
-    Best Practice: Use default constructor in 90% of cases. Only enable
-    validate_same_type or validate_duplicates when you explicitly need them.
+    Best Practice: Use default constructor by default, unless instructed otherwise.
+    Only enable validate_same_type or validate_duplicates when you explicitly need them.
     Example:
         >>> # RECOMMENDED - default constructor for most cases
@@ -164,6 +164,9 @@ class DocumentList(list[Document]):
         @public
+        ALWAYS returns a DocumentList (which may be empty), never raises an exception
+        for no matches. Use this when you want to process all matching documents.
         Args:
             arg: Can be one of:
                 - str: Single document name to filter by
@@ -174,7 +177,9 @@ class DocumentList(list[Document]):
                   (list, tuple, set, generator, or any iterable)
         Returns:
-            New DocumentList with filtered documents.
+            New DocumentList with filtered documents (may be empty).
+            - Returns ALL matching documents
+            - Empty DocumentList if no matches found
         Raises:
             TypeError: If arg is not a valid type (not str, type, or iterable),
@@ -182,12 +187,19 @@ class DocumentList(list[Document]):
             AttributeError: If arg is expected to be iterable but doesn't support iteration.
         Example:
-            >>> docs.filter_by("file.txt")  # Filter by single name
-            >>> docs.filter_by(MyDocument)  # Filter by single type
-            >>> docs.filter_by([Doc1, Doc2])  # Filter by multiple types (list)
-            >>> docs.filter_by({"file1.txt", "file2.txt"})  # Filter by multiple names (set)
-            >>> docs.filter_by((SubDoc, AnotherDoc))  # Filter by multiple types (tuple)
-            >>> docs.filter_by(name for name in ["a.txt", "b.txt"])  # Generator expression
+            >>> # Returns list with all matching documents
+            >>> matching_docs = docs.filter_by("file.txt")  # May be empty
+            >>> for doc in matching_docs:
+            ...     process(doc)
+            >>>
+            >>> # Filter by type - returns all instances
+            >>> config_docs = docs.filter_by(ConfigDocument)
+            >>> print(f"Found {len(config_docs)} config documents")
+            >>>
+            >>> # Filter by multiple names
+            >>> important_docs = docs.filter_by(["config.yaml", "settings.json"])
+            >>> if not important_docs:  # Check if empty
+            ...     print("No important documents found")
         """
         if isinstance(arg, str):
             # Filter by single name
@@ -257,38 +269,73 @@ class DocumentList(list[Document]):
     def get_by(self, arg: type[Document], required: bool = True) -> Document | None: ...
     def get_by(self, arg: str | type[Document], required: bool = True) -> Document | None:
-        """Get a single document by name or type.
+        """Get EXACTLY ONE document by name or type.
         @public
+        IMPORTANT: This method expects to find exactly one matching document.
+        - If no matches and required=True: raises ValueError
+        - If no matches and required=False: returns None
+        - If multiple matches: ALWAYS raises ValueError (ambiguous)
+        When required=True (default), you do NOT need to check for None:
+            >>> doc = docs.get_by("config.yaml")  # Will raise if not found
+            >>> # No need for: if doc is not None  <- This is redundant!
+            >>> print(doc.content)  # Safe to use directly
         Args:
             arg: Document name (str) or document type.
-            required: If True, raises ValueError when not found. If False, returns None.
+            required: If True (default), raises ValueError when not found.
+                     If False, returns None when not found.
         Returns:
-            The first matching document, or None if not found and required=False.
+            The single matching document, or None if not found and required=False.
         Raises:
-            ValueError: If required=True and document not found.
+            ValueError: If required=True and document not found, OR if multiple
+                       documents match (ambiguous result).
             TypeError: If arg is not a string or Document type.
         Example:
-            >>> doc = docs.get_by("file.txt")  # Get by name, raises if not found
-            >>> doc = docs.get_by(MyDocument, required=False)  # Returns None if not found
+            >>> # CORRECT - No need to check for None when required=True (default)
+            >>> doc = docs.get_by("file.txt")  # Raises if not found
+            >>> print(doc.content)  # Safe to use directly
+            >>>
+            >>> # When using required=False, check for None
+            >>> doc = docs.get_by("optional.txt", required=False)
+            >>> if doc is not None:
+            ...     print(doc.content)
+            >>>
+            >>> # Will raise if multiple documents have same type
+            >>> # Use filter_by() instead if you want all matches
+            >>> try:
+            ...     doc = docs.get_by(ConfigDocument)  # Error if 2+ configs
+            >>> except ValueError as e:
+            ...     configs = docs.filter_by(ConfigDocument)  # Get all instead
         """
         if isinstance(arg, str):
-            # Get by name
-            for doc in self:
-                if doc.name == arg:
-                    return doc
+            # Get by name - collect all matches to check for duplicates
+            matches = [doc for doc in self if doc.name == arg]
+            if len(matches) > 1:
+                raise ValueError(
+                    f"Multiple documents found with name '{arg}'. "
+                    f"Found {len(matches)} matches. Use filter_by() to get all matches."
+                )
+            if matches:
+                return matches[0]
             if required:
                 raise ValueError(f"Document with name '{arg}' not found")
             return None
         elif isinstance(arg, type):  # type: ignore[reportUnnecessaryIsInstance]
-            # Get by type (including subclasses)
-            for doc in self:
-                if isinstance(doc, arg):
-                    return doc
+            # Get by type (including subclasses) - collect all matches
+            matches = [doc for doc in self if isinstance(doc, arg)]
+            if len(matches) > 1:
+                raise ValueError(
+                    f"Multiple documents found of type '{arg.__name__}'. "
+                    f"Found {len(matches)} matches. Use filter_by() to get all matches."
+                )
+            if matches:
+                return matches[0]
             if required:
                 raise ValueError(f"Document of type '{arg.__name__}' not found")
             return None

ai_pipeline_core/documents/flow_document.py CHANGED Viewed

@@ -24,24 +24,20 @@ class FlowDocument(Document):
     - Persisted to file system between pipeline steps
     - Survives across multiple flow runs
     - Used for flow inputs and outputs
-    - Saved in directories named after the document's canonical name
+    - Saved in directories organized by the document's type/name
     Creating FlowDocuments:
         Same as Document - use `create()` for automatic conversion, `__init__` for bytes.
         See Document.create() for detailed usage examples.
     Persistence:
-        Documents are saved to: {output_dir}/{canonical_name}/{filename}
+        Documents are saved under an output directory path associated with the document's type/name.
         For example: output/my_doc/data.json
     Note:
         - Cannot instantiate FlowDocument directly - must subclass
         - Used with FlowConfig to define flow input/output types
         - No additional abstract methods to implement
-    See Also:
-        TaskDocument: For temporary documents within task execution
-        TemporaryDocument: For documents that are never persisted
     """
     def __init__(

ai_pipeline_core/documents/task_document.py CHANGED Viewed

@@ -43,10 +43,6 @@ class TaskDocument(Document):
         - Not saved by simple_runner utilities
         - Reduces I/O overhead for temporary data
         - No additional abstract methods to implement
-    See Also:
-        FlowDocument: For documents that persist across flow runs
-        TemporaryDocument: Alternative for non-persistent documents
     """
     def __init__(

ai_pipeline_core/documents/temporary_document.py CHANGED Viewed

@@ -1,7 +1,5 @@
 """Temporary document implementation for non-persistent data.
-@public
 This module provides the TemporaryDocument class for documents that
 are never persisted, regardless of context.
 """
@@ -15,8 +13,6 @@ from .document import Document
 class TemporaryDocument(Document):
     r"""Concrete document class for data that is never persisted.
-    @public
     TemporaryDocument is a final (non-subclassable) document type for
     data that should never be saved to disk, regardless of whether it's
     used in a flow or task context. Unlike FlowDocument and TaskDocument
@@ -28,6 +24,7 @@ class TemporaryDocument(Document):
     - Cannot be subclassed (annotated with Python's @final decorator in code)
     - Useful for transient data like API responses or intermediate calculations
     - Ignored by simple_runner save operations
+    - Useful for tests and debugging
     Creating TemporaryDocuments:
         Same as Document - use `create()` for automatic conversion, `__init__` for bytes.
@@ -48,10 +45,6 @@ class TemporaryDocument(Document):
         - This is a final class and cannot be subclassed
         - Use when you explicitly want to prevent persistence
         - Useful for sensitive data that shouldn't be written to disk
-    See Also:
-        FlowDocument: For documents that persist across flow runs
-        TaskDocument: For documents temporary within task execution
     """
     def __init_subclass__(cls, **kwargs: Any) -> None:

ai-pipeline-core 0.1.14__py3-none-any.whl → 0.2.0__py3-none-any.whl

ai-pipeline-core 0.1.14py3-none-any.whl → 0.2.0py3-none-any.whl