PyPI - ai-pipeline-core - Versions diffs - 0.1.10__py3-none-any.whl → 0.1.11__py3-none-any.whl - Mend

ai-pipeline-core 0.1.10py3-none-any.whl → 0.1.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

ai_pipeline_core/__init__.py +84 -4
ai_pipeline_core/documents/__init__.py +9 -0
ai_pipeline_core/documents/document.py +1034 -151
ai_pipeline_core/documents/document_list.py +147 -38
ai_pipeline_core/documents/flow_document.py +112 -11
ai_pipeline_core/documents/mime_type.py +173 -15
ai_pipeline_core/documents/task_document.py +117 -12
ai_pipeline_core/documents/temporary_document.py +84 -5
ai_pipeline_core/documents/utils.py +41 -9
ai_pipeline_core/exceptions.py +47 -11
ai_pipeline_core/flow/__init__.py +2 -0
ai_pipeline_core/flow/config.py +232 -23
ai_pipeline_core/flow/options.py +50 -1
ai_pipeline_core/llm/__init__.py +6 -0
ai_pipeline_core/llm/ai_messages.py +125 -27
ai_pipeline_core/llm/client.py +278 -26
ai_pipeline_core/llm/model_options.py +130 -1
ai_pipeline_core/llm/model_response.py +239 -35
ai_pipeline_core/llm/model_types.py +67 -0
ai_pipeline_core/logging/__init__.py +13 -0
ai_pipeline_core/logging/logging_config.py +72 -20
ai_pipeline_core/logging/logging_mixin.py +38 -32
ai_pipeline_core/pipeline.py +308 -60
ai_pipeline_core/prefect.py +48 -1
ai_pipeline_core/prompt_manager.py +209 -24
ai_pipeline_core/settings.py +108 -4
ai_pipeline_core/simple_runner/__init__.py +5 -0
ai_pipeline_core/simple_runner/cli.py +96 -11
ai_pipeline_core/simple_runner/simple_runner.py +237 -4
ai_pipeline_core/tracing.py +232 -30
ai_pipeline_core-0.1.11.dist-info/METADATA +450 -0
ai_pipeline_core-0.1.11.dist-info/RECORD +36 -0
ai_pipeline_core-0.1.10.dist-info/METADATA +0 -538
ai_pipeline_core-0.1.10.dist-info/RECORD +0 -36
{ai_pipeline_core-0.1.10.dist-info → ai_pipeline_core-0.1.11.dist-info}/WHEEL +0 -0
{ai_pipeline_core-0.1.10.dist-info → ai_pipeline_core-0.1.11.dist-info}/licenses/LICENSE +0 -0

ai_pipeline_core/simple_runner/simple_runner.py CHANGED Viewed

@@ -1,3 +1,40 @@
+"""Simple pipeline runner for local flow execution.
+This module provides the core functionality for running AI pipeline flows
+locally without full Prefect orchestration. It handles document I/O,
+flow sequencing, and error management.
+Key components:
+    - Document I/O from/to filesystem directories
+    - Single and multi-flow execution
+    - Automatic document validation and passing between flows
+    - Step-based execution control (start/end steps)
+Directory structure:
+    working_dir/
+    ├── InputDocument/       # Documents of type InputDocument
+    │   ├── file1.txt
+    │   └── file1.txt.description.md   # Optional description
+    └── OutputDocument/      # Documents of type OutputDocument
+        └── result.json
+Example:
+    >>> from ai_pipeline_core.simple_runner import run_pipeline
+    >>>
+    >>> # Run single flow
+    >>> results = await run_pipeline(
+    ...     flow_func=MyFlow,
+    ...     config=MyConfig,
+    ...     project_name="test",
+    ...     output_dir=Path("./output"),
+    ...     flow_options=options
+    ... )
+Note:
+    Document directories are named using the canonical_name() method
+    of each document type for consistent organization.
+"""
 from pathlib import Path
 from typing import Any, Callable, Sequence, Type
@@ -9,13 +46,58 @@ from ai_pipeline_core.logging import get_pipeline_logger
 logger = get_pipeline_logger(__name__)
 FlowSequence = Sequence[Callable[..., Any]]
+"""Type alias for a sequence of flow functions."""
 ConfigSequence = Sequence[Type[FlowConfig]]
+"""Type alias for a sequence of flow configuration classes."""
 def load_documents_from_directory(
     base_dir: Path, document_types: Sequence[Type[FlowDocument]]
 ) -> DocumentList:
-    """Loads documents using canonical_name."""
+    """Load documents from filesystem directories by type.
+    Scans subdirectories of base_dir for documents matching the provided
+    types. Each document type has its own subdirectory named after its
+    canonical_name().
+    Args:
+        base_dir: Base directory containing document subdirectories.
+        document_types: Sequence of FlowDocument subclasses to load.
+                       Each type corresponds to a subdirectory.
+    Returns:
+        DocumentList containing all successfully loaded documents.
+        Empty list if no documents found or directories don't exist.
+    Directory structure:
+        base_dir/
+        ├── DocumentTypeA/     # canonical_name() of DocumentTypeA
+        │   ├── doc1.txt
+        │   ├── doc1.txt.description.md  # Optional description file
+        │   └── doc2.json
+        └── DocumentTypeB/
+            └── data.csv
+    File handling:
+        - Document content is read as bytes
+        - Optional .description.md files provide document descriptions
+        - Failed loads are logged but don't stop processing
+        - Non-file entries are skipped
+    Example:
+        >>> from my_docs import InputDoc, ConfigDoc
+        >>> docs = load_documents_from_directory(
+        ...     Path("./data"),
+        ...     [InputDoc, ConfigDoc]
+        ... )
+        >>> print(f"Loaded {len(docs)} documents")
+    Note:
+        - Uses canonical_name() for directory names (e.g., "InputDocument")
+        - Descriptions are loaded from "{filename}.description.md" files
+        - All file types are supported (determined by document class)
+    """
     documents = DocumentList()
     for doc_class in document_types:
@@ -49,7 +131,44 @@ def load_documents_from_directory(
 def save_documents_to_directory(base_dir: Path, documents: DocumentList) -> None:
-    """Saves documents using canonical_name."""
+    """Save documents to filesystem directories by type.
+    Creates subdirectories under base_dir for each document type and
+    saves documents with their original filenames. Only FlowDocument
+    instances are saved (temporary documents are skipped).
+    Args:
+        base_dir: Base directory for saving document subdirectories.
+                 Created if it doesn't exist.
+        documents: DocumentList containing documents to save.
+                  Non-FlowDocument instances are silently skipped.
+    Side effects:
+        - Creates base_dir and subdirectories as needed
+        - Overwrites existing files with the same name
+        - Logs each saved document
+        - Creates .description.md files for documents with descriptions
+    Directory structure created:
+        base_dir/
+        └── DocumentType/      # canonical_name() of document
+            ├── output.json    # Document content
+            └── output.json.description.md  # Optional description
+    Example:
+        >>> docs = DocumentList([
+        ...     OutputDoc(name="result.txt", content=b"data"),
+        ...     OutputDoc(name="stats.json", content=b'{...}')
+        ... ])
+        >>> save_documents_to_directory(Path("./output"), docs)
+        >>> # Creates ./output/OutputDocument/result.txt
+        >>> #     and ./output/OutputDocument/stats.json
+    Note:
+        - Only FlowDocument subclasses are saved
+        - TaskDocument and other temporary documents are skipped
+        - Descriptions are saved as separate .description.md files
+    """
     for document in documents:
         if not isinstance(document, FlowDocument):
             continue
@@ -75,7 +194,61 @@ async def run_pipeline(
     flow_options: FlowOptions,
     flow_name: str | None = None,
 ) -> DocumentList:
-    """Execute a single pipeline flow."""
+    """Execute a single pipeline flow with document I/O.
+    Runs a flow function with automatic document loading, validation,
+    and saving. The flow receives input documents from the filesystem
+    and saves its output for subsequent flows.
+    The execution proceeds through these steps:
+    1. Load input documents from output_dir subdirectories
+    2. Validate input documents against config requirements
+    3. Execute flow function with documents and options
+    4. Validate output documents match config.OUTPUT_DOCUMENT_TYPE
+    5. Save output documents to output_dir subdirectories
+    Args:
+        flow_func: Async flow function decorated with @pipeline_flow.
+                  Must accept (project_name, documents, flow_options).
+        config: FlowConfig subclass defining input/output document types.
+               Used for validation and directory organization.
+        project_name: Name of the project/pipeline for logging and tracking.
+        output_dir: Directory for loading input and saving output documents.
+                   Document subdirectories are created as needed.
+        flow_options: Configuration options passed to the flow function.
+                     Can be FlowOptions or any subclass.
+        flow_name: Optional display name for logging. If None, uses
+                  flow_func.name or flow_func.__name__.
+    Returns:
+        DocumentList containing the flow's output documents.
+    Raises:
+        RuntimeError: If required input documents are missing.
+    Example:
+        >>> from my_flows import AnalysisFlow, AnalysisConfig
+        >>>
+        >>> results = await run_pipeline(
+        ...     flow_func=AnalysisFlow,
+        ...     config=AnalysisConfig,
+        ...     project_name="analysis_001",
+        ...     output_dir=Path("./results"),
+        ...     flow_options=FlowOptions(temperature=0.7)
+        ... )
+        >>> print(f"Generated {len(results)} documents")
+    Note:
+        - Flow must be async (decorated with @pipeline_flow)
+        - Input documents are loaded based on config.INPUT_DOCUMENT_TYPES
+        - Output is validated against config.OUTPUT_DOCUMENT_TYPE
+        - All I/O is logged for debugging
+    """
     if flow_name is None:
         # For Prefect Flow objects, use their name attribute
         # For regular functions, fall back to __name__
@@ -108,7 +281,67 @@ async def run_pipelines(
     start_step: int = 1,
     end_step: int | None = None,
 ) -> None:
-    """Executes multiple pipeline flows sequentially."""
+    """Execute multiple pipeline flows in sequence.
+    Runs a series of flows where each flow's output becomes the input
+    for the next flow. Supports partial execution with start/end steps
+    for debugging and resuming failed pipelines.
+    Execution proceeds by:
+    1. Validating step indices and sequence lengths
+    2. For each flow in range [start_step, end_step]:
+       a. Loading input documents from output_dir
+       b. Executing flow with documents
+       c. Saving output documents to output_dir
+       d. Output becomes input for next flow
+    3. Logging progress and any failures
+    Steps are 1-based for user convenience. Step 1 is the first flow,
+    Step N is the Nth flow. Use start_step > 1 to skip initial flows
+    and end_step < N to stop early.
+    Args:
+        project_name: Name of the overall pipeline/project.
+        output_dir: Directory for document I/O between flows.
+                   Shared by all flows in the sequence.
+        flows: Sequence of flow functions to execute in order.
+              Must all be async functions decorated with @pipeline_flow.
+        flow_configs: Sequence of FlowConfig classes corresponding to flows.
+                     Must have same length as flows sequence.
+        flow_options: Options passed to all flows in the sequence.
+                     Individual flows can use different fields.
+        start_step: First flow to execute (1-based index).
+                   Default 1 starts from the beginning.
+        end_step: Last flow to execute (1-based index).
+                 None runs through the last flow.
+    Raises:
+        ValueError: If flows and configs have different lengths, or if
+                   start_step or end_step are out of range.
+    Example:
+        >>> # Run full pipeline
+        >>> await run_pipelines(
+        ...     project_name="analysis",
+        ...     output_dir=Path("./work"),
+        ...     flows=[ExtractFlow, AnalyzeFlow, SummarizeFlow],
+        ...     flow_configs=[ExtractConfig, AnalyzeConfig, SummaryConfig],
+        ...     flow_options=options
+        ... )
+        >>>
+        >>> # Run only steps 2-3 (skip extraction)
+        >>> await run_pipelines(
+        ...     ...,
+        ...     start_step=2,
+        ...     end_step=3
+        ... )
+    Note:
+        - Each flow's output must match the next flow's input types
+        - Failed flows stop the entire pipeline
+        - Progress is logged with step numbers for debugging
+        - Documents persist in output_dir between runs
+    """
     if len(flows) != len(flow_configs):
         raise ValueError("The number of flows and flow configs must match.")

ai_pipeline_core/tracing.py CHANGED Viewed

@@ -1,9 +1,11 @@
 """Tracing utilities that integrate Laminar (``lmnr``) with our code-base.
-This module centralises:
-• ``TraceInfo`` - a small helper object for propagating contextual metadata.
-• ``trace`` decorator - augments a callable with Laminar tracing, automatic
-``observe`` instrumentation, and optional support for test runs.
+@public
+This module centralizes:
+- ``TraceInfo`` - a small helper object for propagating contextual metadata.
+- ``trace`` decorator - augments a callable with Laminar tracing, automatic
+  ``observe`` instrumentation, and optional support for test runs.
 """
 from __future__ import annotations
@@ -25,13 +27,66 @@ P = ParamSpec("P")
 R = TypeVar("R")
 TraceLevel = Literal["always", "debug", "off"]
+"""Control level for tracing activation.
+@public
+Values:
+- "always": Always trace (default, production mode)
+- "debug": Only trace when LMNR_DEBUG == "true"
+- "off": Disable tracing completely
+"""
 # ---------------------------------------------------------------------------
 # ``TraceInfo`` – metadata container
 # ---------------------------------------------------------------------------
 class TraceInfo(BaseModel):
-    """A container that holds contextual metadata for the current trace."""
+    """Container for propagating trace context through the pipeline.
+    TraceInfo provides a structured way to pass tracing metadata through
+    function calls, ensuring consistent observability across the entire
+    execution flow. It integrates with Laminar (LMNR) for distributed
+    tracing and debugging.
+    Attributes:
+        session_id: Unique identifier for the current session/conversation.
+                   Falls back to LMNR_SESSION_ID environment variable.
+        user_id: Identifier for the user triggering the operation.
+                Falls back to LMNR_USER_ID environment variable.
+        metadata: Key-value pairs for additional trace context.
+                 Useful for filtering and searching in LMNR dashboard.
+        tags: List of tags for categorizing traces (e.g., ["production", "v2"]).
+    Environment fallbacks:
+        - LMNR_SESSION_ID: Default session_id if not explicitly set
+        - LMNR_USER_ID: Default user_id if not explicitly set
+        - LMNR_DEBUG: Controls debug-level tracing when set to "true"
+        Note: These variables are read directly by the tracing layer and are
+        not part of the Settings configuration.
+    Example:
+        >>> # Create trace context
+        >>> trace_info = TraceInfo(
+        ...     session_id="sess_123",
+        ...     user_id="user_456",
+        ...     metadata={"flow": "document_analysis", "version": "1.2"},
+        ...     tags=["production", "high_priority"]
+        ... )
+        >>>
+        >>> # Pass through function calls
+        >>> @trace
+        >>> async def process(data, trace_info: TraceInfo):
+        ...     # TraceInfo automatically propagates to nested calls
+        ...     result = await analyze(data, trace_info=trace_info)
+        ...     return result
+    Note:
+        TraceInfo is typically created at the entry point of a flow
+        and passed through all subsequent function calls for
+        consistent tracing context.
+    """
     session_id: str | None = None
     user_id: str | None = None
@@ -39,7 +94,30 @@ class TraceInfo(BaseModel):
     tags: list[str] = []
     def get_observe_kwargs(self) -> dict[str, Any]:
-        """Return kwargs suitable for passing to the observe decorator."""
+        """Convert TraceInfo to kwargs for Laminar's observe decorator.
+        Transforms the TraceInfo fields into the format expected by
+        the lmnr.observe() decorator, applying environment variable
+        fallbacks for session_id and user_id.
+        Returns:
+            Dictionary with keys:
+            - session_id: From field or LMNR_SESSION_ID env var
+            - user_id: From field or LMNR_USER_ID env var
+            - metadata: Dictionary of custom metadata (if set)
+            - tags: List of tags (if set)
+            Only non-empty values are included in the output.
+        Example:
+            >>> trace_info = TraceInfo(session_id="sess_123", tags=["test"])
+            >>> kwargs = trace_info.get_observe_kwargs()
+            >>> # Returns: {"session_id": "sess_123", "tags": ["test"]}
+        Note:
+            This method is called internally by the trace decorator
+            to configure Laminar observation parameters.
+        """
         kwargs: dict[str, Any] = {}
         # Use environment variable fallback for session_id
@@ -65,7 +143,21 @@ class TraceInfo(BaseModel):
 def _initialise_laminar() -> None:
-    """Ensure Laminar is initialised once per process."""
+    """Initialize Laminar SDK with project configuration.
+    Sets up the Laminar observability client with the project API key
+    from settings. Disables automatic OpenAI instrumentation to avoid
+    conflicts with our custom tracing.
+    Configuration:
+        - Uses settings.lmnr_project_api_key for authentication
+        - Disables OPENAI instrument to prevent double-tracing
+        - Called automatically by trace decorator on first use
+    Note:
+        This is an internal function called once per process.
+        Multiple calls are safe (Laminar handles idempotency).
+    """
     if settings.lmnr_project_api_key:
         Laminar.initialize(
             project_api_key=settings.lmnr_project_api_key,
@@ -118,38 +210,135 @@ def trace(
     ignore_exceptions: bool = False,
     preserve_global_context: bool = True,
 ) -> Callable[[Callable[P, R]], Callable[P, R]] | Callable[P, R]:
-    """Decorator that wires Laminar tracing and observation into a function.
+    """Add Laminar observability tracing to any function.
+    @public
+    The trace decorator integrates functions with Laminar (LMNR) for
+    distributed tracing, performance monitoring, and debugging. It
+    automatically handles both sync and async functions, propagates
+    trace context, and provides fine-grained control over what gets traced.
+    USAGE GUIDELINE - Defaults First:
+        In 90% of cases, use WITHOUT any parameters.
+        The defaults are optimized for most use cases.
     Args:
-        func: The function to be traced (when used as @trace)
-        level: Trace level control:
-            - "always": Always trace (default)
-            - "debug": Only trace when LMNR_DEBUG environment variable is NOT set to "true"
-            - "off": Never trace
-        name: Custom name for the observation (defaults to function name)
-        metadata: Additional metadata for the trace
-        tags: Additional tags for the trace
-        span_type: Type of span for the trace
-        ignore_input: Ignore all inputs in the trace
-        ignore_output: Ignore the output in the trace
-        ignore_inputs: List of specific input parameter names to ignore
-        input_formatter: Custom formatter for inputs (takes any arguments, returns string)
-        output_formatter: Custom formatter for outputs (takes any arguments, returns string)
-        ignore_exceptions: Whether to ignore exceptions in tracing
-        preserve_global_context: Whether to preserve global context
+        func: Function to trace (when used without parentheses: @trace).
+        level: Controls when tracing is active:
+            - "always": Always trace (default, production mode)
+            - "debug": Only trace when LMNR_DEBUG == "true"
+            - "off": Disable tracing completely
+        name: Custom span name in traces (defaults to function.__name__).
+             Use descriptive names for better trace readability.
+        session_id: Override session ID for this function's traces.
+                   Typically propagated via TraceInfo instead.
+        user_id: Override user ID for this function's traces.
+                Typically propagated via TraceInfo instead.
+        metadata: Additional key-value metadata attached to spans.
+                 Searchable in LMNR dashboard. Merged with TraceInfo metadata.
+        tags: List of tags for categorizing spans (e.g., ["api", "critical"]).
+             Merged with TraceInfo tags.
+        span_type: Semantic type of the span (e.g., "LLM", "CHAIN", "TOOL").
+                  Affects visualization in LMNR dashboard.
+        ignore_input: Don't record function inputs in trace (privacy/size).
+        ignore_output: Don't record function output in trace (privacy/size).
+        ignore_inputs: List of parameter names to exclude from trace.
+                      Useful for sensitive data like API keys.
+        input_formatter: Custom function to format inputs for tracing.
+                        Receives all function args, returns display string.
+        output_formatter: Custom function to format output for tracing.
+                         Receives function result, returns display string.
+        ignore_exceptions: Don't record exceptions in traces (default False).
+        preserve_global_context: Maintain Laminar's global context across
+                                calls (default True). Set False for isolated traces.
     Returns:
-        The decorated function with Laminar tracing enabled
+        Decorated function with same signature but added tracing.
+    TraceInfo propagation:
+        If the decorated function has a 'trace_info' parameter, the decorator
+        automatically creates or propagates a TraceInfo instance, ensuring
+        consistent session/user tracking across the call chain.
+    Example:
+        >>> # RECOMMENDED - No parameters needed for most cases!
+        >>> @trace
+        >>> async def process_document(doc):
+        ...     return await analyze(doc)
+        >>>
+        >>> # With parameters (RARE - only when specifically needed):
+        >>> @trace(level="debug")  # Only for debug-specific tracing
+        >>> async def debug_operation():
+        ...     pass
+        >>> @trace(ignore_inputs=["api_key"])  # Only for sensitive data
+        >>> async def api_call(data, api_key):
+        ...     return await external_api(data, api_key)
+        >>>
+        >>> # AVOID unnecessary configuration - defaults handle:
+        >>> # - Automatic naming from function name
+        >>> # - Standard trace level ("always")
+        >>> # - Full input/output capture
+        >>> # - Proper span type inference
+        >>>
+        >>> # Custom formatting
+        >>> @trace(
+        ...     input_formatter=lambda doc: f"Document: {doc.id}",
+        ...     output_formatter=lambda res: f"Results: {len(res)} items"
+        >>> )
+        >>> def analyze(doc):
+        ...     return results
+    Environment variables:
+        - LMNR_DEBUG: Set to "true" to enable debug-level traces
+        - LMNR_SESSION_ID: Default session ID if not in TraceInfo
+        - LMNR_USER_ID: Default user ID if not in TraceInfo
+        - LMNR_PROJECT_API_KEY: Required for trace submission
+    Performance:
+        - Tracing overhead is minimal (~1-2ms per call)
+        - When level="off", decorator returns original function unchanged
+        - Large inputs/outputs can be excluded with ignore_* parameters
+    Note:
+        - Automatically initializes Laminar on first use
+        - Works with both sync and async functions
+        - Preserves function signature and metadata
+        - Thread-safe and async-safe
+    See Also:
+        - TraceInfo: Container for trace metadata
+        - pipeline_task: Task decorator with built-in tracing
+        - pipeline_flow: Flow decorator with built-in tracing
     """
     if level == "off":
         if func:
             return func
         return lambda f: f
     def decorator(f: Callable[P, R]) -> Callable[P, R]:
-        # Handle 'debug' level logic - only trace when LMNR_DEBUG is NOT "true"
-        if level == "debug" and os.getenv("LMNR_DEBUG", "").lower() == "true":
+        """Apply tracing to the target function.
+        Returns:
+            Wrapped function with LMNR observability.
+        """
+        # Handle 'debug' level logic - only trace when LMNR_DEBUG is "true"
+        if level == "debug" and os.getenv("LMNR_DEBUG", "").lower() != "true":
             return f
         # --- Pre-computation (done once when the function is decorated) ---
@@ -175,9 +364,12 @@ def trace(
         # --- Helper function for runtime logic ---
         def _prepare_and_get_observe_params(runtime_kwargs: dict[str, Any]) -> dict[str, Any]:
-            """
-            Inspects runtime args, manages TraceInfo, and returns params for lmnr.observe.
+            """Inspects runtime args, manages TraceInfo, and returns params for lmnr.observe.
             Modifies runtime_kwargs in place to inject TraceInfo if the function expects it.
+            Returns:
+                Dictionary of parameters for lmnr.observe decorator.
             """
             trace_info = runtime_kwargs.get("trace_info")
             if not isinstance(trace_info, TraceInfo):
@@ -223,12 +415,22 @@ def trace(
         # --- The actual wrappers ---
         @wraps(f)
         def sync_wrapper(*args: P.args, **kwargs: P.kwargs) -> R:
+            """Synchronous wrapper for traced function.
+            Returns:
+                The result of the wrapped function.
+            """
             observe_params = _prepare_and_get_observe_params(kwargs)
             observed_func = _observe(**observe_params)(f)
             return observed_func(*args, **kwargs)
         @wraps(f)
         async def async_wrapper(*args: P.args, **kwargs: P.kwargs) -> R:
+            """Asynchronous wrapper for traced function.
+            Returns:
+                The result of the wrapped function.
+            """
             observe_params = _prepare_and_get_observe_params(kwargs)
             observed_func = _observe(**observe_params)(f)
             return await observed_func(*args, **kwargs)

ai-pipeline-core 0.1.10__py3-none-any.whl → 0.1.11__py3-none-any.whl

ai-pipeline-core 0.1.10py3-none-any.whl → 0.1.11py3-none-any.whl