PyPI - ai-pipeline-core - Versions diffs - 0.4.7__tar.gz → 0.4.9__tar.gz - Mend

ai-pipeline-core 0.4.7tar.gz → 0.4.9tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (78) hide show

{ai_pipeline_core-0.4.7 → ai_pipeline_core-0.4.9}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ai-pipeline-core
-Version: 0.4.7
+Version: 0.4.9
 Summary: Core utilities for AI-powered processing pipelines using prefect
 Project-URL: Homepage, https://github.com/bbarwik/ai-pipeline-core
 Project-URL: Repository, https://github.com/bbarwik/ai-pipeline-core
@@ -29,6 +29,7 @@ Requires-Dist: prefect-gcp>=0.6.15
 Requires-Dist: prefect>=3.6.15
 Requires-Dist: pydantic-settings>=2.12.0
 Requires-Dist: pydantic>=2.12.5
+Requires-Dist: pypdf>=5.0.0
 Requires-Dist: python-magic>=0.4.27
 Requires-Dist: ruamel-yaml>=0.19.1
 Requires-Dist: tiktoken>=0.12.0

{ai_pipeline_core-0.4.7 → ai_pipeline_core-0.4.9}/ai_pipeline_core/__init__.py RENAMED Viewed

@@ -64,7 +64,7 @@ from .prompt_manager import PromptManager
 from .settings import Settings
 from .testing import disable_run_logger, prefect_test_harness
-__version__ = "0.4.6"
+__version__ = "0.4.9"
 __all__ = [
     "AIMessageType",

{ai_pipeline_core-0.4.7 → ai_pipeline_core-0.4.9}/ai_pipeline_core/deployment/base.py RENAMED Viewed

@@ -661,7 +661,10 @@ class PipelineDeployment(Generic[TOptions, TResult]):
         except Exception as e:
             logger.warning(f"Failed to initialize observability: {e}")
             with contextlib.suppress(Exception):
-                Laminar.initialize(export_timeout_seconds=15)
+                # Use canonical initializer to ensure consistent Laminar setup
+                from ai_pipeline_core.observability import tracing
+                tracing._initialise_laminar()
         deployment = self
@@ -882,13 +885,34 @@ class PipelineDeployment(Generic[TOptions, TResult]):
             options: FlowOptions,
             context: DeploymentContext,
         ) -> DeploymentResult:
+            # Initialize observability for remote workers
+            try:
+                initialize_observability()
+            except Exception as e:
+                logger.warning(f"Failed to initialize observability: {e}")
+                with contextlib.suppress(Exception):
+                    # Use canonical initializer to ensure consistent Laminar setup
+                    from ai_pipeline_core.observability import tracing
+                    tracing._initialise_laminar()
+            # Set session ID from Prefect flow run for trace grouping
+            flow_run_id = str(runtime.flow_run.get_id()) if runtime.flow_run else str(uuid4())  # pyright: ignore[reportAttributeAccessIssue, reportUnknownMemberType, reportUnknownArgumentType]
+            os.environ["LMNR_SESSION_ID"] = flow_run_id
             store = create_document_store(
                 settings,
                 summary_generator=_build_summary_generator(),
             )
             set_document_store(store)
             try:
-                return await deployment.run(project_name, documents, cast(Any, options), context)
+                # Create parent span to group all traces under a single deployment trace
+                with Laminar.start_as_current_span(
+                    name=f"{deployment.name}-{project_name}",
+                    input={"project_name": project_name, "options": options.model_dump()},
+                    session_id=flow_run_id,
+                ):
+                    return await deployment.run(project_name, documents, cast(Any, options), context)
             finally:
                 store.shutdown()
                 set_document_store(None)

{ai_pipeline_core-0.4.7 → ai_pipeline_core-0.4.9}/ai_pipeline_core/llm/client.py RENAMED Viewed

@@ -38,6 +38,7 @@ from .ai_messages import AIMessages, AIMessageType
 from .model_options import ModelOptions
 from .model_response import ModelResponse, StructuredModelResponse
 from .model_types import ModelName
+from .validation import validate_messages
 logger = get_pipeline_logger(__name__)
@@ -399,6 +400,11 @@ async def _generate_with_retry(  # noqa: PLR0917
     if not context and not messages:
         raise ValueError("Either context or messages must be provided")
+    # Validate inputs - filter out empty/corrupted documents and attachments
+    context, ctx_warnings = validate_messages(context)
+    messages, msg_warnings = validate_messages(messages)
+    validation_warnings = ctx_warnings + msg_warnings
     # Auto-split large images based on model-specific constraints
     context = _prepare_images_for_model(context, model)
     messages = _prepare_images_for_model(messages, model)
@@ -424,6 +430,8 @@ async def _generate_with_retry(  # noqa: PLR0917
                     laminar_metadata["purpose"] = purpose
                 if expected_cost is not None:
                     laminar_metadata["expected_cost"] = expected_cost
+                if validation_warnings:
+                    response._metadata["validation_warnings"] = validation_warnings
                 span.set_attributes(laminar_metadata)  # pyright: ignore[reportArgumentType]
                 Laminar.set_span_output([r for r in (response.reasoning_content, response.content) if r])
                 response.validate_output()

ai_pipeline_core-0.4.9/ai_pipeline_core/llm/validation.py ADDED Viewed

@@ -0,0 +1,176 @@
+"""Validation for LLM inputs.
+Validates documents and attachments before sending to LLM to catch
+empty, corrupted, or invalid content early. Filters invalid content
+and logs warnings instead of failing the entire request.
+"""
+from io import BytesIO
+from PIL import Image
+from pypdf import PdfReader
+from ai_pipeline_core.documents import Document
+from ai_pipeline_core.documents.attachment import Attachment
+from ai_pipeline_core.logging import get_pipeline_logger
+from .ai_messages import AIMessages, AIMessageType
+logger = get_pipeline_logger(__name__)
+def _validate_image_content(content: bytes, name: str) -> str | None:
+    """Validate image content. Returns error message or None if valid."""
+    if not content:
+        return f"empty image content in '{name}'"
+    try:
+        with Image.open(BytesIO(content)) as img:
+            img.verify()
+        return None
+    except Exception as e:
+        return f"invalid image in '{name}': {e}"
+def _validate_pdf_content(content: bytes, name: str) -> str | None:
+    """Validate PDF content. Returns error message or None if valid."""
+    if not content:
+        return f"empty PDF content in '{name}'"
+    # Check PDF header signature
+    if not content.lstrip().startswith(b"%PDF-"):
+        return f"invalid PDF header in '{name}' (missing %PDF- signature)"
+    # Check page count - catches 0-page and corrupted PDFs
+    try:
+        reader = PdfReader(BytesIO(content))
+        if len(reader.pages) == 0:
+            return f"PDF has no pages in '{name}'"
+    except Exception as e:
+        return f"corrupted PDF in '{name}': {e}"
+    return None
+def _validate_text_content(content: bytes, name: str) -> str | None:
+    """Validate text content. Returns error message or None if valid."""
+    if not content:
+        return f"empty text content in '{name}'"
+    # Check for null bytes (indicates binary content)
+    if b"\x00" in content:
+        return f"binary content (null bytes) in text '{name}'"
+    # Check UTF-8 encoding
+    try:
+        content.decode("utf-8")
+    except UnicodeDecodeError as e:
+        return f"invalid UTF-8 encoding in '{name}': {e}"
+    return None
+def _validate_attachment(att: Attachment, parent_name: str) -> str | None:
+    """Validate a single attachment. Returns error message or None if valid."""
+    att_name = f"attachment '{att.name}' of '{parent_name}'"
+    if att.is_image:
+        return _validate_image_content(att.content, att_name)
+    if att.is_pdf:
+        return _validate_pdf_content(att.content, att_name)
+    if att.is_text:
+        return _validate_text_content(att.content, att_name)
+    # Unknown type - let it through, document_to_prompt will handle/skip it
+    return None
+def _validate_document(doc: Document) -> tuple[Document | None, list[str]]:
+    """Validate a document and its attachments.
+    Returns:
+        Tuple of (validated_document_or_None, list_of_error_messages).
+        Returns None for document if main content is invalid.
+        Filters out invalid attachments but keeps the document.
+    """
+    errors: list[str] = []
+    # Validate main content based on type
+    err: str | None = None
+    if doc.is_image:
+        err = _validate_image_content(doc.content, doc.name)
+    elif doc.is_pdf:
+        err = _validate_pdf_content(doc.content, doc.name)
+    elif doc.is_text:
+        err = _validate_text_content(doc.content, doc.name)
+    # else: unknown type - let document_to_prompt handle it
+    if err:
+        errors.append(err)
+        return None, errors
+    # Validate attachments
+    if not doc.attachments:
+        return doc, errors
+    valid_attachments: list[Attachment] = []
+    attachments_changed = False
+    for att in doc.attachments:
+        if err := _validate_attachment(att, doc.name):
+            errors.append(err)
+            attachments_changed = True
+        else:
+            valid_attachments.append(att)
+    if attachments_changed:
+        # Return document with filtered attachments
+        return doc.model_copy(update={"attachments": tuple(valid_attachments)}), errors
+    return doc, errors
+def validate_messages(messages: AIMessages) -> tuple[AIMessages, list[str]]:
+    """Validate all documents in messages and filter out invalid content.
+    Validates documents and their attachments. Invalid documents are removed
+    entirely, invalid attachments are filtered from their parent documents.
+    All validation errors are logged as warnings.
+    Args:
+        messages: AIMessages to validate.
+    Returns:
+        Tuple of (validated_messages, list_of_warning_messages).
+        The validated_messages has invalid documents removed and invalid
+        attachments filtered from remaining documents.
+    """
+    if not messages:
+        return messages, []
+    # Quick check: if no documents, nothing to validate
+    has_documents = any(isinstance(m, Document) for m in messages)
+    if not has_documents:
+        return messages, []
+    valid_msgs: list[AIMessageType] = []
+    warnings: list[str] = []
+    for msg in messages:
+        if isinstance(msg, Document):
+            valid_doc, doc_errors = _validate_document(msg)
+            for err in doc_errors:
+                warning_msg = f"LLM input validation: filtering {err}"
+                warnings.append(warning_msg)
+                logger.warning(warning_msg)
+            if valid_doc is not None:
+                valid_msgs.append(valid_doc)
+        else:
+            valid_msgs.append(msg)
+    # Return original if nothing changed (preserve identity for caching)
+    if len(valid_msgs) == len(messages) and not warnings:
+        return messages, []
+    return AIMessages(valid_msgs), warnings

{ai_pipeline_core-0.4.7 → ai_pipeline_core-0.4.9}/ai_pipeline_core/observability/_initialization.py RENAMED Viewed

@@ -8,7 +8,6 @@ import importlib
 from typing import Any, Protocol
 from uuid import UUID
-from lmnr import Laminar
 from opentelemetry import trace as otel_trace
 from pydantic import BaseModel, ConfigDict
@@ -180,10 +179,12 @@ def initialize_observability(config: ObservabilityConfig | None = None) -> None:
     if config is None:
         config = _build_config_from_settings()
-    # 1. Laminar
+    # 1. Laminar - use canonical initializer from tracing module
     if config.has_lmnr:
         try:
-            Laminar.initialize(project_api_key=config.lmnr_project_api_key, export_timeout_seconds=15)
+            from ai_pipeline_core.observability import tracing  # noqa: PLC0415
+            tracing._initialise_laminar()
             logger.info("Laminar initialized")
         except Exception as e:
             logger.warning(f"Laminar initialization failed: {e}")

{ai_pipeline_core-0.4.7 → ai_pipeline_core-0.4.9}/ai_pipeline_core/observability/tracing.py RENAMED Viewed

@@ -10,6 +10,7 @@ import contextlib
 import inspect
 import json
 import os
+import threading
 from collections.abc import Callable
 from functools import wraps
 from typing import Any, Literal, ParamSpec, TypeVar, cast, overload
@@ -220,19 +221,42 @@ class TraceInfo(BaseModel):
 # ---------------------------------------------------------------------------
+_laminar_initialized = False
+_laminar_init_lock = threading.Lock()
 def _initialise_laminar() -> None:
-    """Initialize Laminar SDK with project configuration.
+    """Initialize Laminar SDK with project configuration (lazy, once per process).
     Sets up the Laminar observability client with the project API key
     from settings. Disables automatic OpenAI instrumentation to avoid
     conflicts with our custom tracing.
-    Called once per process. Multiple calls are safe (Laminar handles idempotency).
+    IMPORTANT: This is called lazily at first trace execution (not at decoration time)
+    to allow LMNR_SPAN_CONTEXT environment variable to be set before initialization.
+    Laminar reads LMNR_SPAN_CONTEXT during initialize() to establish parent context
+    for cross-process tracing.
+    Uses double-checked locking pattern for thread safety. The flag is set AFTER
+    successful initialization to prevent permanently disabled tracing on init failure.
     """
-    if settings.lmnr_project_api_key:
-        Laminar.initialize(
-            project_api_key=settings.lmnr_project_api_key, disabled_instruments=[Instruments.OPENAI] if Instruments.OPENAI else [], export_timeout_seconds=15
-        )
+    global _laminar_initialized  # noqa: PLW0603
+    # Fast path: already initialized (no lock needed)
+    if _laminar_initialized:
+        return
+    with _laminar_init_lock:
+        # Double-check inside lock
+        if _laminar_initialized:
+            return
+        if settings.lmnr_project_api_key:
+            disabled = [Instruments.OPENAI] if Instruments.OPENAI else []
+            Laminar.initialize(project_api_key=settings.lmnr_project_api_key, disabled_instruments=disabled, export_timeout_seconds=15)
+        # Set flag AFTER successful initialization
+        _laminar_initialized = True
 # Overload for calls like @trace(name="...", level="debug")
@@ -400,7 +424,9 @@ def trace(  # noqa: UP047
             return f
         # --- Pre-computation (done once when the function is decorated) ---
-        _initialise_laminar()
+        # NOTE: _initialise_laminar() is NOT called here (at decoration/import time)
+        # to allow LMNR_SPAN_CONTEXT to be set before Laminar.initialize() runs.
+        # It's called lazily in the wrapper functions at first execution.
         sig = inspect.signature(f)
         is_coroutine = inspect.iscoroutinefunction(f)
         observe_name = name or f.__name__
@@ -550,6 +576,9 @@ def trace(  # noqa: UP047
             Returns:
                 The result of the wrapped function.
             """
+            # Lazy initialization: called at first execution, not at decoration time.
+            # This allows LMNR_SPAN_CONTEXT to be set before Laminar.initialize().
+            _initialise_laminar()
             observe_params = _prepare_and_get_observe_params(kwargs)
             observed_func = bound_observe(**observe_params)(f)
             return observed_func(*args, **kwargs)
@@ -561,6 +590,9 @@ def trace(  # noqa: UP047
             Returns:
                 The result of the wrapped function.
             """
+            # Lazy initialization: called at first execution, not at decoration time.
+            # This allows LMNR_SPAN_CONTEXT to be set before Laminar.initialize().
+            _initialise_laminar()
             observe_params = _prepare_and_get_observe_params(kwargs)
             observed_func = bound_observe(**observe_params)(f)
             return await observed_func(*args, **kwargs)  # pyright: ignore[reportGeneralTypeIssues, reportUnknownVariableType]

{ai_pipeline_core-0.4.7 → ai_pipeline_core-0.4.9}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "ai-pipeline-core"
-version = "0.4.7"
+version = "0.4.9"
 description = "Core utilities for AI-powered processing pipelines using prefect"
 readme = "README.md"
 license = {text = "MIT"}
@@ -28,6 +28,7 @@ dependencies = [
     "prefect>=3.6.15",
     "pydantic-settings>=2.12.0",
     "pydantic>=2.12.5",
+    "pypdf>=5.0.0",
     "python-magic>=0.4.27",
     "ruamel.yaml>=0.19.1",
     "tiktoken>=0.12.0",