PyPI - ai-pipeline-core - Versions diffs - 0.4.8__py3-none-any.whl → 0.4.9__py3-none-any.whl - Mend

ai-pipeline-core 0.4.8py3-none-any.whl → 0.4.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

ai_pipeline_core/__init__.py CHANGED Viewed

@@ -64,7 +64,7 @@ from .prompt_manager import PromptManager
 from .settings import Settings
 from .testing import disable_run_logger, prefect_test_harness
-__version__ = "0.4.8"
+__version__ = "0.4.9"
 __all__ = [
     "AIMessageType",

ai_pipeline_core/llm/client.py CHANGED Viewed

@@ -38,6 +38,7 @@ from .ai_messages import AIMessages, AIMessageType
 from .model_options import ModelOptions
 from .model_response import ModelResponse, StructuredModelResponse
 from .model_types import ModelName
+from .validation import validate_messages
 logger = get_pipeline_logger(__name__)
@@ -399,6 +400,11 @@ async def _generate_with_retry(  # noqa: PLR0917
     if not context and not messages:
         raise ValueError("Either context or messages must be provided")
+    # Validate inputs - filter out empty/corrupted documents and attachments
+    context, ctx_warnings = validate_messages(context)
+    messages, msg_warnings = validate_messages(messages)
+    validation_warnings = ctx_warnings + msg_warnings
     # Auto-split large images based on model-specific constraints
     context = _prepare_images_for_model(context, model)
     messages = _prepare_images_for_model(messages, model)
@@ -424,6 +430,8 @@ async def _generate_with_retry(  # noqa: PLR0917
                     laminar_metadata["purpose"] = purpose
                 if expected_cost is not None:
                     laminar_metadata["expected_cost"] = expected_cost
+                if validation_warnings:
+                    response._metadata["validation_warnings"] = validation_warnings
                 span.set_attributes(laminar_metadata)  # pyright: ignore[reportArgumentType]
                 Laminar.set_span_output([r for r in (response.reasoning_content, response.content) if r])
                 response.validate_output()

ai_pipeline_core/llm/validation.py ADDED Viewed

@@ -0,0 +1,176 @@
+"""Validation for LLM inputs.
+Validates documents and attachments before sending to LLM to catch
+empty, corrupted, or invalid content early. Filters invalid content
+and logs warnings instead of failing the entire request.
+"""
+from io import BytesIO
+from PIL import Image
+from pypdf import PdfReader
+from ai_pipeline_core.documents import Document
+from ai_pipeline_core.documents.attachment import Attachment
+from ai_pipeline_core.logging import get_pipeline_logger
+from .ai_messages import AIMessages, AIMessageType
+logger = get_pipeline_logger(__name__)
+def _validate_image_content(content: bytes, name: str) -> str | None:
+    """Validate image content. Returns error message or None if valid."""
+    if not content:
+        return f"empty image content in '{name}'"
+    try:
+        with Image.open(BytesIO(content)) as img:
+            img.verify()
+        return None
+    except Exception as e:
+        return f"invalid image in '{name}': {e}"
+def _validate_pdf_content(content: bytes, name: str) -> str | None:
+    """Validate PDF content. Returns error message or None if valid."""
+    if not content:
+        return f"empty PDF content in '{name}'"
+    # Check PDF header signature
+    if not content.lstrip().startswith(b"%PDF-"):
+        return f"invalid PDF header in '{name}' (missing %PDF- signature)"
+    # Check page count - catches 0-page and corrupted PDFs
+    try:
+        reader = PdfReader(BytesIO(content))
+        if len(reader.pages) == 0:
+            return f"PDF has no pages in '{name}'"
+    except Exception as e:
+        return f"corrupted PDF in '{name}': {e}"
+    return None
+def _validate_text_content(content: bytes, name: str) -> str | None:
+    """Validate text content. Returns error message or None if valid."""
+    if not content:
+        return f"empty text content in '{name}'"
+    # Check for null bytes (indicates binary content)
+    if b"\x00" in content:
+        return f"binary content (null bytes) in text '{name}'"
+    # Check UTF-8 encoding
+    try:
+        content.decode("utf-8")
+    except UnicodeDecodeError as e:
+        return f"invalid UTF-8 encoding in '{name}': {e}"
+    return None
+def _validate_attachment(att: Attachment, parent_name: str) -> str | None:
+    """Validate a single attachment. Returns error message or None if valid."""
+    att_name = f"attachment '{att.name}' of '{parent_name}'"
+    if att.is_image:
+        return _validate_image_content(att.content, att_name)
+    if att.is_pdf:
+        return _validate_pdf_content(att.content, att_name)
+    if att.is_text:
+        return _validate_text_content(att.content, att_name)
+    # Unknown type - let it through, document_to_prompt will handle/skip it
+    return None
+def _validate_document(doc: Document) -> tuple[Document | None, list[str]]:
+    """Validate a document and its attachments.
+    Returns:
+        Tuple of (validated_document_or_None, list_of_error_messages).
+        Returns None for document if main content is invalid.
+        Filters out invalid attachments but keeps the document.
+    """
+    errors: list[str] = []
+    # Validate main content based on type
+    err: str | None = None
+    if doc.is_image:
+        err = _validate_image_content(doc.content, doc.name)
+    elif doc.is_pdf:
+        err = _validate_pdf_content(doc.content, doc.name)
+    elif doc.is_text:
+        err = _validate_text_content(doc.content, doc.name)
+    # else: unknown type - let document_to_prompt handle it
+    if err:
+        errors.append(err)
+        return None, errors
+    # Validate attachments
+    if not doc.attachments:
+        return doc, errors
+    valid_attachments: list[Attachment] = []
+    attachments_changed = False
+    for att in doc.attachments:
+        if err := _validate_attachment(att, doc.name):
+            errors.append(err)
+            attachments_changed = True
+        else:
+            valid_attachments.append(att)
+    if attachments_changed:
+        # Return document with filtered attachments
+        return doc.model_copy(update={"attachments": tuple(valid_attachments)}), errors
+    return doc, errors
+def validate_messages(messages: AIMessages) -> tuple[AIMessages, list[str]]:
+    """Validate all documents in messages and filter out invalid content.
+    Validates documents and their attachments. Invalid documents are removed
+    entirely, invalid attachments are filtered from their parent documents.
+    All validation errors are logged as warnings.
+    Args:
+        messages: AIMessages to validate.
+    Returns:
+        Tuple of (validated_messages, list_of_warning_messages).
+        The validated_messages has invalid documents removed and invalid
+        attachments filtered from remaining documents.
+    """
+    if not messages:
+        return messages, []
+    # Quick check: if no documents, nothing to validate
+    has_documents = any(isinstance(m, Document) for m in messages)
+    if not has_documents:
+        return messages, []
+    valid_msgs: list[AIMessageType] = []
+    warnings: list[str] = []
+    for msg in messages:
+        if isinstance(msg, Document):
+            valid_doc, doc_errors = _validate_document(msg)
+            for err in doc_errors:
+                warning_msg = f"LLM input validation: filtering {err}"
+                warnings.append(warning_msg)
+                logger.warning(warning_msg)
+            if valid_doc is not None:
+                valid_msgs.append(valid_doc)
+        else:
+            valid_msgs.append(msg)
+    # Return original if nothing changed (preserve identity for caching)
+    if len(valid_msgs) == len(messages) and not warnings:
+        return messages, []
+    return AIMessages(valid_msgs), warnings

{ai_pipeline_core-0.4.8.dist-info → ai_pipeline_core-0.4.9.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ai-pipeline-core
-Version: 0.4.8
+Version: 0.4.9
 Summary: Core utilities for AI-powered processing pipelines using prefect
 Project-URL: Homepage, https://github.com/bbarwik/ai-pipeline-core
 Project-URL: Repository, https://github.com/bbarwik/ai-pipeline-core
@@ -29,6 +29,7 @@ Requires-Dist: prefect-gcp>=0.6.15
 Requires-Dist: prefect>=3.6.15
 Requires-Dist: pydantic-settings>=2.12.0
 Requires-Dist: pydantic>=2.12.5
+Requires-Dist: pypdf>=5.0.0
 Requires-Dist: python-magic>=0.4.27
 Requires-Dist: ruamel-yaml>=0.19.1
 Requires-Dist: tiktoken>=0.12.0

{ai_pipeline_core-0.4.8.dist-info → ai_pipeline_core-0.4.9.dist-info}/RECORD RENAMED Viewed

@@ -1,4 +1,4 @@
-ai_pipeline_core/__init__.py,sha256=aJwyMqt4ESan14iAS9guaHbDRk1F97PbOeHBvxShhD4,3270
+ai_pipeline_core/__init__.py,sha256=LwkMjbjJOWUFpZY2kyWNO8wsglvhFMb1gMJ4az2a1TI,3270
 ai_pipeline_core/exceptions.py,sha256=csAl7vq6xjSFBF8-UM9WZODCbhsOdOG5zH6IbA8iteM,1280
 ai_pipeline_core/prompt_manager.py,sha256=3wFkL5rrjtUT1cLInkgyhS8hKnO4MeD1cdXAEuLhgoE,9459
 ai_pipeline_core/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -38,10 +38,11 @@ ai_pipeline_core/images/__init__.py,sha256=Hc2QKR27Q2Q-h5nH-EbzfxdE3dHArBm-st5_x
 ai_pipeline_core/images/_processing.py,sha256=MrCuPGsyyEl9UlXYIPhZs0wN8CPTMZmejV2Lo2wyCZk,4362
 ai_pipeline_core/llm/__init__.py,sha256=oyRvYD5DLDl7JIRTBUaiVz6jUC5dLLujkMNFpfRp2zc,795
 ai_pipeline_core/llm/ai_messages.py,sha256=Ieldm2za0tVd-5ysxYTjietWq1gtJ8kWbP-AqWqNJNg,19308
-ai_pipeline_core/llm/client.py,sha256=N8eH9bY2rF28U5kGK0HQ3ibKvphcipSMLVVxtxtut8Y,30275
+ai_pipeline_core/llm/client.py,sha256=rfnKotEskoargGQG7s3GiGc7ynlzPDAshbX1WOyAwBg,30685
 ai_pipeline_core/llm/model_options.py,sha256=hg8xR0RJdJKp8QJNA4EbLnfFsnkE4HnxD85aYxc--hM,9164
 ai_pipeline_core/llm/model_response.py,sha256=Ml9wcssSssqibReJxCc9EQu488pz69Cmq_XNBs_xmak,12219
 ai_pipeline_core/llm/model_types.py,sha256=qHoUPPEkHu9B4kJ5xcIC09fk72v667ZxvzigxtgLpVo,2174
+ai_pipeline_core/llm/validation.py,sha256=__tTwOnmGBJlXKQXbx6pUAR5uRX1iU09Y7MDrgXcLXc,5675
 ai_pipeline_core/logging/__init__.py,sha256=H8G3bycxwNxc4e4Gjwi-al9e2ufTJbTV5iFKCF1Ticw,495
 ai_pipeline_core/logging/logging.yml,sha256=qsf6vcxtWIHD5xwJGtylibiuy_0KF_Ji7-qb-xvFtaU,1357
 ai_pipeline_core/logging/logging_config.py,sha256=JnTarGSSkpi7eqR7N13TLKeuwNCvZgwJUPlhObiwrHk,6095
@@ -70,7 +71,7 @@ ai_pipeline_core/observability/_tracking/_writer.py,sha256=xZjwYyIxDzzzPxqkKjYAY
 ai_pipeline_core/pipeline/__init__.py,sha256=uMv1jwSyq8Ym8Hbn5097twBJLdwN1iMeqnVM4EWyrhA,282
 ai_pipeline_core/pipeline/decorators.py,sha256=CDJAeOjGLt5Ewc0Jc9zEuwLZwKyutOv89LSRS9dcXmI,37456
 ai_pipeline_core/pipeline/options.py,sha256=KF4FcT085-IwX8r649v0a9ua5xnApM0qG2wJHWbq39A,438
-ai_pipeline_core-0.4.8.dist-info/METADATA,sha256=Ftytzz5IBhleZK7ce8HbE4XMc8pcjdVdOe2oN-fpluA,29947
-ai_pipeline_core-0.4.8.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
-ai_pipeline_core-0.4.8.dist-info/licenses/LICENSE,sha256=kKj8mfbdWwkyG3U6n7ztB3bAZlEwShTkAsvaY657i3I,1074
-ai_pipeline_core-0.4.8.dist-info/RECORD,,
+ai_pipeline_core-0.4.9.dist-info/METADATA,sha256=-kotpepqq68UEB3jHPL3gfIPIL0NeL2lFvqYphO4f1o,29975
+ai_pipeline_core-0.4.9.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
+ai_pipeline_core-0.4.9.dist-info/licenses/LICENSE,sha256=kKj8mfbdWwkyG3U6n7ztB3bAZlEwShTkAsvaY657i3I,1074
+ai_pipeline_core-0.4.9.dist-info/RECORD,,

{ai_pipeline_core-0.4.8.dist-info → ai_pipeline_core-0.4.9.dist-info}/WHEEL RENAMED Viewed

File without changes

{ai_pipeline_core-0.4.8.dist-info → ai_pipeline_core-0.4.9.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

ai-pipeline-core 0.4.8__py3-none-any.whl → 0.4.9__py3-none-any.whl

ai-pipeline-core 0.4.8py3-none-any.whl → 0.4.9py3-none-any.whl