ai-pipeline-core 0.4.8__py3-none-any.whl → 0.4.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -64,7 +64,7 @@ from .prompt_manager import PromptManager
64
64
  from .settings import Settings
65
65
  from .testing import disable_run_logger, prefect_test_harness
66
66
 
67
- __version__ = "0.4.8"
67
+ __version__ = "0.4.9"
68
68
 
69
69
  __all__ = [
70
70
  "AIMessageType",
@@ -38,6 +38,7 @@ from .ai_messages import AIMessages, AIMessageType
38
38
  from .model_options import ModelOptions
39
39
  from .model_response import ModelResponse, StructuredModelResponse
40
40
  from .model_types import ModelName
41
+ from .validation import validate_messages
41
42
 
42
43
  logger = get_pipeline_logger(__name__)
43
44
 
@@ -399,6 +400,11 @@ async def _generate_with_retry( # noqa: PLR0917
399
400
  if not context and not messages:
400
401
  raise ValueError("Either context or messages must be provided")
401
402
 
403
+ # Validate inputs - filter out empty/corrupted documents and attachments
404
+ context, ctx_warnings = validate_messages(context)
405
+ messages, msg_warnings = validate_messages(messages)
406
+ validation_warnings = ctx_warnings + msg_warnings
407
+
402
408
  # Auto-split large images based on model-specific constraints
403
409
  context = _prepare_images_for_model(context, model)
404
410
  messages = _prepare_images_for_model(messages, model)
@@ -424,6 +430,8 @@ async def _generate_with_retry( # noqa: PLR0917
424
430
  laminar_metadata["purpose"] = purpose
425
431
  if expected_cost is not None:
426
432
  laminar_metadata["expected_cost"] = expected_cost
433
+ if validation_warnings:
434
+ response._metadata["validation_warnings"] = validation_warnings
427
435
  span.set_attributes(laminar_metadata) # pyright: ignore[reportArgumentType]
428
436
  Laminar.set_span_output([r for r in (response.reasoning_content, response.content) if r])
429
437
  response.validate_output()
@@ -0,0 +1,176 @@
1
+ """Validation for LLM inputs.
2
+
3
+ Validates documents and attachments before sending to LLM to catch
4
+ empty, corrupted, or invalid content early. Filters invalid content
5
+ and logs warnings instead of failing the entire request.
6
+ """
7
+
8
+ from io import BytesIO
9
+
10
+ from PIL import Image
11
+ from pypdf import PdfReader
12
+
13
+ from ai_pipeline_core.documents import Document
14
+ from ai_pipeline_core.documents.attachment import Attachment
15
+ from ai_pipeline_core.logging import get_pipeline_logger
16
+
17
+ from .ai_messages import AIMessages, AIMessageType
18
+
19
+ logger = get_pipeline_logger(__name__)
20
+
21
+
22
+ def _validate_image_content(content: bytes, name: str) -> str | None:
23
+ """Validate image content. Returns error message or None if valid."""
24
+ if not content:
25
+ return f"empty image content in '{name}'"
26
+ try:
27
+ with Image.open(BytesIO(content)) as img:
28
+ img.verify()
29
+ return None
30
+ except Exception as e:
31
+ return f"invalid image in '{name}': {e}"
32
+
33
+
34
+ def _validate_pdf_content(content: bytes, name: str) -> str | None:
35
+ """Validate PDF content. Returns error message or None if valid."""
36
+ if not content:
37
+ return f"empty PDF content in '{name}'"
38
+
39
+ # Check PDF header signature
40
+ if not content.lstrip().startswith(b"%PDF-"):
41
+ return f"invalid PDF header in '{name}' (missing %PDF- signature)"
42
+
43
+ # Check page count - catches 0-page and corrupted PDFs
44
+ try:
45
+ reader = PdfReader(BytesIO(content))
46
+ if len(reader.pages) == 0:
47
+ return f"PDF has no pages in '{name}'"
48
+ except Exception as e:
49
+ return f"corrupted PDF in '{name}': {e}"
50
+
51
+ return None
52
+
53
+
54
+ def _validate_text_content(content: bytes, name: str) -> str | None:
55
+ """Validate text content. Returns error message or None if valid."""
56
+ if not content:
57
+ return f"empty text content in '{name}'"
58
+
59
+ # Check for null bytes (indicates binary content)
60
+ if b"\x00" in content:
61
+ return f"binary content (null bytes) in text '{name}'"
62
+
63
+ # Check UTF-8 encoding
64
+ try:
65
+ content.decode("utf-8")
66
+ except UnicodeDecodeError as e:
67
+ return f"invalid UTF-8 encoding in '{name}': {e}"
68
+
69
+ return None
70
+
71
+
72
+ def _validate_attachment(att: Attachment, parent_name: str) -> str | None:
73
+ """Validate a single attachment. Returns error message or None if valid."""
74
+ att_name = f"attachment '{att.name}' of '{parent_name}'"
75
+
76
+ if att.is_image:
77
+ return _validate_image_content(att.content, att_name)
78
+ if att.is_pdf:
79
+ return _validate_pdf_content(att.content, att_name)
80
+ if att.is_text:
81
+ return _validate_text_content(att.content, att_name)
82
+
83
+ # Unknown type - let it through, document_to_prompt will handle/skip it
84
+ return None
85
+
86
+
87
+ def _validate_document(doc: Document) -> tuple[Document | None, list[str]]:
88
+ """Validate a document and its attachments.
89
+
90
+ Returns:
91
+ Tuple of (validated_document_or_None, list_of_error_messages).
92
+ Returns None for document if main content is invalid.
93
+ Filters out invalid attachments but keeps the document.
94
+ """
95
+ errors: list[str] = []
96
+
97
+ # Validate main content based on type
98
+ err: str | None = None
99
+ if doc.is_image:
100
+ err = _validate_image_content(doc.content, doc.name)
101
+ elif doc.is_pdf:
102
+ err = _validate_pdf_content(doc.content, doc.name)
103
+ elif doc.is_text:
104
+ err = _validate_text_content(doc.content, doc.name)
105
+ # else: unknown type - let document_to_prompt handle it
106
+
107
+ if err:
108
+ errors.append(err)
109
+ return None, errors
110
+
111
+ # Validate attachments
112
+ if not doc.attachments:
113
+ return doc, errors
114
+
115
+ valid_attachments: list[Attachment] = []
116
+ attachments_changed = False
117
+
118
+ for att in doc.attachments:
119
+ if err := _validate_attachment(att, doc.name):
120
+ errors.append(err)
121
+ attachments_changed = True
122
+ else:
123
+ valid_attachments.append(att)
124
+
125
+ if attachments_changed:
126
+ # Return document with filtered attachments
127
+ return doc.model_copy(update={"attachments": tuple(valid_attachments)}), errors
128
+
129
+ return doc, errors
130
+
131
+
132
+ def validate_messages(messages: AIMessages) -> tuple[AIMessages, list[str]]:
133
+ """Validate all documents in messages and filter out invalid content.
134
+
135
+ Validates documents and their attachments. Invalid documents are removed
136
+ entirely, invalid attachments are filtered from their parent documents.
137
+ All validation errors are logged as warnings.
138
+
139
+ Args:
140
+ messages: AIMessages to validate.
141
+
142
+ Returns:
143
+ Tuple of (validated_messages, list_of_warning_messages).
144
+ The validated_messages has invalid documents removed and invalid
145
+ attachments filtered from remaining documents.
146
+ """
147
+ if not messages:
148
+ return messages, []
149
+
150
+ # Quick check: if no documents, nothing to validate
151
+ has_documents = any(isinstance(m, Document) for m in messages)
152
+ if not has_documents:
153
+ return messages, []
154
+
155
+ valid_msgs: list[AIMessageType] = []
156
+ warnings: list[str] = []
157
+
158
+ for msg in messages:
159
+ if isinstance(msg, Document):
160
+ valid_doc, doc_errors = _validate_document(msg)
161
+
162
+ for err in doc_errors:
163
+ warning_msg = f"LLM input validation: filtering {err}"
164
+ warnings.append(warning_msg)
165
+ logger.warning(warning_msg)
166
+
167
+ if valid_doc is not None:
168
+ valid_msgs.append(valid_doc)
169
+ else:
170
+ valid_msgs.append(msg)
171
+
172
+ # Return original if nothing changed (preserve identity for caching)
173
+ if len(valid_msgs) == len(messages) and not warnings:
174
+ return messages, []
175
+
176
+ return AIMessages(valid_msgs), warnings
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ai-pipeline-core
3
- Version: 0.4.8
3
+ Version: 0.4.9
4
4
  Summary: Core utilities for AI-powered processing pipelines using prefect
5
5
  Project-URL: Homepage, https://github.com/bbarwik/ai-pipeline-core
6
6
  Project-URL: Repository, https://github.com/bbarwik/ai-pipeline-core
@@ -29,6 +29,7 @@ Requires-Dist: prefect-gcp>=0.6.15
29
29
  Requires-Dist: prefect>=3.6.15
30
30
  Requires-Dist: pydantic-settings>=2.12.0
31
31
  Requires-Dist: pydantic>=2.12.5
32
+ Requires-Dist: pypdf>=5.0.0
32
33
  Requires-Dist: python-magic>=0.4.27
33
34
  Requires-Dist: ruamel-yaml>=0.19.1
34
35
  Requires-Dist: tiktoken>=0.12.0
@@ -1,4 +1,4 @@
1
- ai_pipeline_core/__init__.py,sha256=aJwyMqt4ESan14iAS9guaHbDRk1F97PbOeHBvxShhD4,3270
1
+ ai_pipeline_core/__init__.py,sha256=LwkMjbjJOWUFpZY2kyWNO8wsglvhFMb1gMJ4az2a1TI,3270
2
2
  ai_pipeline_core/exceptions.py,sha256=csAl7vq6xjSFBF8-UM9WZODCbhsOdOG5zH6IbA8iteM,1280
3
3
  ai_pipeline_core/prompt_manager.py,sha256=3wFkL5rrjtUT1cLInkgyhS8hKnO4MeD1cdXAEuLhgoE,9459
4
4
  ai_pipeline_core/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -38,10 +38,11 @@ ai_pipeline_core/images/__init__.py,sha256=Hc2QKR27Q2Q-h5nH-EbzfxdE3dHArBm-st5_x
38
38
  ai_pipeline_core/images/_processing.py,sha256=MrCuPGsyyEl9UlXYIPhZs0wN8CPTMZmejV2Lo2wyCZk,4362
39
39
  ai_pipeline_core/llm/__init__.py,sha256=oyRvYD5DLDl7JIRTBUaiVz6jUC5dLLujkMNFpfRp2zc,795
40
40
  ai_pipeline_core/llm/ai_messages.py,sha256=Ieldm2za0tVd-5ysxYTjietWq1gtJ8kWbP-AqWqNJNg,19308
41
- ai_pipeline_core/llm/client.py,sha256=N8eH9bY2rF28U5kGK0HQ3ibKvphcipSMLVVxtxtut8Y,30275
41
+ ai_pipeline_core/llm/client.py,sha256=rfnKotEskoargGQG7s3GiGc7ynlzPDAshbX1WOyAwBg,30685
42
42
  ai_pipeline_core/llm/model_options.py,sha256=hg8xR0RJdJKp8QJNA4EbLnfFsnkE4HnxD85aYxc--hM,9164
43
43
  ai_pipeline_core/llm/model_response.py,sha256=Ml9wcssSssqibReJxCc9EQu488pz69Cmq_XNBs_xmak,12219
44
44
  ai_pipeline_core/llm/model_types.py,sha256=qHoUPPEkHu9B4kJ5xcIC09fk72v667ZxvzigxtgLpVo,2174
45
+ ai_pipeline_core/llm/validation.py,sha256=__tTwOnmGBJlXKQXbx6pUAR5uRX1iU09Y7MDrgXcLXc,5675
45
46
  ai_pipeline_core/logging/__init__.py,sha256=H8G3bycxwNxc4e4Gjwi-al9e2ufTJbTV5iFKCF1Ticw,495
46
47
  ai_pipeline_core/logging/logging.yml,sha256=qsf6vcxtWIHD5xwJGtylibiuy_0KF_Ji7-qb-xvFtaU,1357
47
48
  ai_pipeline_core/logging/logging_config.py,sha256=JnTarGSSkpi7eqR7N13TLKeuwNCvZgwJUPlhObiwrHk,6095
@@ -70,7 +71,7 @@ ai_pipeline_core/observability/_tracking/_writer.py,sha256=xZjwYyIxDzzzPxqkKjYAY
70
71
  ai_pipeline_core/pipeline/__init__.py,sha256=uMv1jwSyq8Ym8Hbn5097twBJLdwN1iMeqnVM4EWyrhA,282
71
72
  ai_pipeline_core/pipeline/decorators.py,sha256=CDJAeOjGLt5Ewc0Jc9zEuwLZwKyutOv89LSRS9dcXmI,37456
72
73
  ai_pipeline_core/pipeline/options.py,sha256=KF4FcT085-IwX8r649v0a9ua5xnApM0qG2wJHWbq39A,438
73
- ai_pipeline_core-0.4.8.dist-info/METADATA,sha256=Ftytzz5IBhleZK7ce8HbE4XMc8pcjdVdOe2oN-fpluA,29947
74
- ai_pipeline_core-0.4.8.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
75
- ai_pipeline_core-0.4.8.dist-info/licenses/LICENSE,sha256=kKj8mfbdWwkyG3U6n7ztB3bAZlEwShTkAsvaY657i3I,1074
76
- ai_pipeline_core-0.4.8.dist-info/RECORD,,
74
+ ai_pipeline_core-0.4.9.dist-info/METADATA,sha256=-kotpepqq68UEB3jHPL3gfIPIL0NeL2lFvqYphO4f1o,29975
75
+ ai_pipeline_core-0.4.9.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
76
+ ai_pipeline_core-0.4.9.dist-info/licenses/LICENSE,sha256=kKj8mfbdWwkyG3U6n7ztB3bAZlEwShTkAsvaY657i3I,1074
77
+ ai_pipeline_core-0.4.9.dist-info/RECORD,,