ai-pipeline-core 0.4.8__py3-none-any.whl → 0.4.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ai_pipeline_core/__init__.py +1 -1
- ai_pipeline_core/llm/client.py +8 -0
- ai_pipeline_core/llm/validation.py +176 -0
- {ai_pipeline_core-0.4.8.dist-info → ai_pipeline_core-0.4.9.dist-info}/METADATA +2 -1
- {ai_pipeline_core-0.4.8.dist-info → ai_pipeline_core-0.4.9.dist-info}/RECORD +7 -6
- {ai_pipeline_core-0.4.8.dist-info → ai_pipeline_core-0.4.9.dist-info}/WHEEL +0 -0
- {ai_pipeline_core-0.4.8.dist-info → ai_pipeline_core-0.4.9.dist-info}/licenses/LICENSE +0 -0
ai_pipeline_core/__init__.py
CHANGED
ai_pipeline_core/llm/client.py
CHANGED
|
@@ -38,6 +38,7 @@ from .ai_messages import AIMessages, AIMessageType
|
|
|
38
38
|
from .model_options import ModelOptions
|
|
39
39
|
from .model_response import ModelResponse, StructuredModelResponse
|
|
40
40
|
from .model_types import ModelName
|
|
41
|
+
from .validation import validate_messages
|
|
41
42
|
|
|
42
43
|
logger = get_pipeline_logger(__name__)
|
|
43
44
|
|
|
@@ -399,6 +400,11 @@ async def _generate_with_retry( # noqa: PLR0917
|
|
|
399
400
|
if not context and not messages:
|
|
400
401
|
raise ValueError("Either context or messages must be provided")
|
|
401
402
|
|
|
403
|
+
# Validate inputs - filter out empty/corrupted documents and attachments
|
|
404
|
+
context, ctx_warnings = validate_messages(context)
|
|
405
|
+
messages, msg_warnings = validate_messages(messages)
|
|
406
|
+
validation_warnings = ctx_warnings + msg_warnings
|
|
407
|
+
|
|
402
408
|
# Auto-split large images based on model-specific constraints
|
|
403
409
|
context = _prepare_images_for_model(context, model)
|
|
404
410
|
messages = _prepare_images_for_model(messages, model)
|
|
@@ -424,6 +430,8 @@ async def _generate_with_retry( # noqa: PLR0917
|
|
|
424
430
|
laminar_metadata["purpose"] = purpose
|
|
425
431
|
if expected_cost is not None:
|
|
426
432
|
laminar_metadata["expected_cost"] = expected_cost
|
|
433
|
+
if validation_warnings:
|
|
434
|
+
response._metadata["validation_warnings"] = validation_warnings
|
|
427
435
|
span.set_attributes(laminar_metadata) # pyright: ignore[reportArgumentType]
|
|
428
436
|
Laminar.set_span_output([r for r in (response.reasoning_content, response.content) if r])
|
|
429
437
|
response.validate_output()
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
"""Validation for LLM inputs.
|
|
2
|
+
|
|
3
|
+
Validates documents and attachments before sending to LLM to catch
|
|
4
|
+
empty, corrupted, or invalid content early. Filters invalid content
|
|
5
|
+
and logs warnings instead of failing the entire request.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from io import BytesIO
|
|
9
|
+
|
|
10
|
+
from PIL import Image
|
|
11
|
+
from pypdf import PdfReader
|
|
12
|
+
|
|
13
|
+
from ai_pipeline_core.documents import Document
|
|
14
|
+
from ai_pipeline_core.documents.attachment import Attachment
|
|
15
|
+
from ai_pipeline_core.logging import get_pipeline_logger
|
|
16
|
+
|
|
17
|
+
from .ai_messages import AIMessages, AIMessageType
|
|
18
|
+
|
|
19
|
+
logger = get_pipeline_logger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _validate_image_content(content: bytes, name: str) -> str | None:
|
|
23
|
+
"""Validate image content. Returns error message or None if valid."""
|
|
24
|
+
if not content:
|
|
25
|
+
return f"empty image content in '{name}'"
|
|
26
|
+
try:
|
|
27
|
+
with Image.open(BytesIO(content)) as img:
|
|
28
|
+
img.verify()
|
|
29
|
+
return None
|
|
30
|
+
except Exception as e:
|
|
31
|
+
return f"invalid image in '{name}': {e}"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _validate_pdf_content(content: bytes, name: str) -> str | None:
|
|
35
|
+
"""Validate PDF content. Returns error message or None if valid."""
|
|
36
|
+
if not content:
|
|
37
|
+
return f"empty PDF content in '{name}'"
|
|
38
|
+
|
|
39
|
+
# Check PDF header signature
|
|
40
|
+
if not content.lstrip().startswith(b"%PDF-"):
|
|
41
|
+
return f"invalid PDF header in '{name}' (missing %PDF- signature)"
|
|
42
|
+
|
|
43
|
+
# Check page count - catches 0-page and corrupted PDFs
|
|
44
|
+
try:
|
|
45
|
+
reader = PdfReader(BytesIO(content))
|
|
46
|
+
if len(reader.pages) == 0:
|
|
47
|
+
return f"PDF has no pages in '{name}'"
|
|
48
|
+
except Exception as e:
|
|
49
|
+
return f"corrupted PDF in '{name}': {e}"
|
|
50
|
+
|
|
51
|
+
return None
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _validate_text_content(content: bytes, name: str) -> str | None:
|
|
55
|
+
"""Validate text content. Returns error message or None if valid."""
|
|
56
|
+
if not content:
|
|
57
|
+
return f"empty text content in '{name}'"
|
|
58
|
+
|
|
59
|
+
# Check for null bytes (indicates binary content)
|
|
60
|
+
if b"\x00" in content:
|
|
61
|
+
return f"binary content (null bytes) in text '{name}'"
|
|
62
|
+
|
|
63
|
+
# Check UTF-8 encoding
|
|
64
|
+
try:
|
|
65
|
+
content.decode("utf-8")
|
|
66
|
+
except UnicodeDecodeError as e:
|
|
67
|
+
return f"invalid UTF-8 encoding in '{name}': {e}"
|
|
68
|
+
|
|
69
|
+
return None
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _validate_attachment(att: Attachment, parent_name: str) -> str | None:
|
|
73
|
+
"""Validate a single attachment. Returns error message or None if valid."""
|
|
74
|
+
att_name = f"attachment '{att.name}' of '{parent_name}'"
|
|
75
|
+
|
|
76
|
+
if att.is_image:
|
|
77
|
+
return _validate_image_content(att.content, att_name)
|
|
78
|
+
if att.is_pdf:
|
|
79
|
+
return _validate_pdf_content(att.content, att_name)
|
|
80
|
+
if att.is_text:
|
|
81
|
+
return _validate_text_content(att.content, att_name)
|
|
82
|
+
|
|
83
|
+
# Unknown type - let it through, document_to_prompt will handle/skip it
|
|
84
|
+
return None
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def _validate_document(doc: Document) -> tuple[Document | None, list[str]]:
|
|
88
|
+
"""Validate a document and its attachments.
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
Tuple of (validated_document_or_None, list_of_error_messages).
|
|
92
|
+
Returns None for document if main content is invalid.
|
|
93
|
+
Filters out invalid attachments but keeps the document.
|
|
94
|
+
"""
|
|
95
|
+
errors: list[str] = []
|
|
96
|
+
|
|
97
|
+
# Validate main content based on type
|
|
98
|
+
err: str | None = None
|
|
99
|
+
if doc.is_image:
|
|
100
|
+
err = _validate_image_content(doc.content, doc.name)
|
|
101
|
+
elif doc.is_pdf:
|
|
102
|
+
err = _validate_pdf_content(doc.content, doc.name)
|
|
103
|
+
elif doc.is_text:
|
|
104
|
+
err = _validate_text_content(doc.content, doc.name)
|
|
105
|
+
# else: unknown type - let document_to_prompt handle it
|
|
106
|
+
|
|
107
|
+
if err:
|
|
108
|
+
errors.append(err)
|
|
109
|
+
return None, errors
|
|
110
|
+
|
|
111
|
+
# Validate attachments
|
|
112
|
+
if not doc.attachments:
|
|
113
|
+
return doc, errors
|
|
114
|
+
|
|
115
|
+
valid_attachments: list[Attachment] = []
|
|
116
|
+
attachments_changed = False
|
|
117
|
+
|
|
118
|
+
for att in doc.attachments:
|
|
119
|
+
if err := _validate_attachment(att, doc.name):
|
|
120
|
+
errors.append(err)
|
|
121
|
+
attachments_changed = True
|
|
122
|
+
else:
|
|
123
|
+
valid_attachments.append(att)
|
|
124
|
+
|
|
125
|
+
if attachments_changed:
|
|
126
|
+
# Return document with filtered attachments
|
|
127
|
+
return doc.model_copy(update={"attachments": tuple(valid_attachments)}), errors
|
|
128
|
+
|
|
129
|
+
return doc, errors
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def validate_messages(messages: AIMessages) -> tuple[AIMessages, list[str]]:
|
|
133
|
+
"""Validate all documents in messages and filter out invalid content.
|
|
134
|
+
|
|
135
|
+
Validates documents and their attachments. Invalid documents are removed
|
|
136
|
+
entirely, invalid attachments are filtered from their parent documents.
|
|
137
|
+
All validation errors are logged as warnings.
|
|
138
|
+
|
|
139
|
+
Args:
|
|
140
|
+
messages: AIMessages to validate.
|
|
141
|
+
|
|
142
|
+
Returns:
|
|
143
|
+
Tuple of (validated_messages, list_of_warning_messages).
|
|
144
|
+
The validated_messages has invalid documents removed and invalid
|
|
145
|
+
attachments filtered from remaining documents.
|
|
146
|
+
"""
|
|
147
|
+
if not messages:
|
|
148
|
+
return messages, []
|
|
149
|
+
|
|
150
|
+
# Quick check: if no documents, nothing to validate
|
|
151
|
+
has_documents = any(isinstance(m, Document) for m in messages)
|
|
152
|
+
if not has_documents:
|
|
153
|
+
return messages, []
|
|
154
|
+
|
|
155
|
+
valid_msgs: list[AIMessageType] = []
|
|
156
|
+
warnings: list[str] = []
|
|
157
|
+
|
|
158
|
+
for msg in messages:
|
|
159
|
+
if isinstance(msg, Document):
|
|
160
|
+
valid_doc, doc_errors = _validate_document(msg)
|
|
161
|
+
|
|
162
|
+
for err in doc_errors:
|
|
163
|
+
warning_msg = f"LLM input validation: filtering {err}"
|
|
164
|
+
warnings.append(warning_msg)
|
|
165
|
+
logger.warning(warning_msg)
|
|
166
|
+
|
|
167
|
+
if valid_doc is not None:
|
|
168
|
+
valid_msgs.append(valid_doc)
|
|
169
|
+
else:
|
|
170
|
+
valid_msgs.append(msg)
|
|
171
|
+
|
|
172
|
+
# Return original if nothing changed (preserve identity for caching)
|
|
173
|
+
if len(valid_msgs) == len(messages) and not warnings:
|
|
174
|
+
return messages, []
|
|
175
|
+
|
|
176
|
+
return AIMessages(valid_msgs), warnings
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ai-pipeline-core
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.9
|
|
4
4
|
Summary: Core utilities for AI-powered processing pipelines using prefect
|
|
5
5
|
Project-URL: Homepage, https://github.com/bbarwik/ai-pipeline-core
|
|
6
6
|
Project-URL: Repository, https://github.com/bbarwik/ai-pipeline-core
|
|
@@ -29,6 +29,7 @@ Requires-Dist: prefect-gcp>=0.6.15
|
|
|
29
29
|
Requires-Dist: prefect>=3.6.15
|
|
30
30
|
Requires-Dist: pydantic-settings>=2.12.0
|
|
31
31
|
Requires-Dist: pydantic>=2.12.5
|
|
32
|
+
Requires-Dist: pypdf>=5.0.0
|
|
32
33
|
Requires-Dist: python-magic>=0.4.27
|
|
33
34
|
Requires-Dist: ruamel-yaml>=0.19.1
|
|
34
35
|
Requires-Dist: tiktoken>=0.12.0
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
ai_pipeline_core/__init__.py,sha256=
|
|
1
|
+
ai_pipeline_core/__init__.py,sha256=LwkMjbjJOWUFpZY2kyWNO8wsglvhFMb1gMJ4az2a1TI,3270
|
|
2
2
|
ai_pipeline_core/exceptions.py,sha256=csAl7vq6xjSFBF8-UM9WZODCbhsOdOG5zH6IbA8iteM,1280
|
|
3
3
|
ai_pipeline_core/prompt_manager.py,sha256=3wFkL5rrjtUT1cLInkgyhS8hKnO4MeD1cdXAEuLhgoE,9459
|
|
4
4
|
ai_pipeline_core/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -38,10 +38,11 @@ ai_pipeline_core/images/__init__.py,sha256=Hc2QKR27Q2Q-h5nH-EbzfxdE3dHArBm-st5_x
|
|
|
38
38
|
ai_pipeline_core/images/_processing.py,sha256=MrCuPGsyyEl9UlXYIPhZs0wN8CPTMZmejV2Lo2wyCZk,4362
|
|
39
39
|
ai_pipeline_core/llm/__init__.py,sha256=oyRvYD5DLDl7JIRTBUaiVz6jUC5dLLujkMNFpfRp2zc,795
|
|
40
40
|
ai_pipeline_core/llm/ai_messages.py,sha256=Ieldm2za0tVd-5ysxYTjietWq1gtJ8kWbP-AqWqNJNg,19308
|
|
41
|
-
ai_pipeline_core/llm/client.py,sha256=
|
|
41
|
+
ai_pipeline_core/llm/client.py,sha256=rfnKotEskoargGQG7s3GiGc7ynlzPDAshbX1WOyAwBg,30685
|
|
42
42
|
ai_pipeline_core/llm/model_options.py,sha256=hg8xR0RJdJKp8QJNA4EbLnfFsnkE4HnxD85aYxc--hM,9164
|
|
43
43
|
ai_pipeline_core/llm/model_response.py,sha256=Ml9wcssSssqibReJxCc9EQu488pz69Cmq_XNBs_xmak,12219
|
|
44
44
|
ai_pipeline_core/llm/model_types.py,sha256=qHoUPPEkHu9B4kJ5xcIC09fk72v667ZxvzigxtgLpVo,2174
|
|
45
|
+
ai_pipeline_core/llm/validation.py,sha256=__tTwOnmGBJlXKQXbx6pUAR5uRX1iU09Y7MDrgXcLXc,5675
|
|
45
46
|
ai_pipeline_core/logging/__init__.py,sha256=H8G3bycxwNxc4e4Gjwi-al9e2ufTJbTV5iFKCF1Ticw,495
|
|
46
47
|
ai_pipeline_core/logging/logging.yml,sha256=qsf6vcxtWIHD5xwJGtylibiuy_0KF_Ji7-qb-xvFtaU,1357
|
|
47
48
|
ai_pipeline_core/logging/logging_config.py,sha256=JnTarGSSkpi7eqR7N13TLKeuwNCvZgwJUPlhObiwrHk,6095
|
|
@@ -70,7 +71,7 @@ ai_pipeline_core/observability/_tracking/_writer.py,sha256=xZjwYyIxDzzzPxqkKjYAY
|
|
|
70
71
|
ai_pipeline_core/pipeline/__init__.py,sha256=uMv1jwSyq8Ym8Hbn5097twBJLdwN1iMeqnVM4EWyrhA,282
|
|
71
72
|
ai_pipeline_core/pipeline/decorators.py,sha256=CDJAeOjGLt5Ewc0Jc9zEuwLZwKyutOv89LSRS9dcXmI,37456
|
|
72
73
|
ai_pipeline_core/pipeline/options.py,sha256=KF4FcT085-IwX8r649v0a9ua5xnApM0qG2wJHWbq39A,438
|
|
73
|
-
ai_pipeline_core-0.4.
|
|
74
|
-
ai_pipeline_core-0.4.
|
|
75
|
-
ai_pipeline_core-0.4.
|
|
76
|
-
ai_pipeline_core-0.4.
|
|
74
|
+
ai_pipeline_core-0.4.9.dist-info/METADATA,sha256=-kotpepqq68UEB3jHPL3gfIPIL0NeL2lFvqYphO4f1o,29975
|
|
75
|
+
ai_pipeline_core-0.4.9.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
76
|
+
ai_pipeline_core-0.4.9.dist-info/licenses/LICENSE,sha256=kKj8mfbdWwkyG3U6n7ztB3bAZlEwShTkAsvaY657i3I,1074
|
|
77
|
+
ai_pipeline_core-0.4.9.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|