ai-pipeline-core 0.4.6__py3-none-any.whl → 0.4.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ai_pipeline_core/llm/ai_messages.py +47 -4
- {ai_pipeline_core-0.4.6.dist-info → ai_pipeline_core-0.4.7.dist-info}/METADATA +1 -1
- {ai_pipeline_core-0.4.6.dist-info → ai_pipeline_core-0.4.7.dist-info}/RECORD +5 -5
- {ai_pipeline_core-0.4.6.dist-info → ai_pipeline_core-0.4.7.dist-info}/WHEEL +0 -0
- {ai_pipeline_core-0.4.6.dist-info → ai_pipeline_core-0.4.7.dist-info}/licenses/LICENSE +0 -0
|
@@ -38,6 +38,34 @@ def _ensure_llm_compatible_image(content: bytes, mime_type: str) -> tuple[bytes,
|
|
|
38
38
|
return buf.getvalue(), "image/png"
|
|
39
39
|
|
|
40
40
|
|
|
41
|
+
def _looks_like_text(content: bytes) -> bool:
|
|
42
|
+
"""Check if content is valid UTF-8 text (not binary).
|
|
43
|
+
|
|
44
|
+
Uses heuristics: must decode as UTF-8 and have no null bytes.
|
|
45
|
+
Null bytes are common in binary files but rare in text.
|
|
46
|
+
"""
|
|
47
|
+
if not content:
|
|
48
|
+
return True
|
|
49
|
+
# Null bytes indicate binary content
|
|
50
|
+
if b"\x00" in content:
|
|
51
|
+
return False
|
|
52
|
+
try:
|
|
53
|
+
content.decode("utf-8")
|
|
54
|
+
return True
|
|
55
|
+
except UnicodeDecodeError:
|
|
56
|
+
return False
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _has_pdf_signature(content: bytes) -> bool:
|
|
60
|
+
"""Check if content starts with PDF magic bytes (%PDF-).
|
|
61
|
+
|
|
62
|
+
Real PDFs start with %PDF- (possibly after whitespace).
|
|
63
|
+
This prevents false positives when a real PDF happens to be
|
|
64
|
+
partly UTF-8 decodable (e.g., ASCII-heavy PDF metadata).
|
|
65
|
+
"""
|
|
66
|
+
return content.lstrip().startswith(b"%PDF-")
|
|
67
|
+
|
|
68
|
+
|
|
41
69
|
AIMessageType = str | Document | ModelResponse
|
|
42
70
|
"""Type for messages in AIMessages container.
|
|
43
71
|
|
|
@@ -350,7 +378,7 @@ class AIMessages(list[AIMessageType]): # noqa: PLR0904
|
|
|
350
378
|
return count
|
|
351
379
|
|
|
352
380
|
@staticmethod
|
|
353
|
-
def document_to_prompt(document: Document) -> list[ChatCompletionContentPartParam]: # noqa: PLR0912, PLR0914
|
|
381
|
+
def document_to_prompt(document: Document) -> list[ChatCompletionContentPartParam]: # noqa: C901, PLR0912, PLR0914, PLR0915
|
|
354
382
|
"""Convert a document to prompt format for LLM consumption.
|
|
355
383
|
|
|
356
384
|
Renders the document as XML with text/image/PDF content, followed by any
|
|
@@ -368,8 +396,15 @@ class AIMessages(list[AIMessageType]): # noqa: PLR0904
|
|
|
368
396
|
description = f"<description>{document.description}</description>\n" if document.description else ""
|
|
369
397
|
header_text = f"<document>\n<id>{document.id}</id>\n<name>{document.name}</name>\n{description}"
|
|
370
398
|
|
|
399
|
+
# Check if "PDF" is actually text (misnamed file from URL ending in .pdf)
|
|
400
|
+
# Real PDFs start with %PDF- magic bytes; if missing and content is UTF-8, it's text
|
|
401
|
+
is_text = document.is_text
|
|
402
|
+
if not is_text and document.is_pdf and _looks_like_text(document.content) and not _has_pdf_signature(document.content):
|
|
403
|
+
is_text = True
|
|
404
|
+
logger.debug(f"Document '{document.name}' has PDF extension but contains text content - sending as text")
|
|
405
|
+
|
|
371
406
|
# Handle text documents
|
|
372
|
-
if
|
|
407
|
+
if is_text:
|
|
373
408
|
text_content = document.content.decode("utf-8")
|
|
374
409
|
content_text = f"{header_text}<content>\n{text_content}\n</content>\n"
|
|
375
410
|
prompt.append({"type": "text", "text": content_text})
|
|
@@ -407,8 +442,16 @@ class AIMessages(list[AIMessageType]): # noqa: PLR0904
|
|
|
407
442
|
desc_attr = f' description="{att.description}"' if att.description else ""
|
|
408
443
|
att_open = f'<attachment name="{att.name}"{desc_attr}>\n'
|
|
409
444
|
|
|
410
|
-
if
|
|
411
|
-
|
|
445
|
+
# Check if "PDF" attachment is actually text (same logic as document)
|
|
446
|
+
att_is_text = att.is_text
|
|
447
|
+
if not att_is_text and att.is_pdf and _looks_like_text(att.content) and not _has_pdf_signature(att.content):
|
|
448
|
+
att_is_text = True
|
|
449
|
+
logger.debug(f"Attachment '{att.name}' has PDF extension but contains text content - sending as text")
|
|
450
|
+
|
|
451
|
+
if att_is_text:
|
|
452
|
+
# Use content.decode() directly - att.text property raises ValueError if is_text is False
|
|
453
|
+
att_text = att.content.decode("utf-8")
|
|
454
|
+
prompt.append({"type": "text", "text": f"{att_open}{att_text}\n</attachment>\n"})
|
|
412
455
|
elif att.is_image or att.is_pdf:
|
|
413
456
|
prompt.append({"type": "text", "text": att_open})
|
|
414
457
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ai-pipeline-core
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.7
|
|
4
4
|
Summary: Core utilities for AI-powered processing pipelines using prefect
|
|
5
5
|
Project-URL: Homepage, https://github.com/bbarwik/ai-pipeline-core
|
|
6
6
|
Project-URL: Repository, https://github.com/bbarwik/ai-pipeline-core
|
|
@@ -37,7 +37,7 @@ ai_pipeline_core/documents/utils.py,sha256=9WOW3zvKYxQPnM8LjYFy3V9-yqc6hwgCaiog3
|
|
|
37
37
|
ai_pipeline_core/images/__init__.py,sha256=Hc2QKR27Q2Q-h5nH-EbzfxdE3dHArBm-st5_xjOKFh0,8854
|
|
38
38
|
ai_pipeline_core/images/_processing.py,sha256=MrCuPGsyyEl9UlXYIPhZs0wN8CPTMZmejV2Lo2wyCZk,4362
|
|
39
39
|
ai_pipeline_core/llm/__init__.py,sha256=oyRvYD5DLDl7JIRTBUaiVz6jUC5dLLujkMNFpfRp2zc,795
|
|
40
|
-
ai_pipeline_core/llm/ai_messages.py,sha256=
|
|
40
|
+
ai_pipeline_core/llm/ai_messages.py,sha256=Ieldm2za0tVd-5ysxYTjietWq1gtJ8kWbP-AqWqNJNg,19308
|
|
41
41
|
ai_pipeline_core/llm/client.py,sha256=N8eH9bY2rF28U5kGK0HQ3ibKvphcipSMLVVxtxtut8Y,30275
|
|
42
42
|
ai_pipeline_core/llm/model_options.py,sha256=hg8xR0RJdJKp8QJNA4EbLnfFsnkE4HnxD85aYxc--hM,9164
|
|
43
43
|
ai_pipeline_core/llm/model_response.py,sha256=Ml9wcssSssqibReJxCc9EQu488pz69Cmq_XNBs_xmak,12219
|
|
@@ -70,7 +70,7 @@ ai_pipeline_core/observability/_tracking/_writer.py,sha256=xZjwYyIxDzzzPxqkKjYAY
|
|
|
70
70
|
ai_pipeline_core/pipeline/__init__.py,sha256=uMv1jwSyq8Ym8Hbn5097twBJLdwN1iMeqnVM4EWyrhA,282
|
|
71
71
|
ai_pipeline_core/pipeline/decorators.py,sha256=CDJAeOjGLt5Ewc0Jc9zEuwLZwKyutOv89LSRS9dcXmI,37456
|
|
72
72
|
ai_pipeline_core/pipeline/options.py,sha256=KF4FcT085-IwX8r649v0a9ua5xnApM0qG2wJHWbq39A,438
|
|
73
|
-
ai_pipeline_core-0.4.
|
|
74
|
-
ai_pipeline_core-0.4.
|
|
75
|
-
ai_pipeline_core-0.4.
|
|
76
|
-
ai_pipeline_core-0.4.
|
|
73
|
+
ai_pipeline_core-0.4.7.dist-info/METADATA,sha256=yFjXJ9fHXFtmrF2jIFx62k5spfR1PEipR_Uekbn3bmo,29947
|
|
74
|
+
ai_pipeline_core-0.4.7.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
75
|
+
ai_pipeline_core-0.4.7.dist-info/licenses/LICENSE,sha256=kKj8mfbdWwkyG3U6n7ztB3bAZlEwShTkAsvaY657i3I,1074
|
|
76
|
+
ai_pipeline_core-0.4.7.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|