ai-pipeline-core 0.4.6__py3-none-any.whl → 0.4.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -38,6 +38,34 @@ def _ensure_llm_compatible_image(content: bytes, mime_type: str) -> tuple[bytes,
38
38
  return buf.getvalue(), "image/png"
39
39
 
40
40
 
41
+ def _looks_like_text(content: bytes) -> bool:
42
+ """Check if content is valid UTF-8 text (not binary).
43
+
44
+ Uses heuristics: must decode as UTF-8 and have no null bytes.
45
+ Null bytes are common in binary files but rare in text.
46
+ """
47
+ if not content:
48
+ return True
49
+ # Null bytes indicate binary content
50
+ if b"\x00" in content:
51
+ return False
52
+ try:
53
+ content.decode("utf-8")
54
+ return True
55
+ except UnicodeDecodeError:
56
+ return False
57
+
58
+
59
+ def _has_pdf_signature(content: bytes) -> bool:
60
+ """Check if content starts with PDF magic bytes (%PDF-).
61
+
62
+ Real PDFs start with %PDF- (possibly after whitespace).
63
+ This prevents false positives when a real PDF happens to be
64
+ partly UTF-8 decodable (e.g., ASCII-heavy PDF metadata).
65
+ """
66
+ return content.lstrip().startswith(b"%PDF-")
67
+
68
+
41
69
  AIMessageType = str | Document | ModelResponse
42
70
  """Type for messages in AIMessages container.
43
71
 
@@ -350,7 +378,7 @@ class AIMessages(list[AIMessageType]): # noqa: PLR0904
350
378
  return count
351
379
 
352
380
  @staticmethod
353
- def document_to_prompt(document: Document) -> list[ChatCompletionContentPartParam]: # noqa: PLR0912, PLR0914
381
+ def document_to_prompt(document: Document) -> list[ChatCompletionContentPartParam]: # noqa: C901, PLR0912, PLR0914, PLR0915
354
382
  """Convert a document to prompt format for LLM consumption.
355
383
 
356
384
  Renders the document as XML with text/image/PDF content, followed by any
@@ -368,8 +396,15 @@ class AIMessages(list[AIMessageType]): # noqa: PLR0904
368
396
  description = f"<description>{document.description}</description>\n" if document.description else ""
369
397
  header_text = f"<document>\n<id>{document.id}</id>\n<name>{document.name}</name>\n{description}"
370
398
 
399
+ # Check if "PDF" is actually text (misnamed file from URL ending in .pdf)
400
+ # Real PDFs start with %PDF- magic bytes; if missing and content is UTF-8, it's text
401
+ is_text = document.is_text
402
+ if not is_text and document.is_pdf and _looks_like_text(document.content) and not _has_pdf_signature(document.content):
403
+ is_text = True
404
+ logger.debug(f"Document '{document.name}' has PDF extension but contains text content - sending as text")
405
+
371
406
  # Handle text documents
372
- if document.is_text:
407
+ if is_text:
373
408
  text_content = document.content.decode("utf-8")
374
409
  content_text = f"{header_text}<content>\n{text_content}\n</content>\n"
375
410
  prompt.append({"type": "text", "text": content_text})
@@ -407,8 +442,16 @@ class AIMessages(list[AIMessageType]): # noqa: PLR0904
407
442
  desc_attr = f' description="{att.description}"' if att.description else ""
408
443
  att_open = f'<attachment name="{att.name}"{desc_attr}>\n'
409
444
 
410
- if att.is_text:
411
- prompt.append({"type": "text", "text": f"{att_open}{att.text}\n</attachment>\n"})
445
+ # Check if "PDF" attachment is actually text (same logic as document)
446
+ att_is_text = att.is_text
447
+ if not att_is_text and att.is_pdf and _looks_like_text(att.content) and not _has_pdf_signature(att.content):
448
+ att_is_text = True
449
+ logger.debug(f"Attachment '{att.name}' has PDF extension but contains text content - sending as text")
450
+
451
+ if att_is_text:
452
+ # Use content.decode() directly - att.text property raises ValueError if is_text is False
453
+ att_text = att.content.decode("utf-8")
454
+ prompt.append({"type": "text", "text": f"{att_open}{att_text}\n</attachment>\n"})
412
455
  elif att.is_image or att.is_pdf:
413
456
  prompt.append({"type": "text", "text": att_open})
414
457
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ai-pipeline-core
3
- Version: 0.4.6
3
+ Version: 0.4.7
4
4
  Summary: Core utilities for AI-powered processing pipelines using prefect
5
5
  Project-URL: Homepage, https://github.com/bbarwik/ai-pipeline-core
6
6
  Project-URL: Repository, https://github.com/bbarwik/ai-pipeline-core
@@ -37,7 +37,7 @@ ai_pipeline_core/documents/utils.py,sha256=9WOW3zvKYxQPnM8LjYFy3V9-yqc6hwgCaiog3
37
37
  ai_pipeline_core/images/__init__.py,sha256=Hc2QKR27Q2Q-h5nH-EbzfxdE3dHArBm-st5_xjOKFh0,8854
38
38
  ai_pipeline_core/images/_processing.py,sha256=MrCuPGsyyEl9UlXYIPhZs0wN8CPTMZmejV2Lo2wyCZk,4362
39
39
  ai_pipeline_core/llm/__init__.py,sha256=oyRvYD5DLDl7JIRTBUaiVz6jUC5dLLujkMNFpfRp2zc,795
40
- ai_pipeline_core/llm/ai_messages.py,sha256=Ycmntk5d6NUFqVVsnNR_IDwJUFuHYEH7CPvmmDfYaJI,17424
40
+ ai_pipeline_core/llm/ai_messages.py,sha256=Ieldm2za0tVd-5ysxYTjietWq1gtJ8kWbP-AqWqNJNg,19308
41
41
  ai_pipeline_core/llm/client.py,sha256=N8eH9bY2rF28U5kGK0HQ3ibKvphcipSMLVVxtxtut8Y,30275
42
42
  ai_pipeline_core/llm/model_options.py,sha256=hg8xR0RJdJKp8QJNA4EbLnfFsnkE4HnxD85aYxc--hM,9164
43
43
  ai_pipeline_core/llm/model_response.py,sha256=Ml9wcssSssqibReJxCc9EQu488pz69Cmq_XNBs_xmak,12219
@@ -70,7 +70,7 @@ ai_pipeline_core/observability/_tracking/_writer.py,sha256=xZjwYyIxDzzzPxqkKjYAY
70
70
  ai_pipeline_core/pipeline/__init__.py,sha256=uMv1jwSyq8Ym8Hbn5097twBJLdwN1iMeqnVM4EWyrhA,282
71
71
  ai_pipeline_core/pipeline/decorators.py,sha256=CDJAeOjGLt5Ewc0Jc9zEuwLZwKyutOv89LSRS9dcXmI,37456
72
72
  ai_pipeline_core/pipeline/options.py,sha256=KF4FcT085-IwX8r649v0a9ua5xnApM0qG2wJHWbq39A,438
73
- ai_pipeline_core-0.4.6.dist-info/METADATA,sha256=hyy3vHyR5xZ5GRg4Nrp8BxKojp3pa-RDg1KWPPm5_O8,29947
74
- ai_pipeline_core-0.4.6.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
75
- ai_pipeline_core-0.4.6.dist-info/licenses/LICENSE,sha256=kKj8mfbdWwkyG3U6n7ztB3bAZlEwShTkAsvaY657i3I,1074
76
- ai_pipeline_core-0.4.6.dist-info/RECORD,,
73
+ ai_pipeline_core-0.4.7.dist-info/METADATA,sha256=yFjXJ9fHXFtmrF2jIFx62k5spfR1PEipR_Uekbn3bmo,29947
74
+ ai_pipeline_core-0.4.7.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
75
+ ai_pipeline_core-0.4.7.dist-info/licenses/LICENSE,sha256=kKj8mfbdWwkyG3U6n7ztB3bAZlEwShTkAsvaY657i3I,1074
76
+ ai_pipeline_core-0.4.7.dist-info/RECORD,,