ai-pipeline-core 0.1.12__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ai_pipeline_core/__init__.py +83 -119
- ai_pipeline_core/deployment/__init__.py +34 -0
- ai_pipeline_core/deployment/base.py +861 -0
- ai_pipeline_core/deployment/contract.py +80 -0
- ai_pipeline_core/deployment/deploy.py +561 -0
- ai_pipeline_core/deployment/helpers.py +97 -0
- ai_pipeline_core/deployment/progress.py +126 -0
- ai_pipeline_core/deployment/remote.py +116 -0
- ai_pipeline_core/docs_generator/__init__.py +54 -0
- ai_pipeline_core/docs_generator/__main__.py +5 -0
- ai_pipeline_core/docs_generator/cli.py +196 -0
- ai_pipeline_core/docs_generator/extractor.py +324 -0
- ai_pipeline_core/docs_generator/guide_builder.py +644 -0
- ai_pipeline_core/docs_generator/trimmer.py +35 -0
- ai_pipeline_core/docs_generator/validator.py +114 -0
- ai_pipeline_core/document_store/__init__.py +13 -0
- ai_pipeline_core/document_store/_summary.py +9 -0
- ai_pipeline_core/document_store/_summary_worker.py +170 -0
- ai_pipeline_core/document_store/clickhouse.py +492 -0
- ai_pipeline_core/document_store/factory.py +38 -0
- ai_pipeline_core/document_store/local.py +312 -0
- ai_pipeline_core/document_store/memory.py +85 -0
- ai_pipeline_core/document_store/protocol.py +68 -0
- ai_pipeline_core/documents/__init__.py +14 -15
- ai_pipeline_core/documents/_context_vars.py +85 -0
- ai_pipeline_core/documents/_hashing.py +52 -0
- ai_pipeline_core/documents/attachment.py +85 -0
- ai_pipeline_core/documents/context.py +128 -0
- ai_pipeline_core/documents/document.py +349 -1062
- ai_pipeline_core/documents/mime_type.py +40 -85
- ai_pipeline_core/documents/utils.py +62 -7
- ai_pipeline_core/exceptions.py +10 -62
- ai_pipeline_core/images/__init__.py +309 -0
- ai_pipeline_core/images/_processing.py +151 -0
- ai_pipeline_core/llm/__init__.py +5 -3
- ai_pipeline_core/llm/ai_messages.py +284 -73
- ai_pipeline_core/llm/client.py +462 -209
- ai_pipeline_core/llm/model_options.py +86 -53
- ai_pipeline_core/llm/model_response.py +187 -241
- ai_pipeline_core/llm/model_types.py +34 -54
- ai_pipeline_core/logging/__init__.py +2 -9
- ai_pipeline_core/logging/logging.yml +1 -1
- ai_pipeline_core/logging/logging_config.py +27 -43
- ai_pipeline_core/logging/logging_mixin.py +17 -51
- ai_pipeline_core/observability/__init__.py +32 -0
- ai_pipeline_core/observability/_debug/__init__.py +30 -0
- ai_pipeline_core/observability/_debug/_auto_summary.py +94 -0
- ai_pipeline_core/observability/_debug/_config.py +95 -0
- ai_pipeline_core/observability/_debug/_content.py +764 -0
- ai_pipeline_core/observability/_debug/_processor.py +98 -0
- ai_pipeline_core/observability/_debug/_summary.py +312 -0
- ai_pipeline_core/observability/_debug/_types.py +75 -0
- ai_pipeline_core/observability/_debug/_writer.py +843 -0
- ai_pipeline_core/observability/_document_tracking.py +146 -0
- ai_pipeline_core/observability/_initialization.py +194 -0
- ai_pipeline_core/observability/_logging_bridge.py +57 -0
- ai_pipeline_core/observability/_summary.py +81 -0
- ai_pipeline_core/observability/_tracking/__init__.py +6 -0
- ai_pipeline_core/observability/_tracking/_client.py +178 -0
- ai_pipeline_core/observability/_tracking/_internal.py +28 -0
- ai_pipeline_core/observability/_tracking/_models.py +138 -0
- ai_pipeline_core/observability/_tracking/_processor.py +158 -0
- ai_pipeline_core/observability/_tracking/_service.py +311 -0
- ai_pipeline_core/observability/_tracking/_writer.py +229 -0
- ai_pipeline_core/observability/tracing.py +640 -0
- ai_pipeline_core/pipeline/__init__.py +10 -0
- ai_pipeline_core/pipeline/decorators.py +915 -0
- ai_pipeline_core/pipeline/options.py +16 -0
- ai_pipeline_core/prompt_manager.py +26 -105
- ai_pipeline_core/settings.py +41 -32
- ai_pipeline_core/testing.py +9 -0
- ai_pipeline_core-0.4.1.dist-info/METADATA +807 -0
- ai_pipeline_core-0.4.1.dist-info/RECORD +76 -0
- {ai_pipeline_core-0.1.12.dist-info → ai_pipeline_core-0.4.1.dist-info}/WHEEL +1 -1
- ai_pipeline_core/documents/document_list.py +0 -240
- ai_pipeline_core/documents/flow_document.py +0 -128
- ai_pipeline_core/documents/task_document.py +0 -133
- ai_pipeline_core/documents/temporary_document.py +0 -95
- ai_pipeline_core/flow/__init__.py +0 -9
- ai_pipeline_core/flow/config.py +0 -314
- ai_pipeline_core/flow/options.py +0 -75
- ai_pipeline_core/pipeline.py +0 -717
- ai_pipeline_core/prefect.py +0 -54
- ai_pipeline_core/simple_runner/__init__.py +0 -24
- ai_pipeline_core/simple_runner/cli.py +0 -255
- ai_pipeline_core/simple_runner/simple_runner.py +0 -385
- ai_pipeline_core/tracing.py +0 -475
- ai_pipeline_core-0.1.12.dist-info/METADATA +0 -450
- ai_pipeline_core-0.1.12.dist-info/RECORD +0 -36
- {ai_pipeline_core-0.1.12.dist-info → ai_pipeline_core-0.4.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,30 +1,46 @@
|
|
|
1
1
|
"""AI message handling for LLM interactions.
|
|
2
2
|
|
|
3
|
-
@public
|
|
4
|
-
|
|
5
3
|
Provides AIMessages container for managing conversations with mixed content types
|
|
6
4
|
including text, documents, and model responses.
|
|
7
5
|
"""
|
|
8
6
|
|
|
9
7
|
import base64
|
|
10
8
|
import hashlib
|
|
9
|
+
import io
|
|
11
10
|
import json
|
|
11
|
+
from collections.abc import Callable, Iterable
|
|
12
|
+
from copy import deepcopy
|
|
13
|
+
from typing import Any, SupportsIndex
|
|
12
14
|
|
|
13
15
|
from openai.types.chat import (
|
|
14
16
|
ChatCompletionContentPartParam,
|
|
15
17
|
ChatCompletionMessageParam,
|
|
16
18
|
)
|
|
17
|
-
from
|
|
19
|
+
from PIL import Image
|
|
18
20
|
|
|
19
21
|
from ai_pipeline_core.documents import Document
|
|
22
|
+
from ai_pipeline_core.documents.document import get_tiktoken_encoding
|
|
23
|
+
from ai_pipeline_core.documents.mime_type import is_llm_supported_image
|
|
24
|
+
from ai_pipeline_core.logging import get_pipeline_logger
|
|
20
25
|
|
|
21
26
|
from .model_response import ModelResponse
|
|
22
27
|
|
|
28
|
+
logger = get_pipeline_logger(__name__)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _ensure_llm_compatible_image(content: bytes, mime_type: str) -> tuple[bytes, str]:
|
|
32
|
+
"""Convert unsupported image formats to PNG for LLM consumption."""
|
|
33
|
+
if is_llm_supported_image(mime_type):
|
|
34
|
+
return content, mime_type
|
|
35
|
+
img = Image.open(io.BytesIO(content))
|
|
36
|
+
buf = io.BytesIO()
|
|
37
|
+
img.save(buf, format="PNG")
|
|
38
|
+
return buf.getvalue(), "image/png"
|
|
39
|
+
|
|
40
|
+
|
|
23
41
|
AIMessageType = str | Document | ModelResponse
|
|
24
42
|
"""Type for messages in AIMessages container.
|
|
25
43
|
|
|
26
|
-
@public
|
|
27
|
-
|
|
28
44
|
Represents the allowed types for conversation messages:
|
|
29
45
|
- str: Plain text messages
|
|
30
46
|
- Document: Structured document content
|
|
@@ -32,11 +48,9 @@ Represents the allowed types for conversation messages:
|
|
|
32
48
|
"""
|
|
33
49
|
|
|
34
50
|
|
|
35
|
-
class AIMessages(list[AIMessageType]):
|
|
51
|
+
class AIMessages(list[AIMessageType]): # noqa: PLR0904
|
|
36
52
|
"""Container for AI conversation messages supporting mixed types.
|
|
37
53
|
|
|
38
|
-
@public
|
|
39
|
-
|
|
40
54
|
This class extends list to manage conversation messages between user
|
|
41
55
|
and AI, supporting text, Document objects, and ModelResponse instances.
|
|
42
56
|
Messages are converted to OpenAI-compatible format for LLM interactions.
|
|
@@ -44,28 +58,151 @@ class AIMessages(list[AIMessageType]):
|
|
|
44
58
|
Conversion Rules:
|
|
45
59
|
- str: Becomes {"role": "user", "content": text}
|
|
46
60
|
- Document: Becomes {"role": "user", "content": document_content}
|
|
47
|
-
(automatically handles text, images, PDFs based on MIME type
|
|
61
|
+
(automatically handles text, images, PDFs based on MIME type; attachments
|
|
62
|
+
are rendered as <attachment> XML blocks)
|
|
48
63
|
- ModelResponse: Becomes {"role": "assistant", "content": response.content}
|
|
49
64
|
|
|
50
65
|
Note: Document conversion is automatic. Text content becomes user text messages.
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
66
|
+
|
|
67
|
+
VISION/PDF MODEL COMPATIBILITY WARNING:
|
|
68
|
+
Images require vision-capable models (e.g., gpt-5.1, gemini-3-flash, gemini-3-pro).
|
|
69
|
+
Non-vision models will raise ValueError when encountering image documents.
|
|
70
|
+
PDFs require models with document processing support - check your model's capabilities
|
|
71
|
+
before including PDF documents in messages. Unsupported models may fall back to
|
|
72
|
+
text extraction or raise errors depending on provider configuration.
|
|
73
|
+
LiteLLM proxy handles the specific encoding requirements for each provider.
|
|
55
74
|
|
|
56
75
|
IMPORTANT: Although AIMessages can contain Document entries, the LLM client functions
|
|
57
76
|
expect `messages` to be `AIMessages` or `str`. If you start from a Document or a list
|
|
58
77
|
of Documents, build AIMessages first (e.g., `AIMessages([doc])` or `AIMessages(docs)`).
|
|
59
78
|
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
>>> prompt = messages.get_last_message_as_str() # Get the last message as a string
|
|
79
|
+
CAUTION: AIMessages is a list subclass. Always use list construction (e.g.,
|
|
80
|
+
`AIMessages(["text"])`) or empty constructor with append (e.g.,
|
|
81
|
+
`AIMessages(); messages.append("text")`). Never pass raw strings directly to the
|
|
82
|
+
constructor (`AIMessages("text")`) as this will raise a TypeError to prevent
|
|
83
|
+
accidental character iteration.
|
|
84
|
+
|
|
67
85
|
"""
|
|
68
86
|
|
|
87
|
+
def __init__(self, iterable: Iterable[AIMessageType] | None = None, *, frozen: bool = False):
|
|
88
|
+
"""Initialize AIMessages with optional iterable.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
iterable: Optional iterable of messages (list, tuple, etc.).
|
|
92
|
+
Must not be a string.
|
|
93
|
+
frozen: If True, list is immutable from creation.
|
|
94
|
+
|
|
95
|
+
Raises:
|
|
96
|
+
TypeError: If a string is passed directly to the constructor.
|
|
97
|
+
"""
|
|
98
|
+
if isinstance(iterable, str):
|
|
99
|
+
raise TypeError(
|
|
100
|
+
"AIMessages cannot be constructed from a string directly. "
|
|
101
|
+
"Use AIMessages(['text']) for a single message or "
|
|
102
|
+
"AIMessages() and then append('text')."
|
|
103
|
+
)
|
|
104
|
+
self._frozen = False # Initialize as unfrozen to allow initial population
|
|
105
|
+
if iterable is None:
|
|
106
|
+
super().__init__()
|
|
107
|
+
else:
|
|
108
|
+
super().__init__(iterable)
|
|
109
|
+
self._frozen = frozen # Set frozen state after initial population
|
|
110
|
+
|
|
111
|
+
def freeze(self) -> None:
|
|
112
|
+
"""Permanently freeze the list, preventing modifications.
|
|
113
|
+
|
|
114
|
+
Once frozen, the list cannot be unfrozen.
|
|
115
|
+
"""
|
|
116
|
+
self._frozen = True
|
|
117
|
+
|
|
118
|
+
def copy(self) -> "AIMessages":
|
|
119
|
+
"""Create an unfrozen deep copy of the list.
|
|
120
|
+
|
|
121
|
+
Returns:
|
|
122
|
+
New unfrozen AIMessages with deep-copied messages.
|
|
123
|
+
"""
|
|
124
|
+
copied_messages = deepcopy(list(self))
|
|
125
|
+
return AIMessages(copied_messages, frozen=False)
|
|
126
|
+
|
|
127
|
+
def _check_frozen(self) -> None:
|
|
128
|
+
"""Check if list is frozen and raise if it is.
|
|
129
|
+
|
|
130
|
+
Raises:
|
|
131
|
+
RuntimeError: If the list is frozen.
|
|
132
|
+
"""
|
|
133
|
+
if self._frozen:
|
|
134
|
+
raise RuntimeError("Cannot modify frozen AIMessages")
|
|
135
|
+
|
|
136
|
+
def append(self, message: AIMessageType) -> None:
|
|
137
|
+
"""Add a message to the end of the list."""
|
|
138
|
+
self._check_frozen()
|
|
139
|
+
super().append(message)
|
|
140
|
+
|
|
141
|
+
def extend(self, messages: Iterable[AIMessageType]) -> None:
|
|
142
|
+
"""Add multiple messages to the list."""
|
|
143
|
+
self._check_frozen()
|
|
144
|
+
super().extend(messages)
|
|
145
|
+
|
|
146
|
+
def insert(self, index: SupportsIndex, message: AIMessageType) -> None:
|
|
147
|
+
"""Insert a message at the specified position."""
|
|
148
|
+
self._check_frozen()
|
|
149
|
+
super().insert(index, message)
|
|
150
|
+
|
|
151
|
+
def __setitem__(
|
|
152
|
+
self,
|
|
153
|
+
index: SupportsIndex | slice,
|
|
154
|
+
value: AIMessageType | Iterable[AIMessageType],
|
|
155
|
+
) -> None:
|
|
156
|
+
"""Set item or slice."""
|
|
157
|
+
self._check_frozen()
|
|
158
|
+
super().__setitem__(index, value) # type: ignore[arg-type]
|
|
159
|
+
|
|
160
|
+
def __iadd__(self, other: Iterable[AIMessageType]) -> "AIMessages":
|
|
161
|
+
"""In-place addition (+=).
|
|
162
|
+
|
|
163
|
+
Returns:
|
|
164
|
+
This AIMessages instance after modification.
|
|
165
|
+
"""
|
|
166
|
+
self._check_frozen()
|
|
167
|
+
return super().__iadd__(other)
|
|
168
|
+
|
|
169
|
+
def __delitem__(self, index: SupportsIndex | slice) -> None:
|
|
170
|
+
"""Delete item or slice from list."""
|
|
171
|
+
self._check_frozen()
|
|
172
|
+
super().__delitem__(index)
|
|
173
|
+
|
|
174
|
+
def pop(self, index: SupportsIndex = -1) -> AIMessageType:
|
|
175
|
+
"""Remove and return item at index.
|
|
176
|
+
|
|
177
|
+
Returns:
|
|
178
|
+
AIMessageType removed from the list.
|
|
179
|
+
"""
|
|
180
|
+
self._check_frozen()
|
|
181
|
+
return super().pop(index)
|
|
182
|
+
|
|
183
|
+
def remove(self, message: AIMessageType) -> None:
|
|
184
|
+
"""Remove first occurrence of message."""
|
|
185
|
+
self._check_frozen()
|
|
186
|
+
super().remove(message)
|
|
187
|
+
|
|
188
|
+
def clear(self) -> None:
|
|
189
|
+
"""Remove all items from list."""
|
|
190
|
+
self._check_frozen()
|
|
191
|
+
super().clear()
|
|
192
|
+
|
|
193
|
+
def reverse(self) -> None:
|
|
194
|
+
"""Reverse list in place."""
|
|
195
|
+
self._check_frozen()
|
|
196
|
+
super().reverse()
|
|
197
|
+
|
|
198
|
+
def sort(self, *, key: Callable[[AIMessageType], Any] | None = None, reverse: bool = False) -> None:
|
|
199
|
+
"""Sort list in place."""
|
|
200
|
+
self._check_frozen()
|
|
201
|
+
if key is None:
|
|
202
|
+
super().sort(reverse=reverse) # type: ignore[call-arg]
|
|
203
|
+
else:
|
|
204
|
+
super().sort(key=key, reverse=reverse)
|
|
205
|
+
|
|
69
206
|
def get_last_message(self) -> AIMessageType:
|
|
70
207
|
"""Get the last message in the conversation.
|
|
71
208
|
|
|
@@ -78,8 +215,6 @@ class AIMessages(list[AIMessageType]):
|
|
|
78
215
|
def get_last_message_as_str(self) -> str:
|
|
79
216
|
"""Get the last message as a string, raising if not a string.
|
|
80
217
|
|
|
81
|
-
@public
|
|
82
|
-
|
|
83
218
|
Returns:
|
|
84
219
|
The last message as a string.
|
|
85
220
|
|
|
@@ -107,6 +242,8 @@ class AIMessages(list[AIMessageType]):
|
|
|
107
242
|
|
|
108
243
|
Transforms the message list into the format expected by OpenAI API.
|
|
109
244
|
Each message type is converted according to its role and content.
|
|
245
|
+
Documents are rendered as XML with any attachments included as nested
|
|
246
|
+
<attachment> blocks.
|
|
110
247
|
|
|
111
248
|
Returns:
|
|
112
249
|
List of ChatCompletionMessageParam dicts (from openai.types.chat)
|
|
@@ -116,26 +253,40 @@ class AIMessages(list[AIMessageType]):
|
|
|
116
253
|
Raises:
|
|
117
254
|
ValueError: If message type is not supported.
|
|
118
255
|
|
|
119
|
-
Example:
|
|
120
|
-
>>> messages = AIMessages(["Hello", response, "Follow up"])
|
|
121
|
-
>>> prompt = messages.to_prompt()
|
|
122
|
-
>>> # Result: [
|
|
123
|
-
>>> # {"role": "user", "content": "Hello"},
|
|
124
|
-
>>> # {"role": "assistant", "content": "..."},
|
|
125
|
-
>>> # {"role": "user", "content": "Follow up"}
|
|
126
|
-
>>> # ]
|
|
127
256
|
"""
|
|
128
257
|
messages: list[ChatCompletionMessageParam] = []
|
|
129
258
|
|
|
130
259
|
for message in self:
|
|
131
260
|
if isinstance(message, str):
|
|
132
|
-
messages.append({"role": "user", "content": message})
|
|
261
|
+
messages.append({"role": "user", "content": [{"type": "text", "text": message}]})
|
|
133
262
|
elif isinstance(message, Document):
|
|
134
263
|
messages.append({"role": "user", "content": AIMessages.document_to_prompt(message)})
|
|
135
264
|
elif isinstance(message, ModelResponse): # type: ignore
|
|
136
|
-
|
|
265
|
+
# Build base assistant message
|
|
266
|
+
assistant_message: ChatCompletionMessageParam = {
|
|
267
|
+
"role": "assistant",
|
|
268
|
+
"content": [{"type": "text", "text": message.content}],
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
# Preserve reasoning_content (Gemini Flash 3+, O1, O3, GPT-5)
|
|
272
|
+
if reasoning_content := message.reasoning_content:
|
|
273
|
+
assistant_message["reasoning_content"] = reasoning_content # type: ignore[typeddict-item]
|
|
274
|
+
|
|
275
|
+
# Preserve thinking_blocks (structured thinking)
|
|
276
|
+
if hasattr(message.choices[0].message, "thinking_blocks"):
|
|
277
|
+
thinking_blocks = getattr(message.choices[0].message, "thinking_blocks", None)
|
|
278
|
+
if thinking_blocks:
|
|
279
|
+
assistant_message["thinking_blocks"] = thinking_blocks # type: ignore[typeddict-item]
|
|
280
|
+
|
|
281
|
+
# Preserve provider_specific_fields (thought_signatures for Gemini multi-turn)
|
|
282
|
+
if hasattr(message.choices[0].message, "provider_specific_fields"):
|
|
283
|
+
provider_fields = getattr(message.choices[0].message, "provider_specific_fields", None)
|
|
284
|
+
if provider_fields:
|
|
285
|
+
assistant_message["provider_specific_fields"] = provider_fields # type: ignore[typeddict-item]
|
|
286
|
+
|
|
287
|
+
messages.append(assistant_message)
|
|
137
288
|
else:
|
|
138
|
-
raise
|
|
289
|
+
raise TypeError(f"Unsupported message type: {type(message)}")
|
|
139
290
|
|
|
140
291
|
return messages
|
|
141
292
|
|
|
@@ -149,8 +300,8 @@ class AIMessages(list[AIMessageType]):
|
|
|
149
300
|
for message in self:
|
|
150
301
|
if isinstance(message, Document):
|
|
151
302
|
serialized_document = message.serialize_model()
|
|
152
|
-
|
|
153
|
-
messages.append(json.dumps(
|
|
303
|
+
filtered_doc = {k: v for k, v in serialized_document.items() if k != "content"}
|
|
304
|
+
messages.append(json.dumps(filtered_doc, indent=2))
|
|
154
305
|
elif isinstance(message, ModelResponse):
|
|
155
306
|
messages.append(message.content)
|
|
156
307
|
else:
|
|
@@ -171,10 +322,40 @@ class AIMessages(list[AIMessageType]):
|
|
|
171
322
|
system_prompt = ""
|
|
172
323
|
return hashlib.sha256((system_prompt + json.dumps(self.to_prompt())).encode()).hexdigest()
|
|
173
324
|
|
|
325
|
+
@property
|
|
326
|
+
def approximate_tokens_count(self) -> int:
|
|
327
|
+
"""Approximate tokens count for the messages.
|
|
328
|
+
|
|
329
|
+
Uses tiktoken with gpt-4 encoding to estimate total token count
|
|
330
|
+
across all messages in the conversation.
|
|
331
|
+
|
|
332
|
+
Returns:
|
|
333
|
+
Approximate tokens count for all messages.
|
|
334
|
+
|
|
335
|
+
Raises:
|
|
336
|
+
ValueError: If message contains unsupported type.
|
|
337
|
+
|
|
338
|
+
"""
|
|
339
|
+
count = 0
|
|
340
|
+
enc = get_tiktoken_encoding()
|
|
341
|
+
for message in self:
|
|
342
|
+
if isinstance(message, str):
|
|
343
|
+
count += len(enc.encode(message))
|
|
344
|
+
elif isinstance(message, Document):
|
|
345
|
+
count += message.approximate_tokens_count
|
|
346
|
+
elif isinstance(message, ModelResponse): # type: ignore
|
|
347
|
+
count += len(enc.encode(message.content))
|
|
348
|
+
else:
|
|
349
|
+
raise TypeError(f"Unsupported message type: {type(message)}")
|
|
350
|
+
return count
|
|
351
|
+
|
|
174
352
|
@staticmethod
|
|
175
|
-
def document_to_prompt(document: Document) -> list[ChatCompletionContentPartParam]:
|
|
353
|
+
def document_to_prompt(document: Document) -> list[ChatCompletionContentPartParam]: # noqa: PLR0912, PLR0914
|
|
176
354
|
"""Convert a document to prompt format for LLM consumption.
|
|
177
355
|
|
|
356
|
+
Renders the document as XML with text/image/PDF content, followed by any
|
|
357
|
+
attachments as separate <attachment> XML blocks with name and description attributes.
|
|
358
|
+
|
|
178
359
|
Args:
|
|
179
360
|
document: The document to convert.
|
|
180
361
|
|
|
@@ -184,50 +365,80 @@ class AIMessages(list[AIMessageType]):
|
|
|
184
365
|
prompt: list[ChatCompletionContentPartParam] = []
|
|
185
366
|
|
|
186
367
|
# Build the text header
|
|
187
|
-
description =
|
|
188
|
-
|
|
189
|
-
)
|
|
190
|
-
header_text = (
|
|
191
|
-
f"<document>\n<id>{document.id}</id>\n<name>{document.name}</name>\n{description}"
|
|
192
|
-
)
|
|
368
|
+
description = f"<description>{document.description}</description>\n" if document.description else ""
|
|
369
|
+
header_text = f"<document>\n<id>{document.id}</id>\n<name>{document.name}</name>\n{description}"
|
|
193
370
|
|
|
194
371
|
# Handle text documents
|
|
195
372
|
if document.is_text:
|
|
196
373
|
text_content = document.content.decode("utf-8")
|
|
197
|
-
content_text = f"{header_text}<content>\n{text_content}\n</content>\n
|
|
374
|
+
content_text = f"{header_text}<content>\n{text_content}\n</content>\n"
|
|
198
375
|
prompt.append({"type": "text", "text": content_text})
|
|
199
|
-
return prompt
|
|
200
376
|
|
|
201
|
-
# Handle
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
377
|
+
# Handle binary documents (image/PDF)
|
|
378
|
+
elif document.is_image or document.is_pdf:
|
|
379
|
+
prompt.append({"type": "text", "text": f"{header_text}<content>\n"})
|
|
380
|
+
|
|
381
|
+
if document.is_image:
|
|
382
|
+
content_bytes, mime_type = _ensure_llm_compatible_image(document.content, document.mime_type)
|
|
383
|
+
else:
|
|
384
|
+
content_bytes, mime_type = document.content, document.mime_type
|
|
385
|
+
base64_content = base64.b64encode(content_bytes).decode("utf-8")
|
|
386
|
+
data_uri = f"data:{mime_type};base64,{base64_content}"
|
|
387
|
+
|
|
388
|
+
if document.is_pdf:
|
|
389
|
+
prompt.append({
|
|
390
|
+
"type": "file",
|
|
391
|
+
"file": {"file_data": data_uri},
|
|
392
|
+
})
|
|
393
|
+
else:
|
|
394
|
+
prompt.append({
|
|
395
|
+
"type": "image_url",
|
|
396
|
+
"image_url": {"url": data_uri, "detail": "high"},
|
|
397
|
+
})
|
|
398
|
+
|
|
399
|
+
prompt.append({"type": "text", "text": "</content>\n"})
|
|
400
|
+
|
|
401
|
+
else:
|
|
402
|
+
logger.error(f"Document is not a text, image or PDF: {document.name} - {document.mime_type}")
|
|
206
403
|
return []
|
|
207
404
|
|
|
208
|
-
#
|
|
209
|
-
|
|
210
|
-
"
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
405
|
+
# Render attachments
|
|
406
|
+
for att in document.attachments:
|
|
407
|
+
desc_attr = f' description="{att.description}"' if att.description else ""
|
|
408
|
+
att_open = f'<attachment name="{att.name}"{desc_attr}>\n'
|
|
409
|
+
|
|
410
|
+
if att.is_text:
|
|
411
|
+
prompt.append({"type": "text", "text": f"{att_open}{att.text}\n</attachment>\n"})
|
|
412
|
+
elif att.is_image or att.is_pdf:
|
|
413
|
+
prompt.append({"type": "text", "text": att_open})
|
|
414
|
+
|
|
415
|
+
if att.is_image:
|
|
416
|
+
att_bytes, att_mime = _ensure_llm_compatible_image(att.content, att.mime_type)
|
|
417
|
+
else:
|
|
418
|
+
att_bytes, att_mime = att.content, att.mime_type
|
|
419
|
+
att_b64 = base64.b64encode(att_bytes).decode("utf-8")
|
|
420
|
+
att_uri = f"data:{att_mime};base64,{att_b64}"
|
|
421
|
+
|
|
422
|
+
if att.is_pdf:
|
|
423
|
+
prompt.append({
|
|
424
|
+
"type": "file",
|
|
425
|
+
"file": {"file_data": att_uri},
|
|
426
|
+
})
|
|
427
|
+
else:
|
|
428
|
+
prompt.append({
|
|
429
|
+
"type": "image_url",
|
|
430
|
+
"image_url": {"url": att_uri, "detail": "high"},
|
|
431
|
+
})
|
|
432
|
+
|
|
433
|
+
prompt.append({"type": "text", "text": "</attachment>\n"})
|
|
434
|
+
else:
|
|
435
|
+
logger.warning(f"Skipping unsupported attachment type: {att.name} - {att.mime_type}")
|
|
436
|
+
|
|
437
|
+
# Close document — merge into last text part to preserve JSON structure (and cache key)
|
|
438
|
+
last = prompt[-1]
|
|
439
|
+
if last["type"] == "text":
|
|
440
|
+
prompt[-1] = {"type": "text", "text": last["text"] + "</document>\n"}
|
|
441
|
+
else:
|
|
442
|
+
prompt.append({"type": "text", "text": "</document>\n"})
|
|
232
443
|
|
|
233
444
|
return prompt
|