ai-pipeline-core 0.1.8__py3-none-any.whl → 0.1.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ai_pipeline_core/__init__.py +86 -4
- ai_pipeline_core/documents/__init__.py +11 -0
- ai_pipeline_core/documents/document.py +1107 -131
- ai_pipeline_core/documents/document_list.py +147 -38
- ai_pipeline_core/documents/flow_document.py +112 -11
- ai_pipeline_core/documents/mime_type.py +173 -15
- ai_pipeline_core/documents/task_document.py +117 -12
- ai_pipeline_core/documents/temporary_document.py +95 -0
- ai_pipeline_core/documents/utils.py +41 -9
- ai_pipeline_core/exceptions.py +47 -11
- ai_pipeline_core/flow/__init__.py +2 -0
- ai_pipeline_core/flow/config.py +250 -23
- ai_pipeline_core/flow/options.py +50 -1
- ai_pipeline_core/llm/__init__.py +6 -0
- ai_pipeline_core/llm/ai_messages.py +125 -27
- ai_pipeline_core/llm/client.py +278 -26
- ai_pipeline_core/llm/model_options.py +130 -1
- ai_pipeline_core/llm/model_response.py +239 -35
- ai_pipeline_core/llm/model_types.py +67 -0
- ai_pipeline_core/logging/__init__.py +13 -0
- ai_pipeline_core/logging/logging_config.py +72 -20
- ai_pipeline_core/logging/logging_mixin.py +38 -32
- ai_pipeline_core/pipeline.py +308 -60
- ai_pipeline_core/prefect.py +48 -1
- ai_pipeline_core/prompt_manager.py +215 -24
- ai_pipeline_core/settings.py +108 -4
- ai_pipeline_core/simple_runner/__init__.py +5 -0
- ai_pipeline_core/simple_runner/cli.py +145 -17
- ai_pipeline_core/simple_runner/simple_runner.py +244 -6
- ai_pipeline_core/tracing.py +232 -30
- ai_pipeline_core-0.1.11.dist-info/METADATA +450 -0
- ai_pipeline_core-0.1.11.dist-info/RECORD +36 -0
- ai_pipeline_core-0.1.8.dist-info/METADATA +0 -558
- ai_pipeline_core-0.1.8.dist-info/RECORD +0 -35
- {ai_pipeline_core-0.1.8.dist-info → ai_pipeline_core-0.1.11.dist-info}/WHEEL +0 -0
- {ai_pipeline_core-0.1.8.dist-info → ai_pipeline_core-0.1.11.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,14 +1,39 @@
|
|
|
1
|
+
"""Document abstraction layer for AI pipeline flows.
|
|
2
|
+
|
|
3
|
+
@public
|
|
4
|
+
|
|
5
|
+
This module provides the core document abstraction for working with various types of data
|
|
6
|
+
in AI pipelines. Documents are immutable Pydantic models that wrap binary content with metadata.
|
|
7
|
+
"""
|
|
8
|
+
|
|
1
9
|
import base64
|
|
2
10
|
import hashlib
|
|
3
11
|
import json
|
|
4
|
-
import re
|
|
5
12
|
from abc import ABC, abstractmethod
|
|
6
13
|
from base64 import b32encode
|
|
7
14
|
from enum import StrEnum
|
|
8
15
|
from functools import cached_property
|
|
9
|
-
from
|
|
16
|
+
from io import BytesIO
|
|
17
|
+
from typing import (
|
|
18
|
+
Any,
|
|
19
|
+
ClassVar,
|
|
20
|
+
Literal,
|
|
21
|
+
Self,
|
|
22
|
+
TypeVar,
|
|
23
|
+
cast,
|
|
24
|
+
final,
|
|
25
|
+
get_args,
|
|
26
|
+
get_origin,
|
|
27
|
+
overload,
|
|
28
|
+
)
|
|
10
29
|
|
|
11
|
-
from pydantic import
|
|
30
|
+
from pydantic import (
|
|
31
|
+
BaseModel,
|
|
32
|
+
ConfigDict,
|
|
33
|
+
ValidationInfo,
|
|
34
|
+
field_serializer,
|
|
35
|
+
field_validator,
|
|
36
|
+
)
|
|
12
37
|
from ruamel.yaml import YAML
|
|
13
38
|
|
|
14
39
|
from ai_pipeline_core.documents.utils import canonical_name_key
|
|
@@ -26,19 +51,133 @@ TModel = TypeVar("TModel", bound=BaseModel)
|
|
|
26
51
|
|
|
27
52
|
|
|
28
53
|
class Document(BaseModel, ABC):
|
|
29
|
-
"""Abstract base class for all documents.
|
|
54
|
+
r"""Abstract base class for all documents in the AI Pipeline Core system.
|
|
55
|
+
|
|
56
|
+
@public
|
|
57
|
+
|
|
58
|
+
Document is the fundamental data abstraction for all content flowing through
|
|
59
|
+
pipelines. It provides automatic encoding, MIME type detection, serialization,
|
|
60
|
+
and validation. All documents must be subclassed from FlowDocument or TaskDocument
|
|
61
|
+
based on their persistence requirements. TemporaryDocument is a special concrete
|
|
62
|
+
class that can be instantiated directly (not abstract).
|
|
63
|
+
|
|
64
|
+
VALIDATION IS AUTOMATIC - Do not add manual validation!
|
|
65
|
+
Size validation, name validation, and MIME type detection are built-in.
|
|
66
|
+
The framework handles all standard validations internally.
|
|
67
|
+
|
|
68
|
+
# WRONG - These checks already happen automatically:
|
|
69
|
+
if document.size > document.MAX_CONTENT_SIZE:
|
|
70
|
+
raise DocumentSizeError(...) # NO! Already handled
|
|
71
|
+
document.validate_file_name(document.name) # NO! Automatic
|
|
72
|
+
|
|
73
|
+
Best Practices:
|
|
74
|
+
- Use create() classmethod for automatic type conversion (90% of cases)
|
|
75
|
+
- Omit description parameter unless truly needed for metadata
|
|
76
|
+
- When using LLM functions, pass AIMessages or str. Wrap any Document values
|
|
77
|
+
in AIMessages([...]). Do not call .text yourself
|
|
78
|
+
|
|
79
|
+
Standard Usage:
|
|
80
|
+
>>> # CORRECT - minimal parameters
|
|
81
|
+
>>> doc = MyDocument.create(name="data.json", content={"key": "value"})
|
|
82
|
+
|
|
83
|
+
>>> # AVOID - unnecessary description
|
|
84
|
+
>>> doc = MyDocument.create(
|
|
85
|
+
... name="data.json",
|
|
86
|
+
... content={"key": "value"},
|
|
87
|
+
... description="This is data" # Usually not needed!
|
|
88
|
+
... )
|
|
89
|
+
|
|
90
|
+
Key features:
|
|
91
|
+
- Immutable by default (frozen Pydantic model)
|
|
92
|
+
- Automatic MIME type detection
|
|
93
|
+
- Content size validation
|
|
94
|
+
- SHA256 hashing for deduplication
|
|
95
|
+
- Support for text, JSON, YAML, PDF, and image formats
|
|
96
|
+
- Conversion utilities between different formats
|
|
97
|
+
|
|
98
|
+
Class Variables:
|
|
99
|
+
MAX_CONTENT_SIZE: Maximum allowed content size in bytes (default 25MB)
|
|
100
|
+
|
|
101
|
+
Attributes:
|
|
102
|
+
name: Document filename (validated for security)
|
|
103
|
+
description: Optional human-readable description
|
|
104
|
+
content: Raw document content as bytes
|
|
105
|
+
|
|
106
|
+
Creating Documents:
|
|
107
|
+
**Use the `create` classmethod** for most use cases. It accepts various
|
|
108
|
+
content types (str, dict, list, BaseModel) and converts them automatically.
|
|
109
|
+
Only use __init__ directly when you already have bytes content.
|
|
30
110
|
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
111
|
+
>>> # RECOMMENDED: Use create for automatic conversion
|
|
112
|
+
>>> doc = MyDocument.create(name="data.json", content={"key": "value"})
|
|
113
|
+
>>>
|
|
114
|
+
>>> # Direct constructor: Only for bytes
|
|
115
|
+
>>> doc = MyDocument(name="data.bin", content=b"\x00\x01\x02")
|
|
116
|
+
|
|
117
|
+
Warning:
|
|
118
|
+
- Document subclasses should NOT start with 'Test' prefix (pytest conflict)
|
|
119
|
+
- Cannot instantiate Document directly - must subclass FlowDocument or TaskDocument
|
|
120
|
+
- Cannot add custom fields - only name, description, content are allowed
|
|
121
|
+
- Document is an abstract class and cannot be instantiated directly
|
|
122
|
+
|
|
123
|
+
Metadata Attachment Patterns:
|
|
124
|
+
Since custom fields are not allowed, use these patterns for metadata:
|
|
125
|
+
1. Use the 'description' field for human-readable metadata
|
|
126
|
+
2. Embed metadata in content (e.g., JSON with data + metadata fields)
|
|
127
|
+
3. Create a separate MetadataDocument type to accompany data documents
|
|
128
|
+
4. Use document naming conventions (e.g., "data_v2_2024.json")
|
|
129
|
+
5. Store metadata in flow_options or pass through TraceInfo
|
|
130
|
+
|
|
131
|
+
Example:
|
|
132
|
+
>>> from enum import StrEnum
|
|
133
|
+
>>>
|
|
134
|
+
>>> # Simple document:
|
|
135
|
+
>>> class MyDocument(FlowDocument):
|
|
136
|
+
... pass
|
|
137
|
+
>>>
|
|
138
|
+
>>> # Document with file restrictions:
|
|
139
|
+
>>> class ConfigDocument(FlowDocument):
|
|
140
|
+
... class FILES(StrEnum):
|
|
141
|
+
... CONFIG = "config.yaml"
|
|
142
|
+
... SETTINGS = "settings.json"
|
|
143
|
+
>>>
|
|
144
|
+
>>> # RECOMMENDED: Use create for automatic conversion
|
|
145
|
+
>>> doc = MyDocument.create(name="data.json", content={"key": "value"})
|
|
146
|
+
>>> print(doc.is_text) # True
|
|
147
|
+
>>> data = doc.as_json() # {'key': 'value'}
|
|
148
|
+
"""
|
|
149
|
+
|
|
150
|
+
MAX_CONTENT_SIZE: ClassVar[int] = 25 * 1024 * 1024
|
|
151
|
+
"""Maximum allowed content size in bytes (default 25MB).
|
|
152
|
+
|
|
153
|
+
@public
|
|
34
154
|
"""
|
|
35
155
|
|
|
36
|
-
MAX_CONTENT_SIZE: ClassVar[int] = 25 * 1024 * 1024 # 25MB default
|
|
37
156
|
DESCRIPTION_EXTENSION: ClassVar[str] = ".description.md"
|
|
157
|
+
"""File extension for description files."""
|
|
158
|
+
|
|
38
159
|
MARKDOWN_LIST_SEPARATOR: ClassVar[str] = "\n\n---\n\n"
|
|
160
|
+
"""Separator for markdown list items."""
|
|
39
161
|
|
|
40
162
|
def __init_subclass__(cls, **kwargs: Any) -> None:
|
|
41
|
-
"""Validate subclass
|
|
163
|
+
"""Validate subclass configuration at definition time.
|
|
164
|
+
|
|
165
|
+
Performs several validation checks when a Document subclass is defined:
|
|
166
|
+
1. Prevents class names starting with 'Test' (pytest conflict)
|
|
167
|
+
2. Validates FILES enum if present (must be StrEnum)
|
|
168
|
+
3. Prevents adding custom fields beyond name, description, content
|
|
169
|
+
|
|
170
|
+
Args:
|
|
171
|
+
**kwargs: Additional keyword arguments passed to parent __init_subclass__.
|
|
172
|
+
|
|
173
|
+
Raises:
|
|
174
|
+
TypeError: If subclass violates naming rules, FILES enum requirements,
|
|
175
|
+
or attempts to add extra fields.
|
|
176
|
+
|
|
177
|
+
Note:
|
|
178
|
+
This validation happens at class definition time, not instantiation,
|
|
179
|
+
providing early error detection during development.
|
|
180
|
+
"""
|
|
42
181
|
super().__init_subclass__(**kwargs)
|
|
43
182
|
if cls.__name__.startswith("Test"):
|
|
44
183
|
raise TypeError(
|
|
@@ -46,56 +185,290 @@ class Document(BaseModel, ABC):
|
|
|
46
185
|
"This causes conflicts with pytest test discovery. "
|
|
47
186
|
"Please use a different name (e.g., 'SampleDocument', 'ExampleDocument')."
|
|
48
187
|
)
|
|
188
|
+
if hasattr(cls, "FILES"):
|
|
189
|
+
files = getattr(cls, "FILES")
|
|
190
|
+
if not issubclass(files, StrEnum):
|
|
191
|
+
raise TypeError(
|
|
192
|
+
f"Document subclass '{cls.__name__}'.FILES must be an Enum of string values"
|
|
193
|
+
)
|
|
194
|
+
# Check that the Document's model_fields only contain the allowed fields
|
|
195
|
+
# It prevents AI models from adding additional fields to documents
|
|
196
|
+
allowed = {"name", "description", "content"}
|
|
197
|
+
current = set(getattr(cls, "model_fields", {}).keys())
|
|
198
|
+
extras = current - allowed
|
|
199
|
+
if extras:
|
|
200
|
+
raise TypeError(
|
|
201
|
+
f"Document subclass '{cls.__name__}' cannot declare additional fields: "
|
|
202
|
+
f"{', '.join(sorted(extras))}. Only {', '.join(sorted(allowed))} are allowed."
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
@overload
|
|
206
|
+
@classmethod
|
|
207
|
+
def create(cls, *, name: str, content: bytes, description: str | None = None) -> Self: ...
|
|
208
|
+
|
|
209
|
+
@overload
|
|
210
|
+
@classmethod
|
|
211
|
+
def create(cls, *, name: str, content: str, description: str | None = None) -> Self: ...
|
|
212
|
+
|
|
213
|
+
@overload
|
|
214
|
+
@classmethod
|
|
215
|
+
def create(
|
|
216
|
+
cls, *, name: str, content: dict[str, Any], description: str | None = None
|
|
217
|
+
) -> Self: ...
|
|
218
|
+
|
|
219
|
+
@overload
|
|
220
|
+
@classmethod
|
|
221
|
+
def create(cls, *, name: str, content: list[Any], description: str | None = None) -> Self: ...
|
|
222
|
+
|
|
223
|
+
@overload
|
|
224
|
+
@classmethod
|
|
225
|
+
def create(cls, *, name: str, content: BaseModel, description: str | None = None) -> Self: ...
|
|
226
|
+
|
|
227
|
+
@classmethod
|
|
228
|
+
def create(
|
|
229
|
+
cls,
|
|
230
|
+
*,
|
|
231
|
+
name: str,
|
|
232
|
+
content: str | bytes | dict[str, Any] | list[Any] | BaseModel,
|
|
233
|
+
description: str | None = None,
|
|
234
|
+
) -> Self:
|
|
235
|
+
r"""Create a Document with automatic content type conversion (recommended).
|
|
236
|
+
|
|
237
|
+
@public
|
|
238
|
+
|
|
239
|
+
This is the **recommended way to create documents**. It accepts various
|
|
240
|
+
content types and automatically converts them to bytes based on the file
|
|
241
|
+
extension. Use the `parse` method to reverse this conversion.
|
|
242
|
+
|
|
243
|
+
Best Practice (90% of cases):
|
|
244
|
+
Only provide name and content. The description parameter is RARELY needed.
|
|
245
|
+
|
|
246
|
+
Args:
|
|
247
|
+
name: Document filename (required, keyword-only).
|
|
248
|
+
Extension determines serialization:
|
|
249
|
+
- .json → JSON serialization
|
|
250
|
+
- .yaml/.yml → YAML serialization
|
|
251
|
+
- .md → Markdown list joining (for list[str])
|
|
252
|
+
- Others → UTF-8 encoding (for str)
|
|
253
|
+
content: Document content in various formats (required, keyword-only):
|
|
254
|
+
- bytes: Used directly without conversion
|
|
255
|
+
- str: Encoded to UTF-8 bytes
|
|
256
|
+
- dict[str, Any]: Serialized to JSON (.json) or YAML (.yaml/.yml)
|
|
257
|
+
- list[str]: Joined with separator for .md, else JSON/YAML
|
|
258
|
+
- list[BaseModel]: Serialized to JSON or YAML based on extension
|
|
259
|
+
- BaseModel: Serialized to JSON or YAML based on extension
|
|
260
|
+
description: Optional description - USUALLY OMIT THIS (defaults to None).
|
|
261
|
+
Only use when meaningful metadata helps downstream processing
|
|
262
|
+
|
|
263
|
+
Returns:
|
|
264
|
+
New Document instance with content converted to bytes
|
|
265
|
+
|
|
266
|
+
Raises:
|
|
267
|
+
ValueError: If content type is not supported for the file extension
|
|
268
|
+
DocumentNameError: If filename violates validation rules
|
|
269
|
+
DocumentSizeError: If content exceeds MAX_CONTENT_SIZE
|
|
270
|
+
|
|
271
|
+
Note:
|
|
272
|
+
All conversions are reversible using the `parse` method.
|
|
273
|
+
For example: MyDocument.create(name="data.json", content={"key": "value"}).parse(dict)
|
|
274
|
+
returns the original dictionary {"key": "value"}.
|
|
275
|
+
|
|
276
|
+
Example:
|
|
277
|
+
>>> # CORRECT - no description needed (90% of cases)
|
|
278
|
+
>>> doc = MyDocument.create(name="test.txt", content="Hello World")
|
|
279
|
+
>>> doc.content # b'Hello World'
|
|
280
|
+
>>> doc.parse(str) # "Hello World"
|
|
281
|
+
|
|
282
|
+
>>> # CORRECT - Dictionary to JSON, no description
|
|
283
|
+
>>> doc = MyDocument.create(name="config.json", content={"key": "value"})
|
|
284
|
+
>>> doc.content # b'{"key": "value", ...}'
|
|
285
|
+
>>> doc.parse(dict) # {"key": "value"}
|
|
286
|
+
|
|
287
|
+
>>> # AVOID unless description adds real value
|
|
288
|
+
>>> doc = MyDocument.create(
|
|
289
|
+
... name="config.json",
|
|
290
|
+
... content={"key": "value"},
|
|
291
|
+
... description="Config file" # Usually redundant!
|
|
292
|
+
... )
|
|
293
|
+
|
|
294
|
+
>>> # Pydantic model to YAML
|
|
295
|
+
>>> from pydantic import BaseModel
|
|
296
|
+
>>> class Config(BaseModel):
|
|
297
|
+
... host: str
|
|
298
|
+
... port: int
|
|
299
|
+
>>> config = Config(host="localhost", port=8080)
|
|
300
|
+
>>> doc = MyDocument.create(name="config.yaml", content=config)
|
|
301
|
+
>>> doc.parse(Config) # Returns Config instance
|
|
302
|
+
|
|
303
|
+
>>> # List to Markdown
|
|
304
|
+
>>> items = ["Section 1", "Section 2"]
|
|
305
|
+
>>> doc = MyDocument.create(name="sections.md", content=items)
|
|
306
|
+
>>> doc.parse(list) # ["Section 1", "Section 2"]
|
|
307
|
+
"""
|
|
308
|
+
# Use model_validate to leverage the existing validator logic
|
|
309
|
+
temp = cls.model_validate({"name": name, "content": content, "description": description})
|
|
310
|
+
# Now construct with type-checker-friendly call (bytes only)
|
|
311
|
+
return cls(name=temp.name, content=temp.content, description=temp.description)
|
|
312
|
+
|
|
313
|
+
def __init__(
|
|
314
|
+
self,
|
|
315
|
+
*,
|
|
316
|
+
name: str,
|
|
317
|
+
content: bytes,
|
|
318
|
+
description: str | None = None,
|
|
319
|
+
) -> None:
|
|
320
|
+
"""Initialize a Document instance with raw bytes content.
|
|
321
|
+
|
|
322
|
+
@public
|
|
49
323
|
|
|
50
|
-
|
|
51
|
-
|
|
324
|
+
Important:
|
|
325
|
+
**Most users should use the `create` classmethod instead of __init__.**
|
|
326
|
+
The create method provides automatic content conversion for various types
|
|
327
|
+
(str, dict, list, Pydantic models) while __init__ only accepts bytes.
|
|
328
|
+
|
|
329
|
+
This constructor accepts only bytes content for type safety. It prevents
|
|
330
|
+
direct instantiation of the abstract Document class.
|
|
331
|
+
|
|
332
|
+
Args:
|
|
333
|
+
name: Document filename (required, keyword-only)
|
|
334
|
+
content: Document content as raw bytes (required, keyword-only)
|
|
335
|
+
description: Optional human-readable description (keyword-only)
|
|
336
|
+
|
|
337
|
+
Raises:
|
|
338
|
+
TypeError: If attempting to instantiate Document directly.
|
|
339
|
+
|
|
340
|
+
Example:
|
|
341
|
+
>>> # Direct constructor - only for bytes content:
|
|
342
|
+
>>> doc = MyDocument(name="test.txt", content=b"Hello World")
|
|
343
|
+
>>> doc.content # b'Hello World'
|
|
344
|
+
|
|
345
|
+
>>> # RECOMMENDED: Use create for automatic conversion:
|
|
346
|
+
>>> doc = MyDocument.create(name="text.txt", content="Hello World")
|
|
347
|
+
>>> doc = MyDocument.create(name="data.json", content={"key": "value"})
|
|
348
|
+
>>> doc = MyDocument.create(name="config.yaml", content=my_model)
|
|
349
|
+
>>> doc = MyDocument.create(name="items.md", content=["item1", "item2"])
|
|
350
|
+
|
|
351
|
+
See Also:
|
|
352
|
+
create: Recommended factory method with automatic type conversion
|
|
353
|
+
parse: Method to reverse the conversion done by create
|
|
354
|
+
"""
|
|
52
355
|
if type(self) is Document:
|
|
53
356
|
raise TypeError("Cannot instantiate abstract Document class directly")
|
|
54
|
-
super().__init__(**data)
|
|
55
357
|
|
|
56
|
-
|
|
57
|
-
# This is used to validate the document name.
|
|
58
|
-
FILES: ClassVar[type[StrEnum] | None] = None
|
|
358
|
+
super().__init__(name=name, content=content, description=description)
|
|
59
359
|
|
|
60
360
|
name: str
|
|
61
361
|
description: str | None = None
|
|
62
|
-
content: bytes
|
|
362
|
+
content: bytes # Note: constructor accepts str | bytes, but field stores bytes only
|
|
63
363
|
|
|
64
364
|
# Pydantic configuration
|
|
65
365
|
model_config = ConfigDict(
|
|
66
|
-
frozen=True,
|
|
366
|
+
frozen=True,
|
|
67
367
|
arbitrary_types_allowed=True,
|
|
368
|
+
extra="forbid",
|
|
68
369
|
)
|
|
69
370
|
|
|
70
371
|
@abstractmethod
|
|
71
|
-
def get_base_type(self) -> Literal["flow", "task"]:
|
|
72
|
-
"""Get the type of the document
|
|
372
|
+
def get_base_type(self) -> Literal["flow", "task", "temporary"]:
|
|
373
|
+
"""Get the base type of the document.
|
|
374
|
+
|
|
375
|
+
Abstract method that must be implemented by all Document subclasses
|
|
376
|
+
to indicate their persistence behavior.
|
|
377
|
+
|
|
378
|
+
Returns:
|
|
379
|
+
One of "flow" (persisted across flow runs), "task" (temporary
|
|
380
|
+
within task execution), or "temporary" (never persisted).
|
|
381
|
+
|
|
382
|
+
Note:
|
|
383
|
+
This method determines document persistence and lifecycle.
|
|
384
|
+
FlowDocument returns "flow", TaskDocument returns "task",
|
|
385
|
+
TemporaryDocument returns "temporary".
|
|
386
|
+
"""
|
|
73
387
|
raise NotImplementedError("Subclasses must implement this method")
|
|
74
388
|
|
|
389
|
+
@final
|
|
75
390
|
@property
|
|
76
|
-
def base_type(self) -> Literal["flow", "task"]:
|
|
77
|
-
"""
|
|
391
|
+
def base_type(self) -> Literal["flow", "task", "temporary"]:
|
|
392
|
+
"""Get the document's base type.
|
|
393
|
+
|
|
394
|
+
Property alias for get_base_type() providing a cleaner API.
|
|
395
|
+
This property cannot be overridden by subclasses.
|
|
396
|
+
|
|
397
|
+
Returns:
|
|
398
|
+
The document's base type: "flow", "task", or "temporary".
|
|
399
|
+
"""
|
|
78
400
|
return self.get_base_type()
|
|
79
401
|
|
|
402
|
+
@final
|
|
80
403
|
@property
|
|
81
404
|
def is_flow(self) -> bool:
|
|
82
|
-
"""Check if
|
|
405
|
+
"""Check if this is a flow document.
|
|
406
|
+
|
|
407
|
+
Flow documents persist across Prefect flow runs and are saved
|
|
408
|
+
to the file system between pipeline steps.
|
|
409
|
+
|
|
410
|
+
Returns:
|
|
411
|
+
True if this is a FlowDocument subclass, False otherwise.
|
|
412
|
+
"""
|
|
83
413
|
return self.get_base_type() == "flow"
|
|
84
414
|
|
|
415
|
+
@final
|
|
85
416
|
@property
|
|
86
417
|
def is_task(self) -> bool:
|
|
87
|
-
"""Check if
|
|
418
|
+
"""Check if this is a task document.
|
|
419
|
+
|
|
420
|
+
Task documents are temporary within Prefect task execution
|
|
421
|
+
and are not persisted between pipeline steps.
|
|
422
|
+
|
|
423
|
+
Returns:
|
|
424
|
+
True if this is a TaskDocument subclass, False otherwise.
|
|
425
|
+
"""
|
|
88
426
|
return self.get_base_type() == "task"
|
|
89
427
|
|
|
428
|
+
@final
|
|
429
|
+
@property
|
|
430
|
+
def is_temporary(self) -> bool:
|
|
431
|
+
"""Check if this is a temporary document.
|
|
432
|
+
|
|
433
|
+
Temporary documents are never persisted and exist only
|
|
434
|
+
during execution.
|
|
435
|
+
|
|
436
|
+
Returns:
|
|
437
|
+
True if this is a TemporaryDocument, False otherwise.
|
|
438
|
+
"""
|
|
439
|
+
return self.get_base_type() == "temporary"
|
|
440
|
+
|
|
441
|
+
@final
|
|
90
442
|
@classmethod
|
|
91
443
|
def get_expected_files(cls) -> list[str] | None:
|
|
444
|
+
"""Get the list of allowed file names for this document class.
|
|
445
|
+
|
|
446
|
+
If the document class defines a FILES enum, returns the list of
|
|
447
|
+
valid file names. Used to restrict documents to specific files.
|
|
448
|
+
|
|
449
|
+
Returns:
|
|
450
|
+
List of allowed file names if FILES enum is defined,
|
|
451
|
+
None if unrestricted.
|
|
452
|
+
|
|
453
|
+
Raises:
|
|
454
|
+
DocumentNameError: If FILES is defined but not a valid StrEnum.
|
|
455
|
+
|
|
456
|
+
Example:
|
|
457
|
+
>>> class ConfigDocument(FlowDocument):
|
|
458
|
+
... class FILES(StrEnum):
|
|
459
|
+
... CONFIG = "config.yaml"
|
|
460
|
+
... SETTINGS = "settings.json"
|
|
461
|
+
>>> ConfigDocument.get_expected_files()
|
|
462
|
+
['config.yaml', 'settings.json']
|
|
92
463
|
"""
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
464
|
+
if not hasattr(cls, "FILES"):
|
|
465
|
+
return None
|
|
466
|
+
files = getattr(cls, "FILES")
|
|
467
|
+
if not files:
|
|
96
468
|
return None
|
|
469
|
+
assert issubclass(files, StrEnum)
|
|
97
470
|
try:
|
|
98
|
-
values = [member.value for member in
|
|
471
|
+
values = [member.value for member in files]
|
|
99
472
|
except TypeError:
|
|
100
473
|
raise DocumentNameError(f"{cls.__name__}.FILES must be an Enum of string values")
|
|
101
474
|
if len(values) == 0:
|
|
@@ -104,32 +477,73 @@ class Document(BaseModel, ABC):
|
|
|
104
477
|
|
|
105
478
|
@classmethod
|
|
106
479
|
def validate_file_name(cls, name: str) -> None:
|
|
107
|
-
"""
|
|
108
|
-
|
|
480
|
+
"""Validate that a file name matches allowed patterns.
|
|
481
|
+
|
|
482
|
+
@public
|
|
483
|
+
|
|
484
|
+
DO NOT OVERRIDE this method if you define a FILES enum!
|
|
485
|
+
The validation is automatic when FILES enum is present.
|
|
486
|
+
|
|
487
|
+
# CORRECT - FILES enum provides automatic validation:
|
|
488
|
+
class MyDocument(FlowDocument):
|
|
489
|
+
class FILES(StrEnum):
|
|
490
|
+
CONFIG = "config.yaml" # Validation happens automatically!
|
|
491
|
+
|
|
492
|
+
# WRONG - Unnecessary override:
|
|
493
|
+
class MyDocument(FlowDocument):
|
|
494
|
+
class FILES(StrEnum):
|
|
495
|
+
CONFIG = "config.yaml"
|
|
496
|
+
|
|
497
|
+
def validate_file_name(cls, name): # DON'T DO THIS!
|
|
498
|
+
pass # Validation already happens via FILES enum
|
|
499
|
+
|
|
500
|
+
Only override for custom validation logic BEYOND FILES enum constraints.
|
|
501
|
+
|
|
502
|
+
Args:
|
|
503
|
+
name: The file name to validate.
|
|
109
504
|
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
equals one of the enum values (exact string match).
|
|
113
|
-
- If `FILES` is None, do nothing.
|
|
505
|
+
Raises:
|
|
506
|
+
DocumentNameError: If the name doesn't match allowed patterns.
|
|
114
507
|
|
|
115
|
-
|
|
116
|
-
|
|
508
|
+
Note:
|
|
509
|
+
- If FILES enum is defined, name must exactly match one of the values
|
|
510
|
+
- If FILES is not defined, any name is allowed
|
|
511
|
+
- Override in subclasses ONLY for custom regex patterns or logic
|
|
117
512
|
"""
|
|
118
|
-
|
|
513
|
+
allowed = cls.get_expected_files()
|
|
514
|
+
if not allowed:
|
|
119
515
|
return
|
|
120
516
|
|
|
121
|
-
try:
|
|
122
|
-
allowed = {str(member.value) for member in cls.FILES} # type: ignore[arg-type]
|
|
123
|
-
except TypeError:
|
|
124
|
-
raise DocumentNameError(f"{cls.__name__}.FILES must be an Enum of string values")
|
|
125
|
-
|
|
126
517
|
if len(allowed) > 0 and name not in allowed:
|
|
127
518
|
allowed_str = ", ".join(sorted(allowed))
|
|
128
519
|
raise DocumentNameError(f"Invalid filename '{name}'. Allowed names: {allowed_str}")
|
|
129
520
|
|
|
130
521
|
@field_validator("name")
|
|
131
522
|
def validate_name(cls, v: str) -> str:
|
|
132
|
-
"""
|
|
523
|
+
r"""Pydantic validator for the document name field.
|
|
524
|
+
|
|
525
|
+
Ensures the document name is secure and follows conventions:
|
|
526
|
+
- No path traversal characters (.., \\, /)
|
|
527
|
+
- Cannot end with .description.md
|
|
528
|
+
- No leading/trailing whitespace
|
|
529
|
+
- Must match FILES enum if defined
|
|
530
|
+
|
|
531
|
+
Performance:
|
|
532
|
+
Validation is O(n) where n is the length of the name.
|
|
533
|
+
FILES enum check is O(m) where m is the number of allowed files
|
|
534
|
+
|
|
535
|
+
Args:
|
|
536
|
+
v: The name value to validate.
|
|
537
|
+
|
|
538
|
+
Returns:
|
|
539
|
+
The validated name.
|
|
540
|
+
|
|
541
|
+
Raises:
|
|
542
|
+
DocumentNameError: If the name violates any validation rules.
|
|
543
|
+
|
|
544
|
+
Note:
|
|
545
|
+
This is called automatically by Pydantic during model construction.
|
|
546
|
+
"""
|
|
133
547
|
if v.endswith(cls.DESCRIPTION_EXTENSION):
|
|
134
548
|
raise DocumentNameError(
|
|
135
549
|
f"Document names cannot end with {cls.DESCRIPTION_EXTENSION}: {v}"
|
|
@@ -145,9 +559,142 @@ class Document(BaseModel, ABC):
|
|
|
145
559
|
|
|
146
560
|
return v
|
|
147
561
|
|
|
148
|
-
@field_validator("content")
|
|
149
|
-
|
|
150
|
-
|
|
562
|
+
@field_validator("content", mode="before")
|
|
563
|
+
@classmethod
|
|
564
|
+
def validate_content(cls, v: Any, info: ValidationInfo) -> bytes:
|
|
565
|
+
"""Pydantic validator that converts various content types to bytes.
|
|
566
|
+
|
|
567
|
+
This validator is called automatically during model construction and
|
|
568
|
+
handles the intelligent type conversion that powers the `create` method.
|
|
569
|
+
It determines the appropriate serialization based on file extension.
|
|
570
|
+
|
|
571
|
+
Conversion Strategy:
|
|
572
|
+
1. bytes → Passthrough (no conversion)
|
|
573
|
+
2. str → UTF-8 encoding
|
|
574
|
+
3. dict/BaseModel + .json → JSON serialization (indented)
|
|
575
|
+
4. dict/BaseModel + .yaml/.yml → YAML serialization
|
|
576
|
+
5. list[str] + .md → Join with markdown separator
|
|
577
|
+
6. list[Any] + .json/.yaml → JSON/YAML array
|
|
578
|
+
7. int/float/bool + .json → JSON primitive
|
|
579
|
+
|
|
580
|
+
Args:
|
|
581
|
+
v: Content to validate (any supported type)
|
|
582
|
+
info: Validation context containing other field values
|
|
583
|
+
|
|
584
|
+
Returns:
|
|
585
|
+
Content converted to bytes
|
|
586
|
+
|
|
587
|
+
Raises:
|
|
588
|
+
DocumentSizeError: If content exceeds MAX_CONTENT_SIZE
|
|
589
|
+
ValueError: If content type unsupported for file extension
|
|
590
|
+
|
|
591
|
+
Note:
|
|
592
|
+
This validator enables create() to accept multiple types while
|
|
593
|
+
ensuring __init__ only receives bytes for type safety.
|
|
594
|
+
"""
|
|
595
|
+
# Get the name from validation context if available
|
|
596
|
+
name = ""
|
|
597
|
+
if hasattr(info, "data") and "name" in info.data:
|
|
598
|
+
name = info.data["name"]
|
|
599
|
+
name_lower = name.lower()
|
|
600
|
+
|
|
601
|
+
# Convert based on content type
|
|
602
|
+
if isinstance(v, bytes):
|
|
603
|
+
pass # Already bytes
|
|
604
|
+
elif isinstance(v, str):
|
|
605
|
+
v = v.encode("utf-8")
|
|
606
|
+
elif isinstance(v, dict):
|
|
607
|
+
# Serialize dict based on extension
|
|
608
|
+
if name_lower.endswith((".yaml", ".yml")):
|
|
609
|
+
# Use YAML format for YAML files
|
|
610
|
+
yaml = YAML()
|
|
611
|
+
stream = BytesIO()
|
|
612
|
+
yaml.dump(v, stream)
|
|
613
|
+
v = stream.getvalue()
|
|
614
|
+
elif name_lower.endswith(".json"):
|
|
615
|
+
# Use JSON for JSON files
|
|
616
|
+
v = json.dumps(v, indent=2).encode("utf-8")
|
|
617
|
+
else:
|
|
618
|
+
# Dict not supported for other file types
|
|
619
|
+
raise ValueError(f"Unsupported content type: {type(v)} for file {name}")
|
|
620
|
+
elif isinstance(v, list):
|
|
621
|
+
# Handle lists based on file extension
|
|
622
|
+
if name_lower.endswith(".md"):
|
|
623
|
+
# For markdown files, join with separator
|
|
624
|
+
if all(isinstance(item, str) for item in v):
|
|
625
|
+
v = cls.MARKDOWN_LIST_SEPARATOR.join(v).encode("utf-8")
|
|
626
|
+
else:
|
|
627
|
+
raise ValueError(
|
|
628
|
+
f"Unsupported content type: mixed-type list for markdown file {name}"
|
|
629
|
+
)
|
|
630
|
+
elif name_lower.endswith((".yaml", ".yml")):
|
|
631
|
+
# Check if it's a list of Pydantic models
|
|
632
|
+
if v and isinstance(v[0], BaseModel):
|
|
633
|
+
# Convert models to dicts first
|
|
634
|
+
v = [item.model_dump(mode="json") for item in v]
|
|
635
|
+
# Use YAML format for YAML files
|
|
636
|
+
yaml = YAML()
|
|
637
|
+
stream = BytesIO()
|
|
638
|
+
yaml.dump(v, stream)
|
|
639
|
+
v = stream.getvalue()
|
|
640
|
+
elif name_lower.endswith(".json"):
|
|
641
|
+
# Check if it's a list of Pydantic models
|
|
642
|
+
if v and isinstance(v[0], BaseModel):
|
|
643
|
+
# Convert models to dicts first
|
|
644
|
+
v = [item.model_dump(mode="json") for item in v]
|
|
645
|
+
# For JSON files, serialize as JSON
|
|
646
|
+
v = json.dumps(v, indent=2).encode("utf-8")
|
|
647
|
+
else:
|
|
648
|
+
# Check if it's a list of BaseModel
|
|
649
|
+
if v and isinstance(v[0], BaseModel):
|
|
650
|
+
raise ValueError("list[BaseModel] requires .json or .yaml extension")
|
|
651
|
+
# List content not supported for other file types
|
|
652
|
+
raise ValueError(f"Unsupported content type: {type(v)} for file {name}")
|
|
653
|
+
elif isinstance(v, BaseModel):
|
|
654
|
+
# Serialize Pydantic models
|
|
655
|
+
if name_lower.endswith((".yaml", ".yml")):
|
|
656
|
+
yaml = YAML()
|
|
657
|
+
stream = BytesIO()
|
|
658
|
+
yaml.dump(v.model_dump(mode="json"), stream)
|
|
659
|
+
v = stream.getvalue()
|
|
660
|
+
else:
|
|
661
|
+
v = json.dumps(v.model_dump(mode="json"), indent=2).encode("utf-8")
|
|
662
|
+
elif isinstance(v, (int, float, bool)):
|
|
663
|
+
# Numbers and booleans: JSON-serialize for .json, string for others
|
|
664
|
+
if name_lower.endswith(".json"):
|
|
665
|
+
v = json.dumps(v).encode("utf-8")
|
|
666
|
+
elif name_lower.endswith((".yaml", ".yml")):
|
|
667
|
+
v = str(v).encode("utf-8")
|
|
668
|
+
elif name_lower.endswith(".txt"):
|
|
669
|
+
v = str(v).encode("utf-8")
|
|
670
|
+
else:
|
|
671
|
+
# For other extensions, convert to string
|
|
672
|
+
v = str(v).encode("utf-8")
|
|
673
|
+
elif v is None:
|
|
674
|
+
# Handle None - only supported for JSON/YAML
|
|
675
|
+
if name_lower.endswith((".json", ".yaml", ".yml")):
|
|
676
|
+
if name_lower.endswith((".yaml", ".yml")):
|
|
677
|
+
v = b"null\n"
|
|
678
|
+
else:
|
|
679
|
+
v = b"null"
|
|
680
|
+
else:
|
|
681
|
+
raise ValueError(f"Unsupported content type: {type(None)} for file {name}")
|
|
682
|
+
else:
|
|
683
|
+
# Try to see if it has model_dump (duck typing for Pydantic-like)
|
|
684
|
+
if hasattr(v, "model_dump"):
|
|
685
|
+
if name_lower.endswith((".yaml", ".yml")):
|
|
686
|
+
yaml = YAML()
|
|
687
|
+
stream = BytesIO()
|
|
688
|
+
yaml.dump(v.model_dump(mode="json"), stream) # type: ignore[attr-defined]
|
|
689
|
+
v = stream.getvalue()
|
|
690
|
+
else:
|
|
691
|
+
v = json.dumps(v.model_dump(mode="json"), indent=2).encode("utf-8") # type: ignore[attr-defined]
|
|
692
|
+
else:
|
|
693
|
+
# List non-.json files should raise error
|
|
694
|
+
if name_lower.endswith(".txt") and isinstance(v, list):
|
|
695
|
+
raise ValueError("List content not supported for text files")
|
|
696
|
+
raise ValueError(f"Unsupported content type: {type(v)}")
|
|
697
|
+
|
|
151
698
|
# Check content size limit
|
|
152
699
|
max_size = getattr(cls, "MAX_CONTENT_SIZE", 100 * 1024 * 1024)
|
|
153
700
|
if len(v) > max_size:
|
|
@@ -159,148 +706,537 @@ class Document(BaseModel, ABC):
|
|
|
159
706
|
|
|
160
707
|
@field_serializer("content")
|
|
161
708
|
def serialize_content(self, v: bytes) -> str:
|
|
162
|
-
"""
|
|
709
|
+
"""Pydantic serializer for content field.
|
|
710
|
+
|
|
711
|
+
Converts bytes content to string for JSON serialization.
|
|
712
|
+
Attempts UTF-8 decoding first, falls back to base64 encoding
|
|
713
|
+
for binary content.
|
|
714
|
+
|
|
715
|
+
Args:
|
|
716
|
+
v: The content bytes to serialize.
|
|
717
|
+
|
|
718
|
+
Returns:
|
|
719
|
+
UTF-8 decoded string for text content,
|
|
720
|
+
base64-encoded string for binary content.
|
|
721
|
+
|
|
722
|
+
Note:
|
|
723
|
+
This is called automatically by Pydantic during
|
|
724
|
+
model serialization to JSON.
|
|
725
|
+
"""
|
|
163
726
|
try:
|
|
164
727
|
return v.decode("utf-8")
|
|
165
728
|
except UnicodeDecodeError:
|
|
166
729
|
# Fall back to base64 for binary content
|
|
167
730
|
return base64.b64encode(v).decode("ascii")
|
|
168
731
|
|
|
732
|
+
@final
|
|
169
733
|
@property
|
|
170
734
|
def id(self) -> str:
|
|
171
|
-
"""
|
|
735
|
+
"""Get a short unique identifier for the document.
|
|
736
|
+
|
|
737
|
+
@public
|
|
738
|
+
|
|
739
|
+
This ID is crucial for LLM interactions. When documents are provided to
|
|
740
|
+
LLMs via generate() or generate_structured(), their IDs are included,
|
|
741
|
+
allowing the LLM to reference documents in prompts by either name or ID.
|
|
742
|
+
The ID is content-based (derived from SHA256 hash of content only),
|
|
743
|
+
so the same content always produces the same ID. Changing the name or
|
|
744
|
+
description does NOT change the ID.
|
|
745
|
+
|
|
746
|
+
Returns:
|
|
747
|
+
6-character base32-encoded string (uppercase, e.g., "A7B2C9").
|
|
748
|
+
This is the first 6 chars of the full base32 SHA256, NOT hex.
|
|
749
|
+
|
|
750
|
+
Collision Rate:
|
|
751
|
+
With base32 encoding (5 bits per char), 6 chars = 30 bits.
|
|
752
|
+
Expect collisions after ~32K documents (birthday paradox).
|
|
753
|
+
For higher uniqueness requirements, use the full sha256 property.
|
|
754
|
+
|
|
755
|
+
Note:
|
|
756
|
+
While shorter than full SHA256, this provides
|
|
757
|
+
reasonable uniqueness for most use cases.
|
|
758
|
+
"""
|
|
172
759
|
return self.sha256[:6]
|
|
173
760
|
|
|
761
|
+
@final
|
|
174
762
|
@cached_property
|
|
175
763
|
def sha256(self) -> str:
|
|
176
|
-
"""
|
|
764
|
+
"""Get the full SHA256 hash of the document content.
|
|
765
|
+
|
|
766
|
+
@public
|
|
767
|
+
|
|
768
|
+
Computes and caches the SHA256 hash of the content,
|
|
769
|
+
encoded in base32 (uppercase). Used for content
|
|
770
|
+
deduplication and integrity verification.
|
|
771
|
+
|
|
772
|
+
Returns:
|
|
773
|
+
Full SHA256 hash as base32-encoded uppercase string.
|
|
774
|
+
|
|
775
|
+
Why Base32 Instead of Hex:
|
|
776
|
+
- Base32 is case-insensitive, avoiding issues with different file systems
|
|
777
|
+
and AI interactions where casing might be inconsistent
|
|
778
|
+
- More compact than hex (52 chars vs 64 chars for SHA-256)
|
|
779
|
+
- Contains more information per character than hex (5 bits vs 4 bits)
|
|
780
|
+
- Safe for URLs without encoding
|
|
781
|
+
- Compatible with case-insensitive file systems
|
|
782
|
+
- Avoids confusion in AI interactions where models might change casing
|
|
783
|
+
- Not base64 because we want consistent uppercase for all uses
|
|
784
|
+
|
|
785
|
+
Note:
|
|
786
|
+
This is computed once and cached for performance.
|
|
787
|
+
The hash is deterministic based on content only.
|
|
788
|
+
"""
|
|
177
789
|
return b32encode(hashlib.sha256(self.content).digest()).decode("ascii").upper()
|
|
178
790
|
|
|
791
|
+
@final
|
|
179
792
|
@property
|
|
180
793
|
def size(self) -> int:
|
|
181
|
-
"""
|
|
794
|
+
"""Get the size of the document content.
|
|
795
|
+
|
|
796
|
+
@public
|
|
797
|
+
|
|
798
|
+
Returns:
|
|
799
|
+
Size of content in bytes.
|
|
800
|
+
|
|
801
|
+
Note:
|
|
802
|
+
Useful for monitoring document sizes and
|
|
803
|
+
ensuring they stay within limits.
|
|
804
|
+
"""
|
|
182
805
|
return len(self.content)
|
|
183
806
|
|
|
184
807
|
@cached_property
|
|
185
808
|
def detected_mime_type(self) -> str:
|
|
186
|
-
"""Detect MIME type from content
|
|
809
|
+
"""Detect the MIME type from document content.
|
|
810
|
+
|
|
811
|
+
Detection strategy (in order):
|
|
812
|
+
1. Returns 'application/x-empty' for empty content
|
|
813
|
+
2. Extension-based detection for known text formats (preferred)
|
|
814
|
+
3. python-magic content analysis for unknown extensions
|
|
815
|
+
4. Fallback to extension or 'application/octet-stream'
|
|
816
|
+
|
|
817
|
+
Returns:
|
|
818
|
+
MIME type string (e.g., "text/plain", "application/json").
|
|
819
|
+
|
|
820
|
+
Note:
|
|
821
|
+
This is cached after first access. Extension-based detection
|
|
822
|
+
is preferred for text formats to avoid misidentification.
|
|
823
|
+
"""
|
|
187
824
|
return detect_mime_type(self.content, self.name)
|
|
188
825
|
|
|
189
826
|
@property
|
|
190
827
|
def mime_type(self) -> str:
|
|
191
|
-
"""Get MIME type
|
|
828
|
+
"""Get the document's MIME type.
|
|
829
|
+
|
|
830
|
+
@public
|
|
831
|
+
|
|
832
|
+
Primary property for accessing MIME type information.
|
|
833
|
+
Automatically detects MIME type based on file extension and content.
|
|
834
|
+
|
|
835
|
+
Returns:
|
|
836
|
+
MIME type string (e.g., "text/plain", "application/json").
|
|
837
|
+
|
|
838
|
+
Note:
|
|
839
|
+
MIME type detection uses extension-based detection for known
|
|
840
|
+
text formats and content analysis for binary formats.
|
|
841
|
+
"""
|
|
192
842
|
return self.detected_mime_type
|
|
193
843
|
|
|
194
844
|
@property
|
|
195
845
|
def is_text(self) -> bool:
|
|
196
|
-
"""Check if document
|
|
846
|
+
"""Check if document contains text content.
|
|
847
|
+
|
|
848
|
+
@public
|
|
849
|
+
|
|
850
|
+
Returns:
|
|
851
|
+
True if MIME type indicates text content
|
|
852
|
+
(text/*, application/json, application/x-yaml, text/yaml, etc.),
|
|
853
|
+
False otherwise.
|
|
854
|
+
|
|
855
|
+
Note:
|
|
856
|
+
Used to determine if text property can be safely accessed.
|
|
857
|
+
"""
|
|
197
858
|
return is_text_mime_type(self.mime_type)
|
|
198
859
|
|
|
199
860
|
@property
|
|
200
861
|
def is_pdf(self) -> bool:
|
|
201
|
-
"""Check if document is PDF
|
|
862
|
+
"""Check if document is a PDF file.
|
|
863
|
+
|
|
864
|
+
@public
|
|
865
|
+
|
|
866
|
+
Returns:
|
|
867
|
+
True if MIME type is application/pdf, False otherwise.
|
|
868
|
+
|
|
869
|
+
Note:
|
|
870
|
+
PDF documents require special handling and are
|
|
871
|
+
supported by certain LLM models.
|
|
872
|
+
"""
|
|
202
873
|
return is_pdf_mime_type(self.mime_type)
|
|
203
874
|
|
|
204
875
|
@property
|
|
205
876
|
def is_image(self) -> bool:
|
|
206
|
-
"""Check if document is an image
|
|
877
|
+
"""Check if document is an image file.
|
|
878
|
+
|
|
879
|
+
@public
|
|
880
|
+
|
|
881
|
+
Returns:
|
|
882
|
+
True if MIME type starts with "image/", False otherwise.
|
|
883
|
+
|
|
884
|
+
Note:
|
|
885
|
+
Image documents are automatically encoded for
|
|
886
|
+
vision-capable LLM models.
|
|
887
|
+
"""
|
|
207
888
|
return is_image_mime_type(self.mime_type)
|
|
208
889
|
|
|
209
890
|
@classmethod
|
|
210
891
|
def canonical_name(cls) -> str:
|
|
211
|
-
"""Get the canonical name
|
|
892
|
+
"""Get the canonical name for this document class.
|
|
893
|
+
|
|
894
|
+
Returns a standardized snake_case name derived from the
|
|
895
|
+
class name, used for directory naming and identification.
|
|
896
|
+
|
|
897
|
+
Returns:
|
|
898
|
+
Snake_case canonical name.
|
|
899
|
+
|
|
900
|
+
Example:
|
|
901
|
+
>>> class UserDataDocument(FlowDocument): ...
|
|
902
|
+
>>> UserDataDocument.canonical_name()
|
|
903
|
+
'user_data'
|
|
904
|
+
"""
|
|
212
905
|
return canonical_name_key(cls)
|
|
213
906
|
|
|
214
|
-
|
|
215
|
-
|
|
907
|
+
@property
|
|
908
|
+
def text(self) -> str:
|
|
909
|
+
"""Get document content as UTF-8 text string.
|
|
910
|
+
|
|
911
|
+
@public
|
|
912
|
+
|
|
913
|
+
Decodes the bytes content as UTF-8 text. Only available for
|
|
914
|
+
text-based documents (check is_text property first).
|
|
915
|
+
|
|
916
|
+
Returns:
|
|
917
|
+
UTF-8 decoded string.
|
|
918
|
+
|
|
919
|
+
Raises:
|
|
920
|
+
ValueError: If document is not text (is_text == False).
|
|
921
|
+
|
|
922
|
+
Example:
|
|
923
|
+
>>> doc = MyDocument.create(name="data.txt", content="Hello \u2728")
|
|
924
|
+
>>> if doc.is_text:
|
|
925
|
+
... print(doc.text) # "Hello \u2728"
|
|
926
|
+
|
|
927
|
+
>>> # Binary document raises error:
|
|
928
|
+
>>> binary_doc = MyDocument(name="image.png", content=png_bytes)
|
|
929
|
+
>>> binary_doc.text # Raises ValueError
|
|
930
|
+
"""
|
|
216
931
|
if not self.is_text:
|
|
217
932
|
raise ValueError(f"Document is not text: {self.name}")
|
|
218
933
|
return self.content.decode("utf-8")
|
|
219
934
|
|
|
220
935
|
def as_yaml(self) -> Any:
|
|
221
|
-
"""Parse document as YAML
|
|
222
|
-
|
|
936
|
+
r"""Parse document content as YAML.
|
|
937
|
+
|
|
938
|
+
@public
|
|
939
|
+
|
|
940
|
+
Parses the document's text content as YAML and returns Python objects.
|
|
941
|
+
Uses ruamel.yaml which is safe by default (no code execution).
|
|
942
|
+
|
|
943
|
+
Returns:
|
|
944
|
+
Parsed YAML data: dict, list, str, int, float, bool, or None.
|
|
945
|
+
|
|
946
|
+
Raises:
|
|
947
|
+
ValueError: If document is not text-based.
|
|
948
|
+
YAMLError: If content is not valid YAML.
|
|
949
|
+
|
|
950
|
+
Example:
|
|
951
|
+
>>> # From dict content
|
|
952
|
+
>>> doc = MyDocument.create(name="config.yaml", content={
|
|
953
|
+
... "server": {"host": "localhost", "port": 8080}
|
|
954
|
+
... })
|
|
955
|
+
>>> doc.as_yaml() # {'server': {'host': 'localhost', 'port': 8080}}
|
|
956
|
+
|
|
957
|
+
>>> # From YAML string
|
|
958
|
+
>>> doc2 = MyDocument(name="simple.yml", content=b"key: value\nitems:\n - a\n - b")
|
|
959
|
+
>>> doc2.as_yaml() # {'key': 'value', 'items': ['a', 'b']}
|
|
960
|
+
"""
|
|
961
|
+
yaml = YAML()
|
|
962
|
+
return yaml.load(self.text) # type: ignore[no-untyped-call, no-any-return]
|
|
223
963
|
|
|
224
964
|
def as_json(self) -> Any:
|
|
225
|
-
"""Parse document as JSON
|
|
226
|
-
|
|
965
|
+
"""Parse document content as JSON.
|
|
966
|
+
|
|
967
|
+
@public
|
|
968
|
+
|
|
969
|
+
Parses the document's text content as JSON and returns Python objects.
|
|
970
|
+
Document must contain valid JSON text.
|
|
227
971
|
|
|
228
|
-
|
|
229
|
-
|
|
972
|
+
Returns:
|
|
973
|
+
Parsed JSON data: dict, list, str, int, float, bool, or None.
|
|
974
|
+
|
|
975
|
+
Raises:
|
|
976
|
+
ValueError: If document is not text-based.
|
|
977
|
+
JSONDecodeError: If content is not valid JSON.
|
|
978
|
+
|
|
979
|
+
Example:
|
|
980
|
+
>>> # From dict content
|
|
981
|
+
>>> doc = MyDocument.create(name="data.json", content={"key": "value"})
|
|
982
|
+
>>> doc.as_json() # {'key': 'value'}
|
|
983
|
+
|
|
984
|
+
>>> # From JSON string
|
|
985
|
+
>>> doc2 = MyDocument(name="array.json", content=b'[1, 2, 3]')
|
|
986
|
+
>>> doc2.as_json() # [1, 2, 3]
|
|
987
|
+
|
|
988
|
+
>>> # Invalid JSON
|
|
989
|
+
>>> bad_doc = MyDocument(name="bad.json", content=b"not json")
|
|
990
|
+
>>> bad_doc.as_json() # Raises JSONDecodeError
|
|
991
|
+
"""
|
|
992
|
+
return json.loads(self.text)
|
|
993
|
+
|
|
994
|
+
@overload
|
|
995
|
+
def as_pydantic_model(self, model_type: type[TModel]) -> TModel: ...
|
|
996
|
+
|
|
997
|
+
@overload
|
|
998
|
+
def as_pydantic_model(self, model_type: type[list[TModel]]) -> list[TModel]: ...
|
|
999
|
+
|
|
1000
|
+
def as_pydantic_model(
|
|
1001
|
+
self, model_type: type[TModel] | type[list[TModel]]
|
|
1002
|
+
) -> TModel | list[TModel]:
|
|
1003
|
+
"""Parse document content as Pydantic model with validation.
|
|
1004
|
+
|
|
1005
|
+
@public
|
|
1006
|
+
|
|
1007
|
+
Parses JSON or YAML content and validates it against a Pydantic model.
|
|
1008
|
+
Automatically detects format based on MIME type. Supports both single
|
|
1009
|
+
models and lists of models.
|
|
1010
|
+
|
|
1011
|
+
Args:
|
|
1012
|
+
model_type: Pydantic model class to validate against.
|
|
1013
|
+
Can be either:
|
|
1014
|
+
- type[Model] for single model
|
|
1015
|
+
- type[list[Model]] for list of models
|
|
1016
|
+
|
|
1017
|
+
Returns:
|
|
1018
|
+
Validated Pydantic model instance or list of instances.
|
|
1019
|
+
|
|
1020
|
+
Raises:
|
|
1021
|
+
ValueError: If document is not text or type mismatch.
|
|
1022
|
+
ValidationError: If data doesn't match model schema.
|
|
1023
|
+
JSONDecodeError/YAMLError: If content parsing fails.
|
|
1024
|
+
|
|
1025
|
+
Example:
|
|
1026
|
+
>>> from pydantic import BaseModel
|
|
1027
|
+
>>>
|
|
1028
|
+
>>> class User(BaseModel):
|
|
1029
|
+
... name: str
|
|
1030
|
+
... age: int
|
|
1031
|
+
>>>
|
|
1032
|
+
>>> # Single model
|
|
1033
|
+
>>> doc = MyDocument.create(name="user.json",
|
|
1034
|
+
... content={"name": "Alice", "age": 30})
|
|
1035
|
+
>>> user = doc.as_pydantic_model(User)
|
|
1036
|
+
>>> print(user.name) # "Alice"
|
|
1037
|
+
>>>
|
|
1038
|
+
>>> # List of models
|
|
1039
|
+
>>> doc2 = MyDocument.create(name="users.json",
|
|
1040
|
+
... content=[{"name": "Bob", "age": 25}, {"name": "Eve", "age": 28}])
|
|
1041
|
+
>>> users = doc2.as_pydantic_model(list[User])
|
|
1042
|
+
>>> print(len(users)) # 2
|
|
1043
|
+
"""
|
|
230
1044
|
data = self.as_yaml() if is_yaml_mime_type(self.mime_type) else self.as_json()
|
|
231
|
-
|
|
1045
|
+
|
|
1046
|
+
if get_origin(model_type) is list:
|
|
1047
|
+
if not isinstance(data, list):
|
|
1048
|
+
raise ValueError(f"Expected list data for {model_type}, got {type(data)}")
|
|
1049
|
+
item_type = get_args(model_type)[0]
|
|
1050
|
+
# Type guard for list case
|
|
1051
|
+
result_list = [item_type.model_validate(item) for item in data] # type: ignore[attr-defined]
|
|
1052
|
+
return cast(list[TModel], result_list)
|
|
1053
|
+
|
|
1054
|
+
# At this point model_type must be type[TModel], not type[list[TModel]]
|
|
1055
|
+
single_model = cast(type[TModel], model_type)
|
|
1056
|
+
return single_model.model_validate(data)
|
|
232
1057
|
|
|
233
1058
|
def as_markdown_list(self) -> list[str]:
|
|
234
|
-
"""Parse document as
|
|
235
|
-
return self.as_text().split(self.MARKDOWN_LIST_SEPARATOR)
|
|
1059
|
+
r"""Parse document as markdown-separated list of sections.
|
|
236
1060
|
|
|
237
|
-
|
|
238
|
-
def create(
|
|
239
|
-
cls,
|
|
240
|
-
name: str,
|
|
241
|
-
description: str | None,
|
|
242
|
-
content: bytes | str | BaseModel | list[str] | Any,
|
|
243
|
-
) -> Self:
|
|
244
|
-
"""Create a document from a name, description, and content"""
|
|
245
|
-
is_yaml_extension = name.endswith(".yaml") or name.endswith(".yml")
|
|
246
|
-
is_json_extension = name.endswith(".json")
|
|
247
|
-
is_markdown_extension = name.endswith(".md")
|
|
248
|
-
is_str_list = isinstance(content, list) and all(isinstance(item, str) for item in content)
|
|
249
|
-
if isinstance(content, bytes):
|
|
250
|
-
pass
|
|
251
|
-
elif isinstance(content, str):
|
|
252
|
-
content = content.encode("utf-8")
|
|
253
|
-
elif is_str_list and is_markdown_extension:
|
|
254
|
-
return cls.create_as_markdown_list(name, description, content) # type: ignore[arg-type]
|
|
255
|
-
elif is_yaml_extension:
|
|
256
|
-
return cls.create_as_yaml(name, description, content)
|
|
257
|
-
elif is_json_extension:
|
|
258
|
-
return cls.create_as_json(name, description, content)
|
|
259
|
-
else:
|
|
260
|
-
raise ValueError(f"Unsupported content type: {type(content)} for {name}")
|
|
1061
|
+
@public
|
|
261
1062
|
|
|
262
|
-
|
|
1063
|
+
Splits text content using markdown separator ("\n\n---\n\n").
|
|
1064
|
+
Designed for markdown documents with multiple sections.
|
|
263
1065
|
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
"""Create a document from a name, description, and list of strings"""
|
|
267
|
-
# remove other list separators (lines that are only the separator + whitespace)
|
|
268
|
-
separator = Document.MARKDOWN_LIST_SEPARATOR.strip()
|
|
269
|
-
pattern = re.compile(rf"^[ \t]*{re.escape(separator)}[ \t]*(?:\r?\n|$)", flags=re.MULTILINE)
|
|
270
|
-
# Normalize CRLF/CR to LF before cleaning to ensure consistent behavior
|
|
271
|
-
normalized_items = [re.sub(r"\r\n?", "\n", item) for item in items]
|
|
272
|
-
cleaned_items = [pattern.sub("", item) for item in normalized_items]
|
|
273
|
-
content = Document.MARKDOWN_LIST_SEPARATOR.join(cleaned_items)
|
|
274
|
-
return cls.create(name, description, content)
|
|
1066
|
+
Returns:
|
|
1067
|
+
List of string sections (preserves whitespace within sections).
|
|
275
1068
|
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
"""Create a document from a name, description, and JSON data"""
|
|
279
|
-
assert name.endswith(".json"), f"Document name must end with .json: {name}"
|
|
280
|
-
if isinstance(data, BaseModel):
|
|
281
|
-
data = data.model_dump(mode="json")
|
|
282
|
-
content = json.dumps(data, indent=2).encode("utf-8")
|
|
283
|
-
return cls.create(name, description, content)
|
|
1069
|
+
Raises:
|
|
1070
|
+
ValueError: If document is not text-based.
|
|
284
1071
|
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
1072
|
+
Example:
|
|
1073
|
+
>>> # Using create with list
|
|
1074
|
+
>>> sections = ["# Chapter 1\nIntroduction", "# Chapter 2\nDetails"]
|
|
1075
|
+
>>> doc = MyDocument.create(name="book.md", content=sections)
|
|
1076
|
+
>>> doc.as_markdown_list() # Returns original sections
|
|
1077
|
+
|
|
1078
|
+
>>> # Manual creation with separator
|
|
1079
|
+
>>> content = "Part 1\n\n---\n\nPart 2\n\n---\n\nPart 3"
|
|
1080
|
+
>>> doc2 = MyDocument(name="parts.md", content=content.encode())
|
|
1081
|
+
>>> doc2.as_markdown_list() # ['Part 1', 'Part 2', 'Part 3']
|
|
1082
|
+
"""
|
|
1083
|
+
return self.text.split(self.MARKDOWN_LIST_SEPARATOR)
|
|
1084
|
+
|
|
1085
|
+
def parse(self, type_: type[Any]) -> Any:
|
|
1086
|
+
r"""Parse document content to original type (reverses create conversion).
|
|
296
1087
|
|
|
297
|
-
|
|
298
|
-
yaml.dump(data, stream)
|
|
299
|
-
content = stream.getvalue()
|
|
300
|
-
return cls.create(name, description, content)
|
|
1088
|
+
@public
|
|
301
1089
|
|
|
1090
|
+
This method reverses the automatic conversion performed by the `create`
|
|
1091
|
+
classmethod. It intelligently parses the bytes content based on the
|
|
1092
|
+
document's file extension and converts to the requested type.
|
|
1093
|
+
|
|
1094
|
+
Designed for roundtrip conversion:
|
|
1095
|
+
>>> original = {"key": "value"}
|
|
1096
|
+
>>> doc = MyDocument.create(name="data.json", content=original)
|
|
1097
|
+
>>> restored = doc.parse(dict)
|
|
1098
|
+
>>> assert restored == original # True
|
|
1099
|
+
|
|
1100
|
+
Args:
|
|
1101
|
+
type_: Target type to parse content into. Supported types:
|
|
1102
|
+
- bytes: Returns raw content (no conversion)
|
|
1103
|
+
- str: Decodes UTF-8 text
|
|
1104
|
+
- dict: Parses JSON (.json) or YAML (.yaml/.yml)
|
|
1105
|
+
- list: Splits markdown (.md) or parses JSON/YAML
|
|
1106
|
+
- BaseModel subclasses: Validates JSON/YAML into model
|
|
1107
|
+
|
|
1108
|
+
Returns:
|
|
1109
|
+
Content parsed to the requested type.
|
|
1110
|
+
|
|
1111
|
+
Raises:
|
|
1112
|
+
ValueError: If type is unsupported or parsing fails.
|
|
1113
|
+
|
|
1114
|
+
Extension Rules:
|
|
1115
|
+
- .json → JSON parsing for dict/list/BaseModel
|
|
1116
|
+
- .yaml/.yml → YAML parsing for dict/list/BaseModel
|
|
1117
|
+
- .md + list → Split by markdown separator
|
|
1118
|
+
- Any + str → UTF-8 decode
|
|
1119
|
+
- Any + bytes → Raw content
|
|
1120
|
+
|
|
1121
|
+
Example:
|
|
1122
|
+
>>> # String content
|
|
1123
|
+
>>> doc = MyDocument(name="test.txt", content=b"Hello")
|
|
1124
|
+
>>> doc.parse(str)
|
|
1125
|
+
'Hello'
|
|
1126
|
+
|
|
1127
|
+
>>> # JSON content
|
|
1128
|
+
>>> doc = MyDocument.create(name="data.json", content={"key": "value"})
|
|
1129
|
+
>>> doc.parse(dict) # Returns {'key': 'value'}
|
|
1130
|
+
|
|
1131
|
+
>>> # Markdown list
|
|
1132
|
+
>>> items = ["Item 1", "Item 2"]
|
|
1133
|
+
>>> content = "\n\n---\n\n".join(items).encode()
|
|
1134
|
+
>>> doc = MyDocument(name="list.md", content=content)
|
|
1135
|
+
>>> doc.parse(list)
|
|
1136
|
+
['Item 1', 'Item 2']
|
|
1137
|
+
"""
|
|
1138
|
+
# Handle basic types
|
|
1139
|
+
if type_ is bytes:
|
|
1140
|
+
return self.content
|
|
1141
|
+
elif type_ is str:
|
|
1142
|
+
# Handle empty content specially
|
|
1143
|
+
if len(self.content) == 0:
|
|
1144
|
+
return ""
|
|
1145
|
+
return self.text
|
|
1146
|
+
|
|
1147
|
+
# Handle structured data based on extension
|
|
1148
|
+
name_lower = self.name.lower()
|
|
1149
|
+
|
|
1150
|
+
# JSON files
|
|
1151
|
+
if name_lower.endswith(".json"):
|
|
1152
|
+
if type_ is dict or type_ is list:
|
|
1153
|
+
result = self.as_json()
|
|
1154
|
+
# Ensure the result is the correct type
|
|
1155
|
+
if type_ is dict and not isinstance(result, dict):
|
|
1156
|
+
raise ValueError(f"Expected dict but got {type(result).__name__}")
|
|
1157
|
+
if type_ is list and not isinstance(result, list):
|
|
1158
|
+
raise ValueError(f"Expected list but got {type(result).__name__}")
|
|
1159
|
+
return result
|
|
1160
|
+
elif issubclass(type_, BaseModel):
|
|
1161
|
+
return self.as_pydantic_model(type_)
|
|
1162
|
+
else:
|
|
1163
|
+
raise ValueError(f"Cannot parse JSON file to type {type_}")
|
|
1164
|
+
|
|
1165
|
+
# YAML files
|
|
1166
|
+
elif name_lower.endswith((".yaml", ".yml")):
|
|
1167
|
+
if type_ is dict or type_ is list:
|
|
1168
|
+
result = self.as_yaml()
|
|
1169
|
+
# Ensure the result is the correct type
|
|
1170
|
+
if type_ is dict and not isinstance(result, dict):
|
|
1171
|
+
raise ValueError(f"Expected dict but got {type(result).__name__}")
|
|
1172
|
+
if type_ is list and not isinstance(result, list):
|
|
1173
|
+
raise ValueError(f"Expected list but got {type(result).__name__}")
|
|
1174
|
+
return result
|
|
1175
|
+
elif issubclass(type_, BaseModel):
|
|
1176
|
+
return self.as_pydantic_model(type_)
|
|
1177
|
+
else:
|
|
1178
|
+
raise ValueError(f"Cannot parse YAML file to type {type_}")
|
|
1179
|
+
|
|
1180
|
+
# Markdown files with lists
|
|
1181
|
+
elif name_lower.endswith(".md") and type_ is list:
|
|
1182
|
+
return self.as_markdown_list()
|
|
1183
|
+
|
|
1184
|
+
# Default: try to return as requested basic type
|
|
1185
|
+
elif type_ is dict or type_ is list:
|
|
1186
|
+
# Try JSON first, then YAML
|
|
1187
|
+
try:
|
|
1188
|
+
result = self.as_json()
|
|
1189
|
+
# Ensure the result is the correct type
|
|
1190
|
+
if type_ is dict and not isinstance(result, dict):
|
|
1191
|
+
raise ValueError(f"Expected dict but got {type(result).__name__}")
|
|
1192
|
+
if type_ is list and not isinstance(result, list):
|
|
1193
|
+
raise ValueError(f"Expected list but got {type(result).__name__}")
|
|
1194
|
+
return result
|
|
1195
|
+
except (json.JSONDecodeError, ValueError):
|
|
1196
|
+
try:
|
|
1197
|
+
result = self.as_yaml()
|
|
1198
|
+
# Ensure the result is the correct type
|
|
1199
|
+
if type_ is dict and not isinstance(result, dict):
|
|
1200
|
+
raise ValueError(f"Expected dict but got {type(result).__name__}")
|
|
1201
|
+
if type_ is list and not isinstance(result, list):
|
|
1202
|
+
raise ValueError(f"Expected list but got {type(result).__name__}")
|
|
1203
|
+
return result
|
|
1204
|
+
except Exception as e:
|
|
1205
|
+
raise ValueError(f"Cannot parse content to {type_}") from e
|
|
1206
|
+
|
|
1207
|
+
raise ValueError(f"Unsupported type {type_} for file {self.name}")
|
|
1208
|
+
|
|
1209
|
+
@final
|
|
302
1210
|
def serialize_model(self) -> dict[str, Any]:
|
|
303
|
-
"""Serialize document to
|
|
1211
|
+
"""Serialize document to dictionary for storage or transmission.
|
|
1212
|
+
|
|
1213
|
+
Creates a complete JSON-serializable representation of the document
|
|
1214
|
+
with all metadata and properly encoded content. Automatically chooses
|
|
1215
|
+
the most appropriate encoding (UTF-8 for text, base64 for binary).
|
|
1216
|
+
|
|
1217
|
+
Returns:
|
|
1218
|
+
Dictionary with the following keys:
|
|
1219
|
+
- name: Document filename (str)
|
|
1220
|
+
- description: Optional description (str | None)
|
|
1221
|
+
- base_type: Persistence type - "flow", "task", or "temporary" (str)
|
|
1222
|
+
- size: Content size in bytes (int)
|
|
1223
|
+
- id: Short hash identifier, first 6 chars of SHA256 (str)
|
|
1224
|
+
- sha256: Full SHA256 hash in base32 encoding (str)
|
|
1225
|
+
- mime_type: Detected MIME type (str)
|
|
1226
|
+
- content: Encoded content (str)
|
|
1227
|
+
- content_encoding: Either "utf-8" or "base64" (str)
|
|
1228
|
+
|
|
1229
|
+
Encoding Strategy:
|
|
1230
|
+
- Text files (text/*, application/json, etc.) → UTF-8 string
|
|
1231
|
+
- Binary files (images, PDFs, etc.) → Base64 string
|
|
1232
|
+
- Invalid UTF-8 in text files → UTF-8 with replacement chars
|
|
1233
|
+
|
|
1234
|
+
Example:
|
|
1235
|
+
>>> doc = MyDocument.create(name="data.json", content={"key": "value"})
|
|
1236
|
+
>>> serialized = doc.serialize_model()
|
|
1237
|
+
>>> serialized["content_encoding"] # "utf-8"
|
|
1238
|
+
>>> serialized["mime_type"] # "application/json"
|
|
1239
|
+
"""
|
|
304
1240
|
result = {
|
|
305
1241
|
"name": self.name,
|
|
306
1242
|
"description": self.description,
|
|
@@ -327,19 +1263,59 @@ class Document(BaseModel, ABC):
|
|
|
327
1263
|
|
|
328
1264
|
return result
|
|
329
1265
|
|
|
1266
|
+
@final
|
|
330
1267
|
@classmethod
|
|
331
1268
|
def from_dict(cls, data: dict[str, Any]) -> Self:
|
|
332
|
-
"""Deserialize document from dictionary.
|
|
1269
|
+
r"""Deserialize document from dictionary (inverse of serialize_model).
|
|
1270
|
+
|
|
1271
|
+
Reconstructs a Document instance from the dictionary format produced
|
|
1272
|
+
by serialize_model(). Automatically handles content decoding based on
|
|
1273
|
+
the content_encoding field.
|
|
1274
|
+
|
|
1275
|
+
Args:
|
|
1276
|
+
data: Dictionary containing serialized document. Required keys:
|
|
1277
|
+
- name: Document filename (str)
|
|
1278
|
+
- content: Encoded content (str or bytes)
|
|
1279
|
+
Optional keys:
|
|
1280
|
+
- description: Document description (str | None)
|
|
1281
|
+
- content_encoding: "utf-8" or "base64" (defaults to "utf-8")
|
|
1282
|
+
|
|
1283
|
+
Returns:
|
|
1284
|
+
New Document instance with restored content.
|
|
1285
|
+
|
|
1286
|
+
Raises:
|
|
1287
|
+
ValueError: If content type is invalid or base64 decoding fails
|
|
1288
|
+
KeyError: If required keys are missing from data dictionary
|
|
1289
|
+
|
|
1290
|
+
Note:
|
|
1291
|
+
Provides roundtrip guarantee with serialize_model().
|
|
1292
|
+
Content and name are preserved exactly.
|
|
1293
|
+
|
|
1294
|
+
Example:
|
|
1295
|
+
>>> data = {
|
|
1296
|
+
... "name": "config.yaml",
|
|
1297
|
+
... "content": "key: value\n",
|
|
1298
|
+
... "content_encoding": "utf-8",
|
|
1299
|
+
... "description": "Config file"
|
|
1300
|
+
... }
|
|
1301
|
+
>>> doc = MyDocument.from_dict(data)
|
|
1302
|
+
"""
|
|
333
1303
|
# Extract content and encoding
|
|
334
|
-
|
|
1304
|
+
content_raw = data.get("content", "")
|
|
335
1305
|
content_encoding = data.get("content_encoding", "utf-8")
|
|
336
1306
|
|
|
337
1307
|
# Decode content based on encoding
|
|
1308
|
+
content: bytes
|
|
338
1309
|
if content_encoding == "base64":
|
|
339
|
-
content
|
|
340
|
-
|
|
1310
|
+
assert isinstance(content_raw, str), "base64 content must be string"
|
|
1311
|
+
content = base64.b64decode(content_raw)
|
|
1312
|
+
elif isinstance(content_raw, str):
|
|
341
1313
|
# Default to UTF-8
|
|
342
|
-
content =
|
|
1314
|
+
content = content_raw.encode("utf-8")
|
|
1315
|
+
elif isinstance(content_raw, bytes):
|
|
1316
|
+
content = content_raw
|
|
1317
|
+
else:
|
|
1318
|
+
raise ValueError(f"Invalid content type: {type(content_raw)}")
|
|
343
1319
|
|
|
344
1320
|
# Create document with the required fields
|
|
345
1321
|
return cls(
|