ai-pipeline-core 0.1.8__py3-none-any.whl → 0.1.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. ai_pipeline_core/__init__.py +86 -4
  2. ai_pipeline_core/documents/__init__.py +11 -0
  3. ai_pipeline_core/documents/document.py +1107 -131
  4. ai_pipeline_core/documents/document_list.py +147 -38
  5. ai_pipeline_core/documents/flow_document.py +112 -11
  6. ai_pipeline_core/documents/mime_type.py +173 -15
  7. ai_pipeline_core/documents/task_document.py +117 -12
  8. ai_pipeline_core/documents/temporary_document.py +95 -0
  9. ai_pipeline_core/documents/utils.py +41 -9
  10. ai_pipeline_core/exceptions.py +47 -11
  11. ai_pipeline_core/flow/__init__.py +2 -0
  12. ai_pipeline_core/flow/config.py +250 -23
  13. ai_pipeline_core/flow/options.py +50 -1
  14. ai_pipeline_core/llm/__init__.py +6 -0
  15. ai_pipeline_core/llm/ai_messages.py +125 -27
  16. ai_pipeline_core/llm/client.py +278 -26
  17. ai_pipeline_core/llm/model_options.py +130 -1
  18. ai_pipeline_core/llm/model_response.py +239 -35
  19. ai_pipeline_core/llm/model_types.py +67 -0
  20. ai_pipeline_core/logging/__init__.py +13 -0
  21. ai_pipeline_core/logging/logging_config.py +72 -20
  22. ai_pipeline_core/logging/logging_mixin.py +38 -32
  23. ai_pipeline_core/pipeline.py +308 -60
  24. ai_pipeline_core/prefect.py +48 -1
  25. ai_pipeline_core/prompt_manager.py +215 -24
  26. ai_pipeline_core/settings.py +108 -4
  27. ai_pipeline_core/simple_runner/__init__.py +5 -0
  28. ai_pipeline_core/simple_runner/cli.py +145 -17
  29. ai_pipeline_core/simple_runner/simple_runner.py +244 -6
  30. ai_pipeline_core/tracing.py +232 -30
  31. ai_pipeline_core-0.1.11.dist-info/METADATA +450 -0
  32. ai_pipeline_core-0.1.11.dist-info/RECORD +36 -0
  33. ai_pipeline_core-0.1.8.dist-info/METADATA +0 -558
  34. ai_pipeline_core-0.1.8.dist-info/RECORD +0 -35
  35. {ai_pipeline_core-0.1.8.dist-info → ai_pipeline_core-0.1.11.dist-info}/WHEEL +0 -0
  36. {ai_pipeline_core-0.1.8.dist-info → ai_pipeline_core-0.1.11.dist-info}/licenses/LICENSE +0 -0
@@ -1,14 +1,39 @@
1
+ """Document abstraction layer for AI pipeline flows.
2
+
3
+ @public
4
+
5
+ This module provides the core document abstraction for working with various types of data
6
+ in AI pipelines. Documents are immutable Pydantic models that wrap binary content with metadata.
7
+ """
8
+
1
9
  import base64
2
10
  import hashlib
3
11
  import json
4
- import re
5
12
  from abc import ABC, abstractmethod
6
13
  from base64 import b32encode
7
14
  from enum import StrEnum
8
15
  from functools import cached_property
9
- from typing import Any, ClassVar, Literal, Self, TypeVar
16
+ from io import BytesIO
17
+ from typing import (
18
+ Any,
19
+ ClassVar,
20
+ Literal,
21
+ Self,
22
+ TypeVar,
23
+ cast,
24
+ final,
25
+ get_args,
26
+ get_origin,
27
+ overload,
28
+ )
10
29
 
11
- from pydantic import BaseModel, ConfigDict, field_serializer, field_validator
30
+ from pydantic import (
31
+ BaseModel,
32
+ ConfigDict,
33
+ ValidationInfo,
34
+ field_serializer,
35
+ field_validator,
36
+ )
12
37
  from ruamel.yaml import YAML
13
38
 
14
39
  from ai_pipeline_core.documents.utils import canonical_name_key
@@ -26,19 +51,133 @@ TModel = TypeVar("TModel", bound=BaseModel)
26
51
 
27
52
 
28
53
  class Document(BaseModel, ABC):
29
- """Abstract base class for all documents.
54
+ r"""Abstract base class for all documents in the AI Pipeline Core system.
55
+
56
+ @public
57
+
58
+ Document is the fundamental data abstraction for all content flowing through
59
+ pipelines. It provides automatic encoding, MIME type detection, serialization,
60
+ and validation. All documents must be subclassed from FlowDocument or TaskDocument
61
+ based on their persistence requirements. TemporaryDocument is a special concrete
62
+ class that can be instantiated directly (not abstract).
63
+
64
+ VALIDATION IS AUTOMATIC - Do not add manual validation!
65
+ Size validation, name validation, and MIME type detection are built-in.
66
+ The framework handles all standard validations internally.
67
+
68
+ # WRONG - These checks already happen automatically:
69
+ if document.size > document.MAX_CONTENT_SIZE:
70
+ raise DocumentSizeError(...) # NO! Already handled
71
+ document.validate_file_name(document.name) # NO! Automatic
72
+
73
+ Best Practices:
74
+ - Use create() classmethod for automatic type conversion (90% of cases)
75
+ - Omit description parameter unless truly needed for metadata
76
+ - When using LLM functions, pass AIMessages or str. Wrap any Document values
77
+ in AIMessages([...]). Do not call .text yourself
78
+
79
+ Standard Usage:
80
+ >>> # CORRECT - minimal parameters
81
+ >>> doc = MyDocument.create(name="data.json", content={"key": "value"})
82
+
83
+ >>> # AVOID - unnecessary description
84
+ >>> doc = MyDocument.create(
85
+ ... name="data.json",
86
+ ... content={"key": "value"},
87
+ ... description="This is data" # Usually not needed!
88
+ ... )
89
+
90
+ Key features:
91
+ - Immutable by default (frozen Pydantic model)
92
+ - Automatic MIME type detection
93
+ - Content size validation
94
+ - SHA256 hashing for deduplication
95
+ - Support for text, JSON, YAML, PDF, and image formats
96
+ - Conversion utilities between different formats
97
+
98
+ Class Variables:
99
+ MAX_CONTENT_SIZE: Maximum allowed content size in bytes (default 25MB)
100
+
101
+ Attributes:
102
+ name: Document filename (validated for security)
103
+ description: Optional human-readable description
104
+ content: Raw document content as bytes
105
+
106
+ Creating Documents:
107
+ **Use the `create` classmethod** for most use cases. It accepts various
108
+ content types (str, dict, list, BaseModel) and converts them automatically.
109
+ Only use __init__ directly when you already have bytes content.
30
110
 
31
- Warning: Document subclasses should NOT start with 'Test' prefix as this
32
- causes conflicts with pytest test discovery. Classes with 'Test' prefix
33
- will be rejected at definition time.
111
+ >>> # RECOMMENDED: Use create for automatic conversion
112
+ >>> doc = MyDocument.create(name="data.json", content={"key": "value"})
113
+ >>>
114
+ >>> # Direct constructor: Only for bytes
115
+ >>> doc = MyDocument(name="data.bin", content=b"\x00\x01\x02")
116
+
117
+ Warning:
118
+ - Document subclasses should NOT start with 'Test' prefix (pytest conflict)
119
+ - Cannot instantiate Document directly - must subclass FlowDocument or TaskDocument
120
+ - Cannot add custom fields - only name, description, content are allowed
121
+ - Document is an abstract class and cannot be instantiated directly
122
+
123
+ Metadata Attachment Patterns:
124
+ Since custom fields are not allowed, use these patterns for metadata:
125
+ 1. Use the 'description' field for human-readable metadata
126
+ 2. Embed metadata in content (e.g., JSON with data + metadata fields)
127
+ 3. Create a separate MetadataDocument type to accompany data documents
128
+ 4. Use document naming conventions (e.g., "data_v2_2024.json")
129
+ 5. Store metadata in flow_options or pass through TraceInfo
130
+
131
+ Example:
132
+ >>> from enum import StrEnum
133
+ >>>
134
+ >>> # Simple document:
135
+ >>> class MyDocument(FlowDocument):
136
+ ... pass
137
+ >>>
138
+ >>> # Document with file restrictions:
139
+ >>> class ConfigDocument(FlowDocument):
140
+ ... class FILES(StrEnum):
141
+ ... CONFIG = "config.yaml"
142
+ ... SETTINGS = "settings.json"
143
+ >>>
144
+ >>> # RECOMMENDED: Use create for automatic conversion
145
+ >>> doc = MyDocument.create(name="data.json", content={"key": "value"})
146
+ >>> print(doc.is_text) # True
147
+ >>> data = doc.as_json() # {'key': 'value'}
148
+ """
149
+
150
+ MAX_CONTENT_SIZE: ClassVar[int] = 25 * 1024 * 1024
151
+ """Maximum allowed content size in bytes (default 25MB).
152
+
153
+ @public
34
154
  """
35
155
 
36
- MAX_CONTENT_SIZE: ClassVar[int] = 25 * 1024 * 1024 # 25MB default
37
156
  DESCRIPTION_EXTENSION: ClassVar[str] = ".description.md"
157
+ """File extension for description files."""
158
+
38
159
  MARKDOWN_LIST_SEPARATOR: ClassVar[str] = "\n\n---\n\n"
160
+ """Separator for markdown list items."""
39
161
 
40
162
  def __init_subclass__(cls, **kwargs: Any) -> None:
41
- """Validate subclass names to prevent pytest conflicts."""
163
+ """Validate subclass configuration at definition time.
164
+
165
+ Performs several validation checks when a Document subclass is defined:
166
+ 1. Prevents class names starting with 'Test' (pytest conflict)
167
+ 2. Validates FILES enum if present (must be StrEnum)
168
+ 3. Prevents adding custom fields beyond name, description, content
169
+
170
+ Args:
171
+ **kwargs: Additional keyword arguments passed to parent __init_subclass__.
172
+
173
+ Raises:
174
+ TypeError: If subclass violates naming rules, FILES enum requirements,
175
+ or attempts to add extra fields.
176
+
177
+ Note:
178
+ This validation happens at class definition time, not instantiation,
179
+ providing early error detection during development.
180
+ """
42
181
  super().__init_subclass__(**kwargs)
43
182
  if cls.__name__.startswith("Test"):
44
183
  raise TypeError(
@@ -46,56 +185,290 @@ class Document(BaseModel, ABC):
46
185
  "This causes conflicts with pytest test discovery. "
47
186
  "Please use a different name (e.g., 'SampleDocument', 'ExampleDocument')."
48
187
  )
188
+ if hasattr(cls, "FILES"):
189
+ files = getattr(cls, "FILES")
190
+ if not issubclass(files, StrEnum):
191
+ raise TypeError(
192
+ f"Document subclass '{cls.__name__}'.FILES must be an Enum of string values"
193
+ )
194
+ # Check that the Document's model_fields only contain the allowed fields
195
+ # It prevents AI models from adding additional fields to documents
196
+ allowed = {"name", "description", "content"}
197
+ current = set(getattr(cls, "model_fields", {}).keys())
198
+ extras = current - allowed
199
+ if extras:
200
+ raise TypeError(
201
+ f"Document subclass '{cls.__name__}' cannot declare additional fields: "
202
+ f"{', '.join(sorted(extras))}. Only {', '.join(sorted(allowed))} are allowed."
203
+ )
204
+
205
+ @overload
206
+ @classmethod
207
+ def create(cls, *, name: str, content: bytes, description: str | None = None) -> Self: ...
208
+
209
+ @overload
210
+ @classmethod
211
+ def create(cls, *, name: str, content: str, description: str | None = None) -> Self: ...
212
+
213
+ @overload
214
+ @classmethod
215
+ def create(
216
+ cls, *, name: str, content: dict[str, Any], description: str | None = None
217
+ ) -> Self: ...
218
+
219
+ @overload
220
+ @classmethod
221
+ def create(cls, *, name: str, content: list[Any], description: str | None = None) -> Self: ...
222
+
223
+ @overload
224
+ @classmethod
225
+ def create(cls, *, name: str, content: BaseModel, description: str | None = None) -> Self: ...
226
+
227
+ @classmethod
228
+ def create(
229
+ cls,
230
+ *,
231
+ name: str,
232
+ content: str | bytes | dict[str, Any] | list[Any] | BaseModel,
233
+ description: str | None = None,
234
+ ) -> Self:
235
+ r"""Create a Document with automatic content type conversion (recommended).
236
+
237
+ @public
238
+
239
+ This is the **recommended way to create documents**. It accepts various
240
+ content types and automatically converts them to bytes based on the file
241
+ extension. Use the `parse` method to reverse this conversion.
242
+
243
+ Best Practice (90% of cases):
244
+ Only provide name and content. The description parameter is RARELY needed.
245
+
246
+ Args:
247
+ name: Document filename (required, keyword-only).
248
+ Extension determines serialization:
249
+ - .json → JSON serialization
250
+ - .yaml/.yml → YAML serialization
251
+ - .md → Markdown list joining (for list[str])
252
+ - Others → UTF-8 encoding (for str)
253
+ content: Document content in various formats (required, keyword-only):
254
+ - bytes: Used directly without conversion
255
+ - str: Encoded to UTF-8 bytes
256
+ - dict[str, Any]: Serialized to JSON (.json) or YAML (.yaml/.yml)
257
+ - list[str]: Joined with separator for .md, else JSON/YAML
258
+ - list[BaseModel]: Serialized to JSON or YAML based on extension
259
+ - BaseModel: Serialized to JSON or YAML based on extension
260
+ description: Optional description - USUALLY OMIT THIS (defaults to None).
261
+ Only use when meaningful metadata helps downstream processing
262
+
263
+ Returns:
264
+ New Document instance with content converted to bytes
265
+
266
+ Raises:
267
+ ValueError: If content type is not supported for the file extension
268
+ DocumentNameError: If filename violates validation rules
269
+ DocumentSizeError: If content exceeds MAX_CONTENT_SIZE
270
+
271
+ Note:
272
+ All conversions are reversible using the `parse` method.
273
+ For example: MyDocument.create(name="data.json", content={"key": "value"}).parse(dict)
274
+ returns the original dictionary {"key": "value"}.
275
+
276
+ Example:
277
+ >>> # CORRECT - no description needed (90% of cases)
278
+ >>> doc = MyDocument.create(name="test.txt", content="Hello World")
279
+ >>> doc.content # b'Hello World'
280
+ >>> doc.parse(str) # "Hello World"
281
+
282
+ >>> # CORRECT - Dictionary to JSON, no description
283
+ >>> doc = MyDocument.create(name="config.json", content={"key": "value"})
284
+ >>> doc.content # b'{"key": "value", ...}'
285
+ >>> doc.parse(dict) # {"key": "value"}
286
+
287
+ >>> # AVOID unless description adds real value
288
+ >>> doc = MyDocument.create(
289
+ ... name="config.json",
290
+ ... content={"key": "value"},
291
+ ... description="Config file" # Usually redundant!
292
+ ... )
293
+
294
+ >>> # Pydantic model to YAML
295
+ >>> from pydantic import BaseModel
296
+ >>> class Config(BaseModel):
297
+ ... host: str
298
+ ... port: int
299
+ >>> config = Config(host="localhost", port=8080)
300
+ >>> doc = MyDocument.create(name="config.yaml", content=config)
301
+ >>> doc.parse(Config) # Returns Config instance
302
+
303
+ >>> # List to Markdown
304
+ >>> items = ["Section 1", "Section 2"]
305
+ >>> doc = MyDocument.create(name="sections.md", content=items)
306
+ >>> doc.parse(list) # ["Section 1", "Section 2"]
307
+ """
308
+ # Use model_validate to leverage the existing validator logic
309
+ temp = cls.model_validate({"name": name, "content": content, "description": description})
310
+ # Now construct with type-checker-friendly call (bytes only)
311
+ return cls(name=temp.name, content=temp.content, description=temp.description)
312
+
313
+ def __init__(
314
+ self,
315
+ *,
316
+ name: str,
317
+ content: bytes,
318
+ description: str | None = None,
319
+ ) -> None:
320
+ """Initialize a Document instance with raw bytes content.
321
+
322
+ @public
49
323
 
50
- def __init__(self, **data: Any) -> None:
51
- """Prevent direct instantiation of abstract Document class."""
324
+ Important:
325
+ **Most users should use the `create` classmethod instead of __init__.**
326
+ The create method provides automatic content conversion for various types
327
+ (str, dict, list, Pydantic models) while __init__ only accepts bytes.
328
+
329
+ This constructor accepts only bytes content for type safety. It prevents
330
+ direct instantiation of the abstract Document class.
331
+
332
+ Args:
333
+ name: Document filename (required, keyword-only)
334
+ content: Document content as raw bytes (required, keyword-only)
335
+ description: Optional human-readable description (keyword-only)
336
+
337
+ Raises:
338
+ TypeError: If attempting to instantiate Document directly.
339
+
340
+ Example:
341
+ >>> # Direct constructor - only for bytes content:
342
+ >>> doc = MyDocument(name="test.txt", content=b"Hello World")
343
+ >>> doc.content # b'Hello World'
344
+
345
+ >>> # RECOMMENDED: Use create for automatic conversion:
346
+ >>> doc = MyDocument.create(name="text.txt", content="Hello World")
347
+ >>> doc = MyDocument.create(name="data.json", content={"key": "value"})
348
+ >>> doc = MyDocument.create(name="config.yaml", content=my_model)
349
+ >>> doc = MyDocument.create(name="items.md", content=["item1", "item2"])
350
+
351
+ See Also:
352
+ create: Recommended factory method with automatic type conversion
353
+ parse: Method to reverse the conversion done by create
354
+ """
52
355
  if type(self) is Document:
53
356
  raise TypeError("Cannot instantiate abstract Document class directly")
54
- super().__init__(**data)
55
357
 
56
- # Optional enum of allowed file names. Subclasses may set this.
57
- # This is used to validate the document name.
58
- FILES: ClassVar[type[StrEnum] | None] = None
358
+ super().__init__(name=name, content=content, description=description)
59
359
 
60
360
  name: str
61
361
  description: str | None = None
62
- content: bytes
362
+ content: bytes # Note: constructor accepts str | bytes, but field stores bytes only
63
363
 
64
364
  # Pydantic configuration
65
365
  model_config = ConfigDict(
66
- frozen=True, # Make documents immutable
366
+ frozen=True,
67
367
  arbitrary_types_allowed=True,
368
+ extra="forbid",
68
369
  )
69
370
 
70
371
  @abstractmethod
71
- def get_base_type(self) -> Literal["flow", "task"]:
72
- """Get the type of the document - must be implemented by subclasses"""
372
+ def get_base_type(self) -> Literal["flow", "task", "temporary"]:
373
+ """Get the base type of the document.
374
+
375
+ Abstract method that must be implemented by all Document subclasses
376
+ to indicate their persistence behavior.
377
+
378
+ Returns:
379
+ One of "flow" (persisted across flow runs), "task" (temporary
380
+ within task execution), or "temporary" (never persisted).
381
+
382
+ Note:
383
+ This method determines document persistence and lifecycle.
384
+ FlowDocument returns "flow", TaskDocument returns "task",
385
+ TemporaryDocument returns "temporary".
386
+ """
73
387
  raise NotImplementedError("Subclasses must implement this method")
74
388
 
389
+ @final
75
390
  @property
76
- def base_type(self) -> Literal["flow", "task"]:
77
- """Alias for document_type for backward compatibility"""
391
+ def base_type(self) -> Literal["flow", "task", "temporary"]:
392
+ """Get the document's base type.
393
+
394
+ Property alias for get_base_type() providing a cleaner API.
395
+ This property cannot be overridden by subclasses.
396
+
397
+ Returns:
398
+ The document's base type: "flow", "task", or "temporary".
399
+ """
78
400
  return self.get_base_type()
79
401
 
402
+ @final
80
403
  @property
81
404
  def is_flow(self) -> bool:
82
- """Check if document is a flow document"""
405
+ """Check if this is a flow document.
406
+
407
+ Flow documents persist across Prefect flow runs and are saved
408
+ to the file system between pipeline steps.
409
+
410
+ Returns:
411
+ True if this is a FlowDocument subclass, False otherwise.
412
+ """
83
413
  return self.get_base_type() == "flow"
84
414
 
415
+ @final
85
416
  @property
86
417
  def is_task(self) -> bool:
87
- """Check if document is a task document"""
418
+ """Check if this is a task document.
419
+
420
+ Task documents are temporary within Prefect task execution
421
+ and are not persisted between pipeline steps.
422
+
423
+ Returns:
424
+ True if this is a TaskDocument subclass, False otherwise.
425
+ """
88
426
  return self.get_base_type() == "task"
89
427
 
428
+ @final
429
+ @property
430
+ def is_temporary(self) -> bool:
431
+ """Check if this is a temporary document.
432
+
433
+ Temporary documents are never persisted and exist only
434
+ during execution.
435
+
436
+ Returns:
437
+ True if this is a TemporaryDocument, False otherwise.
438
+ """
439
+ return self.get_base_type() == "temporary"
440
+
441
+ @final
90
442
  @classmethod
91
443
  def get_expected_files(cls) -> list[str] | None:
444
+ """Get the list of allowed file names for this document class.
445
+
446
+ If the document class defines a FILES enum, returns the list of
447
+ valid file names. Used to restrict documents to specific files.
448
+
449
+ Returns:
450
+ List of allowed file names if FILES enum is defined,
451
+ None if unrestricted.
452
+
453
+ Raises:
454
+ DocumentNameError: If FILES is defined but not a valid StrEnum.
455
+
456
+ Example:
457
+ >>> class ConfigDocument(FlowDocument):
458
+ ... class FILES(StrEnum):
459
+ ... CONFIG = "config.yaml"
460
+ ... SETTINGS = "settings.json"
461
+ >>> ConfigDocument.get_expected_files()
462
+ ['config.yaml', 'settings.json']
92
463
  """
93
- Return the list of allowed file names for this document class, or None if unrestricted.
94
- """
95
- if cls.FILES is None:
464
+ if not hasattr(cls, "FILES"):
465
+ return None
466
+ files = getattr(cls, "FILES")
467
+ if not files:
96
468
  return None
469
+ assert issubclass(files, StrEnum)
97
470
  try:
98
- values = [member.value for member in cls.FILES]
471
+ values = [member.value for member in files]
99
472
  except TypeError:
100
473
  raise DocumentNameError(f"{cls.__name__}.FILES must be an Enum of string values")
101
474
  if len(values) == 0:
@@ -104,32 +477,73 @@ class Document(BaseModel, ABC):
104
477
 
105
478
  @classmethod
106
479
  def validate_file_name(cls, name: str) -> None:
107
- """
108
- Optional file-name validation hook.
480
+ """Validate that a file name matches allowed patterns.
481
+
482
+ @public
483
+
484
+ DO NOT OVERRIDE this method if you define a FILES enum!
485
+ The validation is automatic when FILES enum is present.
486
+
487
+ # CORRECT - FILES enum provides automatic validation:
488
+ class MyDocument(FlowDocument):
489
+ class FILES(StrEnum):
490
+ CONFIG = "config.yaml" # Validation happens automatically!
491
+
492
+ # WRONG - Unnecessary override:
493
+ class MyDocument(FlowDocument):
494
+ class FILES(StrEnum):
495
+ CONFIG = "config.yaml"
496
+
497
+ def validate_file_name(cls, name): # DON'T DO THIS!
498
+ pass # Validation already happens via FILES enum
499
+
500
+ Only override for custom validation logic BEYOND FILES enum constraints.
501
+
502
+ Args:
503
+ name: The file name to validate.
109
504
 
110
- Default behavior:
111
- - If `FILES` enum is defined on the subclass, ensure the **basename** of `name`
112
- equals one of the enum values (exact string match).
113
- - If `FILES` is None, do nothing.
505
+ Raises:
506
+ DocumentNameError: If the name doesn't match allowed patterns.
114
507
 
115
- Override this method in subclasses for custom conventions (regex, prefixes, etc.).
116
- Raise DocumentNameError when invalid.
508
+ Note:
509
+ - If FILES enum is defined, name must exactly match one of the values
510
+ - If FILES is not defined, any name is allowed
511
+ - Override in subclasses ONLY for custom regex patterns or logic
117
512
  """
118
- if cls.FILES is None:
513
+ allowed = cls.get_expected_files()
514
+ if not allowed:
119
515
  return
120
516
 
121
- try:
122
- allowed = {str(member.value) for member in cls.FILES} # type: ignore[arg-type]
123
- except TypeError:
124
- raise DocumentNameError(f"{cls.__name__}.FILES must be an Enum of string values")
125
-
126
517
  if len(allowed) > 0 and name not in allowed:
127
518
  allowed_str = ", ".join(sorted(allowed))
128
519
  raise DocumentNameError(f"Invalid filename '{name}'. Allowed names: {allowed_str}")
129
520
 
130
521
  @field_validator("name")
131
522
  def validate_name(cls, v: str) -> str:
132
- """Validate document name matches expected patterns and is secure"""
523
+ r"""Pydantic validator for the document name field.
524
+
525
+ Ensures the document name is secure and follows conventions:
526
+ - No path traversal characters (.., \\, /)
527
+ - Cannot end with .description.md
528
+ - No leading/trailing whitespace
529
+ - Must match FILES enum if defined
530
+
531
+ Performance:
532
+ Validation is O(n) where n is the length of the name.
533
+ FILES enum check is O(m) where m is the number of allowed files
534
+
535
+ Args:
536
+ v: The name value to validate.
537
+
538
+ Returns:
539
+ The validated name.
540
+
541
+ Raises:
542
+ DocumentNameError: If the name violates any validation rules.
543
+
544
+ Note:
545
+ This is called automatically by Pydantic during model construction.
546
+ """
133
547
  if v.endswith(cls.DESCRIPTION_EXTENSION):
134
548
  raise DocumentNameError(
135
549
  f"Document names cannot end with {cls.DESCRIPTION_EXTENSION}: {v}"
@@ -145,9 +559,142 @@ class Document(BaseModel, ABC):
145
559
 
146
560
  return v
147
561
 
148
- @field_validator("content")
149
- def validate_content(cls, v: bytes) -> bytes:
150
- """Validate content size"""
562
+ @field_validator("content", mode="before")
563
+ @classmethod
564
+ def validate_content(cls, v: Any, info: ValidationInfo) -> bytes:
565
+ """Pydantic validator that converts various content types to bytes.
566
+
567
+ This validator is called automatically during model construction and
568
+ handles the intelligent type conversion that powers the `create` method.
569
+ It determines the appropriate serialization based on file extension.
570
+
571
+ Conversion Strategy:
572
+ 1. bytes → Passthrough (no conversion)
573
+ 2. str → UTF-8 encoding
574
+ 3. dict/BaseModel + .json → JSON serialization (indented)
575
+ 4. dict/BaseModel + .yaml/.yml → YAML serialization
576
+ 5. list[str] + .md → Join with markdown separator
577
+ 6. list[Any] + .json/.yaml → JSON/YAML array
578
+ 7. int/float/bool + .json → JSON primitive
579
+
580
+ Args:
581
+ v: Content to validate (any supported type)
582
+ info: Validation context containing other field values
583
+
584
+ Returns:
585
+ Content converted to bytes
586
+
587
+ Raises:
588
+ DocumentSizeError: If content exceeds MAX_CONTENT_SIZE
589
+ ValueError: If content type unsupported for file extension
590
+
591
+ Note:
592
+ This validator enables create() to accept multiple types while
593
+ ensuring __init__ only receives bytes for type safety.
594
+ """
595
+ # Get the name from validation context if available
596
+ name = ""
597
+ if hasattr(info, "data") and "name" in info.data:
598
+ name = info.data["name"]
599
+ name_lower = name.lower()
600
+
601
+ # Convert based on content type
602
+ if isinstance(v, bytes):
603
+ pass # Already bytes
604
+ elif isinstance(v, str):
605
+ v = v.encode("utf-8")
606
+ elif isinstance(v, dict):
607
+ # Serialize dict based on extension
608
+ if name_lower.endswith((".yaml", ".yml")):
609
+ # Use YAML format for YAML files
610
+ yaml = YAML()
611
+ stream = BytesIO()
612
+ yaml.dump(v, stream)
613
+ v = stream.getvalue()
614
+ elif name_lower.endswith(".json"):
615
+ # Use JSON for JSON files
616
+ v = json.dumps(v, indent=2).encode("utf-8")
617
+ else:
618
+ # Dict not supported for other file types
619
+ raise ValueError(f"Unsupported content type: {type(v)} for file {name}")
620
+ elif isinstance(v, list):
621
+ # Handle lists based on file extension
622
+ if name_lower.endswith(".md"):
623
+ # For markdown files, join with separator
624
+ if all(isinstance(item, str) for item in v):
625
+ v = cls.MARKDOWN_LIST_SEPARATOR.join(v).encode("utf-8")
626
+ else:
627
+ raise ValueError(
628
+ f"Unsupported content type: mixed-type list for markdown file {name}"
629
+ )
630
+ elif name_lower.endswith((".yaml", ".yml")):
631
+ # Check if it's a list of Pydantic models
632
+ if v and isinstance(v[0], BaseModel):
633
+ # Convert models to dicts first
634
+ v = [item.model_dump(mode="json") for item in v]
635
+ # Use YAML format for YAML files
636
+ yaml = YAML()
637
+ stream = BytesIO()
638
+ yaml.dump(v, stream)
639
+ v = stream.getvalue()
640
+ elif name_lower.endswith(".json"):
641
+ # Check if it's a list of Pydantic models
642
+ if v and isinstance(v[0], BaseModel):
643
+ # Convert models to dicts first
644
+ v = [item.model_dump(mode="json") for item in v]
645
+ # For JSON files, serialize as JSON
646
+ v = json.dumps(v, indent=2).encode("utf-8")
647
+ else:
648
+ # Check if it's a list of BaseModel
649
+ if v and isinstance(v[0], BaseModel):
650
+ raise ValueError("list[BaseModel] requires .json or .yaml extension")
651
+ # List content not supported for other file types
652
+ raise ValueError(f"Unsupported content type: {type(v)} for file {name}")
653
+ elif isinstance(v, BaseModel):
654
+ # Serialize Pydantic models
655
+ if name_lower.endswith((".yaml", ".yml")):
656
+ yaml = YAML()
657
+ stream = BytesIO()
658
+ yaml.dump(v.model_dump(mode="json"), stream)
659
+ v = stream.getvalue()
660
+ else:
661
+ v = json.dumps(v.model_dump(mode="json"), indent=2).encode("utf-8")
662
+ elif isinstance(v, (int, float, bool)):
663
+ # Numbers and booleans: JSON-serialize for .json, string for others
664
+ if name_lower.endswith(".json"):
665
+ v = json.dumps(v).encode("utf-8")
666
+ elif name_lower.endswith((".yaml", ".yml")):
667
+ v = str(v).encode("utf-8")
668
+ elif name_lower.endswith(".txt"):
669
+ v = str(v).encode("utf-8")
670
+ else:
671
+ # For other extensions, convert to string
672
+ v = str(v).encode("utf-8")
673
+ elif v is None:
674
+ # Handle None - only supported for JSON/YAML
675
+ if name_lower.endswith((".json", ".yaml", ".yml")):
676
+ if name_lower.endswith((".yaml", ".yml")):
677
+ v = b"null\n"
678
+ else:
679
+ v = b"null"
680
+ else:
681
+ raise ValueError(f"Unsupported content type: {type(None)} for file {name}")
682
+ else:
683
+ # Try to see if it has model_dump (duck typing for Pydantic-like)
684
+ if hasattr(v, "model_dump"):
685
+ if name_lower.endswith((".yaml", ".yml")):
686
+ yaml = YAML()
687
+ stream = BytesIO()
688
+ yaml.dump(v.model_dump(mode="json"), stream) # type: ignore[attr-defined]
689
+ v = stream.getvalue()
690
+ else:
691
+ v = json.dumps(v.model_dump(mode="json"), indent=2).encode("utf-8") # type: ignore[attr-defined]
692
+ else:
693
+ # List non-.json files should raise error
694
+ if name_lower.endswith(".txt") and isinstance(v, list):
695
+ raise ValueError("List content not supported for text files")
696
+ raise ValueError(f"Unsupported content type: {type(v)}")
697
+
151
698
  # Check content size limit
152
699
  max_size = getattr(cls, "MAX_CONTENT_SIZE", 100 * 1024 * 1024)
153
700
  if len(v) > max_size:
@@ -159,148 +706,537 @@ class Document(BaseModel, ABC):
159
706
 
160
707
  @field_serializer("content")
161
708
  def serialize_content(self, v: bytes) -> str:
162
- """Serialize bytes content to string for JSON serialization"""
709
+ """Pydantic serializer for content field.
710
+
711
+ Converts bytes content to string for JSON serialization.
712
+ Attempts UTF-8 decoding first, falls back to base64 encoding
713
+ for binary content.
714
+
715
+ Args:
716
+ v: The content bytes to serialize.
717
+
718
+ Returns:
719
+ UTF-8 decoded string for text content,
720
+ base64-encoded string for binary content.
721
+
722
+ Note:
723
+ This is called automatically by Pydantic during
724
+ model serialization to JSON.
725
+ """
163
726
  try:
164
727
  return v.decode("utf-8")
165
728
  except UnicodeDecodeError:
166
729
  # Fall back to base64 for binary content
167
730
  return base64.b64encode(v).decode("ascii")
168
731
 
732
+ @final
169
733
  @property
170
734
  def id(self) -> str:
171
- """Return the first 6 characters of the SHA256 hash of the content, encoded in base32"""
735
+ """Get a short unique identifier for the document.
736
+
737
+ @public
738
+
739
+ This ID is crucial for LLM interactions. When documents are provided to
740
+ LLMs via generate() or generate_structured(), their IDs are included,
741
+ allowing the LLM to reference documents in prompts by either name or ID.
742
+ The ID is content-based (derived from SHA256 hash of content only),
743
+ so the same content always produces the same ID. Changing the name or
744
+ description does NOT change the ID.
745
+
746
+ Returns:
747
+ 6-character base32-encoded string (uppercase, e.g., "A7B2C9").
748
+ This is the first 6 chars of the full base32 SHA256, NOT hex.
749
+
750
+ Collision Rate:
751
+ With base32 encoding (5 bits per char), 6 chars = 30 bits.
752
+ Expect collisions after ~32K documents (birthday paradox).
753
+ For higher uniqueness requirements, use the full sha256 property.
754
+
755
+ Note:
756
+ While shorter than full SHA256, this provides
757
+ reasonable uniqueness for most use cases.
758
+ """
172
759
  return self.sha256[:6]
173
760
 
761
+ @final
174
762
  @cached_property
175
763
  def sha256(self) -> str:
176
- """Full SHA256 hash of content, encoded in base32"""
764
+ """Get the full SHA256 hash of the document content.
765
+
766
+ @public
767
+
768
+ Computes and caches the SHA256 hash of the content,
769
+ encoded in base32 (uppercase). Used for content
770
+ deduplication and integrity verification.
771
+
772
+ Returns:
773
+ Full SHA256 hash as base32-encoded uppercase string.
774
+
775
+ Why Base32 Instead of Hex:
776
+ - Base32 is case-insensitive, avoiding issues with different file systems
777
+ and AI interactions where casing might be inconsistent
778
+ - More compact than hex (52 chars vs 64 chars for SHA-256)
779
+ - Contains more information per character than hex (5 bits vs 4 bits)
780
+ - Safe for URLs without encoding
781
+ - Compatible with case-insensitive file systems
782
+ - Avoids confusion in AI interactions where models might change casing
783
+ - Not base64 because we want consistent uppercase for all uses
784
+
785
+ Note:
786
+ This is computed once and cached for performance.
787
+ The hash is deterministic based on content only.
788
+ """
177
789
  return b32encode(hashlib.sha256(self.content).digest()).decode("ascii").upper()
178
790
 
791
+ @final
179
792
  @property
180
793
  def size(self) -> int:
181
- """Size of content in bytes"""
794
+ """Get the size of the document content.
795
+
796
+ @public
797
+
798
+ Returns:
799
+ Size of content in bytes.
800
+
801
+ Note:
802
+ Useful for monitoring document sizes and
803
+ ensuring they stay within limits.
804
+ """
182
805
  return len(self.content)
183
806
 
184
807
  @cached_property
185
808
  def detected_mime_type(self) -> str:
186
- """Detect MIME type from content using python-magic"""
809
+ """Detect the MIME type from document content.
810
+
811
+ Detection strategy (in order):
812
+ 1. Returns 'application/x-empty' for empty content
813
+ 2. Extension-based detection for known text formats (preferred)
814
+ 3. python-magic content analysis for unknown extensions
815
+ 4. Fallback to extension or 'application/octet-stream'
816
+
817
+ Returns:
818
+ MIME type string (e.g., "text/plain", "application/json").
819
+
820
+ Note:
821
+ This is cached after first access. Extension-based detection
822
+ is preferred for text formats to avoid misidentification.
823
+ """
187
824
  return detect_mime_type(self.content, self.name)
188
825
 
189
826
  @property
190
827
  def mime_type(self) -> str:
191
- """Get MIME type - uses content detection with fallback to extension"""
828
+ """Get the document's MIME type.
829
+
830
+ @public
831
+
832
+ Primary property for accessing MIME type information.
833
+ Automatically detects MIME type based on file extension and content.
834
+
835
+ Returns:
836
+ MIME type string (e.g., "text/plain", "application/json").
837
+
838
+ Note:
839
+ MIME type detection uses extension-based detection for known
840
+ text formats and content analysis for binary formats.
841
+ """
192
842
  return self.detected_mime_type
193
843
 
194
844
  @property
195
845
  def is_text(self) -> bool:
196
- """Check if document is text based on MIME type"""
846
+ """Check if document contains text content.
847
+
848
+ @public
849
+
850
+ Returns:
851
+ True if MIME type indicates text content
852
+ (text/*, application/json, application/x-yaml, text/yaml, etc.),
853
+ False otherwise.
854
+
855
+ Note:
856
+ Used to determine if text property can be safely accessed.
857
+ """
197
858
  return is_text_mime_type(self.mime_type)
198
859
 
199
860
  @property
200
861
  def is_pdf(self) -> bool:
201
- """Check if document is PDF"""
862
+ """Check if document is a PDF file.
863
+
864
+ @public
865
+
866
+ Returns:
867
+ True if MIME type is application/pdf, False otherwise.
868
+
869
+ Note:
870
+ PDF documents require special handling and are
871
+ supported by certain LLM models.
872
+ """
202
873
  return is_pdf_mime_type(self.mime_type)
203
874
 
204
875
  @property
205
876
  def is_image(self) -> bool:
206
- """Check if document is an image"""
877
+ """Check if document is an image file.
878
+
879
+ @public
880
+
881
+ Returns:
882
+ True if MIME type starts with "image/", False otherwise.
883
+
884
+ Note:
885
+ Image documents are automatically encoded for
886
+ vision-capable LLM models.
887
+ """
207
888
  return is_image_mime_type(self.mime_type)
208
889
 
209
890
  @classmethod
210
891
  def canonical_name(cls) -> str:
211
- """Get the canonical name of the document"""
892
+ """Get the canonical name for this document class.
893
+
894
+ Returns a standardized snake_case name derived from the
895
+ class name, used for directory naming and identification.
896
+
897
+ Returns:
898
+ Snake_case canonical name.
899
+
900
+ Example:
901
+ >>> class UserDataDocument(FlowDocument): ...
902
+ >>> UserDataDocument.canonical_name()
903
+ 'user_data'
904
+ """
212
905
  return canonical_name_key(cls)
213
906
 
214
- def as_text(self) -> str:
215
- """Parse document as text"""
907
+ @property
908
+ def text(self) -> str:
909
+ """Get document content as UTF-8 text string.
910
+
911
+ @public
912
+
913
+ Decodes the bytes content as UTF-8 text. Only available for
914
+ text-based documents (check is_text property first).
915
+
916
+ Returns:
917
+ UTF-8 decoded string.
918
+
919
+ Raises:
920
+ ValueError: If document is not text (is_text == False).
921
+
922
+ Example:
923
+ >>> doc = MyDocument.create(name="data.txt", content="Hello \u2728")
924
+ >>> if doc.is_text:
925
+ ... print(doc.text) # "Hello \u2728"
926
+
927
+ >>> # Binary document raises error:
928
+ >>> binary_doc = MyDocument(name="image.png", content=png_bytes)
929
+ >>> binary_doc.text # Raises ValueError
930
+ """
216
931
  if not self.is_text:
217
932
  raise ValueError(f"Document is not text: {self.name}")
218
933
  return self.content.decode("utf-8")
219
934
 
220
935
  def as_yaml(self) -> Any:
221
- """Parse document as YAML"""
222
- return YAML().load(self.as_text())
936
+ r"""Parse document content as YAML.
937
+
938
+ @public
939
+
940
+ Parses the document's text content as YAML and returns Python objects.
941
+ Uses ruamel.yaml which is safe by default (no code execution).
942
+
943
+ Returns:
944
+ Parsed YAML data: dict, list, str, int, float, bool, or None.
945
+
946
+ Raises:
947
+ ValueError: If document is not text-based.
948
+ YAMLError: If content is not valid YAML.
949
+
950
+ Example:
951
+ >>> # From dict content
952
+ >>> doc = MyDocument.create(name="config.yaml", content={
953
+ ... "server": {"host": "localhost", "port": 8080}
954
+ ... })
955
+ >>> doc.as_yaml() # {'server': {'host': 'localhost', 'port': 8080}}
956
+
957
+ >>> # From YAML string
958
+ >>> doc2 = MyDocument(name="simple.yml", content=b"key: value\nitems:\n - a\n - b")
959
+ >>> doc2.as_yaml() # {'key': 'value', 'items': ['a', 'b']}
960
+ """
961
+ yaml = YAML()
962
+ return yaml.load(self.text) # type: ignore[no-untyped-call, no-any-return]
223
963
 
224
964
  def as_json(self) -> Any:
225
- """Parse document as JSON"""
226
- return json.loads(self.as_text())
965
+ """Parse document content as JSON.
966
+
967
+ @public
968
+
969
+ Parses the document's text content as JSON and returns Python objects.
970
+ Document must contain valid JSON text.
227
971
 
228
- def as_pydantic_model(self, model_type: type[TModel]) -> TModel:
229
- """Parse document as a pydantic model and return the validated instance"""
972
+ Returns:
973
+ Parsed JSON data: dict, list, str, int, float, bool, or None.
974
+
975
+ Raises:
976
+ ValueError: If document is not text-based.
977
+ JSONDecodeError: If content is not valid JSON.
978
+
979
+ Example:
980
+ >>> # From dict content
981
+ >>> doc = MyDocument.create(name="data.json", content={"key": "value"})
982
+ >>> doc.as_json() # {'key': 'value'}
983
+
984
+ >>> # From JSON string
985
+ >>> doc2 = MyDocument(name="array.json", content=b'[1, 2, 3]')
986
+ >>> doc2.as_json() # [1, 2, 3]
987
+
988
+ >>> # Invalid JSON
989
+ >>> bad_doc = MyDocument(name="bad.json", content=b"not json")
990
+ >>> bad_doc.as_json() # Raises JSONDecodeError
991
+ """
992
+ return json.loads(self.text)
993
+
994
+ @overload
995
+ def as_pydantic_model(self, model_type: type[TModel]) -> TModel: ...
996
+
997
+ @overload
998
+ def as_pydantic_model(self, model_type: type[list[TModel]]) -> list[TModel]: ...
999
+
1000
+ def as_pydantic_model(
1001
+ self, model_type: type[TModel] | type[list[TModel]]
1002
+ ) -> TModel | list[TModel]:
1003
+ """Parse document content as Pydantic model with validation.
1004
+
1005
+ @public
1006
+
1007
+ Parses JSON or YAML content and validates it against a Pydantic model.
1008
+ Automatically detects format based on MIME type. Supports both single
1009
+ models and lists of models.
1010
+
1011
+ Args:
1012
+ model_type: Pydantic model class to validate against.
1013
+ Can be either:
1014
+ - type[Model] for single model
1015
+ - type[list[Model]] for list of models
1016
+
1017
+ Returns:
1018
+ Validated Pydantic model instance or list of instances.
1019
+
1020
+ Raises:
1021
+ ValueError: If document is not text or type mismatch.
1022
+ ValidationError: If data doesn't match model schema.
1023
+ JSONDecodeError/YAMLError: If content parsing fails.
1024
+
1025
+ Example:
1026
+ >>> from pydantic import BaseModel
1027
+ >>>
1028
+ >>> class User(BaseModel):
1029
+ ... name: str
1030
+ ... age: int
1031
+ >>>
1032
+ >>> # Single model
1033
+ >>> doc = MyDocument.create(name="user.json",
1034
+ ... content={"name": "Alice", "age": 30})
1035
+ >>> user = doc.as_pydantic_model(User)
1036
+ >>> print(user.name) # "Alice"
1037
+ >>>
1038
+ >>> # List of models
1039
+ >>> doc2 = MyDocument.create(name="users.json",
1040
+ ... content=[{"name": "Bob", "age": 25}, {"name": "Eve", "age": 28}])
1041
+ >>> users = doc2.as_pydantic_model(list[User])
1042
+ >>> print(len(users)) # 2
1043
+ """
230
1044
  data = self.as_yaml() if is_yaml_mime_type(self.mime_type) else self.as_json()
231
- return model_type.model_validate(data)
1045
+
1046
+ if get_origin(model_type) is list:
1047
+ if not isinstance(data, list):
1048
+ raise ValueError(f"Expected list data for {model_type}, got {type(data)}")
1049
+ item_type = get_args(model_type)[0]
1050
+ # Type guard for list case
1051
+ result_list = [item_type.model_validate(item) for item in data] # type: ignore[attr-defined]
1052
+ return cast(list[TModel], result_list)
1053
+
1054
+ # At this point model_type must be type[TModel], not type[list[TModel]]
1055
+ single_model = cast(type[TModel], model_type)
1056
+ return single_model.model_validate(data)
232
1057
 
233
1058
  def as_markdown_list(self) -> list[str]:
234
- """Parse document as a markdown list"""
235
- return self.as_text().split(self.MARKDOWN_LIST_SEPARATOR)
1059
+ r"""Parse document as markdown-separated list of sections.
236
1060
 
237
- @classmethod
238
- def create(
239
- cls,
240
- name: str,
241
- description: str | None,
242
- content: bytes | str | BaseModel | list[str] | Any,
243
- ) -> Self:
244
- """Create a document from a name, description, and content"""
245
- is_yaml_extension = name.endswith(".yaml") or name.endswith(".yml")
246
- is_json_extension = name.endswith(".json")
247
- is_markdown_extension = name.endswith(".md")
248
- is_str_list = isinstance(content, list) and all(isinstance(item, str) for item in content)
249
- if isinstance(content, bytes):
250
- pass
251
- elif isinstance(content, str):
252
- content = content.encode("utf-8")
253
- elif is_str_list and is_markdown_extension:
254
- return cls.create_as_markdown_list(name, description, content) # type: ignore[arg-type]
255
- elif is_yaml_extension:
256
- return cls.create_as_yaml(name, description, content)
257
- elif is_json_extension:
258
- return cls.create_as_json(name, description, content)
259
- else:
260
- raise ValueError(f"Unsupported content type: {type(content)} for {name}")
1061
+ @public
261
1062
 
262
- return cls(name=name, description=description, content=content)
1063
+ Splits text content using markdown separator ("\n\n---\n\n").
1064
+ Designed for markdown documents with multiple sections.
263
1065
 
264
- @classmethod
265
- def create_as_markdown_list(cls, name: str, description: str | None, items: list[str]) -> Self:
266
- """Create a document from a name, description, and list of strings"""
267
- # remove other list separators (lines that are only the separator + whitespace)
268
- separator = Document.MARKDOWN_LIST_SEPARATOR.strip()
269
- pattern = re.compile(rf"^[ \t]*{re.escape(separator)}[ \t]*(?:\r?\n|$)", flags=re.MULTILINE)
270
- # Normalize CRLF/CR to LF before cleaning to ensure consistent behavior
271
- normalized_items = [re.sub(r"\r\n?", "\n", item) for item in items]
272
- cleaned_items = [pattern.sub("", item) for item in normalized_items]
273
- content = Document.MARKDOWN_LIST_SEPARATOR.join(cleaned_items)
274
- return cls.create(name, description, content)
1066
+ Returns:
1067
+ List of string sections (preserves whitespace within sections).
275
1068
 
276
- @classmethod
277
- def create_as_json(cls, name: str, description: str | None, data: Any) -> Self:
278
- """Create a document from a name, description, and JSON data"""
279
- assert name.endswith(".json"), f"Document name must end with .json: {name}"
280
- if isinstance(data, BaseModel):
281
- data = data.model_dump(mode="json")
282
- content = json.dumps(data, indent=2).encode("utf-8")
283
- return cls.create(name, description, content)
1069
+ Raises:
1070
+ ValueError: If document is not text-based.
284
1071
 
285
- @classmethod
286
- def create_as_yaml(cls, name: str, description: str | None, data: Any) -> Self:
287
- """Create a document from a name, description, and YAML data"""
288
- assert name.endswith(".yaml") or name.endswith(".yml"), (
289
- f"Document name must end with .yaml or .yml: {name}"
290
- )
291
- if isinstance(data, BaseModel):
292
- data = data.model_dump()
293
- yaml = YAML()
294
- yaml.indent(mapping=2, sequence=4, offset=2)
295
- from io import BytesIO
1072
+ Example:
1073
+ >>> # Using create with list
1074
+ >>> sections = ["# Chapter 1\nIntroduction", "# Chapter 2\nDetails"]
1075
+ >>> doc = MyDocument.create(name="book.md", content=sections)
1076
+ >>> doc.as_markdown_list() # Returns original sections
1077
+
1078
+ >>> # Manual creation with separator
1079
+ >>> content = "Part 1\n\n---\n\nPart 2\n\n---\n\nPart 3"
1080
+ >>> doc2 = MyDocument(name="parts.md", content=content.encode())
1081
+ >>> doc2.as_markdown_list() # ['Part 1', 'Part 2', 'Part 3']
1082
+ """
1083
+ return self.text.split(self.MARKDOWN_LIST_SEPARATOR)
1084
+
1085
+ def parse(self, type_: type[Any]) -> Any:
1086
+ r"""Parse document content to original type (reverses create conversion).
296
1087
 
297
- stream = BytesIO()
298
- yaml.dump(data, stream)
299
- content = stream.getvalue()
300
- return cls.create(name, description, content)
1088
+ @public
301
1089
 
1090
+ This method reverses the automatic conversion performed by the `create`
1091
+ classmethod. It intelligently parses the bytes content based on the
1092
+ document's file extension and converts to the requested type.
1093
+
1094
+ Designed for roundtrip conversion:
1095
+ >>> original = {"key": "value"}
1096
+ >>> doc = MyDocument.create(name="data.json", content=original)
1097
+ >>> restored = doc.parse(dict)
1098
+ >>> assert restored == original # True
1099
+
1100
+ Args:
1101
+ type_: Target type to parse content into. Supported types:
1102
+ - bytes: Returns raw content (no conversion)
1103
+ - str: Decodes UTF-8 text
1104
+ - dict: Parses JSON (.json) or YAML (.yaml/.yml)
1105
+ - list: Splits markdown (.md) or parses JSON/YAML
1106
+ - BaseModel subclasses: Validates JSON/YAML into model
1107
+
1108
+ Returns:
1109
+ Content parsed to the requested type.
1110
+
1111
+ Raises:
1112
+ ValueError: If type is unsupported or parsing fails.
1113
+
1114
+ Extension Rules:
1115
+ - .json → JSON parsing for dict/list/BaseModel
1116
+ - .yaml/.yml → YAML parsing for dict/list/BaseModel
1117
+ - .md + list → Split by markdown separator
1118
+ - Any + str → UTF-8 decode
1119
+ - Any + bytes → Raw content
1120
+
1121
+ Example:
1122
+ >>> # String content
1123
+ >>> doc = MyDocument(name="test.txt", content=b"Hello")
1124
+ >>> doc.parse(str)
1125
+ 'Hello'
1126
+
1127
+ >>> # JSON content
1128
+ >>> doc = MyDocument.create(name="data.json", content={"key": "value"})
1129
+ >>> doc.parse(dict) # Returns {'key': 'value'}
1130
+
1131
+ >>> # Markdown list
1132
+ >>> items = ["Item 1", "Item 2"]
1133
+ >>> content = "\n\n---\n\n".join(items).encode()
1134
+ >>> doc = MyDocument(name="list.md", content=content)
1135
+ >>> doc.parse(list)
1136
+ ['Item 1', 'Item 2']
1137
+ """
1138
+ # Handle basic types
1139
+ if type_ is bytes:
1140
+ return self.content
1141
+ elif type_ is str:
1142
+ # Handle empty content specially
1143
+ if len(self.content) == 0:
1144
+ return ""
1145
+ return self.text
1146
+
1147
+ # Handle structured data based on extension
1148
+ name_lower = self.name.lower()
1149
+
1150
+ # JSON files
1151
+ if name_lower.endswith(".json"):
1152
+ if type_ is dict or type_ is list:
1153
+ result = self.as_json()
1154
+ # Ensure the result is the correct type
1155
+ if type_ is dict and not isinstance(result, dict):
1156
+ raise ValueError(f"Expected dict but got {type(result).__name__}")
1157
+ if type_ is list and not isinstance(result, list):
1158
+ raise ValueError(f"Expected list but got {type(result).__name__}")
1159
+ return result
1160
+ elif issubclass(type_, BaseModel):
1161
+ return self.as_pydantic_model(type_)
1162
+ else:
1163
+ raise ValueError(f"Cannot parse JSON file to type {type_}")
1164
+
1165
+ # YAML files
1166
+ elif name_lower.endswith((".yaml", ".yml")):
1167
+ if type_ is dict or type_ is list:
1168
+ result = self.as_yaml()
1169
+ # Ensure the result is the correct type
1170
+ if type_ is dict and not isinstance(result, dict):
1171
+ raise ValueError(f"Expected dict but got {type(result).__name__}")
1172
+ if type_ is list and not isinstance(result, list):
1173
+ raise ValueError(f"Expected list but got {type(result).__name__}")
1174
+ return result
1175
+ elif issubclass(type_, BaseModel):
1176
+ return self.as_pydantic_model(type_)
1177
+ else:
1178
+ raise ValueError(f"Cannot parse YAML file to type {type_}")
1179
+
1180
+ # Markdown files with lists
1181
+ elif name_lower.endswith(".md") and type_ is list:
1182
+ return self.as_markdown_list()
1183
+
1184
+ # Default: try to return as requested basic type
1185
+ elif type_ is dict or type_ is list:
1186
+ # Try JSON first, then YAML
1187
+ try:
1188
+ result = self.as_json()
1189
+ # Ensure the result is the correct type
1190
+ if type_ is dict and not isinstance(result, dict):
1191
+ raise ValueError(f"Expected dict but got {type(result).__name__}")
1192
+ if type_ is list and not isinstance(result, list):
1193
+ raise ValueError(f"Expected list but got {type(result).__name__}")
1194
+ return result
1195
+ except (json.JSONDecodeError, ValueError):
1196
+ try:
1197
+ result = self.as_yaml()
1198
+ # Ensure the result is the correct type
1199
+ if type_ is dict and not isinstance(result, dict):
1200
+ raise ValueError(f"Expected dict but got {type(result).__name__}")
1201
+ if type_ is list and not isinstance(result, list):
1202
+ raise ValueError(f"Expected list but got {type(result).__name__}")
1203
+ return result
1204
+ except Exception as e:
1205
+ raise ValueError(f"Cannot parse content to {type_}") from e
1206
+
1207
+ raise ValueError(f"Unsupported type {type_} for file {self.name}")
1208
+
1209
+ @final
302
1210
  def serialize_model(self) -> dict[str, Any]:
303
- """Serialize document to a dictionary with proper encoding."""
1211
+ """Serialize document to dictionary for storage or transmission.
1212
+
1213
+ Creates a complete JSON-serializable representation of the document
1214
+ with all metadata and properly encoded content. Automatically chooses
1215
+ the most appropriate encoding (UTF-8 for text, base64 for binary).
1216
+
1217
+ Returns:
1218
+ Dictionary with the following keys:
1219
+ - name: Document filename (str)
1220
+ - description: Optional description (str | None)
1221
+ - base_type: Persistence type - "flow", "task", or "temporary" (str)
1222
+ - size: Content size in bytes (int)
1223
+ - id: Short hash identifier, first 6 chars of SHA256 (str)
1224
+ - sha256: Full SHA256 hash in base32 encoding (str)
1225
+ - mime_type: Detected MIME type (str)
1226
+ - content: Encoded content (str)
1227
+ - content_encoding: Either "utf-8" or "base64" (str)
1228
+
1229
+ Encoding Strategy:
1230
+ - Text files (text/*, application/json, etc.) → UTF-8 string
1231
+ - Binary files (images, PDFs, etc.) → Base64 string
1232
+ - Invalid UTF-8 in text files → UTF-8 with replacement chars
1233
+
1234
+ Example:
1235
+ >>> doc = MyDocument.create(name="data.json", content={"key": "value"})
1236
+ >>> serialized = doc.serialize_model()
1237
+ >>> serialized["content_encoding"] # "utf-8"
1238
+ >>> serialized["mime_type"] # "application/json"
1239
+ """
304
1240
  result = {
305
1241
  "name": self.name,
306
1242
  "description": self.description,
@@ -327,19 +1263,59 @@ class Document(BaseModel, ABC):
327
1263
 
328
1264
  return result
329
1265
 
1266
+ @final
330
1267
  @classmethod
331
1268
  def from_dict(cls, data: dict[str, Any]) -> Self:
332
- """Deserialize document from dictionary."""
1269
+ r"""Deserialize document from dictionary (inverse of serialize_model).
1270
+
1271
+ Reconstructs a Document instance from the dictionary format produced
1272
+ by serialize_model(). Automatically handles content decoding based on
1273
+ the content_encoding field.
1274
+
1275
+ Args:
1276
+ data: Dictionary containing serialized document. Required keys:
1277
+ - name: Document filename (str)
1278
+ - content: Encoded content (str or bytes)
1279
+ Optional keys:
1280
+ - description: Document description (str | None)
1281
+ - content_encoding: "utf-8" or "base64" (defaults to "utf-8")
1282
+
1283
+ Returns:
1284
+ New Document instance with restored content.
1285
+
1286
+ Raises:
1287
+ ValueError: If content type is invalid or base64 decoding fails
1288
+ KeyError: If required keys are missing from data dictionary
1289
+
1290
+ Note:
1291
+ Provides roundtrip guarantee with serialize_model().
1292
+ Content and name are preserved exactly.
1293
+
1294
+ Example:
1295
+ >>> data = {
1296
+ ... "name": "config.yaml",
1297
+ ... "content": "key: value\n",
1298
+ ... "content_encoding": "utf-8",
1299
+ ... "description": "Config file"
1300
+ ... }
1301
+ >>> doc = MyDocument.from_dict(data)
1302
+ """
333
1303
  # Extract content and encoding
334
- content_str = data.get("content", "")
1304
+ content_raw = data.get("content", "")
335
1305
  content_encoding = data.get("content_encoding", "utf-8")
336
1306
 
337
1307
  # Decode content based on encoding
1308
+ content: bytes
338
1309
  if content_encoding == "base64":
339
- content = base64.b64decode(content_str)
340
- else:
1310
+ assert isinstance(content_raw, str), "base64 content must be string"
1311
+ content = base64.b64decode(content_raw)
1312
+ elif isinstance(content_raw, str):
341
1313
  # Default to UTF-8
342
- content = content_str.encode("utf-8")
1314
+ content = content_raw.encode("utf-8")
1315
+ elif isinstance(content_raw, bytes):
1316
+ content = content_raw
1317
+ else:
1318
+ raise ValueError(f"Invalid content type: {type(content_raw)}")
343
1319
 
344
1320
  # Create document with the required fields
345
1321
  return cls(