ai-pipeline-core 0.1.10__py3-none-any.whl → 0.1.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. ai_pipeline_core/__init__.py +84 -4
  2. ai_pipeline_core/documents/__init__.py +9 -0
  3. ai_pipeline_core/documents/document.py +1044 -152
  4. ai_pipeline_core/documents/document_list.py +147 -38
  5. ai_pipeline_core/documents/flow_document.py +112 -11
  6. ai_pipeline_core/documents/mime_type.py +173 -15
  7. ai_pipeline_core/documents/task_document.py +117 -12
  8. ai_pipeline_core/documents/temporary_document.py +84 -5
  9. ai_pipeline_core/documents/utils.py +41 -9
  10. ai_pipeline_core/exceptions.py +47 -11
  11. ai_pipeline_core/flow/__init__.py +2 -0
  12. ai_pipeline_core/flow/config.py +236 -27
  13. ai_pipeline_core/flow/options.py +50 -1
  14. ai_pipeline_core/llm/__init__.py +6 -0
  15. ai_pipeline_core/llm/ai_messages.py +125 -27
  16. ai_pipeline_core/llm/client.py +278 -26
  17. ai_pipeline_core/llm/model_options.py +130 -1
  18. ai_pipeline_core/llm/model_response.py +239 -35
  19. ai_pipeline_core/llm/model_types.py +67 -0
  20. ai_pipeline_core/logging/__init__.py +13 -0
  21. ai_pipeline_core/logging/logging_config.py +72 -20
  22. ai_pipeline_core/logging/logging_mixin.py +38 -32
  23. ai_pipeline_core/pipeline.py +363 -60
  24. ai_pipeline_core/prefect.py +48 -1
  25. ai_pipeline_core/prompt_manager.py +209 -24
  26. ai_pipeline_core/settings.py +108 -4
  27. ai_pipeline_core/simple_runner/__init__.py +5 -0
  28. ai_pipeline_core/simple_runner/cli.py +96 -11
  29. ai_pipeline_core/simple_runner/simple_runner.py +237 -4
  30. ai_pipeline_core/tracing.py +253 -30
  31. ai_pipeline_core-0.1.12.dist-info/METADATA +450 -0
  32. ai_pipeline_core-0.1.12.dist-info/RECORD +36 -0
  33. ai_pipeline_core-0.1.10.dist-info/METADATA +0 -538
  34. ai_pipeline_core-0.1.10.dist-info/RECORD +0 -36
  35. {ai_pipeline_core-0.1.10.dist-info → ai_pipeline_core-0.1.12.dist-info}/WHEEL +0 -0
  36. {ai_pipeline_core-0.1.10.dist-info → ai_pipeline_core-0.1.12.dist-info}/licenses/LICENSE +0 -0
@@ -1,7 +1,14 @@
1
+ """Document abstraction layer for AI pipeline flows.
2
+
3
+ @public
4
+
5
+ This module provides the core document abstraction for working with various types of data
6
+ in AI pipelines. Documents are immutable Pydantic models that wrap binary content with metadata.
7
+ """
8
+
1
9
  import base64
2
10
  import hashlib
3
11
  import json
4
- import re
5
12
  from abc import ABC, abstractmethod
6
13
  from base64 import b32encode
7
14
  from enum import StrEnum
@@ -20,7 +27,13 @@ from typing import (
20
27
  overload,
21
28
  )
22
29
 
23
- from pydantic import BaseModel, ConfigDict, field_serializer, field_validator
30
+ from pydantic import (
31
+ BaseModel,
32
+ ConfigDict,
33
+ ValidationInfo,
34
+ field_serializer,
35
+ field_validator,
36
+ )
24
37
  from ruamel.yaml import YAML
25
38
 
26
39
  from ai_pipeline_core.documents.utils import canonical_name_key
@@ -35,23 +48,136 @@ from .mime_type import (
35
48
  )
36
49
 
37
50
  TModel = TypeVar("TModel", bound=BaseModel)
38
- ContentInput = bytes | str | BaseModel | list[str] | Any
39
51
 
40
52
 
41
53
  class Document(BaseModel, ABC):
42
- """Abstract base class for all documents.
54
+ r"""Abstract base class for all documents in the AI Pipeline Core system.
55
+
56
+ @public
57
+
58
+ Document is the fundamental data abstraction for all content flowing through
59
+ pipelines. It provides automatic encoding, MIME type detection, serialization,
60
+ and validation. All documents must be subclassed from FlowDocument or TaskDocument
61
+ based on their persistence requirements. TemporaryDocument is a special concrete
62
+ class that can be instantiated directly (not abstract).
63
+
64
+ VALIDATION IS AUTOMATIC - Do not add manual validation!
65
+ Size validation, name validation, and MIME type detection are built-in.
66
+ The framework handles all standard validations internally.
67
+
68
+ # WRONG - These checks already happen automatically:
69
+ if document.size > document.MAX_CONTENT_SIZE:
70
+ raise DocumentSizeError(...) # NO! Already handled
71
+ document.validate_file_name(document.name) # NO! Automatic
72
+
73
+ Best Practices:
74
+ - Use create() classmethod for automatic type conversion (90% of cases)
75
+ - Omit description parameter unless truly needed for metadata
76
+ - When using LLM functions, pass AIMessages or str. Wrap any Document values
77
+ in AIMessages([...]). Do not call .text yourself
78
+
79
+ Standard Usage:
80
+ >>> # CORRECT - minimal parameters
81
+ >>> doc = MyDocument.create(name="data.json", content={"key": "value"})
82
+
83
+ >>> # AVOID - unnecessary description
84
+ >>> doc = MyDocument.create(
85
+ ... name="data.json",
86
+ ... content={"key": "value"},
87
+ ... description="This is data" # Usually not needed!
88
+ ... )
89
+
90
+ Key features:
91
+ - Immutable by default (frozen Pydantic model)
92
+ - Automatic MIME type detection
93
+ - Content size validation
94
+ - SHA256 hashing for deduplication
95
+ - Support for text, JSON, YAML, PDF, and image formats
96
+ - Conversion utilities between different formats
97
+
98
+ Class Variables:
99
+ MAX_CONTENT_SIZE: Maximum allowed content size in bytes (default 25MB)
100
+
101
+ Attributes:
102
+ name: Document filename (validated for security)
103
+ description: Optional human-readable description
104
+ content: Raw document content as bytes
105
+
106
+ Creating Documents:
107
+ **Use the `create` classmethod** for most use cases. It accepts various
108
+ content types (str, dict, list, BaseModel) and converts them automatically.
109
+ Only use __init__ directly when you already have bytes content.
110
+
111
+ >>> # RECOMMENDED: Use create for automatic conversion
112
+ >>> doc = MyDocument.create(name="data.json", content={"key": "value"})
113
+ >>>
114
+ >>> # Direct constructor: Only for bytes
115
+ >>> doc = MyDocument(name="data.bin", content=b"\x00\x01\x02")
116
+
117
+ Warning:
118
+ - Document subclasses should NOT start with 'Test' prefix (pytest conflict)
119
+ - Cannot instantiate Document directly - must subclass FlowDocument or TaskDocument
120
+ - Cannot add custom fields - only name, description, content are allowed
121
+ - Document is an abstract class and cannot be instantiated directly
43
122
 
44
- Warning: Document subclasses should NOT start with 'Test' prefix as this
45
- causes conflicts with pytest test discovery. Classes with 'Test' prefix
46
- will be rejected at definition time.
123
+ Metadata Attachment Patterns:
124
+ Since custom fields are not allowed, use these patterns for metadata:
125
+ 1. Use the 'description' field for human-readable metadata
126
+ 2. Embed metadata in content (e.g., JSON with data + metadata fields)
127
+ 3. Create a separate MetadataDocument type to accompany data documents
128
+ 4. Use document naming conventions (e.g., "data_v2_2024.json")
129
+ 5. Store metadata in flow_options or pass through TraceInfo
130
+
131
+ Example:
132
+ >>> from enum import StrEnum
133
+ >>>
134
+ >>> # Simple document:
135
+ >>> class MyDocument(FlowDocument):
136
+ ... pass
137
+ >>>
138
+ >>> # Document with file restrictions:
139
+ >>> class ConfigDocument(FlowDocument):
140
+ ... class FILES(StrEnum):
141
+ ... CONFIG = "config.yaml"
142
+ ... SETTINGS = "settings.json"
143
+ >>>
144
+ >>> # RECOMMENDED: Use create for automatic conversion
145
+ >>> doc = MyDocument.create(name="data.json", content={"key": "value"})
146
+ >>> print(doc.is_text) # True
147
+ >>> data = doc.as_json() # {'key': 'value'}
148
+ """
149
+
150
+ MAX_CONTENT_SIZE: ClassVar[int] = 25 * 1024 * 1024
151
+ """Maximum allowed content size in bytes (default 25MB).
152
+
153
+ @public
47
154
  """
48
155
 
49
- MAX_CONTENT_SIZE: ClassVar[int] = 25 * 1024 * 1024 # 25MB default
50
156
  DESCRIPTION_EXTENSION: ClassVar[str] = ".description.md"
51
- MARKDOWN_LIST_SEPARATOR: ClassVar[str] = "\n\n---\n\n"
157
+ """File extension for description files."""
158
+
159
+ MARKDOWN_LIST_SEPARATOR: ClassVar[str] = "\n\n-----------------\n\n"
160
+ """Separator for markdown list items."""
52
161
 
53
162
  def __init_subclass__(cls, **kwargs: Any) -> None:
54
- """Validate subclass names to prevent pytest conflicts."""
163
+ """Validate subclass configuration at definition time.
164
+
165
+ Performs several validation checks when a Document subclass is defined:
166
+ 1. Prevents class names starting with 'Test' (pytest conflict)
167
+ 2. Validates FILES enum if present (must be StrEnum)
168
+ 3. Prevents adding custom fields beyond name, description, content
169
+
170
+ Args:
171
+ **kwargs: Additional keyword arguments passed to parent __init_subclass__.
172
+
173
+ Raises:
174
+ TypeError: If subclass violates naming rules, FILES enum requirements,
175
+ or attempts to add extra fields.
176
+
177
+ Note:
178
+ This validation happens at class definition time, not instantiation,
179
+ providing early error detection during development.
180
+ """
55
181
  super().__init_subclass__(**kwargs)
56
182
  if cls.__name__.startswith("Test"):
57
183
  raise TypeError(
@@ -76,15 +202,166 @@ class Document(BaseModel, ABC):
76
202
  f"{', '.join(sorted(extras))}. Only {', '.join(sorted(allowed))} are allowed."
77
203
  )
78
204
 
79
- def __init__(self, **data: Any) -> None:
80
- """Prevent direct instantiation of abstract Document class."""
205
+ @overload
206
+ @classmethod
207
+ def create(cls, *, name: str, content: bytes, description: str | None = None) -> Self: ...
208
+
209
+ @overload
210
+ @classmethod
211
+ def create(cls, *, name: str, content: str, description: str | None = None) -> Self: ...
212
+
213
+ @overload
214
+ @classmethod
215
+ def create(
216
+ cls, *, name: str, content: dict[str, Any], description: str | None = None
217
+ ) -> Self: ...
218
+
219
+ @overload
220
+ @classmethod
221
+ def create(cls, *, name: str, content: list[Any], description: str | None = None) -> Self: ...
222
+
223
+ @overload
224
+ @classmethod
225
+ def create(cls, *, name: str, content: BaseModel, description: str | None = None) -> Self: ...
226
+
227
+ @classmethod
228
+ def create(
229
+ cls,
230
+ *,
231
+ name: str,
232
+ content: str | bytes | dict[str, Any] | list[Any] | BaseModel,
233
+ description: str | None = None,
234
+ ) -> Self:
235
+ r"""Create a Document with automatic content type conversion (recommended).
236
+
237
+ @public
238
+
239
+ This is the **recommended way to create documents**. It accepts various
240
+ content types and automatically converts them to bytes based on the file
241
+ extension. Use the `parse` method to reverse this conversion.
242
+
243
+ Best Practice (90% of cases):
244
+ Only provide name and content. The description parameter is RARELY needed.
245
+
246
+ Args:
247
+ name: Document filename (required, keyword-only).
248
+ Extension determines serialization:
249
+ - .json → JSON serialization
250
+ - .yaml/.yml → YAML serialization
251
+ - .md → Markdown list joining (for list[str])
252
+ - Others → UTF-8 encoding (for str)
253
+ content: Document content in various formats (required, keyword-only):
254
+ - bytes: Used directly without conversion
255
+ - str: Encoded to UTF-8 bytes
256
+ - dict[str, Any]: Serialized to JSON (.json) or YAML (.yaml/.yml)
257
+ - list[str]: Joined with separator for .md (validates no items
258
+ contain separator), else JSON/YAML
259
+ - list[BaseModel]: Serialized to JSON or YAML based on extension
260
+ - BaseModel: Serialized to JSON or YAML based on extension
261
+ description: Optional description - USUALLY OMIT THIS (defaults to None).
262
+ Only use when meaningful metadata helps downstream processing
263
+
264
+ Returns:
265
+ New Document instance with content converted to bytes
266
+
267
+ Raises:
268
+ ValueError: If content type is not supported for the file extension,
269
+ or if markdown list items contain the separator
270
+ DocumentNameError: If filename violates validation rules
271
+ DocumentSizeError: If content exceeds MAX_CONTENT_SIZE
272
+
273
+ Note:
274
+ All conversions are reversible using the `parse` method.
275
+ For example: MyDocument.create(name="data.json", content={"key": "value"}).parse(dict)
276
+ returns the original dictionary {"key": "value"}.
277
+
278
+ Example:
279
+ >>> # CORRECT - no description needed (90% of cases)
280
+ >>> doc = MyDocument.create(name="test.txt", content="Hello World")
281
+ >>> doc.content # b'Hello World'
282
+ >>> doc.parse(str) # "Hello World"
283
+
284
+ >>> # CORRECT - Dictionary to JSON, no description
285
+ >>> doc = MyDocument.create(name="config.json", content={"key": "value"})
286
+ >>> doc.content # b'{"key": "value", ...}'
287
+ >>> doc.parse(dict) # {"key": "value"}
288
+
289
+ >>> # AVOID unless description adds real value
290
+ >>> doc = MyDocument.create(
291
+ ... name="config.json",
292
+ ... content={"key": "value"},
293
+ ... description="Config file" # Usually redundant!
294
+ ... )
295
+
296
+ >>> # Pydantic model to YAML
297
+ >>> from pydantic import BaseModel
298
+ >>> class Config(BaseModel):
299
+ ... host: str
300
+ ... port: int
301
+ >>> config = Config(host="localhost", port=8080)
302
+ >>> doc = MyDocument.create(name="config.yaml", content=config)
303
+ >>> doc.parse(Config) # Returns Config instance
304
+
305
+ >>> # List to Markdown
306
+ >>> items = ["Section 1", "Section 2"]
307
+ >>> doc = MyDocument.create(name="sections.md", content=items)
308
+ >>> doc.parse(list) # ["Section 1", "Section 2"]
309
+ """
310
+ # Use model_validate to leverage the existing validator logic
311
+ temp = cls.model_validate({"name": name, "content": content, "description": description})
312
+ # Now construct with type-checker-friendly call (bytes only)
313
+ return cls(name=temp.name, content=temp.content, description=temp.description)
314
+
315
+ def __init__(
316
+ self,
317
+ *,
318
+ name: str,
319
+ content: bytes,
320
+ description: str | None = None,
321
+ ) -> None:
322
+ """Initialize a Document instance with raw bytes content.
323
+
324
+ @public
325
+
326
+ Important:
327
+ **Most users should use the `create` classmethod instead of __init__.**
328
+ The create method provides automatic content conversion for various types
329
+ (str, dict, list, Pydantic models) while __init__ only accepts bytes.
330
+
331
+ This constructor accepts only bytes content for type safety. It prevents
332
+ direct instantiation of the abstract Document class.
333
+
334
+ Args:
335
+ name: Document filename (required, keyword-only)
336
+ content: Document content as raw bytes (required, keyword-only)
337
+ description: Optional human-readable description (keyword-only)
338
+
339
+ Raises:
340
+ TypeError: If attempting to instantiate Document directly.
341
+
342
+ Example:
343
+ >>> # Direct constructor - only for bytes content:
344
+ >>> doc = MyDocument(name="test.txt", content=b"Hello World")
345
+ >>> doc.content # b'Hello World'
346
+
347
+ >>> # RECOMMENDED: Use create for automatic conversion:
348
+ >>> doc = MyDocument.create(name="text.txt", content="Hello World")
349
+ >>> doc = MyDocument.create(name="data.json", content={"key": "value"})
350
+ >>> doc = MyDocument.create(name="config.yaml", content=my_model)
351
+ >>> doc = MyDocument.create(name="items.md", content=["item1", "item2"])
352
+
353
+ See Also:
354
+ create: Recommended factory method with automatic type conversion
355
+ parse: Method to reverse the conversion done by create
356
+ """
81
357
  if type(self) is Document:
82
358
  raise TypeError("Cannot instantiate abstract Document class directly")
83
- super().__init__(**data)
359
+
360
+ super().__init__(name=name, content=content, description=description)
84
361
 
85
362
  name: str
86
363
  description: str | None = None
87
- content: bytes
364
+ content: bytes # Note: constructor accepts str | bytes, but field stores bytes only
88
365
 
89
366
  # Pydantic configuration
90
367
  model_config = ConfigDict(
@@ -95,38 +372,96 @@ class Document(BaseModel, ABC):
95
372
 
96
373
  @abstractmethod
97
374
  def get_base_type(self) -> Literal["flow", "task", "temporary"]:
98
- """Get the type of the document - must be implemented by subclasses"""
375
+ """Get the base type of the document.
376
+
377
+ Abstract method that must be implemented by all Document subclasses
378
+ to indicate their persistence behavior.
379
+
380
+ Returns:
381
+ One of "flow" (persisted across flow runs), "task" (temporary
382
+ within task execution), or "temporary" (never persisted).
383
+
384
+ Note:
385
+ This method determines document persistence and lifecycle.
386
+ FlowDocument returns "flow", TaskDocument returns "task",
387
+ TemporaryDocument returns "temporary".
388
+ """
99
389
  raise NotImplementedError("Subclasses must implement this method")
100
390
 
101
391
  @final
102
392
  @property
103
393
  def base_type(self) -> Literal["flow", "task", "temporary"]:
104
- """Alias for document_type for backward compatibility"""
394
+ """Get the document's base type.
395
+
396
+ Property alias for get_base_type() providing a cleaner API.
397
+ This property cannot be overridden by subclasses.
398
+
399
+ Returns:
400
+ The document's base type: "flow", "task", or "temporary".
401
+ """
105
402
  return self.get_base_type()
106
403
 
107
404
  @final
108
405
  @property
109
406
  def is_flow(self) -> bool:
110
- """Check if document is a flow document"""
407
+ """Check if this is a flow document.
408
+
409
+ Flow documents persist across Prefect flow runs and are saved
410
+ to the file system between pipeline steps.
411
+
412
+ Returns:
413
+ True if this is a FlowDocument subclass, False otherwise.
414
+ """
111
415
  return self.get_base_type() == "flow"
112
416
 
113
417
  @final
114
418
  @property
115
419
  def is_task(self) -> bool:
116
- """Check if document is a task document"""
420
+ """Check if this is a task document.
421
+
422
+ Task documents are temporary within Prefect task execution
423
+ and are not persisted between pipeline steps.
424
+
425
+ Returns:
426
+ True if this is a TaskDocument subclass, False otherwise.
427
+ """
117
428
  return self.get_base_type() == "task"
118
429
 
119
430
  @final
120
431
  @property
121
432
  def is_temporary(self) -> bool:
122
- """Check if document is a temporary document"""
433
+ """Check if this is a temporary document.
434
+
435
+ Temporary documents are never persisted and exist only
436
+ during execution.
437
+
438
+ Returns:
439
+ True if this is a TemporaryDocument, False otherwise.
440
+ """
123
441
  return self.get_base_type() == "temporary"
124
442
 
125
443
  @final
126
444
  @classmethod
127
445
  def get_expected_files(cls) -> list[str] | None:
128
- """
129
- Return the list of allowed file names for this document class, or None if unrestricted.
446
+ """Get the list of allowed file names for this document class.
447
+
448
+ If the document class defines a FILES enum, returns the list of
449
+ valid file names. Used to restrict documents to specific files.
450
+
451
+ Returns:
452
+ List of allowed file names if FILES enum is defined,
453
+ None if unrestricted.
454
+
455
+ Raises:
456
+ DocumentNameError: If FILES is defined but not a valid StrEnum.
457
+
458
+ Example:
459
+ >>> class ConfigDocument(FlowDocument):
460
+ ... class FILES(StrEnum):
461
+ ... CONFIG = "config.yaml"
462
+ ... SETTINGS = "settings.json"
463
+ >>> ConfigDocument.get_expected_files()
464
+ ['config.yaml', 'settings.json']
130
465
  """
131
466
  if not hasattr(cls, "FILES"):
132
467
  return None
@@ -144,16 +479,38 @@ class Document(BaseModel, ABC):
144
479
 
145
480
  @classmethod
146
481
  def validate_file_name(cls, name: str) -> None:
147
- """
148
- Optional file-name validation hook.
482
+ """Validate that a file name matches allowed patterns.
483
+
484
+ @public
485
+
486
+ DO NOT OVERRIDE this method if you define a FILES enum!
487
+ The validation is automatic when FILES enum is present.
488
+
489
+ # CORRECT - FILES enum provides automatic validation:
490
+ class MyDocument(FlowDocument):
491
+ class FILES(StrEnum):
492
+ CONFIG = "config.yaml" # Validation happens automatically!
493
+
494
+ # WRONG - Unnecessary override:
495
+ class MyDocument(FlowDocument):
496
+ class FILES(StrEnum):
497
+ CONFIG = "config.yaml"
149
498
 
150
- Default behavior:
151
- - If `FILES` enum is defined on the subclass, ensure the **basename** of `name`
152
- equals one of the enum values (exact string match).
153
- - If `FILES` is None, do nothing.
499
+ def validate_file_name(cls, name): # DON'T DO THIS!
500
+ pass # Validation already happens via FILES enum
154
501
 
155
- Override this method in subclasses for custom conventions (regex, prefixes, etc.).
156
- Raise DocumentNameError when invalid.
502
+ Only override for custom validation logic BEYOND FILES enum constraints.
503
+
504
+ Args:
505
+ name: The file name to validate.
506
+
507
+ Raises:
508
+ DocumentNameError: If the name doesn't match allowed patterns.
509
+
510
+ Note:
511
+ - If FILES enum is defined, name must exactly match one of the values
512
+ - If FILES is not defined, any name is allowed
513
+ - Override in subclasses ONLY for custom regex patterns or logic
157
514
  """
158
515
  allowed = cls.get_expected_files()
159
516
  if not allowed:
@@ -165,7 +522,30 @@ class Document(BaseModel, ABC):
165
522
 
166
523
  @field_validator("name")
167
524
  def validate_name(cls, v: str) -> str:
168
- """Validate document name matches expected patterns and is secure"""
525
+ r"""Pydantic validator for the document name field.
526
+
527
+ Ensures the document name is secure and follows conventions:
528
+ - No path traversal characters (.., \\, /)
529
+ - Cannot end with .description.md
530
+ - No leading/trailing whitespace
531
+ - Must match FILES enum if defined
532
+
533
+ Performance:
534
+ Validation is O(n) where n is the length of the name.
535
+ FILES enum check is O(m) where m is the number of allowed files
536
+
537
+ Args:
538
+ v: The name value to validate.
539
+
540
+ Returns:
541
+ The validated name.
542
+
543
+ Raises:
544
+ DocumentNameError: If the name violates any validation rules.
545
+
546
+ Note:
547
+ This is called automatically by Pydantic during model construction.
548
+ """
169
549
  if v.endswith(cls.DESCRIPTION_EXTENSION):
170
550
  raise DocumentNameError(
171
551
  f"Document names cannot end with {cls.DESCRIPTION_EXTENSION}: {v}"
@@ -181,9 +561,149 @@ class Document(BaseModel, ABC):
181
561
 
182
562
  return v
183
563
 
184
- @field_validator("content")
185
- def validate_content(cls, v: bytes) -> bytes:
186
- """Validate content size"""
564
+ @field_validator("content", mode="before")
565
+ @classmethod
566
+ def validate_content(cls, v: Any, info: ValidationInfo) -> bytes:
567
+ """Pydantic validator that converts various content types to bytes.
568
+
569
+ This validator is called automatically during model construction and
570
+ handles the intelligent type conversion that powers the `create` method.
571
+ It determines the appropriate serialization based on file extension.
572
+
573
+ Conversion Strategy:
574
+ 1. bytes → Passthrough (no conversion)
575
+ 2. str → UTF-8 encoding
576
+ 3. dict/BaseModel + .json → JSON serialization (indented)
577
+ 4. dict/BaseModel + .yaml/.yml → YAML serialization
578
+ 5. list[str] + .md → Join with markdown separator (validates no items contain separator)
579
+ 6. list[Any] + .json/.yaml → JSON/YAML array
580
+ 7. int/float/bool + .json → JSON primitive
581
+
582
+ Args:
583
+ v: Content to validate (any supported type)
584
+ info: Validation context containing other field values
585
+
586
+ Returns:
587
+ Content converted to bytes
588
+
589
+ Raises:
590
+ DocumentSizeError: If content exceeds MAX_CONTENT_SIZE
591
+ ValueError: If content type unsupported for file extension
592
+
593
+ Note:
594
+ This validator enables create() to accept multiple types while
595
+ ensuring __init__ only receives bytes for type safety.
596
+ """
597
+ # Get the name from validation context if available
598
+ name = ""
599
+ if hasattr(info, "data") and "name" in info.data:
600
+ name = info.data["name"]
601
+ name_lower = name.lower()
602
+
603
+ # Convert based on content type
604
+ if isinstance(v, bytes):
605
+ pass # Already bytes
606
+ elif isinstance(v, str):
607
+ v = v.encode("utf-8")
608
+ elif isinstance(v, dict):
609
+ # Serialize dict based on extension
610
+ if name_lower.endswith((".yaml", ".yml")):
611
+ # Use YAML format for YAML files
612
+ yaml = YAML()
613
+ stream = BytesIO()
614
+ yaml.dump(v, stream)
615
+ v = stream.getvalue()
616
+ elif name_lower.endswith(".json"):
617
+ # Use JSON for JSON files
618
+ v = json.dumps(v, indent=2).encode("utf-8")
619
+ else:
620
+ # Dict not supported for other file types
621
+ raise ValueError(f"Unsupported content type: {type(v)} for file {name}")
622
+ elif isinstance(v, list):
623
+ # Handle lists based on file extension
624
+ if name_lower.endswith(".md"):
625
+ # For markdown files, join with separator
626
+ if all(isinstance(item, str) for item in v):
627
+ # Check that no string contains the separator
628
+ for item in v:
629
+ if cls.MARKDOWN_LIST_SEPARATOR in item:
630
+ raise ValueError(
631
+ f"Markdown list item cannot contain the separator "
632
+ f"'{cls.MARKDOWN_LIST_SEPARATOR}' as it will mess up formatting"
633
+ )
634
+ v = cls.MARKDOWN_LIST_SEPARATOR.join(v).encode("utf-8")
635
+ else:
636
+ raise ValueError(
637
+ f"Unsupported content type: mixed-type list for markdown file {name}"
638
+ )
639
+ elif name_lower.endswith((".yaml", ".yml")):
640
+ # Check if it's a list of Pydantic models
641
+ if v and isinstance(v[0], BaseModel):
642
+ # Convert models to dicts first
643
+ v = [item.model_dump(mode="json") for item in v]
644
+ # Use YAML format for YAML files
645
+ yaml = YAML()
646
+ stream = BytesIO()
647
+ yaml.dump(v, stream)
648
+ v = stream.getvalue()
649
+ elif name_lower.endswith(".json"):
650
+ # Check if it's a list of Pydantic models
651
+ if v and isinstance(v[0], BaseModel):
652
+ # Convert models to dicts first
653
+ v = [item.model_dump(mode="json") for item in v]
654
+ # For JSON files, serialize as JSON
655
+ v = json.dumps(v, indent=2).encode("utf-8")
656
+ else:
657
+ # Check if it's a list of BaseModel
658
+ if v and isinstance(v[0], BaseModel):
659
+ raise ValueError("list[BaseModel] requires .json or .yaml extension")
660
+ # List content not supported for other file types
661
+ raise ValueError(f"Unsupported content type: {type(v)} for file {name}")
662
+ elif isinstance(v, BaseModel):
663
+ # Serialize Pydantic models
664
+ if name_lower.endswith((".yaml", ".yml")):
665
+ yaml = YAML()
666
+ stream = BytesIO()
667
+ yaml.dump(v.model_dump(mode="json"), stream)
668
+ v = stream.getvalue()
669
+ else:
670
+ v = json.dumps(v.model_dump(mode="json"), indent=2).encode("utf-8")
671
+ elif isinstance(v, (int, float, bool)):
672
+ # Numbers and booleans: JSON-serialize for .json, string for others
673
+ if name_lower.endswith(".json"):
674
+ v = json.dumps(v).encode("utf-8")
675
+ elif name_lower.endswith((".yaml", ".yml")):
676
+ v = str(v).encode("utf-8")
677
+ elif name_lower.endswith(".txt"):
678
+ v = str(v).encode("utf-8")
679
+ else:
680
+ # For other extensions, convert to string
681
+ v = str(v).encode("utf-8")
682
+ elif v is None:
683
+ # Handle None - only supported for JSON/YAML
684
+ if name_lower.endswith((".json", ".yaml", ".yml")):
685
+ if name_lower.endswith((".yaml", ".yml")):
686
+ v = b"null\n"
687
+ else:
688
+ v = b"null"
689
+ else:
690
+ raise ValueError(f"Unsupported content type: {type(None)} for file {name}")
691
+ else:
692
+ # Try to see if it has model_dump (duck typing for Pydantic-like)
693
+ if hasattr(v, "model_dump"):
694
+ if name_lower.endswith((".yaml", ".yml")):
695
+ yaml = YAML()
696
+ stream = BytesIO()
697
+ yaml.dump(v.model_dump(mode="json"), stream) # type: ignore[attr-defined]
698
+ v = stream.getvalue()
699
+ else:
700
+ v = json.dumps(v.model_dump(mode="json"), indent=2).encode("utf-8") # type: ignore[attr-defined]
701
+ else:
702
+ # List non-.json files should raise error
703
+ if name_lower.endswith(".txt") and isinstance(v, list):
704
+ raise ValueError("List content not supported for text files")
705
+ raise ValueError(f"Unsupported content type: {type(v)}")
706
+
187
707
  # Check content size limit
188
708
  max_size = getattr(cls, "MAX_CONTENT_SIZE", 100 * 1024 * 1024)
189
709
  if len(v) > max_size:
@@ -195,7 +715,23 @@ class Document(BaseModel, ABC):
195
715
 
196
716
  @field_serializer("content")
197
717
  def serialize_content(self, v: bytes) -> str:
198
- """Serialize bytes content to string for JSON serialization"""
718
+ """Pydantic serializer for content field.
719
+
720
+ Converts bytes content to string for JSON serialization.
721
+ Attempts UTF-8 decoding first, falls back to base64 encoding
722
+ for binary content.
723
+
724
+ Args:
725
+ v: The content bytes to serialize.
726
+
727
+ Returns:
728
+ UTF-8 decoded string for text content,
729
+ base64-encoded string for binary content.
730
+
731
+ Note:
732
+ This is called automatically by Pydantic during
733
+ model serialization to JSON.
734
+ """
199
735
  try:
200
736
  return v.decode("utf-8")
201
737
  except UnicodeDecodeError:
@@ -205,64 +741,264 @@ class Document(BaseModel, ABC):
205
741
  @final
206
742
  @property
207
743
  def id(self) -> str:
208
- """Return the first 6 characters of the SHA256 hash of the content, encoded in base32"""
744
+ """Get a short unique identifier for the document.
745
+
746
+ @public
747
+
748
+ This ID is crucial for LLM interactions. When documents are provided to
749
+ LLMs via generate() or generate_structured(), their IDs are included,
750
+ allowing the LLM to reference documents in prompts by either name or ID.
751
+ The ID is content-based (derived from SHA256 hash of content only),
752
+ so the same content always produces the same ID. Changing the name or
753
+ description does NOT change the ID.
754
+
755
+ Returns:
756
+ 6-character base32-encoded string (uppercase, e.g., "A7B2C9").
757
+ This is the first 6 chars of the full base32 SHA256, NOT hex.
758
+
759
+ Collision Rate:
760
+ With base32 encoding (5 bits per char), 6 chars = 30 bits.
761
+ Expect collisions after ~32K documents (birthday paradox).
762
+ For higher uniqueness requirements, use the full sha256 property.
763
+
764
+ Note:
765
+ While shorter than full SHA256, this provides
766
+ reasonable uniqueness for most use cases.
767
+ """
209
768
  return self.sha256[:6]
210
769
 
211
770
  @final
212
771
  @cached_property
213
772
  def sha256(self) -> str:
214
- """Full SHA256 hash of content, encoded in base32"""
773
+ """Get the full SHA256 hash of the document content.
774
+
775
+ @public
776
+
777
+ Computes and caches the SHA256 hash of the content,
778
+ encoded in base32 (uppercase). Used for content
779
+ deduplication and integrity verification.
780
+
781
+ Returns:
782
+ Full SHA256 hash as base32-encoded uppercase string.
783
+
784
+ Why Base32 Instead of Hex:
785
+ - Base32 is case-insensitive, avoiding issues with different file systems
786
+ and AI interactions where casing might be inconsistent
787
+ - More compact than hex (52 chars vs 64 chars for SHA-256)
788
+ - Contains more information per character than hex (5 bits vs 4 bits)
789
+ - Safe for URLs without encoding
790
+ - Compatible with case-insensitive file systems
791
+ - Avoids confusion in AI interactions where models might change casing
792
+ - Not base64 because we want consistent uppercase for all uses
793
+
794
+ Note:
795
+ This is computed once and cached for performance.
796
+ The hash is deterministic based on content only.
797
+ """
215
798
  return b32encode(hashlib.sha256(self.content).digest()).decode("ascii").upper()
216
799
 
217
800
  @final
218
801
  @property
219
802
  def size(self) -> int:
220
- """Size of content in bytes"""
803
+ """Get the size of the document content.
804
+
805
+ @public
806
+
807
+ Returns:
808
+ Size of content in bytes.
809
+
810
+ Note:
811
+ Useful for monitoring document sizes and
812
+ ensuring they stay within limits.
813
+ """
221
814
  return len(self.content)
222
815
 
223
816
  @cached_property
224
817
  def detected_mime_type(self) -> str:
225
- """Detect MIME type from content using python-magic"""
818
+ """Detect the MIME type from document content.
819
+
820
+ Detection strategy (in order):
821
+ 1. Returns 'application/x-empty' for empty content
822
+ 2. Extension-based detection for known text formats (preferred)
823
+ 3. python-magic content analysis for unknown extensions
824
+ 4. Fallback to extension or 'application/octet-stream'
825
+
826
+ Returns:
827
+ MIME type string (e.g., "text/plain", "application/json").
828
+
829
+ Note:
830
+ This is cached after first access. Extension-based detection
831
+ is preferred for text formats to avoid misidentification.
832
+ """
226
833
  return detect_mime_type(self.content, self.name)
227
834
 
228
835
  @property
229
836
  def mime_type(self) -> str:
230
- """Get MIME type - uses content detection with fallback to extension"""
837
+ """Get the document's MIME type.
838
+
839
+ @public
840
+
841
+ Primary property for accessing MIME type information.
842
+ Automatically detects MIME type based on file extension and content.
843
+
844
+ Returns:
845
+ MIME type string (e.g., "text/plain", "application/json").
846
+
847
+ Note:
848
+ MIME type detection uses extension-based detection for known
849
+ text formats and content analysis for binary formats.
850
+ """
231
851
  return self.detected_mime_type
232
852
 
233
853
  @property
234
854
  def is_text(self) -> bool:
235
- """Check if document is text based on MIME type"""
855
+ """Check if document contains text content.
856
+
857
+ @public
858
+
859
+ Returns:
860
+ True if MIME type indicates text content
861
+ (text/*, application/json, application/x-yaml, text/yaml, etc.),
862
+ False otherwise.
863
+
864
+ Note:
865
+ Used to determine if text property can be safely accessed.
866
+ """
236
867
  return is_text_mime_type(self.mime_type)
237
868
 
238
869
  @property
239
870
  def is_pdf(self) -> bool:
240
- """Check if document is PDF"""
871
+ """Check if document is a PDF file.
872
+
873
+ @public
874
+
875
+ Returns:
876
+ True if MIME type is application/pdf, False otherwise.
877
+
878
+ Note:
879
+ PDF documents require special handling and are
880
+ supported by certain LLM models.
881
+ """
241
882
  return is_pdf_mime_type(self.mime_type)
242
883
 
243
884
  @property
244
885
  def is_image(self) -> bool:
245
- """Check if document is an image"""
886
+ """Check if document is an image file.
887
+
888
+ @public
889
+
890
+ Returns:
891
+ True if MIME type starts with "image/", False otherwise.
892
+
893
+ Note:
894
+ Image documents are automatically encoded for
895
+ vision-capable LLM models.
896
+ """
246
897
  return is_image_mime_type(self.mime_type)
247
898
 
248
899
  @classmethod
249
900
  def canonical_name(cls) -> str:
250
- """Get the canonical name of the document"""
901
+ """Get the canonical name for this document class.
902
+
903
+ Returns a standardized snake_case name derived from the
904
+ class name, used for directory naming and identification.
905
+
906
+ Returns:
907
+ Snake_case canonical name.
908
+
909
+ Example:
910
+ >>> class UserDataDocument(FlowDocument): ...
911
+ >>> UserDataDocument.canonical_name()
912
+ 'user_data'
913
+ """
251
914
  return canonical_name_key(cls)
252
915
 
253
- def as_text(self) -> str:
254
- """Parse document as text"""
916
+ @property
917
+ def text(self) -> str:
918
+ """Get document content as UTF-8 text string.
919
+
920
+ @public
921
+
922
+ Decodes the bytes content as UTF-8 text. Only available for
923
+ text-based documents (check is_text property first).
924
+
925
+ Returns:
926
+ UTF-8 decoded string.
927
+
928
+ Raises:
929
+ ValueError: If document is not text (is_text == False).
930
+
931
+ Example:
932
+ >>> doc = MyDocument.create(name="data.txt", content="Hello \u2728")
933
+ >>> if doc.is_text:
934
+ ... print(doc.text) # "Hello \u2728"
935
+
936
+ >>> # Binary document raises error:
937
+ >>> binary_doc = MyDocument(name="image.png", content=png_bytes)
938
+ >>> binary_doc.text # Raises ValueError
939
+ """
255
940
  if not self.is_text:
256
941
  raise ValueError(f"Document is not text: {self.name}")
257
942
  return self.content.decode("utf-8")
258
943
 
259
944
  def as_yaml(self) -> Any:
260
- """Parse document as YAML"""
261
- return YAML().load(self.as_text())
945
+ r"""Parse document content as YAML.
946
+
947
+ @public
948
+
949
+ Parses the document's text content as YAML and returns Python objects.
950
+ Uses ruamel.yaml which is safe by default (no code execution).
951
+
952
+ Returns:
953
+ Parsed YAML data: dict, list, str, int, float, bool, or None.
954
+
955
+ Raises:
956
+ ValueError: If document is not text-based.
957
+ YAMLError: If content is not valid YAML.
958
+
959
+ Example:
960
+ >>> # From dict content
961
+ >>> doc = MyDocument.create(name="config.yaml", content={
962
+ ... "server": {"host": "localhost", "port": 8080}
963
+ ... })
964
+ >>> doc.as_yaml() # {'server': {'host': 'localhost', 'port': 8080}}
965
+
966
+ >>> # From YAML string
967
+ >>> doc2 = MyDocument(name="simple.yml", content=b"key: value\nitems:\n - a\n - b")
968
+ >>> doc2.as_yaml() # {'key': 'value', 'items': ['a', 'b']}
969
+ """
970
+ yaml = YAML()
971
+ return yaml.load(self.text) # type: ignore[no-untyped-call, no-any-return]
262
972
 
263
973
  def as_json(self) -> Any:
264
- """Parse document as JSON"""
265
- return json.loads(self.as_text())
974
+ """Parse document content as JSON.
975
+
976
+ @public
977
+
978
+ Parses the document's text content as JSON and returns Python objects.
979
+ Document must contain valid JSON text.
980
+
981
+ Returns:
982
+ Parsed JSON data: dict, list, str, int, float, bool, or None.
983
+
984
+ Raises:
985
+ ValueError: If document is not text-based.
986
+ JSONDecodeError: If content is not valid JSON.
987
+
988
+ Example:
989
+ >>> # From dict content
990
+ >>> doc = MyDocument.create(name="data.json", content={"key": "value"})
991
+ >>> doc.as_json() # {'key': 'value'}
992
+
993
+ >>> # From JSON string
994
+ >>> doc2 = MyDocument(name="array.json", content=b'[1, 2, 3]')
995
+ >>> doc2.as_json() # [1, 2, 3]
996
+
997
+ >>> # Invalid JSON
998
+ >>> bad_doc = MyDocument(name="bad.json", content=b"not json")
999
+ >>> bad_doc.as_json() # Raises JSONDecodeError
1000
+ """
1001
+ return json.loads(self.text)
266
1002
 
267
1003
  @overload
268
1004
  def as_pydantic_model(self, model_type: type[TModel]) -> TModel: ...
@@ -273,126 +1009,243 @@ class Document(BaseModel, ABC):
273
1009
  def as_pydantic_model(
274
1010
  self, model_type: type[TModel] | type[list[TModel]]
275
1011
  ) -> TModel | list[TModel]:
276
- """Parse document as a pydantic model and return the validated instance"""
1012
+ """Parse document content as Pydantic model with validation.
1013
+
1014
+ @public
1015
+
1016
+ Parses JSON or YAML content and validates it against a Pydantic model.
1017
+ Automatically detects format based on MIME type. Supports both single
1018
+ models and lists of models.
1019
+
1020
+ Args:
1021
+ model_type: Pydantic model class to validate against.
1022
+ Can be either:
1023
+ - type[Model] for single model
1024
+ - type[list[Model]] for list of models
1025
+
1026
+ Returns:
1027
+ Validated Pydantic model instance or list of instances.
1028
+
1029
+ Raises:
1030
+ ValueError: If document is not text or type mismatch.
1031
+ ValidationError: If data doesn't match model schema.
1032
+ JSONDecodeError/YAMLError: If content parsing fails.
1033
+
1034
+ Example:
1035
+ >>> from pydantic import BaseModel
1036
+ >>>
1037
+ >>> class User(BaseModel):
1038
+ ... name: str
1039
+ ... age: int
1040
+ >>>
1041
+ >>> # Single model
1042
+ >>> doc = MyDocument.create(name="user.json",
1043
+ ... content={"name": "Alice", "age": 30})
1044
+ >>> user = doc.as_pydantic_model(User)
1045
+ >>> print(user.name) # "Alice"
1046
+ >>>
1047
+ >>> # List of models
1048
+ >>> doc2 = MyDocument.create(name="users.json",
1049
+ ... content=[{"name": "Bob", "age": 25}, {"name": "Eve", "age": 28}])
1050
+ >>> users = doc2.as_pydantic_model(list[User])
1051
+ >>> print(len(users)) # 2
1052
+ """
277
1053
  data = self.as_yaml() if is_yaml_mime_type(self.mime_type) else self.as_json()
278
1054
 
279
1055
  if get_origin(model_type) is list:
280
1056
  if not isinstance(data, list):
281
1057
  raise ValueError(f"Expected list data for {model_type}, got {type(data)}")
282
1058
  item_type = get_args(model_type)[0]
283
- return [item_type.model_validate(item) for item in data]
1059
+ # Type guard for list case
1060
+ result_list = [item_type.model_validate(item) for item in data] # type: ignore[attr-defined]
1061
+ return cast(list[TModel], result_list)
284
1062
 
285
1063
  # At this point model_type must be type[TModel], not type[list[TModel]]
286
1064
  single_model = cast(type[TModel], model_type)
287
1065
  return single_model.model_validate(data)
288
1066
 
289
1067
  def as_markdown_list(self) -> list[str]:
290
- """Parse document as a markdown list"""
291
- return self.as_text().split(self.MARKDOWN_LIST_SEPARATOR)
1068
+ r"""Parse document as markdown-separated list of sections.
292
1069
 
293
- @overload
294
- @classmethod
295
- def create(cls, name: str, content: ContentInput, /) -> Self: ...
296
- @overload
297
- @classmethod
298
- def create(cls, name: str, *, content: ContentInput) -> Self: ...
299
- @overload
300
- @classmethod
301
- def create(cls, name: str, description: str | None, content: ContentInput, /) -> Self: ...
302
- @overload
303
- @classmethod
304
- def create(cls, name: str, description: str | None, *, content: ContentInput) -> Self: ...
1070
+ @public
305
1071
 
306
- @classmethod
307
- def create(
308
- cls,
309
- name: str,
310
- description: ContentInput = None,
311
- content: ContentInput = None,
312
- ) -> Self:
313
- """Create a document from a name, description, and content"""
314
- if content is None:
315
- if description is None:
316
- raise ValueError(f"Unsupported content type: {type(content)} for {name}")
317
- content = description
318
- description = None
319
- else:
320
- assert description is None or isinstance(description, str)
321
-
322
- is_yaml_extension = name.endswith(".yaml") or name.endswith(".yml")
323
- is_json_extension = name.endswith(".json")
324
- is_markdown_extension = name.endswith(".md")
325
- is_str_list = isinstance(content, list) and all(isinstance(item, str) for item in content)
326
- if isinstance(content, bytes):
327
- pass
328
- elif isinstance(content, str):
329
- content = content.encode("utf-8")
330
- elif is_str_list and is_markdown_extension:
331
- return cls.create_as_markdown_list(name, description, content) # type: ignore[arg-type]
332
- elif isinstance(content, list) and all(isinstance(item, BaseModel) for item in content):
333
- # Handle list[BaseModel] for JSON/YAML files
334
- if is_yaml_extension:
335
- return cls.create_as_yaml(name, description, content)
336
- elif is_json_extension:
337
- return cls.create_as_json(name, description, content)
338
- else:
339
- raise ValueError(f"list[BaseModel] requires .json or .yaml extension, got {name}")
340
- elif is_yaml_extension:
341
- return cls.create_as_yaml(name, description, content)
342
- elif is_json_extension:
343
- return cls.create_as_json(name, description, content)
344
- else:
345
- raise ValueError(f"Unsupported content type: {type(content)} for {name}")
1072
+ Splits text content using markdown separator ("\n\n-----------------\n\n").
1073
+ Designed for markdown documents with multiple sections.
346
1074
 
347
- return cls(name=name, description=description, content=content)
1075
+ Returns:
1076
+ List of string sections (preserves whitespace within sections).
348
1077
 
349
- @final
350
- @classmethod
351
- def create_as_markdown_list(cls, name: str, description: str | None, items: list[str]) -> Self:
352
- """Create a document from a name, description, and list of strings"""
353
- # remove other list separators (lines that are only the separator + whitespace)
354
- separator = Document.MARKDOWN_LIST_SEPARATOR.strip()
355
- pattern = re.compile(rf"^[ \t]*{re.escape(separator)}[ \t]*(?:\r?\n|$)", flags=re.MULTILINE)
356
- # Normalize CRLF/CR to LF before cleaning to ensure consistent behavior
357
- normalized_items = [re.sub(r"\r\n?", "\n", item) for item in items]
358
- cleaned_items = [pattern.sub("", item) for item in normalized_items]
359
- content = Document.MARKDOWN_LIST_SEPARATOR.join(cleaned_items)
360
- return cls.create(name, description, content)
1078
+ Raises:
1079
+ ValueError: If document is not text-based.
361
1080
 
362
- @final
363
- @classmethod
364
- def create_as_json(cls, name: str, description: str | None, data: Any) -> Self:
365
- """Create a document from a name, description, and JSON data"""
366
- assert name.endswith(".json"), f"Document name must end with .json: {name}"
367
- if isinstance(data, BaseModel):
368
- data = data.model_dump(mode="json")
369
- elif isinstance(data, list) and all(isinstance(item, BaseModel) for item in data):
370
- data = [item.model_dump(mode="json") for item in data]
371
- content = json.dumps(data, indent=2).encode("utf-8")
372
- return cls.create(name, description, content)
1081
+ Example:
1082
+ >>> # Using create with list
1083
+ >>> sections = ["# Chapter 1\nIntroduction", "# Chapter 2\nDetails"]
1084
+ >>> doc = MyDocument.create(name="book.md", content=sections)
1085
+ >>> doc.as_markdown_list() # Returns original sections
373
1086
 
374
- @final
375
- @classmethod
376
- def create_as_yaml(cls, name: str, description: str | None, data: Any) -> Self:
377
- """Create a document from a name, description, and YAML data"""
378
- assert name.endswith(".yaml") or name.endswith(".yml"), (
379
- f"Document name must end with .yaml or .yml: {name}"
380
- )
381
- if isinstance(data, BaseModel):
382
- data = data.model_dump(mode="json")
383
- elif isinstance(data, list) and all(isinstance(item, BaseModel) for item in data):
384
- data = [item.model_dump(mode="json") for item in data]
385
- yaml = YAML()
386
- yaml.indent(mapping=2, sequence=4, offset=2)
1087
+ >>> # Manual creation with separator
1088
+ >>> content = "Part 1\n\n-----------------\n\nPart 2\n\n-----------------\n\nPart 3"
1089
+ >>> doc2 = MyDocument(name="parts.md", content=content.encode())
1090
+ >>> doc2.as_markdown_list() # ['Part 1', 'Part 2', 'Part 3']
1091
+ """
1092
+ return self.text.split(self.MARKDOWN_LIST_SEPARATOR)
1093
+
1094
+ def parse(self, type_: type[Any]) -> Any:
1095
+ r"""Parse document content to original type (reverses create conversion).
1096
+
1097
+ @public
387
1098
 
388
- stream = BytesIO()
389
- yaml.dump(data, stream)
390
- content = stream.getvalue()
391
- return cls.create(name, description, content)
1099
+ This method reverses the automatic conversion performed by the `create`
1100
+ classmethod. It intelligently parses the bytes content based on the
1101
+ document's file extension and converts to the requested type.
1102
+
1103
+ Designed for roundtrip conversion:
1104
+ >>> original = {"key": "value"}
1105
+ >>> doc = MyDocument.create(name="data.json", content=original)
1106
+ >>> restored = doc.parse(dict)
1107
+ >>> assert restored == original # True
1108
+
1109
+ Args:
1110
+ type_: Target type to parse content into. Supported types:
1111
+ - bytes: Returns raw content (no conversion)
1112
+ - str: Decodes UTF-8 text
1113
+ - dict: Parses JSON (.json) or YAML (.yaml/.yml)
1114
+ - list: Splits markdown (.md) or parses JSON/YAML
1115
+ - BaseModel subclasses: Validates JSON/YAML into model
1116
+
1117
+ Returns:
1118
+ Content parsed to the requested type.
1119
+
1120
+ Raises:
1121
+ ValueError: If type is unsupported or parsing fails.
1122
+
1123
+ Extension Rules:
1124
+ - .json → JSON parsing for dict/list/BaseModel
1125
+ - .yaml/.yml → YAML parsing for dict/list/BaseModel
1126
+ - .md + list → Split by markdown separator
1127
+ - Any + str → UTF-8 decode
1128
+ - Any + bytes → Raw content
1129
+
1130
+ Example:
1131
+ >>> # String content
1132
+ >>> doc = MyDocument(name="test.txt", content=b"Hello")
1133
+ >>> doc.parse(str)
1134
+ 'Hello'
1135
+
1136
+ >>> # JSON content
1137
+ >>> doc = MyDocument.create(name="data.json", content={"key": "value"})
1138
+ >>> doc.parse(dict) # Returns {'key': 'value'}
1139
+
1140
+ >>> # Markdown list
1141
+ >>> items = ["Item 1", "Item 2"]
1142
+ >>> content = "\n\n---\n\n".join(items).encode()
1143
+ >>> doc = MyDocument(name="list.md", content=content)
1144
+ >>> doc.parse(list)
1145
+ ['Item 1', 'Item 2']
1146
+ """
1147
+ # Handle basic types
1148
+ if type_ is bytes:
1149
+ return self.content
1150
+ elif type_ is str:
1151
+ # Handle empty content specially
1152
+ if len(self.content) == 0:
1153
+ return ""
1154
+ return self.text
1155
+
1156
+ # Handle structured data based on extension
1157
+ name_lower = self.name.lower()
1158
+
1159
+ # JSON files
1160
+ if name_lower.endswith(".json"):
1161
+ if type_ is dict or type_ is list:
1162
+ result = self.as_json()
1163
+ # Ensure the result is the correct type
1164
+ if type_ is dict and not isinstance(result, dict):
1165
+ raise ValueError(f"Expected dict but got {type(result).__name__}")
1166
+ if type_ is list and not isinstance(result, list):
1167
+ raise ValueError(f"Expected list but got {type(result).__name__}")
1168
+ return result
1169
+ elif issubclass(type_, BaseModel):
1170
+ return self.as_pydantic_model(type_)
1171
+ else:
1172
+ raise ValueError(f"Cannot parse JSON file to type {type_}")
1173
+
1174
+ # YAML files
1175
+ elif name_lower.endswith((".yaml", ".yml")):
1176
+ if type_ is dict or type_ is list:
1177
+ result = self.as_yaml()
1178
+ # Ensure the result is the correct type
1179
+ if type_ is dict and not isinstance(result, dict):
1180
+ raise ValueError(f"Expected dict but got {type(result).__name__}")
1181
+ if type_ is list and not isinstance(result, list):
1182
+ raise ValueError(f"Expected list but got {type(result).__name__}")
1183
+ return result
1184
+ elif issubclass(type_, BaseModel):
1185
+ return self.as_pydantic_model(type_)
1186
+ else:
1187
+ raise ValueError(f"Cannot parse YAML file to type {type_}")
1188
+
1189
+ # Markdown files with lists
1190
+ elif name_lower.endswith(".md") and type_ is list:
1191
+ return self.as_markdown_list()
1192
+
1193
+ # Default: try to return as requested basic type
1194
+ elif type_ is dict or type_ is list:
1195
+ # Try JSON first, then YAML
1196
+ try:
1197
+ result = self.as_json()
1198
+ # Ensure the result is the correct type
1199
+ if type_ is dict and not isinstance(result, dict):
1200
+ raise ValueError(f"Expected dict but got {type(result).__name__}")
1201
+ if type_ is list and not isinstance(result, list):
1202
+ raise ValueError(f"Expected list but got {type(result).__name__}")
1203
+ return result
1204
+ except (json.JSONDecodeError, ValueError):
1205
+ try:
1206
+ result = self.as_yaml()
1207
+ # Ensure the result is the correct type
1208
+ if type_ is dict and not isinstance(result, dict):
1209
+ raise ValueError(f"Expected dict but got {type(result).__name__}")
1210
+ if type_ is list and not isinstance(result, list):
1211
+ raise ValueError(f"Expected list but got {type(result).__name__}")
1212
+ return result
1213
+ except Exception as e:
1214
+ raise ValueError(f"Cannot parse content to {type_}") from e
1215
+
1216
+ raise ValueError(f"Unsupported type {type_} for file {self.name}")
392
1217
 
393
1218
  @final
394
1219
  def serialize_model(self) -> dict[str, Any]:
395
- """Serialize document to a dictionary with proper encoding."""
1220
+ """Serialize document to dictionary for storage or transmission.
1221
+
1222
+ Creates a complete JSON-serializable representation of the document
1223
+ with all metadata and properly encoded content. Automatically chooses
1224
+ the most appropriate encoding (UTF-8 for text, base64 for binary).
1225
+
1226
+ Returns:
1227
+ Dictionary with the following keys:
1228
+ - name: Document filename (str)
1229
+ - description: Optional description (str | None)
1230
+ - base_type: Persistence type - "flow", "task", or "temporary" (str)
1231
+ - size: Content size in bytes (int)
1232
+ - id: Short hash identifier, first 6 chars of SHA256 (str)
1233
+ - sha256: Full SHA256 hash in base32 encoding (str)
1234
+ - mime_type: Detected MIME type (str)
1235
+ - content: Encoded content (str)
1236
+ - content_encoding: Either "utf-8" or "base64" (str)
1237
+
1238
+ Encoding Strategy:
1239
+ - Text files (text/*, application/json, etc.) → UTF-8 string
1240
+ - Binary files (images, PDFs, etc.) → Base64 string
1241
+ - Invalid UTF-8 in text files → UTF-8 with replacement chars
1242
+
1243
+ Example:
1244
+ >>> doc = MyDocument.create(name="data.json", content={"key": "value"})
1245
+ >>> serialized = doc.serialize_model()
1246
+ >>> serialized["content_encoding"] # "utf-8"
1247
+ >>> serialized["mime_type"] # "application/json"
1248
+ """
396
1249
  result = {
397
1250
  "name": self.name,
398
1251
  "description": self.description,
@@ -422,17 +1275,56 @@ class Document(BaseModel, ABC):
422
1275
  @final
423
1276
  @classmethod
424
1277
  def from_dict(cls, data: dict[str, Any]) -> Self:
425
- """Deserialize document from dictionary."""
1278
+ r"""Deserialize document from dictionary (inverse of serialize_model).
1279
+
1280
+ Reconstructs a Document instance from the dictionary format produced
1281
+ by serialize_model(). Automatically handles content decoding based on
1282
+ the content_encoding field.
1283
+
1284
+ Args:
1285
+ data: Dictionary containing serialized document. Required keys:
1286
+ - name: Document filename (str)
1287
+ - content: Encoded content (str or bytes)
1288
+ Optional keys:
1289
+ - description: Document description (str | None)
1290
+ - content_encoding: "utf-8" or "base64" (defaults to "utf-8")
1291
+
1292
+ Returns:
1293
+ New Document instance with restored content.
1294
+
1295
+ Raises:
1296
+ ValueError: If content type is invalid or base64 decoding fails
1297
+ KeyError: If required keys are missing from data dictionary
1298
+
1299
+ Note:
1300
+ Provides roundtrip guarantee with serialize_model().
1301
+ Content and name are preserved exactly.
1302
+
1303
+ Example:
1304
+ >>> data = {
1305
+ ... "name": "config.yaml",
1306
+ ... "content": "key: value\n",
1307
+ ... "content_encoding": "utf-8",
1308
+ ... "description": "Config file"
1309
+ ... }
1310
+ >>> doc = MyDocument.from_dict(data)
1311
+ """
426
1312
  # Extract content and encoding
427
- content_str = data.get("content", "")
1313
+ content_raw = data.get("content", "")
428
1314
  content_encoding = data.get("content_encoding", "utf-8")
429
1315
 
430
1316
  # Decode content based on encoding
1317
+ content: bytes
431
1318
  if content_encoding == "base64":
432
- content = base64.b64decode(content_str)
433
- else:
1319
+ assert isinstance(content_raw, str), "base64 content must be string"
1320
+ content = base64.b64decode(content_raw)
1321
+ elif isinstance(content_raw, str):
434
1322
  # Default to UTF-8
435
- content = content_str.encode("utf-8")
1323
+ content = content_raw.encode("utf-8")
1324
+ elif isinstance(content_raw, bytes):
1325
+ content = content_raw
1326
+ else:
1327
+ raise ValueError(f"Invalid content type: {type(content_raw)}")
436
1328
 
437
1329
  # Create document with the required fields
438
1330
  return cls(