ai-pipeline-core 0.1.12__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. ai_pipeline_core/__init__.py +83 -119
  2. ai_pipeline_core/deployment/__init__.py +34 -0
  3. ai_pipeline_core/deployment/base.py +861 -0
  4. ai_pipeline_core/deployment/contract.py +80 -0
  5. ai_pipeline_core/deployment/deploy.py +561 -0
  6. ai_pipeline_core/deployment/helpers.py +97 -0
  7. ai_pipeline_core/deployment/progress.py +126 -0
  8. ai_pipeline_core/deployment/remote.py +116 -0
  9. ai_pipeline_core/docs_generator/__init__.py +54 -0
  10. ai_pipeline_core/docs_generator/__main__.py +5 -0
  11. ai_pipeline_core/docs_generator/cli.py +196 -0
  12. ai_pipeline_core/docs_generator/extractor.py +324 -0
  13. ai_pipeline_core/docs_generator/guide_builder.py +644 -0
  14. ai_pipeline_core/docs_generator/trimmer.py +35 -0
  15. ai_pipeline_core/docs_generator/validator.py +114 -0
  16. ai_pipeline_core/document_store/__init__.py +13 -0
  17. ai_pipeline_core/document_store/_summary.py +9 -0
  18. ai_pipeline_core/document_store/_summary_worker.py +170 -0
  19. ai_pipeline_core/document_store/clickhouse.py +492 -0
  20. ai_pipeline_core/document_store/factory.py +38 -0
  21. ai_pipeline_core/document_store/local.py +312 -0
  22. ai_pipeline_core/document_store/memory.py +85 -0
  23. ai_pipeline_core/document_store/protocol.py +68 -0
  24. ai_pipeline_core/documents/__init__.py +14 -15
  25. ai_pipeline_core/documents/_context_vars.py +85 -0
  26. ai_pipeline_core/documents/_hashing.py +52 -0
  27. ai_pipeline_core/documents/attachment.py +85 -0
  28. ai_pipeline_core/documents/context.py +128 -0
  29. ai_pipeline_core/documents/document.py +349 -1062
  30. ai_pipeline_core/documents/mime_type.py +40 -85
  31. ai_pipeline_core/documents/utils.py +62 -7
  32. ai_pipeline_core/exceptions.py +10 -62
  33. ai_pipeline_core/images/__init__.py +309 -0
  34. ai_pipeline_core/images/_processing.py +151 -0
  35. ai_pipeline_core/llm/__init__.py +5 -3
  36. ai_pipeline_core/llm/ai_messages.py +284 -73
  37. ai_pipeline_core/llm/client.py +462 -209
  38. ai_pipeline_core/llm/model_options.py +86 -53
  39. ai_pipeline_core/llm/model_response.py +187 -241
  40. ai_pipeline_core/llm/model_types.py +34 -54
  41. ai_pipeline_core/logging/__init__.py +2 -9
  42. ai_pipeline_core/logging/logging.yml +1 -1
  43. ai_pipeline_core/logging/logging_config.py +27 -43
  44. ai_pipeline_core/logging/logging_mixin.py +17 -51
  45. ai_pipeline_core/observability/__init__.py +32 -0
  46. ai_pipeline_core/observability/_debug/__init__.py +30 -0
  47. ai_pipeline_core/observability/_debug/_auto_summary.py +94 -0
  48. ai_pipeline_core/observability/_debug/_config.py +95 -0
  49. ai_pipeline_core/observability/_debug/_content.py +764 -0
  50. ai_pipeline_core/observability/_debug/_processor.py +98 -0
  51. ai_pipeline_core/observability/_debug/_summary.py +312 -0
  52. ai_pipeline_core/observability/_debug/_types.py +75 -0
  53. ai_pipeline_core/observability/_debug/_writer.py +843 -0
  54. ai_pipeline_core/observability/_document_tracking.py +146 -0
  55. ai_pipeline_core/observability/_initialization.py +194 -0
  56. ai_pipeline_core/observability/_logging_bridge.py +57 -0
  57. ai_pipeline_core/observability/_summary.py +81 -0
  58. ai_pipeline_core/observability/_tracking/__init__.py +6 -0
  59. ai_pipeline_core/observability/_tracking/_client.py +178 -0
  60. ai_pipeline_core/observability/_tracking/_internal.py +28 -0
  61. ai_pipeline_core/observability/_tracking/_models.py +138 -0
  62. ai_pipeline_core/observability/_tracking/_processor.py +158 -0
  63. ai_pipeline_core/observability/_tracking/_service.py +311 -0
  64. ai_pipeline_core/observability/_tracking/_writer.py +229 -0
  65. ai_pipeline_core/observability/tracing.py +640 -0
  66. ai_pipeline_core/pipeline/__init__.py +10 -0
  67. ai_pipeline_core/pipeline/decorators.py +915 -0
  68. ai_pipeline_core/pipeline/options.py +16 -0
  69. ai_pipeline_core/prompt_manager.py +26 -105
  70. ai_pipeline_core/settings.py +41 -32
  71. ai_pipeline_core/testing.py +9 -0
  72. ai_pipeline_core-0.4.1.dist-info/METADATA +807 -0
  73. ai_pipeline_core-0.4.1.dist-info/RECORD +76 -0
  74. {ai_pipeline_core-0.1.12.dist-info → ai_pipeline_core-0.4.1.dist-info}/WHEEL +1 -1
  75. ai_pipeline_core/documents/document_list.py +0 -240
  76. ai_pipeline_core/documents/flow_document.py +0 -128
  77. ai_pipeline_core/documents/task_document.py +0 -133
  78. ai_pipeline_core/documents/temporary_document.py +0 -95
  79. ai_pipeline_core/flow/__init__.py +0 -9
  80. ai_pipeline_core/flow/config.py +0 -314
  81. ai_pipeline_core/flow/options.py +0 -75
  82. ai_pipeline_core/pipeline.py +0 -717
  83. ai_pipeline_core/prefect.py +0 -54
  84. ai_pipeline_core/simple_runner/__init__.py +0 -24
  85. ai_pipeline_core/simple_runner/cli.py +0 -255
  86. ai_pipeline_core/simple_runner/simple_runner.py +0 -385
  87. ai_pipeline_core/tracing.py +0 -475
  88. ai_pipeline_core-0.1.12.dist-info/METADATA +0 -450
  89. ai_pipeline_core-0.1.12.dist-info/RECORD +0 -36
  90. {ai_pipeline_core-0.1.12.dist-info → ai_pipeline_core-0.4.1.dist-info}/licenses/LICENSE +0 -0
@@ -1,23 +1,18 @@
1
1
  """Document abstraction layer for AI pipeline flows.
2
2
 
3
- @public
4
-
5
- This module provides the core document abstraction for working with various types of data
6
- in AI pipelines. Documents are immutable Pydantic models that wrap binary content with metadata.
3
+ Immutable Pydantic models wrapping binary content with metadata, MIME detection,
4
+ SHA256 hashing, and serialization. All documents must be concrete subclasses of Document.
7
5
  """
8
6
 
9
7
  import base64
10
- import hashlib
8
+ import functools
11
9
  import json
12
- from abc import ABC, abstractmethod
13
- from base64 import b32encode
14
10
  from enum import StrEnum
15
11
  from functools import cached_property
16
12
  from io import BytesIO
17
13
  from typing import (
18
14
  Any,
19
15
  ClassVar,
20
- Literal,
21
16
  Self,
22
17
  TypeVar,
23
18
  cast,
@@ -27,18 +22,23 @@ from typing import (
27
22
  overload,
28
23
  )
29
24
 
25
+ import tiktoken
30
26
  from pydantic import (
31
27
  BaseModel,
32
28
  ConfigDict,
33
29
  ValidationInfo,
34
30
  field_serializer,
35
31
  field_validator,
32
+ model_validator,
36
33
  )
37
34
  from ruamel.yaml import YAML
38
35
 
39
- from ai_pipeline_core.documents.utils import canonical_name_key
36
+ from ai_pipeline_core.documents._context_vars import get_task_context, is_registration_suppressed
37
+ from ai_pipeline_core.documents._hashing import compute_content_sha256, compute_document_sha256
38
+ from ai_pipeline_core.documents.utils import canonical_name_key, is_document_sha256
40
39
  from ai_pipeline_core.exceptions import DocumentNameError, DocumentSizeError
41
40
 
41
+ from .attachment import Attachment
42
42
  from .mime_type import (
43
43
  detect_mime_type,
44
44
  is_image_mime_type,
@@ -48,136 +48,77 @@ from .mime_type import (
48
48
  )
49
49
 
50
50
  TModel = TypeVar("TModel", bound=BaseModel)
51
+ TDocument = TypeVar("TDocument", bound="Document")
51
52
 
53
+ # Registry of canonical_name -> Document subclass for collision detection.
54
+ # Only non-test classes are registered. Test modules (tests.*, conftest, etc.) are skipped.
55
+ _canonical_name_registry: dict[str, type["Document"]] = {} # nosemgrep: no-mutable-module-globals
52
56
 
53
- class Document(BaseModel, ABC):
54
- r"""Abstract base class for all documents in the AI Pipeline Core system.
55
-
56
- @public
57
-
58
- Document is the fundamental data abstraction for all content flowing through
59
- pipelines. It provides automatic encoding, MIME type detection, serialization,
60
- and validation. All documents must be subclassed from FlowDocument or TaskDocument
61
- based on their persistence requirements. TemporaryDocument is a special concrete
62
- class that can be instantiated directly (not abstract).
63
-
64
- VALIDATION IS AUTOMATIC - Do not add manual validation!
65
- Size validation, name validation, and MIME type detection are built-in.
66
- The framework handles all standard validations internally.
67
-
68
- # WRONG - These checks already happen automatically:
69
- if document.size > document.MAX_CONTENT_SIZE:
70
- raise DocumentSizeError(...) # NO! Already handled
71
- document.validate_file_name(document.name) # NO! Automatic
72
-
73
- Best Practices:
74
- - Use create() classmethod for automatic type conversion (90% of cases)
75
- - Omit description parameter unless truly needed for metadata
76
- - When using LLM functions, pass AIMessages or str. Wrap any Document values
77
- in AIMessages([...]). Do not call .text yourself
78
-
79
- Standard Usage:
80
- >>> # CORRECT - minimal parameters
81
- >>> doc = MyDocument.create(name="data.json", content={"key": "value"})
82
-
83
- >>> # AVOID - unnecessary description
84
- >>> doc = MyDocument.create(
85
- ... name="data.json",
86
- ... content={"key": "value"},
87
- ... description="This is data" # Usually not needed!
88
- ... )
89
-
90
- Key features:
91
- - Immutable by default (frozen Pydantic model)
92
- - Automatic MIME type detection
93
- - Content size validation
94
- - SHA256 hashing for deduplication
95
- - Support for text, JSON, YAML, PDF, and image formats
96
- - Conversion utilities between different formats
97
-
98
- Class Variables:
99
- MAX_CONTENT_SIZE: Maximum allowed content size in bytes (default 25MB)
100
-
101
- Attributes:
102
- name: Document filename (validated for security)
103
- description: Optional human-readable description
104
- content: Raw document content as bytes
105
-
106
- Creating Documents:
107
- **Use the `create` classmethod** for most use cases. It accepts various
108
- content types (str, dict, list, BaseModel) and converts them automatically.
109
- Only use __init__ directly when you already have bytes content.
110
-
111
- >>> # RECOMMENDED: Use create for automatic conversion
112
- >>> doc = MyDocument.create(name="data.json", content={"key": "value"})
113
- >>>
114
- >>> # Direct constructor: Only for bytes
115
- >>> doc = MyDocument(name="data.bin", content=b"\x00\x01\x02")
116
-
117
- Warning:
118
- - Document subclasses should NOT start with 'Test' prefix (pytest conflict)
119
- - Cannot instantiate Document directly - must subclass FlowDocument or TaskDocument
120
- - Cannot add custom fields - only name, description, content are allowed
121
- - Document is an abstract class and cannot be instantiated directly
122
-
123
- Metadata Attachment Patterns:
124
- Since custom fields are not allowed, use these patterns for metadata:
125
- 1. Use the 'description' field for human-readable metadata
126
- 2. Embed metadata in content (e.g., JSON with data + metadata fields)
127
- 3. Create a separate MetadataDocument type to accompany data documents
128
- 4. Use document naming conventions (e.g., "data_v2_2024.json")
129
- 5. Store metadata in flow_options or pass through TraceInfo
130
-
131
- Example:
132
- >>> from enum import StrEnum
133
- >>>
134
- >>> # Simple document:
135
- >>> class MyDocument(FlowDocument):
136
- ... pass
137
- >>>
138
- >>> # Document with file restrictions:
139
- >>> class ConfigDocument(FlowDocument):
140
- ... class FILES(StrEnum):
141
- ... CONFIG = "config.yaml"
142
- ... SETTINGS = "settings.json"
143
- >>>
144
- >>> # RECOMMENDED: Use create for automatic conversion
145
- >>> doc = MyDocument.create(name="data.json", content={"key": "value"})
146
- >>> print(doc.is_text) # True
147
- >>> data = doc.as_json() # {'key': 'value'}
148
- """
149
57
 
150
- MAX_CONTENT_SIZE: ClassVar[int] = 25 * 1024 * 1024
151
- """Maximum allowed content size in bytes (default 25MB).
58
+ def _is_test_module(cls: type) -> bool:
59
+ """Check if a class is defined in a test module (skip collision detection)."""
60
+ module = getattr(cls, "__module__", "") or ""
61
+ parts = module.split(".")
62
+ return any(p == "tests" or p.startswith("test_") or p == "conftest" for p in parts)
152
63
 
153
- @public
154
- """
155
64
 
156
- DESCRIPTION_EXTENSION: ClassVar[str] = ".description.md"
157
- """File extension for description files."""
65
+ @functools.cache
66
+ def get_tiktoken_encoding() -> tiktoken.Encoding:
67
+ """Lazy-cached tiktoken encoding. Deferred to first use, cached forever."""
68
+ return tiktoken.encoding_for_model("gpt-4")
158
69
 
159
- MARKDOWN_LIST_SEPARATOR: ClassVar[str] = "\n\n-----------------\n\n"
160
- """Separator for markdown list items."""
161
70
 
162
- def __init_subclass__(cls, **kwargs: Any) -> None:
163
- """Validate subclass configuration at definition time.
71
+ def _serialize_to_json(data: Any) -> bytes:
72
+ """JSON serialize with 2-space indent."""
73
+ return json.dumps(data, indent=2).encode("utf-8")
164
74
 
165
- Performs several validation checks when a Document subclass is defined:
166
- 1. Prevents class names starting with 'Test' (pytest conflict)
167
- 2. Validates FILES enum if present (must be StrEnum)
168
- 3. Prevents adding custom fields beyond name, description, content
169
75
 
170
- Args:
171
- **kwargs: Additional keyword arguments passed to parent __init_subclass__.
76
+ def _serialize_to_yaml(data: Any) -> bytes:
77
+ """YAML serialize via ruamel."""
78
+ yaml = YAML()
79
+ stream = BytesIO()
80
+ yaml.dump(data, stream) # pyright: ignore[reportUnknownMemberType]
81
+ return stream.getvalue()
172
82
 
173
- Raises:
174
- TypeError: If subclass violates naming rules, FILES enum requirements,
175
- or attempts to add extra fields.
176
83
 
177
- Note:
178
- This validation happens at class definition time, not instantiation,
179
- providing early error detection during development.
180
- """
84
+ def _serialize_structured(name: str, data: Any) -> bytes:
85
+ """Serialize dict/list to JSON or YAML based on file extension."""
86
+ name_lower = name.lower()
87
+ if name_lower.endswith((".yaml", ".yml")):
88
+ return _serialize_to_yaml(data)
89
+ if name_lower.endswith(".json"):
90
+ return _serialize_to_json(data)
91
+ raise ValueError(f"Structured content ({type(data).__name__}) requires .json or .yaml extension, got: {name}")
92
+
93
+
94
+ def _convert_content(name: str, content: str | bytes | dict[str, Any] | list[Any] | BaseModel) -> bytes:
95
+ """Convert any supported content type to bytes. Dispatch by isinstance."""
96
+ if isinstance(content, bytes):
97
+ return content
98
+ if isinstance(content, str):
99
+ return content.encode("utf-8")
100
+ if isinstance(content, dict):
101
+ return _serialize_structured(name, content)
102
+ if isinstance(content, BaseModel):
103
+ return _serialize_structured(name, content.model_dump(mode="json"))
104
+ if isinstance(content, list): # pyright: ignore[reportUnnecessaryIsInstance]
105
+ data = [item.model_dump(mode="json") if isinstance(item, BaseModel) else item for item in content]
106
+ return _serialize_structured(name, data)
107
+ raise ValueError(f"Unsupported content type: {type(content)}") # pyright: ignore[reportUnreachable]
108
+
109
+
110
+ class Document(BaseModel):
111
+ """Immutable base class for all pipeline documents. Cannot be instantiated directly — must be subclassed.
112
+
113
+ Content is stored as bytes. Use `create()` for automatic conversion from str/dict/list/BaseModel.
114
+ Use `parse()` to reverse the conversion. Serialization is extension-driven (.json → JSON, .yaml → YAML).
115
+ """
116
+
117
+ MAX_CONTENT_SIZE: ClassVar[int] = 25 * 1024 * 1024
118
+ """Maximum allowed total size in bytes (default 25MB)."""
119
+
120
+ def __init_subclass__(cls, **kwargs: Any) -> None:
121
+ """Validate subclass at definition time. Cannot start with 'Test', cannot add custom fields."""
181
122
  super().__init_subclass__(**kwargs)
182
123
  if cls.__name__.startswith("Test"):
183
124
  raise TypeError(
@@ -186,14 +127,12 @@ class Document(BaseModel, ABC):
186
127
  "Please use a different name (e.g., 'SampleDocument', 'ExampleDocument')."
187
128
  )
188
129
  if hasattr(cls, "FILES"):
189
- files = getattr(cls, "FILES")
130
+ files: type[StrEnum] = cls.FILES # pyright: ignore[reportAttributeAccessIssue, reportUnknownMemberType, reportUnknownVariableType]
190
131
  if not issubclass(files, StrEnum):
191
- raise TypeError(
192
- f"Document subclass '{cls.__name__}'.FILES must be an Enum of string values"
193
- )
132
+ raise TypeError(f"Document subclass '{cls.__name__}'.FILES must be an Enum of string values")
194
133
  # Check that the Document's model_fields only contain the allowed fields
195
134
  # It prevents AI models from adding additional fields to documents
196
- allowed = {"name", "description", "content"}
135
+ allowed = {"name", "description", "content", "sources", "attachments", "origins"}
197
136
  current = set(getattr(cls, "model_fields", {}).keys())
198
137
  extras = current - allowed
199
138
  if extras:
@@ -202,27 +141,18 @@ class Document(BaseModel, ABC):
202
141
  f"{', '.join(sorted(extras))}. Only {', '.join(sorted(allowed))} are allowed."
203
142
  )
204
143
 
205
- @overload
206
- @classmethod
207
- def create(cls, *, name: str, content: bytes, description: str | None = None) -> Self: ...
208
-
209
- @overload
210
- @classmethod
211
- def create(cls, *, name: str, content: str, description: str | None = None) -> Self: ...
212
-
213
- @overload
214
- @classmethod
215
- def create(
216
- cls, *, name: str, content: dict[str, Any], description: str | None = None
217
- ) -> Self: ...
218
-
219
- @overload
220
- @classmethod
221
- def create(cls, *, name: str, content: list[Any], description: str | None = None) -> Self: ...
222
-
223
- @overload
224
- @classmethod
225
- def create(cls, *, name: str, content: BaseModel, description: str | None = None) -> Self: ...
144
+ # Canonical name collision detection (production classes only)
145
+ if not _is_test_module(cls):
146
+ canonical = canonical_name_key(cls)
147
+ existing = _canonical_name_registry.get(canonical)
148
+ if existing is not None and existing is not cls:
149
+ raise TypeError(
150
+ f"Document subclass '{cls.__name__}' (in {cls.__module__}) produces "
151
+ f"canonical_name '{canonical}' which collides with existing class "
152
+ f"'{existing.__name__}' (in {existing.__module__}). "
153
+ f"Rename one of the classes to avoid ambiguity."
154
+ )
155
+ _canonical_name_registry[canonical] = cls
226
156
 
227
157
  @classmethod
228
158
  def create(
@@ -231,86 +161,23 @@ class Document(BaseModel, ABC):
231
161
  name: str,
232
162
  content: str | bytes | dict[str, Any] | list[Any] | BaseModel,
233
163
  description: str | None = None,
164
+ sources: tuple[str, ...] | None = None,
165
+ origins: tuple[str, ...] | None = None,
166
+ attachments: tuple[Attachment, ...] | None = None,
234
167
  ) -> Self:
235
- r"""Create a Document with automatic content type conversion (recommended).
236
-
237
- @public
238
-
239
- This is the **recommended way to create documents**. It accepts various
240
- content types and automatically converts them to bytes based on the file
241
- extension. Use the `parse` method to reverse this conversion.
242
-
243
- Best Practice (90% of cases):
244
- Only provide name and content. The description parameter is RARELY needed.
245
-
246
- Args:
247
- name: Document filename (required, keyword-only).
248
- Extension determines serialization:
249
- - .json → JSON serialization
250
- - .yaml/.yml → YAML serialization
251
- - .md → Markdown list joining (for list[str])
252
- - Others → UTF-8 encoding (for str)
253
- content: Document content in various formats (required, keyword-only):
254
- - bytes: Used directly without conversion
255
- - str: Encoded to UTF-8 bytes
256
- - dict[str, Any]: Serialized to JSON (.json) or YAML (.yaml/.yml)
257
- - list[str]: Joined with separator for .md (validates no items
258
- contain separator), else JSON/YAML
259
- - list[BaseModel]: Serialized to JSON or YAML based on extension
260
- - BaseModel: Serialized to JSON or YAML based on extension
261
- description: Optional description - USUALLY OMIT THIS (defaults to None).
262
- Only use when meaningful metadata helps downstream processing
263
-
264
- Returns:
265
- New Document instance with content converted to bytes
266
-
267
- Raises:
268
- ValueError: If content type is not supported for the file extension,
269
- or if markdown list items contain the separator
270
- DocumentNameError: If filename violates validation rules
271
- DocumentSizeError: If content exceeds MAX_CONTENT_SIZE
272
-
273
- Note:
274
- All conversions are reversible using the `parse` method.
275
- For example: MyDocument.create(name="data.json", content={"key": "value"}).parse(dict)
276
- returns the original dictionary {"key": "value"}.
277
-
278
- Example:
279
- >>> # CORRECT - no description needed (90% of cases)
280
- >>> doc = MyDocument.create(name="test.txt", content="Hello World")
281
- >>> doc.content # b'Hello World'
282
- >>> doc.parse(str) # "Hello World"
283
-
284
- >>> # CORRECT - Dictionary to JSON, no description
285
- >>> doc = MyDocument.create(name="config.json", content={"key": "value"})
286
- >>> doc.content # b'{"key": "value", ...}'
287
- >>> doc.parse(dict) # {"key": "value"}
288
-
289
- >>> # AVOID unless description adds real value
290
- >>> doc = MyDocument.create(
291
- ... name="config.json",
292
- ... content={"key": "value"},
293
- ... description="Config file" # Usually redundant!
294
- ... )
295
-
296
- >>> # Pydantic model to YAML
297
- >>> from pydantic import BaseModel
298
- >>> class Config(BaseModel):
299
- ... host: str
300
- ... port: int
301
- >>> config = Config(host="localhost", port=8080)
302
- >>> doc = MyDocument.create(name="config.yaml", content=config)
303
- >>> doc.parse(Config) # Returns Config instance
304
-
305
- >>> # List to Markdown
306
- >>> items = ["Section 1", "Section 2"]
307
- >>> doc = MyDocument.create(name="sections.md", content=items)
308
- >>> doc.parse(list) # ["Section 1", "Section 2"]
168
+ """Create a document with automatic content-to-bytes conversion.
169
+
170
+ Serialization is extension-driven: .json → JSON, .yaml → YAML, others → UTF-8.
171
+ Reversible via parse(). Cannot be called on Document directly — must use a subclass.
309
172
  """
310
- # Use model_validate to leverage the existing validator logic
311
- temp = cls.model_validate({"name": name, "content": content, "description": description})
312
- # Now construct with type-checker-friendly call (bytes only)
313
- return cls(name=temp.name, content=temp.content, description=temp.description)
173
+ return cls(
174
+ name=name,
175
+ content=_convert_content(name, content),
176
+ description=description,
177
+ sources=sources,
178
+ origins=origins,
179
+ attachments=attachments,
180
+ )
314
181
 
315
182
  def __init__(
316
183
  self,
@@ -318,50 +185,53 @@ class Document(BaseModel, ABC):
318
185
  name: str,
319
186
  content: bytes,
320
187
  description: str | None = None,
188
+ sources: tuple[str, ...] | None = None,
189
+ origins: tuple[str, ...] | None = None,
190
+ attachments: tuple[Attachment, ...] | None = None,
321
191
  ) -> None:
322
- """Initialize a Document instance with raw bytes content.
323
-
324
- @public
325
-
326
- Important:
327
- **Most users should use the `create` classmethod instead of __init__.**
328
- The create method provides automatic content conversion for various types
329
- (str, dict, list, Pydantic models) while __init__ only accepts bytes.
330
-
331
- This constructor accepts only bytes content for type safety. It prevents
332
- direct instantiation of the abstract Document class.
192
+ """Initialize with raw bytes content. Most users should use `create()` instead."""
193
+ if type(self) is Document:
194
+ raise TypeError("Cannot instantiate Document directly — use a concrete subclass")
333
195
 
334
- Args:
335
- name: Document filename (required, keyword-only)
336
- content: Document content as raw bytes (required, keyword-only)
337
- description: Optional human-readable description (keyword-only)
196
+ super().__init__(
197
+ name=name,
198
+ content=content,
199
+ description=description,
200
+ sources=sources or (),
201
+ origins=origins or (),
202
+ attachments=attachments or (),
203
+ )
338
204
 
339
- Raises:
340
- TypeError: If attempting to instantiate Document directly.
205
+ # Register with task context for document lifecycle tracking
206
+ if not is_registration_suppressed():
207
+ task_ctx = get_task_context()
208
+ if task_ctx is not None:
209
+ task_ctx.register_created(self) # pyright: ignore[reportAttributeAccessIssue, reportUnknownMemberType]
341
210
 
342
- Example:
343
- >>> # Direct constructor - only for bytes content:
344
- >>> doc = MyDocument(name="test.txt", content=b"Hello World")
345
- >>> doc.content # b'Hello World'
211
+ name: str
212
+ description: str | None = None
213
+ content: bytes
214
+ sources: tuple[str, ...] = ()
215
+ """Content provenance: documents and references this document's content was directly
216
+ derived from. Can be document SHA256 hashes (for pipeline documents) or external
217
+ references (URLs, file paths). Answers: 'where did this content come from?'
346
218
 
347
- >>> # RECOMMENDED: Use create for automatic conversion:
348
- >>> doc = MyDocument.create(name="text.txt", content="Hello World")
349
- >>> doc = MyDocument.create(name="data.json", content={"key": "value"})
350
- >>> doc = MyDocument.create(name="config.yaml", content=my_model)
351
- >>> doc = MyDocument.create(name="items.md", content=["item1", "item2"])
219
+ Example: an analysis document derived from an input document has
220
+ sources=(input_doc.sha256,). A webpage capture has sources=("https://example.com",)."""
352
221
 
353
- See Also:
354
- create: Recommended factory method with automatic type conversion
355
- parse: Method to reverse the conversion done by create
356
- """
357
- if type(self) is Document:
358
- raise TypeError("Cannot instantiate abstract Document class directly")
222
+ origins: tuple[str, ...] = ()
223
+ """Causal provenance: documents that caused this document to be created without directly
224
+ contributing to its content. Always document SHA256 hashes, never arbitrary strings.
225
+ Answers: 'why does this document exist?'
359
226
 
360
- super().__init__(name=name, content=content, description=description)
227
+ Example: a research plan causes 10 webpages to be captured. Each webpage's source is its
228
+ URL (content provenance), its origin is the research plan (causal provenance — the plan
229
+ caused the capture but didn't contribute to the webpage's content).
361
230
 
362
- name: str
363
- description: str | None = None
364
- content: bytes # Note: constructor accepts str | bytes, but field stores bytes only
231
+ A SHA256 hash must not appear in both sources and origins for the same document.
232
+ Within a pipeline task or flow, all source/origin SHA256 references must point to
233
+ documents that existed before the task/flow started executing."""
234
+ attachments: tuple[Attachment, ...] = ()
365
235
 
366
236
  # Pydantic configuration
367
237
  model_config = ConfigDict(
@@ -370,148 +240,27 @@ class Document(BaseModel, ABC):
370
240
  extra="forbid",
371
241
  )
372
242
 
373
- @abstractmethod
374
- def get_base_type(self) -> Literal["flow", "task", "temporary"]:
375
- """Get the base type of the document.
376
-
377
- Abstract method that must be implemented by all Document subclasses
378
- to indicate their persistence behavior.
379
-
380
- Returns:
381
- One of "flow" (persisted across flow runs), "task" (temporary
382
- within task execution), or "temporary" (never persisted).
383
-
384
- Note:
385
- This method determines document persistence and lifecycle.
386
- FlowDocument returns "flow", TaskDocument returns "task",
387
- TemporaryDocument returns "temporary".
388
- """
389
- raise NotImplementedError("Subclasses must implement this method")
390
-
391
- @final
392
- @property
393
- def base_type(self) -> Literal["flow", "task", "temporary"]:
394
- """Get the document's base type.
395
-
396
- Property alias for get_base_type() providing a cleaner API.
397
- This property cannot be overridden by subclasses.
398
-
399
- Returns:
400
- The document's base type: "flow", "task", or "temporary".
401
- """
402
- return self.get_base_type()
403
-
404
- @final
405
- @property
406
- def is_flow(self) -> bool:
407
- """Check if this is a flow document.
408
-
409
- Flow documents persist across Prefect flow runs and are saved
410
- to the file system between pipeline steps.
411
-
412
- Returns:
413
- True if this is a FlowDocument subclass, False otherwise.
414
- """
415
- return self.get_base_type() == "flow"
416
-
417
- @final
418
- @property
419
- def is_task(self) -> bool:
420
- """Check if this is a task document.
421
-
422
- Task documents are temporary within Prefect task execution
423
- and are not persisted between pipeline steps.
424
-
425
- Returns:
426
- True if this is a TaskDocument subclass, False otherwise.
427
- """
428
- return self.get_base_type() == "task"
429
-
430
- @final
431
- @property
432
- def is_temporary(self) -> bool:
433
- """Check if this is a temporary document.
434
-
435
- Temporary documents are never persisted and exist only
436
- during execution.
437
-
438
- Returns:
439
- True if this is a TemporaryDocument, False otherwise.
440
- """
441
- return self.get_base_type() == "temporary"
442
-
443
243
  @final
444
244
  @classmethod
445
245
  def get_expected_files(cls) -> list[str] | None:
446
- """Get the list of allowed file names for this document class.
447
-
448
- If the document class defines a FILES enum, returns the list of
449
- valid file names. Used to restrict documents to specific files.
450
-
451
- Returns:
452
- List of allowed file names if FILES enum is defined,
453
- None if unrestricted.
454
-
455
- Raises:
456
- DocumentNameError: If FILES is defined but not a valid StrEnum.
457
-
458
- Example:
459
- >>> class ConfigDocument(FlowDocument):
460
- ... class FILES(StrEnum):
461
- ... CONFIG = "config.yaml"
462
- ... SETTINGS = "settings.json"
463
- >>> ConfigDocument.get_expected_files()
464
- ['config.yaml', 'settings.json']
465
- """
246
+ """Return allowed filenames from FILES enum, or None if unrestricted."""
466
247
  if not hasattr(cls, "FILES"):
467
248
  return None
468
- files = getattr(cls, "FILES")
249
+ files: type[StrEnum] = cls.FILES # pyright: ignore[reportAttributeAccessIssue, reportUnknownMemberType, reportUnknownVariableType]
469
250
  if not files:
470
251
  return None
471
252
  assert issubclass(files, StrEnum)
472
253
  try:
473
254
  values = [member.value for member in files]
474
255
  except TypeError:
475
- raise DocumentNameError(f"{cls.__name__}.FILES must be an Enum of string values")
256
+ raise DocumentNameError(f"{cls.__name__}.FILES must be an Enum of string values") from None
476
257
  if len(values) == 0:
477
258
  return None
478
259
  return values
479
260
 
480
261
  @classmethod
481
262
  def validate_file_name(cls, name: str) -> None:
482
- """Validate that a file name matches allowed patterns.
483
-
484
- @public
485
-
486
- DO NOT OVERRIDE this method if you define a FILES enum!
487
- The validation is automatic when FILES enum is present.
488
-
489
- # CORRECT - FILES enum provides automatic validation:
490
- class MyDocument(FlowDocument):
491
- class FILES(StrEnum):
492
- CONFIG = "config.yaml" # Validation happens automatically!
493
-
494
- # WRONG - Unnecessary override:
495
- class MyDocument(FlowDocument):
496
- class FILES(StrEnum):
497
- CONFIG = "config.yaml"
498
-
499
- def validate_file_name(cls, name): # DON'T DO THIS!
500
- pass # Validation already happens via FILES enum
501
-
502
- Only override for custom validation logic BEYOND FILES enum constraints.
503
-
504
- Args:
505
- name: The file name to validate.
506
-
507
- Raises:
508
- DocumentNameError: If the name doesn't match allowed patterns.
509
-
510
- Note:
511
- - If FILES enum is defined, name must exactly match one of the values
512
- - If FILES is not defined, any name is allowed
513
- - Override in subclasses ONLY for custom regex patterns or logic
514
- """
263
+ """Validate filename against FILES enum. Override only for custom validation beyond FILES."""
515
264
  allowed = cls.get_expected_files()
516
265
  if not allowed:
517
266
  return
@@ -521,42 +270,18 @@ class Document(BaseModel, ABC):
521
270
  raise DocumentNameError(f"Invalid filename '{name}'. Allowed names: {allowed_str}")
522
271
 
523
272
  @field_validator("name")
273
+ @classmethod
524
274
  def validate_name(cls, v: str) -> str:
525
- r"""Pydantic validator for the document name field.
526
-
527
- Ensures the document name is secure and follows conventions:
528
- - No path traversal characters (.., \\, /)
529
- - Cannot end with .description.md
530
- - No leading/trailing whitespace
531
- - Must match FILES enum if defined
532
-
533
- Performance:
534
- Validation is O(n) where n is the length of the name.
535
- FILES enum check is O(m) where m is the number of allowed files
536
-
537
- Args:
538
- v: The name value to validate.
539
-
540
- Returns:
541
- The validated name.
542
-
543
- Raises:
544
- DocumentNameError: If the name violates any validation rules.
545
-
546
- Note:
547
- This is called automatically by Pydantic during model construction.
548
- """
549
- if v.endswith(cls.DESCRIPTION_EXTENSION):
550
- raise DocumentNameError(
551
- f"Document names cannot end with {cls.DESCRIPTION_EXTENSION}: {v}"
552
- )
553
-
275
+ """Reject path traversal, whitespace issues, reserved suffixes. Must match FILES enum if defined."""
554
276
  if ".." in v or "\\" in v or "/" in v:
555
277
  raise DocumentNameError(f"Invalid filename - contains path traversal characters: {v}")
556
278
 
557
279
  if not v or v.startswith(" ") or v.endswith(" "):
558
280
  raise DocumentNameError(f"Invalid filename format: {v}")
559
281
 
282
+ if v.endswith(".meta.json"):
283
+ raise DocumentNameError(f"Document names cannot end with .meta.json (reserved): {v}")
284
+
560
285
  cls.validate_file_name(v)
561
286
 
562
287
  return v
@@ -564,174 +289,58 @@ class Document(BaseModel, ABC):
564
289
  @field_validator("content", mode="before")
565
290
  @classmethod
566
291
  def validate_content(cls, v: Any, info: ValidationInfo) -> bytes:
567
- """Pydantic validator that converts various content types to bytes.
568
-
569
- This validator is called automatically during model construction and
570
- handles the intelligent type conversion that powers the `create` method.
571
- It determines the appropriate serialization based on file extension.
572
-
573
- Conversion Strategy:
574
- 1. bytes → Passthrough (no conversion)
575
- 2. str → UTF-8 encoding
576
- 3. dict/BaseModel + .json → JSON serialization (indented)
577
- 4. dict/BaseModel + .yaml/.yml → YAML serialization
578
- 5. list[str] + .md → Join with markdown separator (validates no items contain separator)
579
- 6. list[Any] + .json/.yaml → JSON/YAML array
580
- 7. int/float/bool + .json → JSON primitive
581
-
582
- Args:
583
- v: Content to validate (any supported type)
584
- info: Validation context containing other field values
585
-
586
- Returns:
587
- Content converted to bytes
588
-
589
- Raises:
590
- DocumentSizeError: If content exceeds MAX_CONTENT_SIZE
591
- ValueError: If content type unsupported for file extension
592
-
593
- Note:
594
- This validator enables create() to accept multiple types while
595
- ensuring __init__ only receives bytes for type safety.
596
- """
597
- # Get the name from validation context if available
598
- name = ""
599
- if hasattr(info, "data") and "name" in info.data:
600
- name = info.data["name"]
601
- name_lower = name.lower()
602
-
603
- # Convert based on content type
604
- if isinstance(v, bytes):
605
- pass # Already bytes
606
- elif isinstance(v, str):
607
- v = v.encode("utf-8")
608
- elif isinstance(v, dict):
609
- # Serialize dict based on extension
610
- if name_lower.endswith((".yaml", ".yml")):
611
- # Use YAML format for YAML files
612
- yaml = YAML()
613
- stream = BytesIO()
614
- yaml.dump(v, stream)
615
- v = stream.getvalue()
616
- elif name_lower.endswith(".json"):
617
- # Use JSON for JSON files
618
- v = json.dumps(v, indent=2).encode("utf-8")
619
- else:
620
- # Dict not supported for other file types
621
- raise ValueError(f"Unsupported content type: {type(v)} for file {name}")
622
- elif isinstance(v, list):
623
- # Handle lists based on file extension
624
- if name_lower.endswith(".md"):
625
- # For markdown files, join with separator
626
- if all(isinstance(item, str) for item in v):
627
- # Check that no string contains the separator
628
- for item in v:
629
- if cls.MARKDOWN_LIST_SEPARATOR in item:
630
- raise ValueError(
631
- f"Markdown list item cannot contain the separator "
632
- f"'{cls.MARKDOWN_LIST_SEPARATOR}' as it will mess up formatting"
633
- )
634
- v = cls.MARKDOWN_LIST_SEPARATOR.join(v).encode("utf-8")
635
- else:
636
- raise ValueError(
637
- f"Unsupported content type: mixed-type list for markdown file {name}"
638
- )
639
- elif name_lower.endswith((".yaml", ".yml")):
640
- # Check if it's a list of Pydantic models
641
- if v and isinstance(v[0], BaseModel):
642
- # Convert models to dicts first
643
- v = [item.model_dump(mode="json") for item in v]
644
- # Use YAML format for YAML files
645
- yaml = YAML()
646
- stream = BytesIO()
647
- yaml.dump(v, stream)
648
- v = stream.getvalue()
649
- elif name_lower.endswith(".json"):
650
- # Check if it's a list of Pydantic models
651
- if v and isinstance(v[0], BaseModel):
652
- # Convert models to dicts first
653
- v = [item.model_dump(mode="json") for item in v]
654
- # For JSON files, serialize as JSON
655
- v = json.dumps(v, indent=2).encode("utf-8")
656
- else:
657
- # Check if it's a list of BaseModel
658
- if v and isinstance(v[0], BaseModel):
659
- raise ValueError("list[BaseModel] requires .json or .yaml extension")
660
- # List content not supported for other file types
661
- raise ValueError(f"Unsupported content type: {type(v)} for file {name}")
662
- elif isinstance(v, BaseModel):
663
- # Serialize Pydantic models
664
- if name_lower.endswith((".yaml", ".yml")):
665
- yaml = YAML()
666
- stream = BytesIO()
667
- yaml.dump(v.model_dump(mode="json"), stream)
668
- v = stream.getvalue()
669
- else:
670
- v = json.dumps(v.model_dump(mode="json"), indent=2).encode("utf-8")
671
- elif isinstance(v, (int, float, bool)):
672
- # Numbers and booleans: JSON-serialize for .json, string for others
673
- if name_lower.endswith(".json"):
674
- v = json.dumps(v).encode("utf-8")
675
- elif name_lower.endswith((".yaml", ".yml")):
676
- v = str(v).encode("utf-8")
677
- elif name_lower.endswith(".txt"):
678
- v = str(v).encode("utf-8")
679
- else:
680
- # For other extensions, convert to string
681
- v = str(v).encode("utf-8")
682
- elif v is None:
683
- # Handle None - only supported for JSON/YAML
684
- if name_lower.endswith((".json", ".yaml", ".yml")):
685
- if name_lower.endswith((".yaml", ".yml")):
686
- v = b"null\n"
687
- else:
688
- v = b"null"
689
- else:
690
- raise ValueError(f"Unsupported content type: {type(None)} for file {name}")
691
- else:
692
- # Try to see if it has model_dump (duck typing for Pydantic-like)
693
- if hasattr(v, "model_dump"):
694
- if name_lower.endswith((".yaml", ".yml")):
695
- yaml = YAML()
696
- stream = BytesIO()
697
- yaml.dump(v.model_dump(mode="json"), stream) # type: ignore[attr-defined]
698
- v = stream.getvalue()
699
- else:
700
- v = json.dumps(v.model_dump(mode="json"), indent=2).encode("utf-8") # type: ignore[attr-defined]
701
- else:
702
- # List non-.json files should raise error
703
- if name_lower.endswith(".txt") and isinstance(v, list):
704
- raise ValueError("List content not supported for text files")
705
- raise ValueError(f"Unsupported content type: {type(v)}")
706
-
707
- # Check content size limit
708
- max_size = getattr(cls, "MAX_CONTENT_SIZE", 100 * 1024 * 1024)
709
- if len(v) > max_size:
710
- raise DocumentSizeError(
711
- f"Document size ({len(v)} bytes) exceeds maximum allowed size ({max_size} bytes)"
712
- )
713
-
292
+ """Convert content to bytes via `_convert_content` if not already bytes. Enforces MAX_CONTENT_SIZE."""
293
+ if not isinstance(v, bytes):
294
+ name = info.data.get("name", "") if hasattr(info, "data") else ""
295
+ v = _convert_content(name, v)
296
+ if len(v) > cls.MAX_CONTENT_SIZE:
297
+ raise DocumentSizeError(f"Document size ({len(v)} bytes) exceeds maximum allowed size ({cls.MAX_CONTENT_SIZE} bytes)")
714
298
  return v
715
299
 
716
- @field_serializer("content")
717
- def serialize_content(self, v: bytes) -> str:
718
- """Pydantic serializer for content field.
300
+ @field_validator("sources")
301
+ @classmethod
302
+ def validate_sources(cls, v: tuple[str, ...]) -> tuple[str, ...]:
303
+ """Sources must be document SHA256 hashes or URLs."""
304
+ for src in v:
305
+ if not is_document_sha256(src) and "://" not in src:
306
+ raise ValueError(f"Source must be a document SHA256 hash or a URL (containing '://'), got: {src!r}")
307
+ return v
719
308
 
720
- Converts bytes content to string for JSON serialization.
721
- Attempts UTF-8 decoding first, falls back to base64 encoding
722
- for binary content.
309
+ @field_validator("origins")
310
+ @classmethod
311
+ def validate_origins(cls, v: tuple[str, ...]) -> tuple[str, ...]:
312
+ """Origins must be valid document SHA256 hashes."""
313
+ for origin in v:
314
+ if not is_document_sha256(origin):
315
+ raise ValueError(f"Origin must be a document SHA256 hash, got: {origin}")
316
+ return v
723
317
 
724
- Args:
725
- v: The content bytes to serialize.
318
+ @model_validator(mode="after")
319
+ def validate_no_source_origin_overlap(self) -> Self:
320
+ """Reject documents where the same SHA256 appears in both sources and origins."""
321
+ source_sha256s = {src for src in self.sources if is_document_sha256(src)}
322
+ if source_sha256s:
323
+ overlap = source_sha256s & set(self.origins)
324
+ if overlap:
325
+ sample = next(iter(overlap))
326
+ raise ValueError(
327
+ f"SHA256 hash {sample[:12]}... appears in both sources and origins. "
328
+ f"A document reference must be either a source (content provenance) "
329
+ f"or an origin (causal provenance), not both."
330
+ )
331
+ return self
726
332
 
727
- Returns:
728
- UTF-8 decoded string for text content,
729
- base64-encoded string for binary content.
333
+ @model_validator(mode="after")
334
+ def validate_total_size(self) -> Self:
335
+ """Validate that total document size (content + attachments) is within limits."""
336
+ total = self.size
337
+ if total > self.MAX_CONTENT_SIZE:
338
+ raise DocumentSizeError(f"Total document size ({total} bytes) including attachments exceeds maximum allowed size ({self.MAX_CONTENT_SIZE} bytes)")
339
+ return self
730
340
 
731
- Note:
732
- This is called automatically by Pydantic during
733
- model serialization to JSON.
734
- """
341
+ @field_serializer("content")
342
+ def serialize_content(self, v: bytes) -> str: # noqa: PLR6301
343
+ """UTF-8 decode for text, base64 for binary. Called by Pydantic during serialization."""
735
344
  try:
736
345
  return v.decode("utf-8")
737
346
  except UnicodeDecodeError:
@@ -741,263 +350,89 @@ class Document(BaseModel, ABC):
741
350
  @final
742
351
  @property
743
352
  def id(self) -> str:
744
- """Get a short unique identifier for the document.
745
-
746
- @public
747
-
748
- This ID is crucial for LLM interactions. When documents are provided to
749
- LLMs via generate() or generate_structured(), their IDs are included,
750
- allowing the LLM to reference documents in prompts by either name or ID.
751
- The ID is content-based (derived from SHA256 hash of content only),
752
- so the same content always produces the same ID. Changing the name or
753
- description does NOT change the ID.
754
-
755
- Returns:
756
- 6-character base32-encoded string (uppercase, e.g., "A7B2C9").
757
- This is the first 6 chars of the full base32 SHA256, NOT hex.
758
-
759
- Collision Rate:
760
- With base32 encoding (5 bits per char), 6 chars = 30 bits.
761
- Expect collisions after ~32K documents (birthday paradox).
762
- For higher uniqueness requirements, use the full sha256 property.
763
-
764
- Note:
765
- While shorter than full SHA256, this provides
766
- reasonable uniqueness for most use cases.
767
- """
353
+ """First 6 chars of sha256. Used as short document identifier in LLM context."""
768
354
  return self.sha256[:6]
769
355
 
770
356
  @final
771
357
  @cached_property
772
358
  def sha256(self) -> str:
773
- """Get the full SHA256 hash of the document content.
774
-
775
- @public
776
-
777
- Computes and caches the SHA256 hash of the content,
778
- encoded in base32 (uppercase). Used for content
779
- deduplication and integrity verification.
780
-
781
- Returns:
782
- Full SHA256 hash as base32-encoded uppercase string.
783
-
784
- Why Base32 Instead of Hex:
785
- - Base32 is case-insensitive, avoiding issues with different file systems
786
- and AI interactions where casing might be inconsistent
787
- - More compact than hex (52 chars vs 64 chars for SHA-256)
788
- - Contains more information per character than hex (5 bits vs 4 bits)
789
- - Safe for URLs without encoding
790
- - Compatible with case-insensitive file systems
791
- - Avoids confusion in AI interactions where models might change casing
792
- - Not base64 because we want consistent uppercase for all uses
793
-
794
- Note:
795
- This is computed once and cached for performance.
796
- The hash is deterministic based on content only.
797
- """
798
- return b32encode(hashlib.sha256(self.content).digest()).decode("ascii").upper()
359
+ """Full SHA256 identity hash (name + content + attachments). BASE32 encoded, cached."""
360
+ return compute_document_sha256(self)
361
+
362
+ @final
363
+ @cached_property
364
+ def content_sha256(self) -> str:
365
+ """SHA256 hash of raw content bytes only. Used for content deduplication."""
366
+ return compute_content_sha256(self.content)
799
367
 
800
368
  @final
801
369
  @property
802
370
  def size(self) -> int:
803
- """Get the size of the document content.
804
-
805
- @public
806
-
807
- Returns:
808
- Size of content in bytes.
809
-
810
- Note:
811
- Useful for monitoring document sizes and
812
- ensuring they stay within limits.
813
- """
814
- return len(self.content)
371
+ """Total size of content + attachments in bytes."""
372
+ return len(self.content) + sum(att.size for att in self.attachments)
815
373
 
816
374
  @cached_property
817
- def detected_mime_type(self) -> str:
818
- """Detect the MIME type from document content.
819
-
820
- Detection strategy (in order):
821
- 1. Returns 'application/x-empty' for empty content
822
- 2. Extension-based detection for known text formats (preferred)
823
- 3. python-magic content analysis for unknown extensions
824
- 4. Fallback to extension or 'application/octet-stream'
825
-
826
- Returns:
827
- MIME type string (e.g., "text/plain", "application/json").
828
-
829
- Note:
830
- This is cached after first access. Extension-based detection
831
- is preferred for text formats to avoid misidentification.
832
- """
833
- return detect_mime_type(self.content, self.name)
834
-
835
- @property
836
375
  def mime_type(self) -> str:
837
- """Get the document's MIME type.
838
-
839
- @public
840
-
841
- Primary property for accessing MIME type information.
842
- Automatically detects MIME type based on file extension and content.
843
-
844
- Returns:
845
- MIME type string (e.g., "text/plain", "application/json").
846
-
847
- Note:
848
- MIME type detection uses extension-based detection for known
849
- text formats and content analysis for binary formats.
850
- """
851
- return self.detected_mime_type
376
+ """Detected MIME type. Extension-based for known formats, content analysis for others. Cached."""
377
+ return detect_mime_type(self.content, self.name)
852
378
 
853
379
  @property
854
380
  def is_text(self) -> bool:
855
- """Check if document contains text content.
856
-
857
- @public
858
-
859
- Returns:
860
- True if MIME type indicates text content
861
- (text/*, application/json, application/x-yaml, text/yaml, etc.),
862
- False otherwise.
863
-
864
- Note:
865
- Used to determine if text property can be safely accessed.
866
- """
381
+ """True if MIME type indicates text content."""
867
382
  return is_text_mime_type(self.mime_type)
868
383
 
869
384
  @property
870
385
  def is_pdf(self) -> bool:
871
- """Check if document is a PDF file.
872
-
873
- @public
874
-
875
- Returns:
876
- True if MIME type is application/pdf, False otherwise.
877
-
878
- Note:
879
- PDF documents require special handling and are
880
- supported by certain LLM models.
881
- """
386
+ """True if MIME type is application/pdf."""
882
387
  return is_pdf_mime_type(self.mime_type)
883
388
 
884
389
  @property
885
390
  def is_image(self) -> bool:
886
- """Check if document is an image file.
887
-
888
- @public
889
-
890
- Returns:
891
- True if MIME type starts with "image/", False otherwise.
892
-
893
- Note:
894
- Image documents are automatically encoded for
895
- vision-capable LLM models.
896
- """
391
+ """True if MIME type starts with image/."""
897
392
  return is_image_mime_type(self.mime_type)
898
393
 
899
394
  @classmethod
900
395
  def canonical_name(cls) -> str:
901
- """Get the canonical name for this document class.
902
-
903
- Returns a standardized snake_case name derived from the
904
- class name, used for directory naming and identification.
905
-
906
- Returns:
907
- Snake_case canonical name.
908
-
909
- Example:
910
- >>> class UserDataDocument(FlowDocument): ...
911
- >>> UserDataDocument.canonical_name()
912
- 'user_data'
913
- """
396
+ """Snake_case name derived from class name, used for directory naming."""
914
397
  return canonical_name_key(cls)
915
398
 
916
399
  @property
917
400
  def text(self) -> str:
918
- """Get document content as UTF-8 text string.
919
-
920
- @public
921
-
922
- Decodes the bytes content as UTF-8 text. Only available for
923
- text-based documents (check is_text property first).
924
-
925
- Returns:
926
- UTF-8 decoded string.
927
-
928
- Raises:
929
- ValueError: If document is not text (is_text == False).
930
-
931
- Example:
932
- >>> doc = MyDocument.create(name="data.txt", content="Hello \u2728")
933
- >>> if doc.is_text:
934
- ... print(doc.text) # "Hello \u2728"
935
-
936
- >>> # Binary document raises error:
937
- >>> binary_doc = MyDocument(name="image.png", content=png_bytes)
938
- >>> binary_doc.text # Raises ValueError
939
- """
401
+ """Content decoded as UTF-8. Raises ValueError if not text."""
940
402
  if not self.is_text:
941
403
  raise ValueError(f"Document is not text: {self.name}")
942
404
  return self.content.decode("utf-8")
943
405
 
944
- def as_yaml(self) -> Any:
945
- r"""Parse document content as YAML.
946
-
947
- @public
948
-
949
- Parses the document's text content as YAML and returns Python objects.
950
- Uses ruamel.yaml which is safe by default (no code execution).
951
-
952
- Returns:
953
- Parsed YAML data: dict, list, str, int, float, bool, or None.
954
-
955
- Raises:
956
- ValueError: If document is not text-based.
957
- YAMLError: If content is not valid YAML.
406
+ @cached_property
407
+ def approximate_tokens_count(self) -> int:
408
+ """Approximate token count (tiktoken gpt-4 encoding). Images=1080, PDFs/other=1024."""
409
+ enc = get_tiktoken_encoding()
410
+ if self.is_text:
411
+ total = len(enc.encode(self.text))
412
+ elif self.is_image:
413
+ total = 1080
414
+ else:
415
+ total = 1024
416
+
417
+ for att in self.attachments:
418
+ if att.is_image:
419
+ total += 1080
420
+ elif att.is_pdf:
421
+ total += 1024
422
+ elif att.is_text:
423
+ total += len(enc.encode(att.text))
424
+ else:
425
+ total += 1024
958
426
 
959
- Example:
960
- >>> # From dict content
961
- >>> doc = MyDocument.create(name="config.yaml", content={
962
- ... "server": {"host": "localhost", "port": 8080}
963
- ... })
964
- >>> doc.as_yaml() # {'server': {'host': 'localhost', 'port': 8080}}
427
+ return total
965
428
 
966
- >>> # From YAML string
967
- >>> doc2 = MyDocument(name="simple.yml", content=b"key: value\nitems:\n - a\n - b")
968
- >>> doc2.as_yaml() # {'key': 'value', 'items': ['a', 'b']}
969
- """
429
+ def as_yaml(self) -> Any:
430
+ """Parse content as YAML via ruamel.yaml."""
970
431
  yaml = YAML()
971
432
  return yaml.load(self.text) # type: ignore[no-untyped-call, no-any-return]
972
433
 
973
434
  def as_json(self) -> Any:
974
- """Parse document content as JSON.
975
-
976
- @public
977
-
978
- Parses the document's text content as JSON and returns Python objects.
979
- Document must contain valid JSON text.
980
-
981
- Returns:
982
- Parsed JSON data: dict, list, str, int, float, bool, or None.
983
-
984
- Raises:
985
- ValueError: If document is not text-based.
986
- JSONDecodeError: If content is not valid JSON.
987
-
988
- Example:
989
- >>> # From dict content
990
- >>> doc = MyDocument.create(name="data.json", content={"key": "value"})
991
- >>> doc.as_json() # {'key': 'value'}
992
-
993
- >>> # From JSON string
994
- >>> doc2 = MyDocument(name="array.json", content=b'[1, 2, 3]')
995
- >>> doc2.as_json() # [1, 2, 3]
996
-
997
- >>> # Invalid JSON
998
- >>> bad_doc = MyDocument(name="bad.json", content=b"not json")
999
- >>> bad_doc.as_json() # Raises JSONDecodeError
1000
- """
435
+ """Parse content as JSON."""
1001
436
  return json.loads(self.text)
1002
437
 
1003
438
  @overload
@@ -1006,50 +441,8 @@ class Document(BaseModel, ABC):
1006
441
  @overload
1007
442
  def as_pydantic_model(self, model_type: type[list[TModel]]) -> list[TModel]: ...
1008
443
 
1009
- def as_pydantic_model(
1010
- self, model_type: type[TModel] | type[list[TModel]]
1011
- ) -> TModel | list[TModel]:
1012
- """Parse document content as Pydantic model with validation.
1013
-
1014
- @public
1015
-
1016
- Parses JSON or YAML content and validates it against a Pydantic model.
1017
- Automatically detects format based on MIME type. Supports both single
1018
- models and lists of models.
1019
-
1020
- Args:
1021
- model_type: Pydantic model class to validate against.
1022
- Can be either:
1023
- - type[Model] for single model
1024
- - type[list[Model]] for list of models
1025
-
1026
- Returns:
1027
- Validated Pydantic model instance or list of instances.
1028
-
1029
- Raises:
1030
- ValueError: If document is not text or type mismatch.
1031
- ValidationError: If data doesn't match model schema.
1032
- JSONDecodeError/YAMLError: If content parsing fails.
1033
-
1034
- Example:
1035
- >>> from pydantic import BaseModel
1036
- >>>
1037
- >>> class User(BaseModel):
1038
- ... name: str
1039
- ... age: int
1040
- >>>
1041
- >>> # Single model
1042
- >>> doc = MyDocument.create(name="user.json",
1043
- ... content={"name": "Alice", "age": 30})
1044
- >>> user = doc.as_pydantic_model(User)
1045
- >>> print(user.name) # "Alice"
1046
- >>>
1047
- >>> # List of models
1048
- >>> doc2 = MyDocument.create(name="users.json",
1049
- ... content=[{"name": "Bob", "age": 25}, {"name": "Eve", "age": 28}])
1050
- >>> users = doc2.as_pydantic_model(list[User])
1051
- >>> print(len(users)) # 2
1052
- """
444
+ def as_pydantic_model(self, model_type: type[TModel] | type[list[TModel]]) -> TModel | list[TModel]:
445
+ """Parse JSON/YAML content and validate against a Pydantic model. Supports single and list types."""
1053
446
  data = self.as_yaml() if is_yaml_mime_type(self.mime_type) else self.as_json()
1054
447
 
1055
448
  if get_origin(model_type) is list:
@@ -1064,271 +457,165 @@ class Document(BaseModel, ABC):
1064
457
  single_model = cast(type[TModel], model_type)
1065
458
  return single_model.model_validate(data)
1066
459
 
1067
- def as_markdown_list(self) -> list[str]:
1068
- r"""Parse document as markdown-separated list of sections.
1069
-
1070
- @public
1071
-
1072
- Splits text content using markdown separator ("\n\n-----------------\n\n").
1073
- Designed for markdown documents with multiple sections.
1074
-
1075
- Returns:
1076
- List of string sections (preserves whitespace within sections).
1077
-
1078
- Raises:
1079
- ValueError: If document is not text-based.
1080
-
1081
- Example:
1082
- >>> # Using create with list
1083
- >>> sections = ["# Chapter 1\nIntroduction", "# Chapter 2\nDetails"]
1084
- >>> doc = MyDocument.create(name="book.md", content=sections)
1085
- >>> doc.as_markdown_list() # Returns original sections
1086
-
1087
- >>> # Manual creation with separator
1088
- >>> content = "Part 1\n\n-----------------\n\nPart 2\n\n-----------------\n\nPart 3"
1089
- >>> doc2 = MyDocument(name="parts.md", content=content.encode())
1090
- >>> doc2.as_markdown_list() # ['Part 1', 'Part 2', 'Part 3']
1091
- """
1092
- return self.text.split(self.MARKDOWN_LIST_SEPARATOR)
460
+ def _parse_structured(self) -> Any:
461
+ """Parse content as JSON or YAML based on extension. Strict — no guessing."""
462
+ name_lower = self.name.lower()
463
+ if name_lower.endswith(".json"):
464
+ return self.as_json()
465
+ if name_lower.endswith((".yaml", ".yml")):
466
+ return self.as_yaml()
467
+ raise ValueError(f"Cannot parse '{self.name}' as structured data — use .json or .yaml extension")
1093
468
 
1094
469
  def parse(self, type_: type[Any]) -> Any:
1095
- r"""Parse document content to original type (reverses create conversion).
1096
-
1097
- @public
1098
-
1099
- This method reverses the automatic conversion performed by the `create`
1100
- classmethod. It intelligently parses the bytes content based on the
1101
- document's file extension and converts to the requested type.
1102
-
1103
- Designed for roundtrip conversion:
1104
- >>> original = {"key": "value"}
1105
- >>> doc = MyDocument.create(name="data.json", content=original)
1106
- >>> restored = doc.parse(dict)
1107
- >>> assert restored == original # True
1108
-
1109
- Args:
1110
- type_: Target type to parse content into. Supported types:
1111
- - bytes: Returns raw content (no conversion)
1112
- - str: Decodes UTF-8 text
1113
- - dict: Parses JSON (.json) or YAML (.yaml/.yml)
1114
- - list: Splits markdown (.md) or parses JSON/YAML
1115
- - BaseModel subclasses: Validates JSON/YAML into model
1116
-
1117
- Returns:
1118
- Content parsed to the requested type.
1119
-
1120
- Raises:
1121
- ValueError: If type is unsupported or parsing fails.
1122
-
1123
- Extension Rules:
1124
- - .json → JSON parsing for dict/list/BaseModel
1125
- - .yaml/.yml → YAML parsing for dict/list/BaseModel
1126
- - .md + list → Split by markdown separator
1127
- - Any + str → UTF-8 decode
1128
- - Any + bytes → Raw content
1129
-
1130
- Example:
1131
- >>> # String content
1132
- >>> doc = MyDocument(name="test.txt", content=b"Hello")
1133
- >>> doc.parse(str)
1134
- 'Hello'
1135
-
1136
- >>> # JSON content
1137
- >>> doc = MyDocument.create(name="data.json", content={"key": "value"})
1138
- >>> doc.parse(dict) # Returns {'key': 'value'}
1139
-
1140
- >>> # Markdown list
1141
- >>> items = ["Item 1", "Item 2"]
1142
- >>> content = "\n\n---\n\n".join(items).encode()
1143
- >>> doc = MyDocument(name="list.md", content=content)
1144
- >>> doc.parse(list)
1145
- ['Item 1', 'Item 2']
1146
- """
1147
- # Handle basic types
470
+ """Parse content to the requested type. Reverses create() conversion. Extension-based dispatch, no guessing."""
1148
471
  if type_ is bytes:
1149
472
  return self.content
1150
- elif type_ is str:
1151
- # Handle empty content specially
1152
- if len(self.content) == 0:
1153
- return ""
1154
- return self.text
1155
-
1156
- # Handle structured data based on extension
1157
- name_lower = self.name.lower()
1158
-
1159
- # JSON files
1160
- if name_lower.endswith(".json"):
1161
- if type_ is dict or type_ is list:
1162
- result = self.as_json()
1163
- # Ensure the result is the correct type
1164
- if type_ is dict and not isinstance(result, dict):
1165
- raise ValueError(f"Expected dict but got {type(result).__name__}")
1166
- if type_ is list and not isinstance(result, list):
1167
- raise ValueError(f"Expected list but got {type(result).__name__}")
1168
- return result
1169
- elif issubclass(type_, BaseModel):
1170
- return self.as_pydantic_model(type_)
1171
- else:
1172
- raise ValueError(f"Cannot parse JSON file to type {type_}")
1173
-
1174
- # YAML files
1175
- elif name_lower.endswith((".yaml", ".yml")):
1176
- if type_ is dict or type_ is list:
1177
- result = self.as_yaml()
1178
- # Ensure the result is the correct type
1179
- if type_ is dict and not isinstance(result, dict):
1180
- raise ValueError(f"Expected dict but got {type(result).__name__}")
1181
- if type_ is list and not isinstance(result, list):
1182
- raise ValueError(f"Expected list but got {type(result).__name__}")
1183
- return result
1184
- elif issubclass(type_, BaseModel):
1185
- return self.as_pydantic_model(type_)
1186
- else:
1187
- raise ValueError(f"Cannot parse YAML file to type {type_}")
473
+ if type_ is str:
474
+ return self.text if self.content else ""
475
+ if type_ is dict or type_ is list:
476
+ data = self._parse_structured()
477
+ if not isinstance(data, type_):
478
+ raise ValueError(f"Expected {type_.__name__} but got {type(data).__name__}")
479
+ return data # pyright: ignore[reportUnknownVariableType]
480
+ if isinstance(type_, type) and issubclass(type_, BaseModel): # pyright: ignore[reportUnnecessaryIsInstance]
481
+ return self.as_pydantic_model(type_)
482
+ raise ValueError(f"Unsupported parse type: {type_}")
1188
483
 
1189
- # Markdown files with lists
1190
- elif name_lower.endswith(".md") and type_ is list:
1191
- return self.as_markdown_list()
484
+ @property
485
+ def source_documents(self) -> tuple[str, ...]:
486
+ """Document SHA256 hashes from sources (filtered by is_document_sha256)."""
487
+ return tuple(src for src in self.sources if is_document_sha256(src))
1192
488
 
1193
- # Default: try to return as requested basic type
1194
- elif type_ is dict or type_ is list:
1195
- # Try JSON first, then YAML
1196
- try:
1197
- result = self.as_json()
1198
- # Ensure the result is the correct type
1199
- if type_ is dict and not isinstance(result, dict):
1200
- raise ValueError(f"Expected dict but got {type(result).__name__}")
1201
- if type_ is list and not isinstance(result, list):
1202
- raise ValueError(f"Expected list but got {type(result).__name__}")
1203
- return result
1204
- except (json.JSONDecodeError, ValueError):
1205
- try:
1206
- result = self.as_yaml()
1207
- # Ensure the result is the correct type
1208
- if type_ is dict and not isinstance(result, dict):
1209
- raise ValueError(f"Expected dict but got {type(result).__name__}")
1210
- if type_ is list and not isinstance(result, list):
1211
- raise ValueError(f"Expected list but got {type(result).__name__}")
1212
- return result
1213
- except Exception as e:
1214
- raise ValueError(f"Cannot parse content to {type_}") from e
1215
-
1216
- raise ValueError(f"Unsupported type {type_} for file {self.name}")
489
+ @property
490
+ def source_references(self) -> tuple[str, ...]:
491
+ """Non-hash reference strings from sources (URLs, file paths, etc.)."""
492
+ return tuple(src for src in self.sources if not is_document_sha256(src))
493
+
494
+ def has_source(self, source: "Document | str") -> bool:
495
+ """Check if a source (Document or string) is in this document's sources."""
496
+ if isinstance(source, str):
497
+ return source in self.sources
498
+ if isinstance(source, Document): # type: ignore[misc]
499
+ return source.sha256 in self.sources
500
+ raise TypeError(f"Invalid source type: {type(source)}") # pyright: ignore[reportUnreachable]
1217
501
 
1218
502
  @final
1219
503
  def serialize_model(self) -> dict[str, Any]:
1220
- """Serialize document to dictionary for storage or transmission.
1221
-
1222
- Creates a complete JSON-serializable representation of the document
1223
- with all metadata and properly encoded content. Automatically chooses
1224
- the most appropriate encoding (UTF-8 for text, base64 for binary).
1225
-
1226
- Returns:
1227
- Dictionary with the following keys:
1228
- - name: Document filename (str)
1229
- - description: Optional description (str | None)
1230
- - base_type: Persistence type - "flow", "task", or "temporary" (str)
1231
- - size: Content size in bytes (int)
1232
- - id: Short hash identifier, first 6 chars of SHA256 (str)
1233
- - sha256: Full SHA256 hash in base32 encoding (str)
1234
- - mime_type: Detected MIME type (str)
1235
- - content: Encoded content (str)
1236
- - content_encoding: Either "utf-8" or "base64" (str)
1237
-
1238
- Encoding Strategy:
1239
- - Text files (text/*, application/json, etc.) → UTF-8 string
1240
- - Binary files (images, PDFs, etc.) → Base64 string
1241
- - Invalid UTF-8 in text files → UTF-8 with replacement chars
1242
-
1243
- Example:
1244
- >>> doc = MyDocument.create(name="data.json", content={"key": "value"})
1245
- >>> serialized = doc.serialize_model()
1246
- >>> serialized["content_encoding"] # "utf-8"
1247
- >>> serialized["mime_type"] # "application/json"
1248
- """
1249
- result = {
504
+ """Serialize to JSON-compatible dict for storage/transmission. Roundtrips with from_dict()."""
505
+ result: dict[str, Any] = { # nosemgrep: mutable-field-on-frozen-pydantic-model
1250
506
  "name": self.name,
1251
507
  "description": self.description,
1252
- "base_type": self.get_base_type(),
1253
508
  "size": self.size,
1254
509
  "id": self.id,
1255
510
  "sha256": self.sha256,
511
+ "content_sha256": self.content_sha256,
1256
512
  "mime_type": self.mime_type,
513
+ "sources": list(self.sources),
514
+ "origins": list(self.origins),
515
+ "canonical_name": canonical_name_key(self.__class__),
516
+ "class_name": self.__class__.__name__,
1257
517
  }
1258
518
 
1259
- # Try to encode content as UTF-8, fall back to base64
1260
- if self.is_text or self.mime_type.startswith("text/"):
519
+ if self.is_text:
1261
520
  try:
1262
521
  result["content"] = self.content.decode("utf-8")
1263
522
  result["content_encoding"] = "utf-8"
1264
523
  except UnicodeDecodeError:
1265
- # For text files with encoding issues, use UTF-8 with replacement
1266
524
  result["content"] = self.content.decode("utf-8", errors="replace")
1267
525
  result["content_encoding"] = "utf-8"
1268
526
  else:
1269
- # Binary content - use base64
1270
527
  result["content"] = base64.b64encode(self.content).decode("ascii")
1271
528
  result["content_encoding"] = "base64"
1272
529
 
530
+ serialized_attachments: list[dict[str, Any]] = [] # nosemgrep: mutable-field-on-frozen-pydantic-model
531
+ for att in self.attachments:
532
+ att_data: dict[str, Any] = {"name": att.name, "description": att.description} # nosemgrep: mutable-field-on-frozen-pydantic-model
533
+ if att.is_text:
534
+ att_data["content"] = att.content.decode("utf-8", errors="replace")
535
+ att_data["content_encoding"] = "utf-8"
536
+ else:
537
+ att_data["content"] = base64.b64encode(att.content).decode("ascii")
538
+ att_data["content_encoding"] = "base64"
539
+ att_data["mime_type"] = att.mime_type
540
+ att_data["size"] = att.size
541
+ serialized_attachments.append(att_data)
542
+ result["attachments"] = serialized_attachments
543
+
1273
544
  return result
1274
545
 
1275
546
  @final
1276
547
  @classmethod
1277
548
  def from_dict(cls, data: dict[str, Any]) -> Self:
1278
- r"""Deserialize document from dictionary (inverse of serialize_model).
1279
-
1280
- Reconstructs a Document instance from the dictionary format produced
1281
- by serialize_model(). Automatically handles content decoding based on
1282
- the content_encoding field.
1283
-
1284
- Args:
1285
- data: Dictionary containing serialized document. Required keys:
1286
- - name: Document filename (str)
1287
- - content: Encoded content (str or bytes)
1288
- Optional keys:
1289
- - description: Document description (str | None)
1290
- - content_encoding: "utf-8" or "base64" (defaults to "utf-8")
1291
-
1292
- Returns:
1293
- New Document instance with restored content.
1294
-
1295
- Raises:
1296
- ValueError: If content type is invalid or base64 decoding fails
1297
- KeyError: If required keys are missing from data dictionary
1298
-
1299
- Note:
1300
- Provides roundtrip guarantee with serialize_model().
1301
- Content and name are preserved exactly.
1302
-
1303
- Example:
1304
- >>> data = {
1305
- ... "name": "config.yaml",
1306
- ... "content": "key: value\n",
1307
- ... "content_encoding": "utf-8",
1308
- ... "description": "Config file"
1309
- ... }
1310
- >>> doc = MyDocument.from_dict(data)
1311
- """
1312
- # Extract content and encoding
549
+ """Deserialize from dict produced by serialize_model(). Roundtrip guarantee."""
1313
550
  content_raw = data.get("content", "")
1314
551
  content_encoding = data.get("content_encoding", "utf-8")
1315
552
 
1316
- # Decode content based on encoding
1317
553
  content: bytes
1318
554
  if content_encoding == "base64":
1319
- assert isinstance(content_raw, str), "base64 content must be string"
555
+ if not isinstance(content_raw, str):
556
+ raise ValueError("base64 content must be string")
1320
557
  content = base64.b64decode(content_raw)
1321
558
  elif isinstance(content_raw, str):
1322
- # Default to UTF-8
1323
559
  content = content_raw.encode("utf-8")
1324
560
  elif isinstance(content_raw, bytes):
1325
561
  content = content_raw
1326
562
  else:
1327
563
  raise ValueError(f"Invalid content type: {type(content_raw)}")
1328
564
 
1329
- # Create document with the required fields
565
+ attachments: tuple[Attachment, ...] | None = None
566
+ if attachments_raw := data.get("attachments"):
567
+ att_list: list[Attachment] = [] # nosemgrep: mutable-field-on-frozen-pydantic-model
568
+ for att_data in attachments_raw:
569
+ att_content_raw = att_data.get("content", "")
570
+ if att_data.get("content_encoding") == "base64":
571
+ att_content = base64.b64decode(att_content_raw)
572
+ elif isinstance(att_content_raw, str):
573
+ att_content = att_content_raw.encode("utf-8")
574
+ else:
575
+ att_content = att_content_raw
576
+ att_list.append(Attachment(name=att_data["name"], content=att_content, description=att_data.get("description")))
577
+ attachments = tuple(att_list)
578
+
1330
579
  return cls(
1331
580
  name=data["name"],
1332
581
  content=content,
1333
582
  description=data.get("description"),
583
+ sources=tuple(data.get("sources") or ()),
584
+ origins=tuple(data.get("origins") or ()),
585
+ attachments=attachments,
586
+ )
587
+
588
+ @final
589
+ def model_convert(self, new_type: type[TDocument], *, update: dict[str, Any] | None = None) -> TDocument:
590
+ """Convert to a different Document subclass with optional field overrides."""
591
+ try:
592
+ if not isinstance(new_type, type): # pyright: ignore[reportUnnecessaryIsInstance]
593
+ raise TypeError(f"new_type must be a class, got {new_type}") # pyright: ignore[reportUnreachable]
594
+ if not issubclass(new_type, Document): # pyright: ignore[reportUnnecessaryIsInstance]
595
+ raise TypeError(f"new_type must be a subclass of Document, got {new_type}") # pyright: ignore[reportUnreachable]
596
+ except (TypeError, AttributeError) as err:
597
+ raise TypeError(f"new_type must be a subclass of Document, got {new_type}") from err
598
+
599
+ if new_type is Document:
600
+ raise TypeError("Cannot instantiate Document directly — use a concrete subclass")
601
+
602
+ data: dict[str, Any] = { # nosemgrep: mutable-field-on-frozen-pydantic-model
603
+ "name": self.name,
604
+ "content": self.content,
605
+ "description": self.description,
606
+ "sources": self.sources,
607
+ "origins": self.origins,
608
+ "attachments": self.attachments,
609
+ }
610
+
611
+ if update:
612
+ data.update(update)
613
+
614
+ return new_type(
615
+ name=data["name"],
616
+ content=data["content"],
617
+ description=data.get("description"),
618
+ sources=data.get("sources"),
619
+ origins=data.get("origins"),
620
+ attachments=data.get("attachments"),
1334
621
  )