ai-pipeline-core 0.3.3__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. ai_pipeline_core/__init__.py +70 -144
  2. ai_pipeline_core/deployment/__init__.py +6 -18
  3. ai_pipeline_core/deployment/base.py +392 -212
  4. ai_pipeline_core/deployment/contract.py +6 -10
  5. ai_pipeline_core/{utils → deployment}/deploy.py +50 -69
  6. ai_pipeline_core/deployment/helpers.py +16 -17
  7. ai_pipeline_core/{progress.py → deployment/progress.py} +23 -24
  8. ai_pipeline_core/{utils/remote_deployment.py → deployment/remote.py} +11 -14
  9. ai_pipeline_core/docs_generator/__init__.py +54 -0
  10. ai_pipeline_core/docs_generator/__main__.py +5 -0
  11. ai_pipeline_core/docs_generator/cli.py +196 -0
  12. ai_pipeline_core/docs_generator/extractor.py +324 -0
  13. ai_pipeline_core/docs_generator/guide_builder.py +644 -0
  14. ai_pipeline_core/docs_generator/trimmer.py +35 -0
  15. ai_pipeline_core/docs_generator/validator.py +114 -0
  16. ai_pipeline_core/document_store/__init__.py +13 -0
  17. ai_pipeline_core/document_store/_summary.py +9 -0
  18. ai_pipeline_core/document_store/_summary_worker.py +170 -0
  19. ai_pipeline_core/document_store/clickhouse.py +492 -0
  20. ai_pipeline_core/document_store/factory.py +38 -0
  21. ai_pipeline_core/document_store/local.py +312 -0
  22. ai_pipeline_core/document_store/memory.py +85 -0
  23. ai_pipeline_core/document_store/protocol.py +68 -0
  24. ai_pipeline_core/documents/__init__.py +12 -14
  25. ai_pipeline_core/documents/_context_vars.py +85 -0
  26. ai_pipeline_core/documents/_hashing.py +52 -0
  27. ai_pipeline_core/documents/attachment.py +85 -0
  28. ai_pipeline_core/documents/context.py +128 -0
  29. ai_pipeline_core/documents/document.py +318 -1434
  30. ai_pipeline_core/documents/mime_type.py +37 -82
  31. ai_pipeline_core/documents/utils.py +4 -12
  32. ai_pipeline_core/exceptions.py +10 -62
  33. ai_pipeline_core/images/__init__.py +32 -85
  34. ai_pipeline_core/images/_processing.py +5 -11
  35. ai_pipeline_core/llm/__init__.py +6 -4
  36. ai_pipeline_core/llm/ai_messages.py +106 -81
  37. ai_pipeline_core/llm/client.py +267 -158
  38. ai_pipeline_core/llm/model_options.py +12 -84
  39. ai_pipeline_core/llm/model_response.py +53 -99
  40. ai_pipeline_core/llm/model_types.py +8 -23
  41. ai_pipeline_core/logging/__init__.py +2 -7
  42. ai_pipeline_core/logging/logging.yml +1 -1
  43. ai_pipeline_core/logging/logging_config.py +27 -37
  44. ai_pipeline_core/logging/logging_mixin.py +15 -41
  45. ai_pipeline_core/observability/__init__.py +32 -0
  46. ai_pipeline_core/observability/_debug/__init__.py +30 -0
  47. ai_pipeline_core/observability/_debug/_auto_summary.py +94 -0
  48. ai_pipeline_core/{debug/config.py → observability/_debug/_config.py} +11 -7
  49. ai_pipeline_core/{debug/content.py → observability/_debug/_content.py} +134 -75
  50. ai_pipeline_core/{debug/processor.py → observability/_debug/_processor.py} +16 -17
  51. ai_pipeline_core/{debug/summary.py → observability/_debug/_summary.py} +113 -37
  52. ai_pipeline_core/observability/_debug/_types.py +75 -0
  53. ai_pipeline_core/{debug/writer.py → observability/_debug/_writer.py} +126 -196
  54. ai_pipeline_core/observability/_document_tracking.py +146 -0
  55. ai_pipeline_core/observability/_initialization.py +194 -0
  56. ai_pipeline_core/observability/_logging_bridge.py +57 -0
  57. ai_pipeline_core/observability/_summary.py +81 -0
  58. ai_pipeline_core/observability/_tracking/__init__.py +6 -0
  59. ai_pipeline_core/observability/_tracking/_client.py +178 -0
  60. ai_pipeline_core/observability/_tracking/_internal.py +28 -0
  61. ai_pipeline_core/observability/_tracking/_models.py +138 -0
  62. ai_pipeline_core/observability/_tracking/_processor.py +158 -0
  63. ai_pipeline_core/observability/_tracking/_service.py +311 -0
  64. ai_pipeline_core/observability/_tracking/_writer.py +229 -0
  65. ai_pipeline_core/{tracing.py → observability/tracing.py} +139 -335
  66. ai_pipeline_core/pipeline/__init__.py +10 -0
  67. ai_pipeline_core/pipeline/decorators.py +915 -0
  68. ai_pipeline_core/pipeline/options.py +16 -0
  69. ai_pipeline_core/prompt_manager.py +16 -102
  70. ai_pipeline_core/settings.py +26 -31
  71. ai_pipeline_core/testing.py +9 -0
  72. ai_pipeline_core-0.4.0.dist-info/METADATA +807 -0
  73. ai_pipeline_core-0.4.0.dist-info/RECORD +76 -0
  74. ai_pipeline_core/debug/__init__.py +0 -26
  75. ai_pipeline_core/documents/document_list.py +0 -420
  76. ai_pipeline_core/documents/flow_document.py +0 -112
  77. ai_pipeline_core/documents/task_document.py +0 -117
  78. ai_pipeline_core/documents/temporary_document.py +0 -74
  79. ai_pipeline_core/flow/__init__.py +0 -9
  80. ai_pipeline_core/flow/config.py +0 -494
  81. ai_pipeline_core/flow/options.py +0 -75
  82. ai_pipeline_core/pipeline.py +0 -718
  83. ai_pipeline_core/prefect.py +0 -63
  84. ai_pipeline_core/prompt_builder/__init__.py +0 -5
  85. ai_pipeline_core/prompt_builder/documents_prompt.jinja2 +0 -23
  86. ai_pipeline_core/prompt_builder/global_cache.py +0 -78
  87. ai_pipeline_core/prompt_builder/new_core_documents_prompt.jinja2 +0 -6
  88. ai_pipeline_core/prompt_builder/prompt_builder.py +0 -253
  89. ai_pipeline_core/prompt_builder/system_prompt.jinja2 +0 -41
  90. ai_pipeline_core/storage/__init__.py +0 -8
  91. ai_pipeline_core/storage/storage.py +0 -628
  92. ai_pipeline_core/utils/__init__.py +0 -8
  93. ai_pipeline_core-0.3.3.dist-info/METADATA +0 -569
  94. ai_pipeline_core-0.3.3.dist-info/RECORD +0 -57
  95. {ai_pipeline_core-0.3.3.dist-info → ai_pipeline_core-0.4.0.dist-info}/WHEEL +0 -0
  96. {ai_pipeline_core-0.3.3.dist-info → ai_pipeline_core-0.4.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,25 +1,18 @@
1
1
  """Document abstraction layer for AI pipeline flows.
2
2
 
3
- @public
4
-
5
- This module provides the core document abstraction for working with various types of data
6
- in AI pipelines. Documents are immutable Pydantic models that wrap binary content with metadata.
3
+ Immutable Pydantic models wrapping binary content with metadata, MIME detection,
4
+ SHA256 hashing, and serialization. All documents must be concrete subclasses of Document.
7
5
  """
8
6
 
9
- from __future__ import annotations
10
-
11
7
  import base64
12
- import hashlib
8
+ import functools
13
9
  import json
14
- from abc import ABC, abstractmethod
15
- from base64 import b32encode
16
10
  from enum import StrEnum
17
11
  from functools import cached_property
18
12
  from io import BytesIO
19
13
  from typing import (
20
14
  Any,
21
15
  ClassVar,
22
- Literal,
23
16
  Self,
24
17
  TypeVar,
25
18
  cast,
@@ -33,16 +26,19 @@ import tiktoken
33
26
  from pydantic import (
34
27
  BaseModel,
35
28
  ConfigDict,
36
- Field,
37
29
  ValidationInfo,
38
30
  field_serializer,
39
31
  field_validator,
32
+ model_validator,
40
33
  )
41
34
  from ruamel.yaml import YAML
42
35
 
36
+ from ai_pipeline_core.documents._context_vars import get_task_context, is_registration_suppressed
37
+ from ai_pipeline_core.documents._hashing import compute_content_sha256, compute_document_sha256
43
38
  from ai_pipeline_core.documents.utils import canonical_name_key, is_document_sha256
44
39
  from ai_pipeline_core.exceptions import DocumentNameError, DocumentSizeError
45
40
 
41
+ from .attachment import Attachment
46
42
  from .mime_type import (
47
43
  detect_mime_type,
48
44
  is_image_mime_type,
@@ -54,223 +50,75 @@ from .mime_type import (
54
50
  TModel = TypeVar("TModel", bound=BaseModel)
55
51
  TDocument = TypeVar("TDocument", bound="Document")
56
52
 
53
+ # Registry of canonical_name -> Document subclass for collision detection.
54
+ # Only non-test classes are registered. Test modules (tests.*, conftest, etc.) are skipped.
55
+ _canonical_name_registry: dict[str, type["Document"]] = {} # nosemgrep: no-mutable-module-globals
57
56
 
58
- class Document(BaseModel, ABC):
59
- r"""Abstract base class for all documents in the AI Pipeline Core system.
60
-
61
- @public
62
-
63
- Document is the fundamental data abstraction for all content flowing through
64
- pipelines. It provides automatic encoding, MIME type detection, serialization,
65
- and validation. All documents must be subclassed from FlowDocument or TaskDocument
66
- based on their persistence requirements.
67
-
68
- VALIDATION IS AUTOMATIC - Do not add manual validation!
69
- Size validation, name validation, and MIME type detection are built-in.
70
- The framework handles all standard validations internally.
71
-
72
- # WRONG - These checks already happen automatically:
73
- if document.size > document.MAX_CONTENT_SIZE:
74
- raise DocumentSizeError(...) # NO! Already handled
75
- document.validate_file_name(document.name) # NO! Automatic
76
-
77
- Best Practices:
78
- - Use create() classmethod for automatic type conversion (default preferred)
79
- - Omit description parameter unless truly needed for metadata
80
- - When using LLM functions, pass AIMessages or str. Wrap any Document values
81
- in AIMessages([...]). Do not call .text yourself
82
-
83
- Standard Usage:
84
- >>> # CORRECT - minimal parameters
85
- >>> doc = MyDocument.create(name="data.json", content={"key": "value"})
86
-
87
- >>> # AVOID - unnecessary description
88
- >>> doc = MyDocument.create(
89
- ... name="data.json",
90
- ... content={"key": "value"},
91
- ... description="This is data" # Usually not needed!
92
- ... )
93
-
94
- Key features:
95
- - Immutable by default (frozen Pydantic model)
96
- - Automatic MIME type detection
97
- - Content size validation
98
- - SHA256 hashing for deduplication
99
- - Support for text, JSON, YAML, PDF, and image formats
100
- - Conversion utilities between different formats
101
- - Source provenance tracking via sources field
102
- - Document type conversion via model_convert() method
103
- - Standard Pydantic model_copy() for same-type copying
104
-
105
- Class Variables:
106
- MAX_CONTENT_SIZE: Maximum allowed content size in bytes (default 25MB)
107
-
108
- Attributes:
109
- name: Document filename (validated for security)
110
- description: Optional human-readable description
111
- content: Raw document content as bytes
112
- sources: List of source references tracking document provenance
113
-
114
- Creating Documents:
115
- **Use the `create` classmethod** for most use cases. It accepts various
116
- content types (str, dict, list, BaseModel) and converts them automatically.
117
- Only use __init__ directly when you already have bytes content.
118
-
119
- >>> # RECOMMENDED: Use create for automatic conversion
120
- >>> doc = MyDocument.create(name="data.json", content={"key": "value"})
121
- >>>
122
- >>> # Direct constructor: Only for bytes
123
- >>> doc = MyDocument(name="data.bin", content=b"\x00\x01\x02")
124
-
125
- Warning:
126
- - Document subclasses should NOT start with 'Test' prefix (pytest conflict)
127
- - Cannot instantiate Document directly - must subclass FlowDocument or TaskDocument
128
- - Cannot add custom fields - only name, description, content, sources are allowed
129
- - Document is an abstract class and cannot be instantiated directly
130
-
131
- Metadata Attachment Patterns:
132
- Since custom fields are not allowed, use these patterns for metadata:
133
- 1. Use the 'description' field for human-readable metadata
134
- 2. Embed metadata in content (e.g., JSON with data + metadata fields)
135
- 3. Create a separate MetadataDocument type to accompany data documents
136
- 4. Use document naming conventions (e.g., "data_v2_2024.json")
137
- 5. Store metadata in flow_options
138
-
139
- FILES Enum Best Practice:
140
- When defining a FILES enum, NEVER use magic strings to reference files.
141
- Always use the enum values to maintain type safety and refactorability.
142
-
143
- WRONG - Magic strings/numbers:
144
- doc = ConfigDocument.create(name="config.yaml", content=data) # NO!
145
- doc = docs.get_by("settings.json") # NO! Magic string
146
- files = ["config.yaml", "settings.json"] # NO! Magic strings
147
-
148
- CORRECT - Use enum references:
149
- doc = ConfigDocument.create(
150
- name=ConfigDocument.FILES.CONFIG, # YES! Type-safe
151
- content=data
152
- )
153
- doc = docs.get_by(ConfigDocument.FILES.SETTINGS) # YES!
154
- files = [
155
- ConfigDocument.FILES.CONFIG,
156
- ConfigDocument.FILES.SETTINGS
157
- ] # YES! Refactorable
158
-
159
- Pydantic Model Interaction:
160
- Documents provide DIRECT support for Pydantic models. Use the built-in
161
- methods instead of manual JSON conversion.
162
-
163
- WRONG - Manual JSON conversion:
164
- # Don't do this - manual JSON handling
165
- json_str = doc.text
166
- json_data = json.loads(json_str)
167
- model = MyModel(**json_data) # NO! Use as_pydantic_model
168
-
169
- # Don't do this - manual serialization
170
- json_str = model.model_dump_json()
171
- doc = MyDocument.create(name="data.json", content=json_str) # NO!
172
-
173
- CORRECT - Direct Pydantic interaction:
174
- # Reading Pydantic model from document
175
- model = doc.as_pydantic_model(MyModel) # Direct conversion
176
- models = doc.as_pydantic_model(list[MyModel]) # List support
177
-
178
- # Creating document from Pydantic model
179
- doc = MyDocument.create(
180
- name="data.json",
181
- content=model # Direct BaseModel support
182
- )
183
57
 
184
- # Round-trip is seamless
185
- original_model = MyModel(field="value")
186
- doc = MyDocument.create(name="data.json", content=original_model)
187
- restored = doc.as_pydantic_model(MyModel)
188
- assert restored == original_model # Perfect round-trip
189
-
190
- Example:
191
- >>> from enum import StrEnum
192
- >>> from pydantic import BaseModel
193
- >>>
194
- >>> # Simple document:
195
- >>> class MyDocument(FlowDocument):
196
- ... pass
197
- >>>
198
- >>> # Document with file restrictions:
199
- >>> class ConfigDocument(FlowDocument):
200
- ... class FILES(StrEnum):
201
- ... CONFIG = "config.yaml"
202
- ... SETTINGS = "settings.json"
203
- >>>
204
- >>> # CORRECT FILES usage - no magic strings:
205
- >>> doc = ConfigDocument.create(
206
- ... name=ConfigDocument.FILES.CONFIG, # Use enum
207
- ... content={"key": "value"}
208
- ... )
209
- >>>
210
- >>> # CORRECT Pydantic usage:
211
- >>> class Config(BaseModel):
212
- ... key: str
213
- >>>
214
- >>> # Direct creation from Pydantic model
215
- >>> config_model = Config(key="value")
216
- >>> doc = MyDocument.create(name="data.json", content=config_model)
217
- >>>
218
- >>> # Direct extraction to Pydantic model
219
- >>> restored = doc.as_pydantic_model(Config)
220
- >>> print(restored.key) # "value"
221
- >>>
222
- >>> # Track document provenance with sources
223
- >>> source_doc = MyDocument.create(name="input.txt", content="raw data")
224
- >>> processed = MyDocument.create(
225
- ... name="output.txt",
226
- ... content="processed data",
227
- ... sources=[source_doc.sha256] # Reference source document
228
- ... )
229
- >>> processed.has_source(source_doc) # True
230
- >>>
231
- >>> # Document copying and type conversion:
232
- >>> # Standard Pydantic model_copy (doesn't validate updates)
233
- >>> copied = doc.model_copy(update={"name": "new_name.json"})
234
- >>> # Type conversion with validation via model_convert
235
- >>> task_doc = MyTaskDoc.create(name="temp.json", content={"data": "value"})
236
- >>> flow_doc = task_doc.model_convert(MyFlowDoc) # Convert to FlowDocument
237
- >>> flow_doc.is_flow # True
238
- """
58
+ def _is_test_module(cls: type) -> bool:
59
+ """Check if a class is defined in a test module (skip collision detection)."""
60
+ module = getattr(cls, "__module__", "") or ""
61
+ parts = module.split(".")
62
+ return any(p == "tests" or p.startswith("test_") or p == "conftest" for p in parts)
239
63
 
240
- MAX_CONTENT_SIZE: ClassVar[int] = 25 * 1024 * 1024
241
- """Maximum allowed content size in bytes (default 25MB).
242
64
 
243
- @public
244
- """
65
+ @functools.cache
66
+ def get_tiktoken_encoding() -> tiktoken.Encoding:
67
+ """Lazy-cached tiktoken encoding. Deferred to first use, cached forever."""
68
+ return tiktoken.encoding_for_model("gpt-4")
245
69
 
246
- DESCRIPTION_EXTENSION: ClassVar[str] = ".description.md"
247
- """File extension for description files."""
248
70
 
249
- SOURCES_EXTENSION: ClassVar[str] = ".sources.json"
250
- """File extension for sources metadata files."""
71
+ def _serialize_to_json(data: Any) -> bytes:
72
+ """JSON serialize with 2-space indent."""
73
+ return json.dumps(data, indent=2).encode("utf-8")
251
74
 
252
- MARKDOWN_LIST_SEPARATOR: ClassVar[str] = "\n\n-----------------\n\n"
253
- """Separator for markdown list items."""
254
75
 
255
- def __init_subclass__(cls, **kwargs: Any) -> None:
256
- """Validate subclass configuration at definition time.
76
+ def _serialize_to_yaml(data: Any) -> bytes:
77
+ """YAML serialize via ruamel."""
78
+ yaml = YAML()
79
+ stream = BytesIO()
80
+ yaml.dump(data, stream) # pyright: ignore[reportUnknownMemberType]
81
+ return stream.getvalue()
257
82
 
258
- Performs several validation checks when a Document subclass is defined:
259
- 1. Prevents class names starting with 'Test' (pytest conflict)
260
- 2. Validates FILES enum if present (must be StrEnum)
261
- 3. Prevents adding custom fields beyond name, description, content
262
83
 
263
- Args:
264
- **kwargs: Additional keyword arguments passed to parent __init_subclass__.
84
+ def _serialize_structured(name: str, data: Any) -> bytes:
85
+ """Serialize dict/list to JSON or YAML based on file extension."""
86
+ name_lower = name.lower()
87
+ if name_lower.endswith((".yaml", ".yml")):
88
+ return _serialize_to_yaml(data)
89
+ if name_lower.endswith(".json"):
90
+ return _serialize_to_json(data)
91
+ raise ValueError(f"Structured content ({type(data).__name__}) requires .json or .yaml extension, got: {name}")
265
92
 
266
- Raises:
267
- TypeError: If subclass violates naming rules, FILES enum requirements,
268
- or attempts to add extra fields.
269
93
 
270
- Note:
271
- This validation happens at class definition time, not instantiation,
272
- providing early error detection during development.
273
- """
94
+ def _convert_content(name: str, content: str | bytes | dict[str, Any] | list[Any] | BaseModel) -> bytes:
95
+ """Convert any supported content type to bytes. Dispatch by isinstance."""
96
+ if isinstance(content, bytes):
97
+ return content
98
+ if isinstance(content, str):
99
+ return content.encode("utf-8")
100
+ if isinstance(content, dict):
101
+ return _serialize_structured(name, content)
102
+ if isinstance(content, BaseModel):
103
+ return _serialize_structured(name, content.model_dump(mode="json"))
104
+ if isinstance(content, list): # pyright: ignore[reportUnnecessaryIsInstance]
105
+ data = [item.model_dump(mode="json") if isinstance(item, BaseModel) else item for item in content]
106
+ return _serialize_structured(name, data)
107
+ raise ValueError(f"Unsupported content type: {type(content)}") # pyright: ignore[reportUnreachable]
108
+
109
+
110
+ class Document(BaseModel):
111
+ """Immutable base class for all pipeline documents. Cannot be instantiated directly — must be subclassed.
112
+
113
+ Content is stored as bytes. Use `create()` for automatic conversion from str/dict/list/BaseModel.
114
+ Use `parse()` to reverse the conversion. Serialization is extension-driven (.json → JSON, .yaml → YAML).
115
+ """
116
+
117
+ MAX_CONTENT_SIZE: ClassVar[int] = 25 * 1024 * 1024
118
+ """Maximum allowed total size in bytes (default 25MB)."""
119
+
120
+ def __init_subclass__(cls, **kwargs: Any) -> None:
121
+ """Validate subclass at definition time. Cannot start with 'Test', cannot add custom fields."""
274
122
  super().__init_subclass__(**kwargs)
275
123
  if cls.__name__.startswith("Test"):
276
124
  raise TypeError(
@@ -279,14 +127,12 @@ class Document(BaseModel, ABC):
279
127
  "Please use a different name (e.g., 'SampleDocument', 'ExampleDocument')."
280
128
  )
281
129
  if hasattr(cls, "FILES"):
282
- files = getattr(cls, "FILES")
130
+ files: type[StrEnum] = cls.FILES # pyright: ignore[reportAttributeAccessIssue, reportUnknownMemberType, reportUnknownVariableType]
283
131
  if not issubclass(files, StrEnum):
284
- raise TypeError(
285
- f"Document subclass '{cls.__name__}'.FILES must be an Enum of string values"
286
- )
132
+ raise TypeError(f"Document subclass '{cls.__name__}'.FILES must be an Enum of string values")
287
133
  # Check that the Document's model_fields only contain the allowed fields
288
134
  # It prevents AI models from adding additional fields to documents
289
- allowed = {"name", "description", "content", "sources"}
135
+ allowed = {"name", "description", "content", "sources", "attachments", "origins"}
290
136
  current = set(getattr(cls, "model_fields", {}).keys())
291
137
  extras = current - allowed
292
138
  if extras:
@@ -295,60 +141,18 @@ class Document(BaseModel, ABC):
295
141
  f"{', '.join(sorted(extras))}. Only {', '.join(sorted(allowed))} are allowed."
296
142
  )
297
143
 
298
- @overload
299
- @classmethod
300
- def create(
301
- cls,
302
- *,
303
- name: str,
304
- content: bytes,
305
- description: str | None = None,
306
- sources: list[str] | None = None,
307
- ) -> Self: ...
308
-
309
- @overload
310
- @classmethod
311
- def create(
312
- cls,
313
- *,
314
- name: str,
315
- content: str,
316
- description: str | None = None,
317
- sources: list[str] | None = None,
318
- ) -> Self: ...
319
-
320
- @overload
321
- @classmethod
322
- def create(
323
- cls,
324
- *,
325
- name: str,
326
- content: dict[str, Any],
327
- description: str | None = None,
328
- sources: list[str] | None = None,
329
- ) -> Self: ...
330
-
331
- @overload
332
- @classmethod
333
- def create(
334
- cls,
335
- *,
336
- name: str,
337
- content: list[Any],
338
- description: str | None = None,
339
- sources: list[str] | None = None,
340
- ) -> Self: ...
341
-
342
- @overload
343
- @classmethod
344
- def create(
345
- cls,
346
- *,
347
- name: str,
348
- content: BaseModel,
349
- description: str | None = None,
350
- sources: list[str] | None = None,
351
- ) -> Self: ...
144
+ # Canonical name collision detection (production classes only)
145
+ if not _is_test_module(cls):
146
+ canonical = canonical_name_key(cls)
147
+ existing = _canonical_name_registry.get(canonical)
148
+ if existing is not None and existing is not cls:
149
+ raise TypeError(
150
+ f"Document subclass '{cls.__name__}' (in {cls.__module__}) produces "
151
+ f"canonical_name '{canonical}' which collides with existing class "
152
+ f"'{existing.__name__}' (in {existing.__module__}). "
153
+ f"Rename one of the classes to avoid ambiguity."
154
+ )
155
+ _canonical_name_registry[canonical] = cls
352
156
 
353
157
  @classmethod
354
158
  def create(
@@ -357,111 +161,22 @@ class Document(BaseModel, ABC):
357
161
  name: str,
358
162
  content: str | bytes | dict[str, Any] | list[Any] | BaseModel,
359
163
  description: str | None = None,
360
- sources: list[str] | None = None,
164
+ sources: tuple[str, ...] | None = None,
165
+ origins: tuple[str, ...] | None = None,
166
+ attachments: tuple[Attachment, ...] | None = None,
361
167
  ) -> Self:
362
- r"""Create a Document with automatic content type conversion (recommended).
363
-
364
- @public
365
-
366
- This is the **recommended way to create documents**. It accepts various
367
- content types and automatically converts them to bytes based on the file
368
- extension. Use the `parse` method to reverse this conversion.
369
-
370
- Best Practice (by default, unless instructed otherwise):
371
- Only provide name and content. The description parameter is RARELY needed.
372
-
373
- Args:
374
- name: Document filename (required, keyword-only).
375
- Extension determines serialization:
376
- - .json → JSON serialization
377
- - .yaml/.yml → YAML serialization
378
- - .md → Markdown list joining (for list[str])
379
- - Others → UTF-8 encoding (for str)
380
- content: Document content in various formats (required, keyword-only):
381
- - bytes: Used directly without conversion
382
- - str: Encoded to UTF-8 bytes
383
- - dict[str, Any]: Serialized to JSON (.json) or YAML (.yaml/.yml)
384
- - list[str]: Joined automatically for .md (validates format compatibility),
385
- else JSON/YAML
386
- - list[BaseModel]: Serialized to JSON or YAML based on extension
387
- - BaseModel: Serialized to JSON or YAML based on extension
388
- description: Optional description - USUALLY OMIT THIS (defaults to None).
389
- Only use when meaningful metadata helps downstream processing
390
- sources: Optional list of source strings (document SHA256 hashes or references).
391
- Used to track what sources contributed to creating this document.
392
- Can contain document SHA256 hashes (for referencing other documents)
393
- or arbitrary reference strings (URLs, file paths, descriptions).
394
- Defaults to empty list
395
-
396
- Returns:
397
- New Document instance with content converted to bytes
398
-
399
- Raises:
400
- ValueError: If content type is not supported for the file extension,
401
- or if markdown list format is incompatible
402
- DocumentNameError: If filename violates validation rules
403
- DocumentSizeError: If content exceeds MAX_CONTENT_SIZE
404
-
405
- Note:
406
- All conversions are reversible using the `parse` method.
407
- For example: MyDocument.create(name="data.json", content={"key": "value"}).parse(dict)
408
- returns the original dictionary {"key": "value"}.
409
-
410
- Example:
411
- >>> # CORRECT - no description needed (by default, unless instructed otherwise)
412
- >>> doc = MyDocument.create(name="test.txt", content="Hello World")
413
- >>> doc.content # b'Hello World'
414
- >>> doc.parse(str) # "Hello World"
415
-
416
- >>> # CORRECT - Dictionary to JSON, no description
417
- >>> doc = MyDocument.create(name="config.json", content={"key": "value"})
418
- >>> doc.content # b'{"key": "value", ...}'
419
- >>> doc.parse(dict) # {"key": "value"}
420
-
421
- >>> # AVOID unless description adds real value
422
- >>> doc = MyDocument.create(
423
- ... name="config.json",
424
- ... content={"key": "value"},
425
- ... description="Config file" # Usually redundant!
426
- ... )
427
-
428
- >>> # Pydantic model to YAML
429
- >>> from pydantic import BaseModel
430
- >>> class Config(BaseModel):
431
- ... host: str
432
- ... port: int
433
- >>> config = Config(host="localhost", port=8080)
434
- >>> doc = MyDocument.create(name="config.yaml", content=config)
435
- >>> doc.parse(Config) # Returns Config instance
436
-
437
- >>> # List to Markdown
438
- >>> items = ["Section 1", "Section 2"]
439
- >>> doc = MyDocument.create(name="sections.md", content=items)
440
- >>> doc.parse(list) # ["Section 1", "Section 2"]
441
-
442
- >>> # Document with sources for provenance tracking
443
- >>> source_doc = MyDocument.create(name="source.txt", content="original")
444
- >>> derived = MyDocument.create(
445
- ... name="result.txt",
446
- ... content="processed",
447
- ... sources=[source_doc.sha256, "https://api.example.com/data"]
448
- ... )
449
- >>> derived.get_source_documents() # [source_doc.sha256]
450
- >>> derived.get_source_references() # ["https://api.example.com/data"]
168
+ """Create a document with automatic content-to-bytes conversion.
169
+
170
+ Serialization is extension-driven: .json → JSON, .yaml → YAML, others → UTF-8.
171
+ Reversible via parse(). Cannot be called on Document directly — must use a subclass.
451
172
  """
452
- # Use model_validate to leverage the existing validator logic
453
- temp = cls.model_validate({
454
- "name": name,
455
- "content": content,
456
- "description": description,
457
- "sources": sources,
458
- })
459
- # Now construct with type-checker-friendly call (bytes only)
460
173
  return cls(
461
- name=temp.name,
462
- content=temp.content,
463
- description=temp.description,
464
- sources=temp.sources,
174
+ name=name,
175
+ content=_convert_content(name, content),
176
+ description=description,
177
+ sources=sources,
178
+ origins=origins,
179
+ attachments=attachments,
465
180
  )
466
181
 
467
182
  def __init__(
@@ -470,61 +185,53 @@ class Document(BaseModel, ABC):
470
185
  name: str,
471
186
  content: bytes,
472
187
  description: str | None = None,
473
- sources: list[str] | None = None,
188
+ sources: tuple[str, ...] | None = None,
189
+ origins: tuple[str, ...] | None = None,
190
+ attachments: tuple[Attachment, ...] | None = None,
474
191
  ) -> None:
475
- """Initialize a Document instance with raw bytes content.
476
-
477
- @public
478
-
479
- Important:
480
- **Most users should use the `create` classmethod instead of __init__.**
481
- The create method provides automatic content conversion for various types
482
- (str, dict, list, Pydantic models) while __init__ only accepts bytes.
483
-
484
- This constructor accepts only bytes content for type safety. It prevents
485
- direct instantiation of the abstract Document class.
486
-
487
- Args:
488
- name: Document filename (required, keyword-only)
489
- content: Document content as raw bytes (required, keyword-only)
490
- description: Optional human-readable description (keyword-only)
491
- sources: Optional list of source strings for provenance tracking.
492
- Can contain document SHA256 hashes (for referencing other documents)
493
- or arbitrary reference strings (URLs, file paths, descriptions).
494
- Defaults to empty list
495
-
496
- Raises:
497
- TypeError: If attempting to instantiate Document directly.
498
-
499
- Example:
500
- >>> # Direct constructor - only for bytes content:
501
- >>> doc = MyDocument(name="test.txt", content=b"Hello World")
502
- >>> doc.content # b'Hello World'
503
-
504
- >>> # RECOMMENDED: Use create for automatic conversion:
505
- >>> doc = MyDocument.create(name="text.txt", content="Hello World")
506
- >>> doc = MyDocument.create(name="data.json", content={"key": "value"})
507
- >>> doc = MyDocument.create(name="config.yaml", content=my_model)
508
- >>> doc = MyDocument.create(name="items.md", content=["item1", "item2"])
509
- """
192
+ """Initialize with raw bytes content. Most users should use `create()` instead."""
510
193
  if type(self) is Document:
511
- raise TypeError("Cannot instantiate abstract Document class directly")
194
+ raise TypeError("Cannot instantiate Document directly — use a concrete subclass")
512
195
 
513
- # Only pass sources if not None to let Pydantic's default_factory handle it
514
- if sources is not None:
515
- super().__init__(name=name, content=content, description=description, sources=sources)
516
- else:
517
- super().__init__(name=name, content=content, description=description)
196
+ super().__init__(
197
+ name=name,
198
+ content=content,
199
+ description=description,
200
+ sources=sources or (),
201
+ origins=origins or (),
202
+ attachments=attachments or (),
203
+ )
204
+
205
+ # Register with task context for document lifecycle tracking
206
+ if not is_registration_suppressed():
207
+ task_ctx = get_task_context()
208
+ if task_ctx is not None:
209
+ task_ctx.register_created(self) # pyright: ignore[reportAttributeAccessIssue, reportUnknownMemberType]
518
210
 
519
211
  name: str
520
212
  description: str | None = None
521
- content: bytes # Note: constructor accepts str | bytes, but field stores bytes only
522
- sources: list[str] = Field(
523
- default_factory=list,
524
- description="List of source references for tracking document provenance. "
525
- "Can contain document SHA256 hashes (for referencing other documents) "
526
- "or arbitrary reference strings (URLs, file paths, descriptions)",
527
- )
213
+ content: bytes
214
+ sources: tuple[str, ...] = ()
215
+ """Content provenance: documents and references this document's content was directly
216
+ derived from. Can be document SHA256 hashes (for pipeline documents) or external
217
+ references (URLs, file paths). Answers: 'where did this content come from?'
218
+
219
+ Example: an analysis document derived from an input document has
220
+ sources=(input_doc.sha256,). A webpage capture has sources=("https://example.com",)."""
221
+
222
+ origins: tuple[str, ...] = ()
223
+ """Causal provenance: documents that caused this document to be created without directly
224
+ contributing to its content. Always document SHA256 hashes, never arbitrary strings.
225
+ Answers: 'why does this document exist?'
226
+
227
+ Example: a research plan causes 10 webpages to be captured. Each webpage's source is its
228
+ URL (content provenance), its origin is the research plan (causal provenance — the plan
229
+ caused the capture but didn't contribute to the webpage's content).
230
+
231
+ A SHA256 hash must not appear in both sources and origins for the same document.
232
+ Within a pipeline task or flow, all source/origin SHA256 references must point to
233
+ documents that existed before the task/flow started executing."""
234
+ attachments: tuple[Attachment, ...] = ()
528
235
 
529
236
  # Pydantic configuration
530
237
  model_config = ConfigDict(
@@ -533,145 +240,27 @@ class Document(BaseModel, ABC):
533
240
  extra="forbid",
534
241
  )
535
242
 
536
- @abstractmethod
537
- def get_base_type(self) -> Literal["flow", "task", "temporary"]:
538
- """Get the base type of the document.
539
-
540
- Abstract method that must be implemented by all Document subclasses
541
- to indicate their persistence behavior.
542
-
543
- Returns:
544
- One of "flow" (persisted across flow runs), "task" (temporary
545
- within task execution), or "temporary" (never persisted).
546
-
547
- Note:
548
- This method determines document persistence and lifecycle.
549
- FlowDocument returns "flow", TaskDocument returns "task".
550
- """
551
- raise NotImplementedError("Subclasses must implement this method")
552
-
553
- @final
554
- @property
555
- def base_type(self) -> Literal["flow", "task", "temporary"]:
556
- """Get the document's base type.
557
-
558
- Property alias for get_base_type() providing a cleaner API.
559
- This property cannot be overridden by subclasses.
560
-
561
- Returns:
562
- The document's base type: "flow", "task", or "temporary".
563
- """
564
- return self.get_base_type()
565
-
566
- @final
567
- @property
568
- def is_flow(self) -> bool:
569
- """Check if this is a flow document.
570
-
571
- Flow documents persist across Prefect flow runs and are saved
572
- to the file system between pipeline steps.
573
-
574
- Returns:
575
- True if this is a FlowDocument subclass, False otherwise.
576
- """
577
- return self.get_base_type() == "flow"
578
-
579
- @final
580
- @property
581
- def is_task(self) -> bool:
582
- """Check if this is a task document.
583
-
584
- Task documents are temporary within Prefect task execution
585
- and are not persisted between pipeline steps.
586
-
587
- Returns:
588
- True if this is a TaskDocument subclass, False otherwise.
589
- """
590
- return self.get_base_type() == "task"
591
-
592
- @final
593
- @property
594
- def is_temporary(self) -> bool:
595
- """Check if this is a temporary document.
596
-
597
- Temporary documents are never persisted and exist only
598
- during execution.
599
-
600
- Returns:
601
- True if this document is temporary, False otherwise.
602
- """
603
- return self.get_base_type() == "temporary"
604
-
605
243
  @final
606
244
  @classmethod
607
245
  def get_expected_files(cls) -> list[str] | None:
608
- """Get the list of allowed file names for this document class.
609
-
610
- If the document class defines a FILES enum, returns the list of
611
- valid file names. Used to restrict documents to specific files.
612
-
613
- Returns:
614
- List of allowed file names if FILES enum is defined,
615
- None if unrestricted.
616
-
617
- Raises:
618
- DocumentNameError: If FILES is defined but not a valid StrEnum.
619
-
620
- Example:
621
- >>> class ConfigDocument(FlowDocument):
622
- ... class FILES(StrEnum):
623
- ... CONFIG = "config.yaml"
624
- ... SETTINGS = "settings.json"
625
- >>> ConfigDocument.get_expected_files()
626
- ['config.yaml', 'settings.json']
627
- """
246
+ """Return allowed filenames from FILES enum, or None if unrestricted."""
628
247
  if not hasattr(cls, "FILES"):
629
248
  return None
630
- files = getattr(cls, "FILES")
249
+ files: type[StrEnum] = cls.FILES # pyright: ignore[reportAttributeAccessIssue, reportUnknownMemberType, reportUnknownVariableType]
631
250
  if not files:
632
251
  return None
633
252
  assert issubclass(files, StrEnum)
634
253
  try:
635
254
  values = [member.value for member in files]
636
255
  except TypeError:
637
- raise DocumentNameError(f"{cls.__name__}.FILES must be an Enum of string values")
256
+ raise DocumentNameError(f"{cls.__name__}.FILES must be an Enum of string values") from None
638
257
  if len(values) == 0:
639
258
  return None
640
259
  return values
641
260
 
642
261
  @classmethod
643
262
  def validate_file_name(cls, name: str) -> None:
644
- """Validate that a file name matches allowed patterns.
645
-
646
- DO NOT OVERRIDE this method if you define a FILES enum!
647
- The validation is automatic when FILES enum is present.
648
-
649
- # CORRECT - FILES enum provides automatic validation:
650
- class MyDocument(FlowDocument):
651
- class FILES(StrEnum):
652
- CONFIG = "config.yaml" # Validation happens automatically!
653
-
654
- # WRONG - Unnecessary override:
655
- class MyDocument(FlowDocument):
656
- class FILES(StrEnum):
657
- CONFIG = "config.yaml"
658
-
659
- def validate_file_name(cls, name): # DON'T DO THIS!
660
- pass # Validation already happens via FILES enum
661
-
662
- Only override for custom validation logic BEYOND FILES enum constraints.
663
-
664
- Args:
665
- name: The file name to validate.
666
-
667
- Raises:
668
- DocumentNameError: If the name doesn't match allowed patterns.
669
-
670
- Note:
671
- - If FILES enum is defined, name must exactly match one of the values
672
- - If FILES is not defined, any name is allowed
673
- - Override in subclasses ONLY for custom regex patterns or logic
674
- """
263
+ """Validate filename against FILES enum. Override only for custom validation beyond FILES."""
675
264
  allowed = cls.get_expected_files()
676
265
  if not allowed:
677
266
  return
@@ -681,45 +270,18 @@ class Document(BaseModel, ABC):
681
270
  raise DocumentNameError(f"Invalid filename '{name}'. Allowed names: {allowed_str}")
682
271
 
683
272
  @field_validator("name")
273
+ @classmethod
684
274
  def validate_name(cls, v: str) -> str:
685
- r"""Pydantic validator for the document name field.
686
-
687
- Ensures the document name is secure and follows conventions:
688
- - No path traversal characters (.., \\, /)
689
- - Cannot end with .description.md or .sources.json
690
- - No leading/trailing whitespace
691
- - Must match FILES enum if defined
692
-
693
- Performance:
694
- Validation is O(n) where n is the length of the name.
695
- FILES enum check is O(m) where m is the number of allowed files
696
-
697
- Args:
698
- v: The name value to validate.
699
-
700
- Returns:
701
- The validated name.
702
-
703
- Raises:
704
- DocumentNameError: If the name violates any validation rules.
705
-
706
- Note:
707
- This is called automatically by Pydantic during model construction.
708
- """
709
- if v.endswith(cls.DESCRIPTION_EXTENSION):
710
- raise DocumentNameError(
711
- f"Document names cannot end with {cls.DESCRIPTION_EXTENSION}: {v}"
712
- )
713
-
714
- if v.endswith(cls.SOURCES_EXTENSION):
715
- raise DocumentNameError(f"Document names cannot end with {cls.SOURCES_EXTENSION}: {v}")
716
-
275
+ """Reject path traversal, whitespace issues, reserved suffixes. Must match FILES enum if defined."""
717
276
  if ".." in v or "\\" in v or "/" in v:
718
277
  raise DocumentNameError(f"Invalid filename - contains path traversal characters: {v}")
719
278
 
720
279
  if not v or v.startswith(" ") or v.endswith(" "):
721
280
  raise DocumentNameError(f"Invalid filename format: {v}")
722
281
 
282
+ if v.endswith(".meta.json"):
283
+ raise DocumentNameError(f"Document names cannot end with .meta.json (reserved): {v}")
284
+
723
285
  cls.validate_file_name(v)
724
286
 
725
287
  return v
@@ -727,174 +289,58 @@ class Document(BaseModel, ABC):
727
289
  @field_validator("content", mode="before")
728
290
  @classmethod
729
291
  def validate_content(cls, v: Any, info: ValidationInfo) -> bytes:
730
- """Pydantic validator that converts various content types to bytes.
731
-
732
- This validator is called automatically during model construction and
733
- handles the intelligent type conversion that powers the `create` method.
734
- It determines the appropriate serialization based on file extension.
735
-
736
- Conversion Strategy:
737
- 1. bytes → Passthrough (no conversion)
738
- 2. str → UTF-8 encoding
739
- 3. dict/BaseModel + .json → JSON serialization (indented)
740
- 4. dict/BaseModel + .yaml/.yml → YAML serialization
741
- 5. list[str] + .md → Join with markdown sections (validates format compatibility)
742
- 6. list[Any] + .json/.yaml → JSON/YAML array
743
- 7. int/float/bool + .json → JSON primitive
744
-
745
- Args:
746
- v: Content to validate (any supported type)
747
- info: Validation context containing other field values
748
-
749
- Returns:
750
- Content converted to bytes
751
-
752
- Raises:
753
- DocumentSizeError: If content exceeds MAX_CONTENT_SIZE
754
- ValueError: If content type unsupported for file extension
755
-
756
- Note:
757
- This validator enables create() to accept multiple types while
758
- ensuring __init__ only receives bytes for type safety.
759
- """
760
- # Get the name from validation context if available
761
- name = ""
762
- if hasattr(info, "data") and "name" in info.data:
763
- name = info.data["name"]
764
- name_lower = name.lower()
765
-
766
- # Convert based on content type
767
- if isinstance(v, bytes):
768
- pass # Already bytes
769
- elif isinstance(v, str):
770
- v = v.encode("utf-8")
771
- elif isinstance(v, dict):
772
- # Serialize dict based on extension
773
- if name_lower.endswith((".yaml", ".yml")):
774
- # Use YAML format for YAML files
775
- yaml = YAML()
776
- stream = BytesIO()
777
- yaml.dump(v, stream)
778
- v = stream.getvalue()
779
- elif name_lower.endswith(".json"):
780
- # Use JSON for JSON files
781
- v = json.dumps(v, indent=2).encode("utf-8")
782
- else:
783
- # Dict not supported for other file types
784
- raise ValueError(f"Unsupported content type: {type(v)} for file {name}")
785
- elif isinstance(v, list):
786
- # Handle lists based on file extension
787
- if name_lower.endswith(".md"):
788
- # For markdown files, join with separator
789
- if all(isinstance(item, str) for item in v):
790
- # Check that no string contains the separator
791
- for item in v:
792
- if cls.MARKDOWN_LIST_SEPARATOR in item:
793
- raise ValueError(
794
- f"Markdown list item cannot contain the separator "
795
- f"'{cls.MARKDOWN_LIST_SEPARATOR}' as it will mess up formatting"
796
- )
797
- v = cls.MARKDOWN_LIST_SEPARATOR.join(v).encode("utf-8")
798
- else:
799
- raise ValueError(
800
- f"Unsupported content type: mixed-type list for markdown file {name}"
801
- )
802
- elif name_lower.endswith((".yaml", ".yml")):
803
- # Check if it's a list of Pydantic models
804
- if v and isinstance(v[0], BaseModel):
805
- # Convert models to dicts first
806
- v = [item.model_dump(mode="json") for item in v]
807
- # Use YAML format for YAML files
808
- yaml = YAML()
809
- stream = BytesIO()
810
- yaml.dump(v, stream)
811
- v = stream.getvalue()
812
- elif name_lower.endswith(".json"):
813
- # Check if it's a list of Pydantic models
814
- if v and isinstance(v[0], BaseModel):
815
- # Convert models to dicts first
816
- v = [item.model_dump(mode="json") for item in v]
817
- # For JSON files, serialize as JSON
818
- v = json.dumps(v, indent=2).encode("utf-8")
819
- else:
820
- # Check if it's a list of BaseModel
821
- if v and isinstance(v[0], BaseModel):
822
- raise ValueError("list[BaseModel] requires .json or .yaml extension")
823
- # List content not supported for other file types
824
- raise ValueError(f"Unsupported content type: {type(v)} for file {name}")
825
- elif isinstance(v, BaseModel):
826
- # Serialize Pydantic models
827
- if name_lower.endswith((".yaml", ".yml")):
828
- yaml = YAML()
829
- stream = BytesIO()
830
- yaml.dump(v.model_dump(mode="json"), stream)
831
- v = stream.getvalue()
832
- else:
833
- v = json.dumps(v.model_dump(mode="json"), indent=2).encode("utf-8")
834
- elif isinstance(v, (int, float, bool)):
835
- # Numbers and booleans: JSON-serialize for .json, string for others
836
- if name_lower.endswith(".json"):
837
- v = json.dumps(v).encode("utf-8")
838
- elif name_lower.endswith((".yaml", ".yml")):
839
- v = str(v).encode("utf-8")
840
- elif name_lower.endswith(".txt"):
841
- v = str(v).encode("utf-8")
842
- else:
843
- # For other extensions, convert to string
844
- v = str(v).encode("utf-8")
845
- elif v is None:
846
- # Handle None - only supported for JSON/YAML
847
- if name_lower.endswith((".json", ".yaml", ".yml")):
848
- if name_lower.endswith((".yaml", ".yml")):
849
- v = b"null\n"
850
- else:
851
- v = b"null"
852
- else:
853
- raise ValueError(f"Unsupported content type: {type(None)} for file {name}")
854
- else:
855
- # Try to see if it has model_dump (duck typing for Pydantic-like)
856
- if hasattr(v, "model_dump"):
857
- if name_lower.endswith((".yaml", ".yml")):
858
- yaml = YAML()
859
- stream = BytesIO()
860
- yaml.dump(v.model_dump(mode="json"), stream) # type: ignore[attr-defined]
861
- v = stream.getvalue()
862
- else:
863
- v = json.dumps(v.model_dump(mode="json"), indent=2).encode("utf-8") # type: ignore[attr-defined]
864
- else:
865
- # List non-.json files should raise error
866
- if name_lower.endswith(".txt") and isinstance(v, list):
867
- raise ValueError("List content not supported for text files")
868
- raise ValueError(f"Unsupported content type: {type(v)}")
869
-
870
- # Check content size limit
871
- max_size = getattr(cls, "MAX_CONTENT_SIZE", 100 * 1024 * 1024)
872
- if len(v) > max_size:
873
- raise DocumentSizeError(
874
- f"Document size ({len(v)} bytes) exceeds maximum allowed size ({max_size} bytes)"
875
- )
876
-
292
+ """Convert content to bytes via `_convert_content` if not already bytes. Enforces MAX_CONTENT_SIZE."""
293
+ if not isinstance(v, bytes):
294
+ name = info.data.get("name", "") if hasattr(info, "data") else ""
295
+ v = _convert_content(name, v)
296
+ if len(v) > cls.MAX_CONTENT_SIZE:
297
+ raise DocumentSizeError(f"Document size ({len(v)} bytes) exceeds maximum allowed size ({cls.MAX_CONTENT_SIZE} bytes)")
877
298
  return v
878
299
 
879
- @field_serializer("content")
880
- def serialize_content(self, v: bytes) -> str:
881
- """Pydantic serializer for content field.
300
+ @field_validator("sources")
301
+ @classmethod
302
+ def validate_sources(cls, v: tuple[str, ...]) -> tuple[str, ...]:
303
+ """Sources must be document SHA256 hashes or URLs."""
304
+ for src in v:
305
+ if not is_document_sha256(src) and "://" not in src:
306
+ raise ValueError(f"Source must be a document SHA256 hash or a URL (containing '://'), got: {src!r}")
307
+ return v
882
308
 
883
- Converts bytes content to string for JSON serialization.
884
- Attempts UTF-8 decoding first, falls back to base64 encoding
885
- for binary content.
309
+ @field_validator("origins")
310
+ @classmethod
311
+ def validate_origins(cls, v: tuple[str, ...]) -> tuple[str, ...]:
312
+ """Origins must be valid document SHA256 hashes."""
313
+ for origin in v:
314
+ if not is_document_sha256(origin):
315
+ raise ValueError(f"Origin must be a document SHA256 hash, got: {origin}")
316
+ return v
886
317
 
887
- Args:
888
- v: The content bytes to serialize.
318
+ @model_validator(mode="after")
319
+ def validate_no_source_origin_overlap(self) -> Self:
320
+ """Reject documents where the same SHA256 appears in both sources and origins."""
321
+ source_sha256s = {src for src in self.sources if is_document_sha256(src)}
322
+ if source_sha256s:
323
+ overlap = source_sha256s & set(self.origins)
324
+ if overlap:
325
+ sample = next(iter(overlap))
326
+ raise ValueError(
327
+ f"SHA256 hash {sample[:12]}... appears in both sources and origins. "
328
+ f"A document reference must be either a source (content provenance) "
329
+ f"or an origin (causal provenance), not both."
330
+ )
331
+ return self
889
332
 
890
- Returns:
891
- UTF-8 decoded string for text content,
892
- base64-encoded string for binary content.
333
+ @model_validator(mode="after")
334
+ def validate_total_size(self) -> Self:
335
+ """Validate that total document size (content + attachments) is within limits."""
336
+ total = self.size
337
+ if total > self.MAX_CONTENT_SIZE:
338
+ raise DocumentSizeError(f"Total document size ({total} bytes) including attachments exceeds maximum allowed size ({self.MAX_CONTENT_SIZE} bytes)")
339
+ return self
893
340
 
894
- Note:
895
- This is called automatically by Pydantic during
896
- model serialization to JSON.
897
- """
341
+ @field_serializer("content")
342
+ def serialize_content(self, v: bytes) -> str: # noqa: PLR6301
343
+ """UTF-8 decode for text, base64 for binary. Called by Pydantic during serialization."""
898
344
  try:
899
345
  return v.decode("utf-8")
900
346
  except UnicodeDecodeError:
@@ -904,281 +350,89 @@ class Document(BaseModel, ABC):
904
350
  @final
905
351
  @property
906
352
  def id(self) -> str:
907
- """Get a short unique identifier for the document.
908
-
909
- @public
910
-
911
- This ID is crucial for LLM interactions. When documents are provided to
912
- LLMs via generate() or generate_structured(), their IDs are included,
913
- allowing the LLM to reference documents in prompts by either name or ID.
914
- The ID is content-based (derived from SHA256 hash of content only),
915
- so the same content always produces the same ID. Changing the name or
916
- description does NOT change the ID.
917
-
918
- Returns:
919
- 6-character base32-encoded string (uppercase, e.g., "A7B2C9").
920
- This is the first 6 chars of the full base32 SHA256, NOT hex.
921
-
922
- Collision Rate:
923
- With base32 encoding (5 bits per char), 6 chars = 30 bits.
924
- Expect collisions after ~32K documents (birthday paradox).
925
- For higher uniqueness requirements, use the full sha256 property.
926
-
927
- Note:
928
- While shorter than full SHA256, this provides
929
- reasonable uniqueness for most use cases.
930
- """
353
+ """First 6 chars of sha256. Used as short document identifier in LLM context."""
931
354
  return self.sha256[:6]
932
355
 
933
356
  @final
934
357
  @cached_property
935
358
  def sha256(self) -> str:
936
- """Get the full SHA256 hash of the document content.
937
-
938
- @public
939
-
940
- Computes and caches the SHA256 hash of the content,
941
- encoded in base32 (uppercase). Used for content
942
- deduplication and integrity verification.
943
-
944
- Returns:
945
- Full SHA256 hash as base32-encoded uppercase string.
946
-
947
- Why Base32 Instead of Hex:
948
- - Base32 is case-insensitive, avoiding issues with different file systems
949
- and AI interactions where casing might be inconsistent
950
- - More compact than hex (52 chars vs 64 chars for SHA-256)
951
- - Contains more information per character than hex (5 bits vs 4 bits)
952
- - Safe for URLs without encoding
953
- - Compatible with case-insensitive file systems
954
- - Avoids confusion in AI interactions where models might change casing
955
- - Not base64 because we want consistent uppercase for all uses
956
-
957
- Note:
958
- This is computed once and cached for performance.
959
- The hash is deterministic based on content only.
960
- """
961
- return b32encode(hashlib.sha256(self.content).digest()).decode("ascii").upper().rstrip("=")
359
+ """Full SHA256 identity hash (name + content + attachments). BASE32 encoded, cached."""
360
+ return compute_document_sha256(self)
361
+
362
+ @final
363
+ @cached_property
364
+ def content_sha256(self) -> str:
365
+ """SHA256 hash of raw content bytes only. Used for content deduplication."""
366
+ return compute_content_sha256(self.content)
962
367
 
963
368
  @final
964
369
  @property
965
370
  def size(self) -> int:
966
- """Get the size of the document content.
967
-
968
- @public
969
-
970
- Returns:
971
- Size of content in bytes.
972
-
973
- Note:
974
- Useful for monitoring document sizes and
975
- ensuring they stay within limits.
976
- """
977
- return len(self.content)
371
+ """Total size of content + attachments in bytes."""
372
+ return len(self.content) + sum(att.size for att in self.attachments)
978
373
 
979
374
  @cached_property
980
- def detected_mime_type(self) -> str:
981
- """Detect the MIME type from document content.
982
-
983
- Detection strategy (in order):
984
- 1. Returns 'text/plain' for empty content
985
- 2. Extension-based detection for known text formats (preferred)
986
- 3. python-magic content analysis for unknown extensions
987
- 4. Fallback to extension or 'application/octet-stream'
988
-
989
- Returns:
990
- MIME type string (e.g., "text/plain", "application/json").
991
-
992
- Note:
993
- This is cached after first access. Extension-based detection
994
- is preferred for text formats to avoid misidentification.
995
- """
996
- return detect_mime_type(self.content, self.name)
997
-
998
- @property
999
375
  def mime_type(self) -> str:
1000
- """Get the document's MIME type.
1001
-
1002
- @public
1003
-
1004
- Primary property for accessing MIME type information.
1005
- Automatically detects MIME type based on file extension and content.
1006
-
1007
- Returns:
1008
- MIME type string (e.g., "text/plain", "application/json").
1009
-
1010
- Note:
1011
- MIME type detection uses extension-based detection for known
1012
- text formats and content analysis for binary formats.
1013
- """
1014
- return self.detected_mime_type
376
+ """Detected MIME type. Extension-based for known formats, content analysis for others. Cached."""
377
+ return detect_mime_type(self.content, self.name)
1015
378
 
1016
379
  @property
1017
380
  def is_text(self) -> bool:
1018
- """Check if document contains text content.
1019
-
1020
- @public
1021
-
1022
- Returns:
1023
- True if MIME type indicates text content
1024
- (text/*, application/json, application/x-yaml, text/yaml, etc.),
1025
- False otherwise.
1026
-
1027
- Note:
1028
- Used to determine if text property can be safely accessed.
1029
- """
381
+ """True if MIME type indicates text content."""
1030
382
  return is_text_mime_type(self.mime_type)
1031
383
 
1032
384
  @property
1033
385
  def is_pdf(self) -> bool:
1034
- """Check if document is a PDF file.
1035
-
1036
- @public
1037
-
1038
- Returns:
1039
- True if MIME type is application/pdf, False otherwise.
1040
-
1041
- Note:
1042
- PDF documents require special handling and are
1043
- supported by certain LLM models.
1044
- """
386
+ """True if MIME type is application/pdf."""
1045
387
  return is_pdf_mime_type(self.mime_type)
1046
388
 
1047
389
  @property
1048
390
  def is_image(self) -> bool:
1049
- """Check if document is an image file.
1050
-
1051
- @public
1052
-
1053
- Returns:
1054
- True if MIME type starts with "image/", False otherwise.
1055
-
1056
- Note:
1057
- Image documents are automatically encoded for
1058
- vision-capable LLM models.
1059
- """
391
+ """True if MIME type starts with image/."""
1060
392
  return is_image_mime_type(self.mime_type)
1061
393
 
1062
394
  @classmethod
1063
395
  def canonical_name(cls) -> str:
1064
- """Get the canonical name for this document class.
1065
-
1066
- Returns a standardized snake_case name derived from the
1067
- class name, used for directory naming and identification.
1068
-
1069
- Returns:
1070
- Snake_case canonical name.
1071
-
1072
- Example:
1073
- >>> class UserDataDocument(FlowDocument): ...
1074
- >>> UserDataDocument.canonical_name()
1075
- 'user_data'
1076
- """
396
+ """Snake_case name derived from class name, used for directory naming."""
1077
397
  return canonical_name_key(cls)
1078
398
 
1079
399
  @property
1080
400
  def text(self) -> str:
1081
- """Get document content as UTF-8 text string.
1082
-
1083
- @public
1084
-
1085
- Decodes the bytes content as UTF-8 text. Only available for
1086
- text-based documents (check is_text property first).
1087
-
1088
- Returns:
1089
- UTF-8 decoded string.
1090
-
1091
- Raises:
1092
- ValueError: If document is not text (is_text == False).
1093
-
1094
- Example:
1095
- >>> doc = MyDocument.create(name="data.txt", content="Hello \u2728")
1096
- >>> if doc.is_text:
1097
- ... print(doc.text) # "Hello \u2728"
1098
-
1099
- >>> # Binary document raises error:
1100
- >>> binary_doc = MyDocument(name="image.png", content=png_bytes)
1101
- >>> binary_doc.text # Raises ValueError
1102
- """
401
+ """Content decoded as UTF-8. Raises ValueError if not text."""
1103
402
  if not self.is_text:
1104
403
  raise ValueError(f"Document is not text: {self.name}")
1105
404
  return self.content.decode("utf-8")
1106
405
 
1107
- @property
406
+ @cached_property
1108
407
  def approximate_tokens_count(self) -> int:
1109
- """Approximate tokens count for the document content.
1110
-
1111
- @public
1112
-
1113
- Uses tiktoken with gpt-4 encoding to estimate token count.
1114
- For text documents, encodes the actual text. For non-text
1115
- documents (images, PDFs, etc.), returns a fixed estimate of 1024 tokens.
1116
-
1117
- Returns:
1118
- Approximate number of tokens for this document.
1119
-
1120
- Example:
1121
- >>> doc = MyDocument.create(name="data.txt", content="Hello world")
1122
- >>> doc.approximate_tokens_count # ~2 tokens
1123
- """
408
+ """Approximate token count (tiktoken gpt-4 encoding). Images=1080, PDFs/other=1024."""
409
+ enc = get_tiktoken_encoding()
1124
410
  if self.is_text:
1125
- return len(tiktoken.encoding_for_model("gpt-4").encode(self.text))
411
+ total = len(enc.encode(self.text))
412
+ elif self.is_image:
413
+ total = 1080
1126
414
  else:
1127
- return 1024 # Fixed estimate for non-text documents
1128
-
1129
- def as_yaml(self) -> Any:
1130
- r"""Parse document content as YAML.
1131
-
1132
- Parses the document's text content as YAML and returns Python objects.
1133
- Uses ruamel.yaml which is safe by default (no code execution).
1134
-
1135
- Returns:
1136
- Parsed YAML data: dict, list, str, int, float, bool, or None.
1137
-
1138
- Raises:
1139
- ValueError: If document is not text-based.
1140
- YAMLError: If content is not valid YAML.
415
+ total = 1024
416
+
417
+ for att in self.attachments:
418
+ if att.is_image:
419
+ total += 1080
420
+ elif att.is_pdf:
421
+ total += 1024
422
+ elif att.is_text:
423
+ total += len(enc.encode(att.text))
424
+ else:
425
+ total += 1024
1141
426
 
1142
- Example:
1143
- >>> # From dict content
1144
- >>> doc = MyDocument.create(name="config.yaml", content={
1145
- ... "server": {"host": "localhost", "port": 8080}
1146
- ... })
1147
- >>> doc.as_yaml() # {'server': {'host': 'localhost', 'port': 8080}}
427
+ return total
1148
428
 
1149
- >>> # From YAML string
1150
- >>> doc2 = MyDocument(name="simple.yml", content=b"key: value\nitems:\n - a\n - b")
1151
- >>> doc2.as_yaml() # {'key': 'value', 'items': ['a', 'b']}
1152
- """
429
+ def as_yaml(self) -> Any:
430
+ """Parse content as YAML via ruamel.yaml."""
1153
431
  yaml = YAML()
1154
432
  return yaml.load(self.text) # type: ignore[no-untyped-call, no-any-return]
1155
433
 
1156
434
  def as_json(self) -> Any:
1157
- """Parse document content as JSON.
1158
-
1159
- Parses the document's text content as JSON and returns Python objects.
1160
- Document must contain valid JSON text.
1161
-
1162
- Returns:
1163
- Parsed JSON data: dict, list, str, int, float, bool, or None.
1164
-
1165
- Raises:
1166
- ValueError: If document is not text-based.
1167
- JSONDecodeError: If content is not valid JSON.
1168
-
1169
- Example:
1170
- >>> # From dict content
1171
- >>> doc = MyDocument.create(name="data.json", content={"key": "value"})
1172
- >>> doc.as_json() # {'key': 'value'}
1173
-
1174
- >>> # From JSON string
1175
- >>> doc2 = MyDocument(name="array.json", content=b'[1, 2, 3]')
1176
- >>> doc2.as_json() # [1, 2, 3]
1177
-
1178
- >>> # Invalid JSON
1179
- >>> bad_doc = MyDocument(name="bad.json", content=b"not json")
1180
- >>> bad_doc.as_json() # Raises JSONDecodeError
1181
- """
435
+ """Parse content as JSON."""
1182
436
  return json.loads(self.text)
1183
437
 
1184
438
  @overload
@@ -1187,50 +441,8 @@ class Document(BaseModel, ABC):
1187
441
  @overload
1188
442
  def as_pydantic_model(self, model_type: type[list[TModel]]) -> list[TModel]: ...
1189
443
 
1190
- def as_pydantic_model(
1191
- self, model_type: type[TModel] | type[list[TModel]]
1192
- ) -> TModel | list[TModel]:
1193
- """Parse document content as Pydantic model with validation.
1194
-
1195
- @public
1196
-
1197
- Parses JSON or YAML content and validates it against a Pydantic model.
1198
- Automatically detects format based on MIME type. Supports both single
1199
- models and lists of models.
1200
-
1201
- Args:
1202
- model_type: Pydantic model class to validate against.
1203
- Can be either:
1204
- - type[Model] for single model
1205
- - type[list[Model]] for list of models
1206
-
1207
- Returns:
1208
- Validated Pydantic model instance or list of instances.
1209
-
1210
- Raises:
1211
- ValueError: If document is not text or type mismatch.
1212
- ValidationError: If data doesn't match model schema.
1213
- JSONDecodeError/YAMLError: If content parsing fails.
1214
-
1215
- Example:
1216
- >>> from pydantic import BaseModel
1217
- >>>
1218
- >>> class User(BaseModel):
1219
- ... name: str
1220
- ... age: int
1221
- >>>
1222
- >>> # Single model
1223
- >>> doc = MyDocument.create(name="user.json",
1224
- ... content={"name": "Alice", "age": 30})
1225
- >>> user = doc.as_pydantic_model(User)
1226
- >>> print(user.name) # "Alice"
1227
- >>>
1228
- >>> # List of models
1229
- >>> doc2 = MyDocument.create(name="users.json",
1230
- ... content=[{"name": "Bob", "age": 25}, {"name": "Eve", "age": 28}])
1231
- >>> users = doc2.as_pydantic_model(list[User])
1232
- >>> print(len(users)) # 2
1233
- """
444
+ def as_pydantic_model(self, model_type: type[TModel] | type[list[TModel]]) -> TModel | list[TModel]:
445
+ """Parse JSON/YAML content and validate against a Pydantic model. Supports single and list types."""
1234
446
  data = self.as_yaml() if is_yaml_mime_type(self.mime_type) else self.as_json()
1235
447
 
1236
448
  if get_origin(model_type) is list:
@@ -1245,493 +457,165 @@ class Document(BaseModel, ABC):
1245
457
  single_model = cast(type[TModel], model_type)
1246
458
  return single_model.model_validate(data)
1247
459
 
1248
- def as_markdown_list(self) -> list[str]:
1249
- r"""Parse document as markdown-separated list of sections.
1250
-
1251
- @public
1252
-
1253
- Splits text content automatically using markdown section separators.
1254
- Designed for markdown documents with multiple sections.
1255
-
1256
- Returns:
1257
- List of string sections (preserves whitespace within sections).
1258
-
1259
- Raises:
1260
- ValueError: If document is not text-based.
1261
-
1262
- Example:
1263
- >>> # Using create with list
1264
- >>> sections = ["# Chapter 1\nIntroduction", "# Chapter 2\nDetails"]
1265
- >>> doc = MyDocument.create(name="book.md", content=sections)
1266
- >>> doc.as_markdown_list() # Returns original sections
1267
-
1268
- >>> # Round-trip conversion works automatically
1269
- >>> sections = ["Part 1", "Part 2", "Part 3"]
1270
- >>> doc2 = MyDocument.create(name="parts.md", content=sections)
1271
- >>> doc2.as_markdown_list() # ['Part 1', 'Part 2', 'Part 3']
1272
- """
1273
- return self.text.split(self.MARKDOWN_LIST_SEPARATOR)
460
+ def _parse_structured(self) -> Any:
461
+ """Parse content as JSON or YAML based on extension. Strict — no guessing."""
462
+ name_lower = self.name.lower()
463
+ if name_lower.endswith(".json"):
464
+ return self.as_json()
465
+ if name_lower.endswith((".yaml", ".yml")):
466
+ return self.as_yaml()
467
+ raise ValueError(f"Cannot parse '{self.name}' as structured data — use .json or .yaml extension")
1274
468
 
1275
469
  def parse(self, type_: type[Any]) -> Any:
1276
- r"""Parse document content to original type (reverses create conversion).
1277
-
1278
- @public
1279
-
1280
- This method reverses the automatic conversion performed by the `create`
1281
- classmethod. It intelligently parses the bytes content based on the
1282
- document's file extension and converts to the requested type.
1283
-
1284
- Designed for roundtrip conversion:
1285
- >>> original = {"key": "value"}
1286
- >>> doc = MyDocument.create(name="data.json", content=original)
1287
- >>> restored = doc.parse(dict)
1288
- >>> assert restored == original # True
1289
-
1290
- Args:
1291
- type_: Target type to parse content into. Supported types:
1292
- - bytes: Returns raw content (no conversion)
1293
- - str: Decodes UTF-8 text
1294
- - dict: Parses JSON (.json) or YAML (.yaml/.yml)
1295
- - list: Splits markdown (.md) or parses JSON/YAML
1296
- - BaseModel subclasses: Validates JSON/YAML into model
1297
-
1298
- Returns:
1299
- Content parsed to the requested type.
1300
-
1301
- Raises:
1302
- ValueError: If type is unsupported or parsing fails.
1303
-
1304
- Extension Rules:
1305
- - .json → JSON parsing for dict/list/BaseModel
1306
- - .yaml/.yml → YAML parsing for dict/list/BaseModel
1307
- - .md + list → Split automatically into sections
1308
- - Any + str → UTF-8 decode
1309
- - Any + bytes → Raw content
1310
-
1311
- Example:
1312
- >>> # String content
1313
- >>> doc = MyDocument(name="test.txt", content=b"Hello")
1314
- >>> doc.parse(str)
1315
- 'Hello'
1316
-
1317
- >>> # JSON content
1318
- >>> doc = MyDocument.create(name="data.json", content={"key": "value"})
1319
- >>> doc.parse(dict) # Returns {'key': 'value'}
1320
-
1321
- >>> # Markdown list
1322
- >>> items = ["Item 1", "Item 2"]
1323
- >>> doc = MyDocument.create(name="list.md", content=items)
1324
- >>> doc.parse(list)
1325
- ['Item 1', 'Item 2']
1326
- """
1327
- # Handle basic types
470
+ """Parse content to the requested type. Reverses create() conversion. Extension-based dispatch, no guessing."""
1328
471
  if type_ is bytes:
1329
472
  return self.content
1330
- elif type_ is str:
1331
- # Handle empty content specially
1332
- if len(self.content) == 0:
1333
- return ""
1334
- return self.text
1335
-
1336
- # Handle structured data based on extension
1337
- name_lower = self.name.lower()
473
+ if type_ is str:
474
+ return self.text if self.content else ""
475
+ if type_ is dict or type_ is list:
476
+ data = self._parse_structured()
477
+ if not isinstance(data, type_):
478
+ raise ValueError(f"Expected {type_.__name__} but got {type(data).__name__}")
479
+ return data # pyright: ignore[reportUnknownVariableType]
480
+ if isinstance(type_, type) and issubclass(type_, BaseModel): # pyright: ignore[reportUnnecessaryIsInstance]
481
+ return self.as_pydantic_model(type_)
482
+ raise ValueError(f"Unsupported parse type: {type_}")
1338
483
 
1339
- # JSON files
1340
- if name_lower.endswith(".json"):
1341
- if type_ is dict or type_ is list:
1342
- result = self.as_json()
1343
- # Ensure the result is the correct type
1344
- if type_ is dict and not isinstance(result, dict):
1345
- raise ValueError(f"Expected dict but got {type(result).__name__}")
1346
- if type_ is list and not isinstance(result, list):
1347
- raise ValueError(f"Expected list but got {type(result).__name__}")
1348
- return result
1349
- elif issubclass(type_, BaseModel):
1350
- return self.as_pydantic_model(type_)
1351
- else:
1352
- raise ValueError(f"Cannot parse JSON file to type {type_}")
1353
-
1354
- # YAML files
1355
- elif name_lower.endswith((".yaml", ".yml")):
1356
- if type_ is dict or type_ is list:
1357
- result = self.as_yaml()
1358
- # Ensure the result is the correct type
1359
- if type_ is dict and not isinstance(result, dict):
1360
- raise ValueError(f"Expected dict but got {type(result).__name__}")
1361
- if type_ is list and not isinstance(result, list):
1362
- raise ValueError(f"Expected list but got {type(result).__name__}")
1363
- return result
1364
- elif issubclass(type_, BaseModel):
1365
- return self.as_pydantic_model(type_)
1366
- else:
1367
- raise ValueError(f"Cannot parse YAML file to type {type_}")
484
+ @property
485
+ def source_documents(self) -> tuple[str, ...]:
486
+ """Document SHA256 hashes from sources (filtered by is_document_sha256)."""
487
+ return tuple(src for src in self.sources if is_document_sha256(src))
1368
488
 
1369
- # Markdown files with lists
1370
- elif name_lower.endswith(".md") and type_ is list:
1371
- return self.as_markdown_list()
489
+ @property
490
+ def source_references(self) -> tuple[str, ...]:
491
+ """Non-hash reference strings from sources (URLs, file paths, etc.)."""
492
+ return tuple(src for src in self.sources if not is_document_sha256(src))
1372
493
 
1373
- # Default: try to return as requested basic type
1374
- elif type_ is dict or type_ is list:
1375
- # Try JSON first, then YAML
1376
- try:
1377
- result = self.as_json()
1378
- # Ensure the result is the correct type
1379
- if type_ is dict and not isinstance(result, dict):
1380
- raise ValueError(f"Expected dict but got {type(result).__name__}")
1381
- if type_ is list and not isinstance(result, list):
1382
- raise ValueError(f"Expected list but got {type(result).__name__}")
1383
- return result
1384
- except (json.JSONDecodeError, ValueError):
1385
- try:
1386
- result = self.as_yaml()
1387
- # Ensure the result is the correct type
1388
- if type_ is dict and not isinstance(result, dict):
1389
- raise ValueError(f"Expected dict but got {type(result).__name__}")
1390
- if type_ is list and not isinstance(result, list):
1391
- raise ValueError(f"Expected list but got {type(result).__name__}")
1392
- return result
1393
- except Exception as e:
1394
- raise ValueError(f"Cannot parse content to {type_}") from e
1395
-
1396
- raise ValueError(f"Unsupported type {type_} for file {self.name}")
1397
-
1398
- def get_source_documents(self) -> list[str]:
1399
- """Get list of document SHA256 hashes referenced as sources.
1400
-
1401
- Retrieves all document references from this document's sources list,
1402
- filtering for valid SHA256 hashes that reference other documents.
1403
- This is useful for building dependency graphs and tracking document
1404
- lineage in processing pipelines.
1405
-
1406
- Returns:
1407
- List of SHA256 hashes (base32 encoded) for documents referenced
1408
- as sources. Each hash uniquely identifies another document that
1409
- contributed to creating this one.
1410
-
1411
- Example:
1412
- >>> # Create a derived document from multiple sources
1413
- >>> source1 = MyDocument.create(name="data1.txt", content="First")
1414
- >>> source2 = MyDocument.create(name="data2.txt", content="Second")
1415
- >>>
1416
- >>> merged = MyDocument.create(
1417
- ... name="merged.txt",
1418
- ... content="Combined data",
1419
- ... sources=[source1.sha256, source2.sha256, "https://api.example.com"]
1420
- ... )
1421
- >>>
1422
- >>> # Get only document references (not URLs)
1423
- >>> doc_refs = merged.get_source_documents()
1424
- >>> print(doc_refs) # [source1.sha256, source2.sha256]
1425
- >>>
1426
- >>> # Check if specific document is a source
1427
- >>> if source1.sha256 in doc_refs:
1428
- ... print("Document derived from source1")
1429
- """
1430
- return [src for src in self.sources if is_document_sha256(src)]
1431
-
1432
- def get_source_references(self) -> list[str]:
1433
- """Get list of arbitrary reference strings from sources.
1434
-
1435
- Retrieves all non-document references from this document's sources list.
1436
- These are typically URLs, file paths, API endpoints, or descriptive strings
1437
- that indicate where the document's content originated from, but are not
1438
- references to other documents in the pipeline.
1439
-
1440
- Returns:
1441
- List of reference strings that are not document SHA256 hashes.
1442
- Can include URLs, file paths, API endpoints, dataset names,
1443
- or any other string that provides source context.
1444
-
1445
- Example:
1446
- >>> # Create document with mixed source types
1447
- >>> doc = MyDocument.create(
1448
- ... name="report.txt",
1449
- ... content="Analysis results",
1450
- ... sources=[
1451
- ... other_doc.sha256, # Document reference
1452
- ... "https://api.example.com/data", # API URL
1453
- ... "dataset:customer-2024", # Dataset identifier
1454
- ... "/path/to/source.csv", # File path
1455
- ... ]
1456
- ... )
1457
- >>>
1458
- >>> # Get only non-document references
1459
- >>> refs = doc.get_source_references()
1460
- >>> print(refs)
1461
- >>> # ["https://api.example.com/data", "dataset:customer-2024", "/path/to/source.csv"]
1462
- >>>
1463
- >>> # Use for attribution or debugging
1464
- >>> for ref in refs:
1465
- ... print(f"Data sourced from: {ref}")
1466
- """
1467
- return [src for src in self.sources if not is_document_sha256(src)]
1468
-
1469
- def has_source(self, source: Document | str) -> bool:
1470
- """Check if a specific source is tracked for this document.
1471
-
1472
- Verifies whether a given source (document or reference string) is
1473
- included in this document's sources list. Useful for dependency
1474
- checking, lineage verification, and conditional processing based
1475
- on document origins.
1476
-
1477
- Args:
1478
- source: Source to check for. Can be:
1479
- - Document: Checks if document's SHA256 is in sources
1480
- - str: Checks if exact string is in sources (hash or reference)
1481
-
1482
- Returns:
1483
- True if the source is tracked in this document's sources,
1484
- False otherwise.
1485
-
1486
- Raises:
1487
- TypeError: If source is not a Document or string.
1488
-
1489
- Example:
1490
- >>> # Check if document was derived from specific source
1491
- >>> source_doc = MyDocument.create(name="original.txt", content="Data")
1492
- >>> api_url = "https://api.example.com/data"
1493
- >>>
1494
- >>> derived = MyDocument.create(
1495
- ... name="processed.txt",
1496
- ... content="Processed data",
1497
- ... sources=[source_doc.sha256, api_url]
1498
- ... )
1499
- >>>
1500
- >>> # Check document source
1501
- >>> if derived.has_source(source_doc):
1502
- ... print("Derived from source_doc")
1503
- >>>
1504
- >>> # Check string reference
1505
- >>> if derived.has_source(api_url):
1506
- ... print("Data from API")
1507
- >>>
1508
- >>> # Check by SHA256 directly
1509
- >>> if derived.has_source(source_doc.sha256):
1510
- ... print("Has specific hash")
1511
- """
494
+ def has_source(self, source: "Document | str") -> bool:
495
+ """Check if a source (Document or string) is in this document's sources."""
1512
496
  if isinstance(source, str):
1513
- # Direct string comparison
1514
497
  return source in self.sources
1515
- elif isinstance(source, Document): # type: ignore[misc]
1516
- # Check if document's SHA256 is in sources
498
+ if isinstance(source, Document): # type: ignore[misc]
1517
499
  return source.sha256 in self.sources
1518
- else:
1519
- raise TypeError(f"Invalid source type: {type(source)}")
500
+ raise TypeError(f"Invalid source type: {type(source)}") # pyright: ignore[reportUnreachable]
1520
501
 
1521
502
  @final
1522
503
  def serialize_model(self) -> dict[str, Any]:
1523
- """Serialize document to dictionary for storage or transmission.
1524
-
1525
- Creates a complete JSON-serializable representation of the document
1526
- with all metadata and properly encoded content. Automatically chooses
1527
- the most appropriate encoding (UTF-8 for text, base64 for binary).
1528
-
1529
- Returns:
1530
- Dictionary with the following keys:
1531
- - name: Document filename (str)
1532
- - description: Optional description (str | None)
1533
- - base_type: Persistence type - "flow", "task", or "temporary" (str)
1534
- - size: Content size in bytes (int)
1535
- - id: Short hash identifier, first 6 chars of SHA256 (str)
1536
- - sha256: Full SHA256 hash in base32 encoding without padding (str)
1537
- - mime_type: Detected MIME type (str)
1538
- - sources: List of source strings (list[dict])
1539
- - canonical_name: Canonical snake_case name for debug tracing (str)
1540
- - class_name: Name of the actual document class for debug tracing (str)
1541
- - content: Encoded content (str)
1542
- - content_encoding: Either "utf-8" or "base64" (str)
1543
-
1544
- Encoding Strategy:
1545
- - Text files (text/*, application/json, etc.) → UTF-8 string
1546
- - Binary files (images, PDFs, etc.) → Base64 string
1547
- - Invalid UTF-8 in text files → UTF-8 with replacement chars
1548
-
1549
- Example:
1550
- >>> doc = MyDocument.create(name="data.json", content={"key": "value"})
1551
- >>> serialized = doc.serialize_model()
1552
- >>> serialized["content_encoding"] # "utf-8"
1553
- >>> serialized["mime_type"] # "application/json"
1554
- """
1555
- result = {
504
+ """Serialize to JSON-compatible dict for storage/transmission. Roundtrips with from_dict()."""
505
+ result: dict[str, Any] = { # nosemgrep: mutable-field-on-frozen-pydantic-model
1556
506
  "name": self.name,
1557
507
  "description": self.description,
1558
- "base_type": self.get_base_type(),
1559
508
  "size": self.size,
1560
509
  "id": self.id,
1561
510
  "sha256": self.sha256,
511
+ "content_sha256": self.content_sha256,
1562
512
  "mime_type": self.mime_type,
1563
- "sources": self.sources,
513
+ "sources": list(self.sources),
514
+ "origins": list(self.origins),
1564
515
  "canonical_name": canonical_name_key(self.__class__),
1565
516
  "class_name": self.__class__.__name__,
1566
517
  }
1567
518
 
1568
- # Try to encode content as UTF-8, fall back to base64
1569
519
  if self.is_text:
1570
520
  try:
1571
521
  result["content"] = self.content.decode("utf-8")
1572
522
  result["content_encoding"] = "utf-8"
1573
523
  except UnicodeDecodeError:
1574
- # For text files with encoding issues, use UTF-8 with replacement
1575
524
  result["content"] = self.content.decode("utf-8", errors="replace")
1576
525
  result["content_encoding"] = "utf-8"
1577
526
  else:
1578
- # Binary content - use base64
1579
527
  result["content"] = base64.b64encode(self.content).decode("ascii")
1580
528
  result["content_encoding"] = "base64"
1581
529
 
530
+ serialized_attachments: list[dict[str, Any]] = [] # nosemgrep: mutable-field-on-frozen-pydantic-model
531
+ for att in self.attachments:
532
+ att_data: dict[str, Any] = {"name": att.name, "description": att.description} # nosemgrep: mutable-field-on-frozen-pydantic-model
533
+ if att.is_text:
534
+ att_data["content"] = att.content.decode("utf-8", errors="replace")
535
+ att_data["content_encoding"] = "utf-8"
536
+ else:
537
+ att_data["content"] = base64.b64encode(att.content).decode("ascii")
538
+ att_data["content_encoding"] = "base64"
539
+ att_data["mime_type"] = att.mime_type
540
+ att_data["size"] = att.size
541
+ serialized_attachments.append(att_data)
542
+ result["attachments"] = serialized_attachments
543
+
1582
544
  return result
1583
545
 
1584
546
  @final
1585
547
  @classmethod
1586
548
  def from_dict(cls, data: dict[str, Any]) -> Self:
1587
- r"""Deserialize document from dictionary (inverse of serialize_model).
1588
-
1589
- Reconstructs a Document instance from the dictionary format produced
1590
- by serialize_model(). Automatically handles content decoding based on
1591
- the content_encoding field.
1592
-
1593
- Args:
1594
- data: Dictionary containing serialized document. Required keys:
1595
- - name: Document filename (str)
1596
- - content: Encoded content (str or bytes)
1597
- Optional keys:
1598
- - description: Document description (str | None)
1599
- - content_encoding: "utf-8" or "base64" (defaults to "utf-8")
1600
- - sources: List of source strings
1601
-
1602
- Returns:
1603
- New Document instance with restored content.
1604
-
1605
- Raises:
1606
- ValueError: If content type is invalid or base64 decoding fails
1607
- KeyError: If required keys are missing from data dictionary
1608
-
1609
- Note:
1610
- Provides roundtrip guarantee with serialize_model().
1611
- Content and name are preserved exactly.
1612
-
1613
- Example:
1614
- >>> data = {
1615
- ... "name": "config.yaml",
1616
- ... "content": "key: value\n",
1617
- ... "content_encoding": "utf-8",
1618
- ... "description": "Config file"
1619
- ... }
1620
- >>> doc = MyDocument.from_dict(data)
1621
- """
1622
- # Extract content and encoding
549
+ """Deserialize from dict produced by serialize_model(). Roundtrip guarantee."""
1623
550
  content_raw = data.get("content", "")
1624
551
  content_encoding = data.get("content_encoding", "utf-8")
1625
552
 
1626
- # Decode content based on encoding
1627
553
  content: bytes
1628
554
  if content_encoding == "base64":
1629
- assert isinstance(content_raw, str), "base64 content must be string"
555
+ if not isinstance(content_raw, str):
556
+ raise ValueError("base64 content must be string")
1630
557
  content = base64.b64decode(content_raw)
1631
558
  elif isinstance(content_raw, str):
1632
- # Default to UTF-8
1633
559
  content = content_raw.encode("utf-8")
1634
560
  elif isinstance(content_raw, bytes):
1635
561
  content = content_raw
1636
562
  else:
1637
563
  raise ValueError(f"Invalid content type: {type(content_raw)}")
1638
564
 
565
+ attachments: tuple[Attachment, ...] | None = None
566
+ if attachments_raw := data.get("attachments"):
567
+ att_list: list[Attachment] = [] # nosemgrep: mutable-field-on-frozen-pydantic-model
568
+ for att_data in attachments_raw:
569
+ att_content_raw = att_data.get("content", "")
570
+ if att_data.get("content_encoding") == "base64":
571
+ att_content = base64.b64decode(att_content_raw)
572
+ elif isinstance(att_content_raw, str):
573
+ att_content = att_content_raw.encode("utf-8")
574
+ else:
575
+ att_content = att_content_raw
576
+ att_list.append(Attachment(name=att_data["name"], content=att_content, description=att_data.get("description")))
577
+ attachments = tuple(att_list)
578
+
1639
579
  return cls(
1640
580
  name=data["name"],
1641
581
  content=content,
1642
582
  description=data.get("description"),
1643
- sources=data.get("sources", []),
583
+ sources=tuple(data.get("sources") or ()),
584
+ origins=tuple(data.get("origins") or ()),
585
+ attachments=attachments,
1644
586
  )
1645
587
 
1646
588
  @final
1647
- def model_convert(
1648
- self,
1649
- new_type: type[TDocument],
1650
- *,
1651
- update: dict[str, Any] | None = None,
1652
- deep: bool = False,
1653
- ) -> TDocument:
1654
- """Convert document to a different Document type with optional updates.
1655
-
1656
- @public
1657
-
1658
- Creates a new document of a different type, preserving all attributes
1659
- while allowing updates. This is useful for converting between document
1660
- types (e.g., TaskDocument to FlowDocument) while maintaining data integrity.
1661
-
1662
- Args:
1663
- new_type: Target Document class for conversion. Must be a concrete
1664
- subclass of Document (not abstract classes like Document,
1665
- FlowDocument, or TaskDocument).
1666
- update: Dictionary of attributes to update. Supports any attributes
1667
- that the Document constructor accepts (name, content,
1668
- description, sources).
1669
- deep: Whether to perform a deep copy of mutable attributes.
1670
-
1671
- Returns:
1672
- New Document instance of the specified type.
1673
-
1674
- Raises:
1675
- TypeError: If new_type is not a subclass of Document, is an abstract
1676
- class, or if update contains invalid attributes.
1677
- DocumentNameError: If the name violates the target type's FILES enum.
1678
- DocumentSizeError: If content exceeds MAX_CONTENT_SIZE.
1679
-
1680
- Example:
1681
- >>> # Convert TaskDocument to FlowDocument
1682
- >>> task_doc = MyTaskDoc.create(name="temp.json", content={"data": "value"})
1683
- >>> flow_doc = task_doc.model_convert(MyFlowDoc)
1684
- >>> assert flow_doc.is_flow
1685
- >>> assert flow_doc.content == task_doc.content
1686
- >>>
1687
- >>> # Convert with updates
1688
- >>> updated = task_doc.model_convert(
1689
- ... MyFlowDoc,
1690
- ... update={"name": "permanent.json", "description": "Converted"}
1691
- ... )
1692
- >>>
1693
- >>> # Track document lineage
1694
- >>> derived = doc.model_convert(
1695
- ... ProcessedDoc,
1696
- ... update={"sources": [doc.sha256]}
1697
- ... )
1698
- """
1699
- # Validate new_type
589
+ def model_convert(self, new_type: type[TDocument], *, update: dict[str, Any] | None = None) -> TDocument:
590
+ """Convert to a different Document subclass with optional field overrides."""
1700
591
  try:
1701
- # Use a runtime check to ensure it's a class
1702
- if not isinstance(new_type, type): # type: ignore[reportIncompatibleArgumentType]
1703
- raise TypeError(f"new_type must be a class, got {new_type}")
1704
- if not issubclass(new_type, Document): # type: ignore[reportIncompatibleArgumentType]
1705
- raise TypeError(f"new_type must be a subclass of Document, got {new_type}")
1706
- except (TypeError, AttributeError):
1707
- # Not a class at all
1708
- raise TypeError(f"new_type must be a subclass of Document, got {new_type}")
1709
-
1710
- # Check for abstract classes by name (avoid circular imports)
1711
- class_name = new_type.__name__
1712
- if class_name == "Document":
1713
- raise TypeError("Cannot instantiate abstract Document class directly")
1714
- if class_name == "FlowDocument":
1715
- raise TypeError("Cannot instantiate abstract FlowDocument class directly")
1716
- if class_name == "TaskDocument":
1717
- raise TypeError("Cannot instantiate abstract TaskDocument class directly")
1718
-
1719
- # Get current document data with proper typing
1720
- data: dict[str, Any] = {
592
+ if not isinstance(new_type, type): # pyright: ignore[reportUnnecessaryIsInstance]
593
+ raise TypeError(f"new_type must be a class, got {new_type}") # pyright: ignore[reportUnreachable]
594
+ if not issubclass(new_type, Document): # pyright: ignore[reportUnnecessaryIsInstance]
595
+ raise TypeError(f"new_type must be a subclass of Document, got {new_type}") # pyright: ignore[reportUnreachable]
596
+ except (TypeError, AttributeError) as err:
597
+ raise TypeError(f"new_type must be a subclass of Document, got {new_type}") from err
598
+
599
+ if new_type is Document:
600
+ raise TypeError("Cannot instantiate Document directly — use a concrete subclass")
601
+
602
+ data: dict[str, Any] = { # nosemgrep: mutable-field-on-frozen-pydantic-model
1721
603
  "name": self.name,
1722
604
  "content": self.content,
1723
605
  "description": self.description,
1724
- "sources": self.sources.copy() if deep else self.sources,
606
+ "sources": self.sources,
607
+ "origins": self.origins,
608
+ "attachments": self.attachments,
1725
609
  }
1726
610
 
1727
- # Apply updates if provided
1728
611
  if update:
1729
612
  data.update(update)
1730
613
 
1731
- # Create new document of target type
1732
614
  return new_type(
1733
615
  name=data["name"],
1734
616
  content=data["content"],
1735
617
  description=data.get("description"),
1736
- sources=data.get("sources", []),
618
+ sources=data.get("sources"),
619
+ origins=data.get("origins"),
620
+ attachments=data.get("attachments"),
1737
621
  )