ai-pipeline-core 0.3.3__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ai_pipeline_core/__init__.py +70 -144
- ai_pipeline_core/deployment/__init__.py +6 -18
- ai_pipeline_core/deployment/base.py +392 -212
- ai_pipeline_core/deployment/contract.py +6 -10
- ai_pipeline_core/{utils → deployment}/deploy.py +50 -69
- ai_pipeline_core/deployment/helpers.py +16 -17
- ai_pipeline_core/{progress.py → deployment/progress.py} +23 -24
- ai_pipeline_core/{utils/remote_deployment.py → deployment/remote.py} +11 -14
- ai_pipeline_core/docs_generator/__init__.py +54 -0
- ai_pipeline_core/docs_generator/__main__.py +5 -0
- ai_pipeline_core/docs_generator/cli.py +196 -0
- ai_pipeline_core/docs_generator/extractor.py +324 -0
- ai_pipeline_core/docs_generator/guide_builder.py +644 -0
- ai_pipeline_core/docs_generator/trimmer.py +35 -0
- ai_pipeline_core/docs_generator/validator.py +114 -0
- ai_pipeline_core/document_store/__init__.py +13 -0
- ai_pipeline_core/document_store/_summary.py +9 -0
- ai_pipeline_core/document_store/_summary_worker.py +170 -0
- ai_pipeline_core/document_store/clickhouse.py +492 -0
- ai_pipeline_core/document_store/factory.py +38 -0
- ai_pipeline_core/document_store/local.py +312 -0
- ai_pipeline_core/document_store/memory.py +85 -0
- ai_pipeline_core/document_store/protocol.py +68 -0
- ai_pipeline_core/documents/__init__.py +12 -14
- ai_pipeline_core/documents/_context_vars.py +85 -0
- ai_pipeline_core/documents/_hashing.py +52 -0
- ai_pipeline_core/documents/attachment.py +85 -0
- ai_pipeline_core/documents/context.py +128 -0
- ai_pipeline_core/documents/document.py +318 -1434
- ai_pipeline_core/documents/mime_type.py +37 -82
- ai_pipeline_core/documents/utils.py +4 -12
- ai_pipeline_core/exceptions.py +10 -62
- ai_pipeline_core/images/__init__.py +32 -85
- ai_pipeline_core/images/_processing.py +5 -11
- ai_pipeline_core/llm/__init__.py +6 -4
- ai_pipeline_core/llm/ai_messages.py +106 -81
- ai_pipeline_core/llm/client.py +267 -158
- ai_pipeline_core/llm/model_options.py +12 -84
- ai_pipeline_core/llm/model_response.py +53 -99
- ai_pipeline_core/llm/model_types.py +8 -23
- ai_pipeline_core/logging/__init__.py +2 -7
- ai_pipeline_core/logging/logging.yml +1 -1
- ai_pipeline_core/logging/logging_config.py +27 -37
- ai_pipeline_core/logging/logging_mixin.py +15 -41
- ai_pipeline_core/observability/__init__.py +32 -0
- ai_pipeline_core/observability/_debug/__init__.py +30 -0
- ai_pipeline_core/observability/_debug/_auto_summary.py +94 -0
- ai_pipeline_core/{debug/config.py → observability/_debug/_config.py} +11 -7
- ai_pipeline_core/{debug/content.py → observability/_debug/_content.py} +134 -75
- ai_pipeline_core/{debug/processor.py → observability/_debug/_processor.py} +16 -17
- ai_pipeline_core/{debug/summary.py → observability/_debug/_summary.py} +113 -37
- ai_pipeline_core/observability/_debug/_types.py +75 -0
- ai_pipeline_core/{debug/writer.py → observability/_debug/_writer.py} +126 -196
- ai_pipeline_core/observability/_document_tracking.py +146 -0
- ai_pipeline_core/observability/_initialization.py +194 -0
- ai_pipeline_core/observability/_logging_bridge.py +57 -0
- ai_pipeline_core/observability/_summary.py +81 -0
- ai_pipeline_core/observability/_tracking/__init__.py +6 -0
- ai_pipeline_core/observability/_tracking/_client.py +178 -0
- ai_pipeline_core/observability/_tracking/_internal.py +28 -0
- ai_pipeline_core/observability/_tracking/_models.py +138 -0
- ai_pipeline_core/observability/_tracking/_processor.py +158 -0
- ai_pipeline_core/observability/_tracking/_service.py +311 -0
- ai_pipeline_core/observability/_tracking/_writer.py +229 -0
- ai_pipeline_core/{tracing.py → observability/tracing.py} +139 -335
- ai_pipeline_core/pipeline/__init__.py +10 -0
- ai_pipeline_core/pipeline/decorators.py +915 -0
- ai_pipeline_core/pipeline/options.py +16 -0
- ai_pipeline_core/prompt_manager.py +16 -102
- ai_pipeline_core/settings.py +26 -31
- ai_pipeline_core/testing.py +9 -0
- ai_pipeline_core-0.4.0.dist-info/METADATA +807 -0
- ai_pipeline_core-0.4.0.dist-info/RECORD +76 -0
- ai_pipeline_core/debug/__init__.py +0 -26
- ai_pipeline_core/documents/document_list.py +0 -420
- ai_pipeline_core/documents/flow_document.py +0 -112
- ai_pipeline_core/documents/task_document.py +0 -117
- ai_pipeline_core/documents/temporary_document.py +0 -74
- ai_pipeline_core/flow/__init__.py +0 -9
- ai_pipeline_core/flow/config.py +0 -494
- ai_pipeline_core/flow/options.py +0 -75
- ai_pipeline_core/pipeline.py +0 -718
- ai_pipeline_core/prefect.py +0 -63
- ai_pipeline_core/prompt_builder/__init__.py +0 -5
- ai_pipeline_core/prompt_builder/documents_prompt.jinja2 +0 -23
- ai_pipeline_core/prompt_builder/global_cache.py +0 -78
- ai_pipeline_core/prompt_builder/new_core_documents_prompt.jinja2 +0 -6
- ai_pipeline_core/prompt_builder/prompt_builder.py +0 -253
- ai_pipeline_core/prompt_builder/system_prompt.jinja2 +0 -41
- ai_pipeline_core/storage/__init__.py +0 -8
- ai_pipeline_core/storage/storage.py +0 -628
- ai_pipeline_core/utils/__init__.py +0 -8
- ai_pipeline_core-0.3.3.dist-info/METADATA +0 -569
- ai_pipeline_core-0.3.3.dist-info/RECORD +0 -57
- {ai_pipeline_core-0.3.3.dist-info → ai_pipeline_core-0.4.0.dist-info}/WHEEL +0 -0
- {ai_pipeline_core-0.3.3.dist-info → ai_pipeline_core-0.4.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,25 +1,18 @@
|
|
|
1
1
|
"""Document abstraction layer for AI pipeline flows.
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
This module provides the core document abstraction for working with various types of data
|
|
6
|
-
in AI pipelines. Documents are immutable Pydantic models that wrap binary content with metadata.
|
|
3
|
+
Immutable Pydantic models wrapping binary content with metadata, MIME detection,
|
|
4
|
+
SHA256 hashing, and serialization. All documents must be concrete subclasses of Document.
|
|
7
5
|
"""
|
|
8
6
|
|
|
9
|
-
from __future__ import annotations
|
|
10
|
-
|
|
11
7
|
import base64
|
|
12
|
-
import
|
|
8
|
+
import functools
|
|
13
9
|
import json
|
|
14
|
-
from abc import ABC, abstractmethod
|
|
15
|
-
from base64 import b32encode
|
|
16
10
|
from enum import StrEnum
|
|
17
11
|
from functools import cached_property
|
|
18
12
|
from io import BytesIO
|
|
19
13
|
from typing import (
|
|
20
14
|
Any,
|
|
21
15
|
ClassVar,
|
|
22
|
-
Literal,
|
|
23
16
|
Self,
|
|
24
17
|
TypeVar,
|
|
25
18
|
cast,
|
|
@@ -33,16 +26,19 @@ import tiktoken
|
|
|
33
26
|
from pydantic import (
|
|
34
27
|
BaseModel,
|
|
35
28
|
ConfigDict,
|
|
36
|
-
Field,
|
|
37
29
|
ValidationInfo,
|
|
38
30
|
field_serializer,
|
|
39
31
|
field_validator,
|
|
32
|
+
model_validator,
|
|
40
33
|
)
|
|
41
34
|
from ruamel.yaml import YAML
|
|
42
35
|
|
|
36
|
+
from ai_pipeline_core.documents._context_vars import get_task_context, is_registration_suppressed
|
|
37
|
+
from ai_pipeline_core.documents._hashing import compute_content_sha256, compute_document_sha256
|
|
43
38
|
from ai_pipeline_core.documents.utils import canonical_name_key, is_document_sha256
|
|
44
39
|
from ai_pipeline_core.exceptions import DocumentNameError, DocumentSizeError
|
|
45
40
|
|
|
41
|
+
from .attachment import Attachment
|
|
46
42
|
from .mime_type import (
|
|
47
43
|
detect_mime_type,
|
|
48
44
|
is_image_mime_type,
|
|
@@ -54,223 +50,75 @@ from .mime_type import (
|
|
|
54
50
|
TModel = TypeVar("TModel", bound=BaseModel)
|
|
55
51
|
TDocument = TypeVar("TDocument", bound="Document")
|
|
56
52
|
|
|
53
|
+
# Registry of canonical_name -> Document subclass for collision detection.
|
|
54
|
+
# Only non-test classes are registered. Test modules (tests.*, conftest, etc.) are skipped.
|
|
55
|
+
_canonical_name_registry: dict[str, type["Document"]] = {} # nosemgrep: no-mutable-module-globals
|
|
57
56
|
|
|
58
|
-
class Document(BaseModel, ABC):
|
|
59
|
-
r"""Abstract base class for all documents in the AI Pipeline Core system.
|
|
60
|
-
|
|
61
|
-
@public
|
|
62
|
-
|
|
63
|
-
Document is the fundamental data abstraction for all content flowing through
|
|
64
|
-
pipelines. It provides automatic encoding, MIME type detection, serialization,
|
|
65
|
-
and validation. All documents must be subclassed from FlowDocument or TaskDocument
|
|
66
|
-
based on their persistence requirements.
|
|
67
|
-
|
|
68
|
-
VALIDATION IS AUTOMATIC - Do not add manual validation!
|
|
69
|
-
Size validation, name validation, and MIME type detection are built-in.
|
|
70
|
-
The framework handles all standard validations internally.
|
|
71
|
-
|
|
72
|
-
# WRONG - These checks already happen automatically:
|
|
73
|
-
if document.size > document.MAX_CONTENT_SIZE:
|
|
74
|
-
raise DocumentSizeError(...) # NO! Already handled
|
|
75
|
-
document.validate_file_name(document.name) # NO! Automatic
|
|
76
|
-
|
|
77
|
-
Best Practices:
|
|
78
|
-
- Use create() classmethod for automatic type conversion (default preferred)
|
|
79
|
-
- Omit description parameter unless truly needed for metadata
|
|
80
|
-
- When using LLM functions, pass AIMessages or str. Wrap any Document values
|
|
81
|
-
in AIMessages([...]). Do not call .text yourself
|
|
82
|
-
|
|
83
|
-
Standard Usage:
|
|
84
|
-
>>> # CORRECT - minimal parameters
|
|
85
|
-
>>> doc = MyDocument.create(name="data.json", content={"key": "value"})
|
|
86
|
-
|
|
87
|
-
>>> # AVOID - unnecessary description
|
|
88
|
-
>>> doc = MyDocument.create(
|
|
89
|
-
... name="data.json",
|
|
90
|
-
... content={"key": "value"},
|
|
91
|
-
... description="This is data" # Usually not needed!
|
|
92
|
-
... )
|
|
93
|
-
|
|
94
|
-
Key features:
|
|
95
|
-
- Immutable by default (frozen Pydantic model)
|
|
96
|
-
- Automatic MIME type detection
|
|
97
|
-
- Content size validation
|
|
98
|
-
- SHA256 hashing for deduplication
|
|
99
|
-
- Support for text, JSON, YAML, PDF, and image formats
|
|
100
|
-
- Conversion utilities between different formats
|
|
101
|
-
- Source provenance tracking via sources field
|
|
102
|
-
- Document type conversion via model_convert() method
|
|
103
|
-
- Standard Pydantic model_copy() for same-type copying
|
|
104
|
-
|
|
105
|
-
Class Variables:
|
|
106
|
-
MAX_CONTENT_SIZE: Maximum allowed content size in bytes (default 25MB)
|
|
107
|
-
|
|
108
|
-
Attributes:
|
|
109
|
-
name: Document filename (validated for security)
|
|
110
|
-
description: Optional human-readable description
|
|
111
|
-
content: Raw document content as bytes
|
|
112
|
-
sources: List of source references tracking document provenance
|
|
113
|
-
|
|
114
|
-
Creating Documents:
|
|
115
|
-
**Use the `create` classmethod** for most use cases. It accepts various
|
|
116
|
-
content types (str, dict, list, BaseModel) and converts them automatically.
|
|
117
|
-
Only use __init__ directly when you already have bytes content.
|
|
118
|
-
|
|
119
|
-
>>> # RECOMMENDED: Use create for automatic conversion
|
|
120
|
-
>>> doc = MyDocument.create(name="data.json", content={"key": "value"})
|
|
121
|
-
>>>
|
|
122
|
-
>>> # Direct constructor: Only for bytes
|
|
123
|
-
>>> doc = MyDocument(name="data.bin", content=b"\x00\x01\x02")
|
|
124
|
-
|
|
125
|
-
Warning:
|
|
126
|
-
- Document subclasses should NOT start with 'Test' prefix (pytest conflict)
|
|
127
|
-
- Cannot instantiate Document directly - must subclass FlowDocument or TaskDocument
|
|
128
|
-
- Cannot add custom fields - only name, description, content, sources are allowed
|
|
129
|
-
- Document is an abstract class and cannot be instantiated directly
|
|
130
|
-
|
|
131
|
-
Metadata Attachment Patterns:
|
|
132
|
-
Since custom fields are not allowed, use these patterns for metadata:
|
|
133
|
-
1. Use the 'description' field for human-readable metadata
|
|
134
|
-
2. Embed metadata in content (e.g., JSON with data + metadata fields)
|
|
135
|
-
3. Create a separate MetadataDocument type to accompany data documents
|
|
136
|
-
4. Use document naming conventions (e.g., "data_v2_2024.json")
|
|
137
|
-
5. Store metadata in flow_options
|
|
138
|
-
|
|
139
|
-
FILES Enum Best Practice:
|
|
140
|
-
When defining a FILES enum, NEVER use magic strings to reference files.
|
|
141
|
-
Always use the enum values to maintain type safety and refactorability.
|
|
142
|
-
|
|
143
|
-
WRONG - Magic strings/numbers:
|
|
144
|
-
doc = ConfigDocument.create(name="config.yaml", content=data) # NO!
|
|
145
|
-
doc = docs.get_by("settings.json") # NO! Magic string
|
|
146
|
-
files = ["config.yaml", "settings.json"] # NO! Magic strings
|
|
147
|
-
|
|
148
|
-
CORRECT - Use enum references:
|
|
149
|
-
doc = ConfigDocument.create(
|
|
150
|
-
name=ConfigDocument.FILES.CONFIG, # YES! Type-safe
|
|
151
|
-
content=data
|
|
152
|
-
)
|
|
153
|
-
doc = docs.get_by(ConfigDocument.FILES.SETTINGS) # YES!
|
|
154
|
-
files = [
|
|
155
|
-
ConfigDocument.FILES.CONFIG,
|
|
156
|
-
ConfigDocument.FILES.SETTINGS
|
|
157
|
-
] # YES! Refactorable
|
|
158
|
-
|
|
159
|
-
Pydantic Model Interaction:
|
|
160
|
-
Documents provide DIRECT support for Pydantic models. Use the built-in
|
|
161
|
-
methods instead of manual JSON conversion.
|
|
162
|
-
|
|
163
|
-
WRONG - Manual JSON conversion:
|
|
164
|
-
# Don't do this - manual JSON handling
|
|
165
|
-
json_str = doc.text
|
|
166
|
-
json_data = json.loads(json_str)
|
|
167
|
-
model = MyModel(**json_data) # NO! Use as_pydantic_model
|
|
168
|
-
|
|
169
|
-
# Don't do this - manual serialization
|
|
170
|
-
json_str = model.model_dump_json()
|
|
171
|
-
doc = MyDocument.create(name="data.json", content=json_str) # NO!
|
|
172
|
-
|
|
173
|
-
CORRECT - Direct Pydantic interaction:
|
|
174
|
-
# Reading Pydantic model from document
|
|
175
|
-
model = doc.as_pydantic_model(MyModel) # Direct conversion
|
|
176
|
-
models = doc.as_pydantic_model(list[MyModel]) # List support
|
|
177
|
-
|
|
178
|
-
# Creating document from Pydantic model
|
|
179
|
-
doc = MyDocument.create(
|
|
180
|
-
name="data.json",
|
|
181
|
-
content=model # Direct BaseModel support
|
|
182
|
-
)
|
|
183
57
|
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
Example:
|
|
191
|
-
>>> from enum import StrEnum
|
|
192
|
-
>>> from pydantic import BaseModel
|
|
193
|
-
>>>
|
|
194
|
-
>>> # Simple document:
|
|
195
|
-
>>> class MyDocument(FlowDocument):
|
|
196
|
-
... pass
|
|
197
|
-
>>>
|
|
198
|
-
>>> # Document with file restrictions:
|
|
199
|
-
>>> class ConfigDocument(FlowDocument):
|
|
200
|
-
... class FILES(StrEnum):
|
|
201
|
-
... CONFIG = "config.yaml"
|
|
202
|
-
... SETTINGS = "settings.json"
|
|
203
|
-
>>>
|
|
204
|
-
>>> # CORRECT FILES usage - no magic strings:
|
|
205
|
-
>>> doc = ConfigDocument.create(
|
|
206
|
-
... name=ConfigDocument.FILES.CONFIG, # Use enum
|
|
207
|
-
... content={"key": "value"}
|
|
208
|
-
... )
|
|
209
|
-
>>>
|
|
210
|
-
>>> # CORRECT Pydantic usage:
|
|
211
|
-
>>> class Config(BaseModel):
|
|
212
|
-
... key: str
|
|
213
|
-
>>>
|
|
214
|
-
>>> # Direct creation from Pydantic model
|
|
215
|
-
>>> config_model = Config(key="value")
|
|
216
|
-
>>> doc = MyDocument.create(name="data.json", content=config_model)
|
|
217
|
-
>>>
|
|
218
|
-
>>> # Direct extraction to Pydantic model
|
|
219
|
-
>>> restored = doc.as_pydantic_model(Config)
|
|
220
|
-
>>> print(restored.key) # "value"
|
|
221
|
-
>>>
|
|
222
|
-
>>> # Track document provenance with sources
|
|
223
|
-
>>> source_doc = MyDocument.create(name="input.txt", content="raw data")
|
|
224
|
-
>>> processed = MyDocument.create(
|
|
225
|
-
... name="output.txt",
|
|
226
|
-
... content="processed data",
|
|
227
|
-
... sources=[source_doc.sha256] # Reference source document
|
|
228
|
-
... )
|
|
229
|
-
>>> processed.has_source(source_doc) # True
|
|
230
|
-
>>>
|
|
231
|
-
>>> # Document copying and type conversion:
|
|
232
|
-
>>> # Standard Pydantic model_copy (doesn't validate updates)
|
|
233
|
-
>>> copied = doc.model_copy(update={"name": "new_name.json"})
|
|
234
|
-
>>> # Type conversion with validation via model_convert
|
|
235
|
-
>>> task_doc = MyTaskDoc.create(name="temp.json", content={"data": "value"})
|
|
236
|
-
>>> flow_doc = task_doc.model_convert(MyFlowDoc) # Convert to FlowDocument
|
|
237
|
-
>>> flow_doc.is_flow # True
|
|
238
|
-
"""
|
|
58
|
+
def _is_test_module(cls: type) -> bool:
|
|
59
|
+
"""Check if a class is defined in a test module (skip collision detection)."""
|
|
60
|
+
module = getattr(cls, "__module__", "") or ""
|
|
61
|
+
parts = module.split(".")
|
|
62
|
+
return any(p == "tests" or p.startswith("test_") or p == "conftest" for p in parts)
|
|
239
63
|
|
|
240
|
-
MAX_CONTENT_SIZE: ClassVar[int] = 25 * 1024 * 1024
|
|
241
|
-
"""Maximum allowed content size in bytes (default 25MB).
|
|
242
64
|
|
|
243
|
-
|
|
244
|
-
|
|
65
|
+
@functools.cache
|
|
66
|
+
def get_tiktoken_encoding() -> tiktoken.Encoding:
|
|
67
|
+
"""Lazy-cached tiktoken encoding. Deferred to first use, cached forever."""
|
|
68
|
+
return tiktoken.encoding_for_model("gpt-4")
|
|
245
69
|
|
|
246
|
-
DESCRIPTION_EXTENSION: ClassVar[str] = ".description.md"
|
|
247
|
-
"""File extension for description files."""
|
|
248
70
|
|
|
249
|
-
|
|
250
|
-
"""
|
|
71
|
+
def _serialize_to_json(data: Any) -> bytes:
|
|
72
|
+
"""JSON serialize with 2-space indent."""
|
|
73
|
+
return json.dumps(data, indent=2).encode("utf-8")
|
|
251
74
|
|
|
252
|
-
MARKDOWN_LIST_SEPARATOR: ClassVar[str] = "\n\n-----------------\n\n"
|
|
253
|
-
"""Separator for markdown list items."""
|
|
254
75
|
|
|
255
|
-
|
|
256
|
-
|
|
76
|
+
def _serialize_to_yaml(data: Any) -> bytes:
|
|
77
|
+
"""YAML serialize via ruamel."""
|
|
78
|
+
yaml = YAML()
|
|
79
|
+
stream = BytesIO()
|
|
80
|
+
yaml.dump(data, stream) # pyright: ignore[reportUnknownMemberType]
|
|
81
|
+
return stream.getvalue()
|
|
257
82
|
|
|
258
|
-
Performs several validation checks when a Document subclass is defined:
|
|
259
|
-
1. Prevents class names starting with 'Test' (pytest conflict)
|
|
260
|
-
2. Validates FILES enum if present (must be StrEnum)
|
|
261
|
-
3. Prevents adding custom fields beyond name, description, content
|
|
262
83
|
|
|
263
|
-
|
|
264
|
-
|
|
84
|
+
def _serialize_structured(name: str, data: Any) -> bytes:
|
|
85
|
+
"""Serialize dict/list to JSON or YAML based on file extension."""
|
|
86
|
+
name_lower = name.lower()
|
|
87
|
+
if name_lower.endswith((".yaml", ".yml")):
|
|
88
|
+
return _serialize_to_yaml(data)
|
|
89
|
+
if name_lower.endswith(".json"):
|
|
90
|
+
return _serialize_to_json(data)
|
|
91
|
+
raise ValueError(f"Structured content ({type(data).__name__}) requires .json or .yaml extension, got: {name}")
|
|
265
92
|
|
|
266
|
-
Raises:
|
|
267
|
-
TypeError: If subclass violates naming rules, FILES enum requirements,
|
|
268
|
-
or attempts to add extra fields.
|
|
269
93
|
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
94
|
+
def _convert_content(name: str, content: str | bytes | dict[str, Any] | list[Any] | BaseModel) -> bytes:
|
|
95
|
+
"""Convert any supported content type to bytes. Dispatch by isinstance."""
|
|
96
|
+
if isinstance(content, bytes):
|
|
97
|
+
return content
|
|
98
|
+
if isinstance(content, str):
|
|
99
|
+
return content.encode("utf-8")
|
|
100
|
+
if isinstance(content, dict):
|
|
101
|
+
return _serialize_structured(name, content)
|
|
102
|
+
if isinstance(content, BaseModel):
|
|
103
|
+
return _serialize_structured(name, content.model_dump(mode="json"))
|
|
104
|
+
if isinstance(content, list): # pyright: ignore[reportUnnecessaryIsInstance]
|
|
105
|
+
data = [item.model_dump(mode="json") if isinstance(item, BaseModel) else item for item in content]
|
|
106
|
+
return _serialize_structured(name, data)
|
|
107
|
+
raise ValueError(f"Unsupported content type: {type(content)}") # pyright: ignore[reportUnreachable]
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
class Document(BaseModel):
|
|
111
|
+
"""Immutable base class for all pipeline documents. Cannot be instantiated directly — must be subclassed.
|
|
112
|
+
|
|
113
|
+
Content is stored as bytes. Use `create()` for automatic conversion from str/dict/list/BaseModel.
|
|
114
|
+
Use `parse()` to reverse the conversion. Serialization is extension-driven (.json → JSON, .yaml → YAML).
|
|
115
|
+
"""
|
|
116
|
+
|
|
117
|
+
MAX_CONTENT_SIZE: ClassVar[int] = 25 * 1024 * 1024
|
|
118
|
+
"""Maximum allowed total size in bytes (default 25MB)."""
|
|
119
|
+
|
|
120
|
+
def __init_subclass__(cls, **kwargs: Any) -> None:
|
|
121
|
+
"""Validate subclass at definition time. Cannot start with 'Test', cannot add custom fields."""
|
|
274
122
|
super().__init_subclass__(**kwargs)
|
|
275
123
|
if cls.__name__.startswith("Test"):
|
|
276
124
|
raise TypeError(
|
|
@@ -279,14 +127,12 @@ class Document(BaseModel, ABC):
|
|
|
279
127
|
"Please use a different name (e.g., 'SampleDocument', 'ExampleDocument')."
|
|
280
128
|
)
|
|
281
129
|
if hasattr(cls, "FILES"):
|
|
282
|
-
files =
|
|
130
|
+
files: type[StrEnum] = cls.FILES # pyright: ignore[reportAttributeAccessIssue, reportUnknownMemberType, reportUnknownVariableType]
|
|
283
131
|
if not issubclass(files, StrEnum):
|
|
284
|
-
raise TypeError(
|
|
285
|
-
f"Document subclass '{cls.__name__}'.FILES must be an Enum of string values"
|
|
286
|
-
)
|
|
132
|
+
raise TypeError(f"Document subclass '{cls.__name__}'.FILES must be an Enum of string values")
|
|
287
133
|
# Check that the Document's model_fields only contain the allowed fields
|
|
288
134
|
# It prevents AI models from adding additional fields to documents
|
|
289
|
-
allowed = {"name", "description", "content", "sources"}
|
|
135
|
+
allowed = {"name", "description", "content", "sources", "attachments", "origins"}
|
|
290
136
|
current = set(getattr(cls, "model_fields", {}).keys())
|
|
291
137
|
extras = current - allowed
|
|
292
138
|
if extras:
|
|
@@ -295,60 +141,18 @@ class Document(BaseModel, ABC):
|
|
|
295
141
|
f"{', '.join(sorted(extras))}. Only {', '.join(sorted(allowed))} are allowed."
|
|
296
142
|
)
|
|
297
143
|
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
@classmethod
|
|
311
|
-
def create(
|
|
312
|
-
cls,
|
|
313
|
-
*,
|
|
314
|
-
name: str,
|
|
315
|
-
content: str,
|
|
316
|
-
description: str | None = None,
|
|
317
|
-
sources: list[str] | None = None,
|
|
318
|
-
) -> Self: ...
|
|
319
|
-
|
|
320
|
-
@overload
|
|
321
|
-
@classmethod
|
|
322
|
-
def create(
|
|
323
|
-
cls,
|
|
324
|
-
*,
|
|
325
|
-
name: str,
|
|
326
|
-
content: dict[str, Any],
|
|
327
|
-
description: str | None = None,
|
|
328
|
-
sources: list[str] | None = None,
|
|
329
|
-
) -> Self: ...
|
|
330
|
-
|
|
331
|
-
@overload
|
|
332
|
-
@classmethod
|
|
333
|
-
def create(
|
|
334
|
-
cls,
|
|
335
|
-
*,
|
|
336
|
-
name: str,
|
|
337
|
-
content: list[Any],
|
|
338
|
-
description: str | None = None,
|
|
339
|
-
sources: list[str] | None = None,
|
|
340
|
-
) -> Self: ...
|
|
341
|
-
|
|
342
|
-
@overload
|
|
343
|
-
@classmethod
|
|
344
|
-
def create(
|
|
345
|
-
cls,
|
|
346
|
-
*,
|
|
347
|
-
name: str,
|
|
348
|
-
content: BaseModel,
|
|
349
|
-
description: str | None = None,
|
|
350
|
-
sources: list[str] | None = None,
|
|
351
|
-
) -> Self: ...
|
|
144
|
+
# Canonical name collision detection (production classes only)
|
|
145
|
+
if not _is_test_module(cls):
|
|
146
|
+
canonical = canonical_name_key(cls)
|
|
147
|
+
existing = _canonical_name_registry.get(canonical)
|
|
148
|
+
if existing is not None and existing is not cls:
|
|
149
|
+
raise TypeError(
|
|
150
|
+
f"Document subclass '{cls.__name__}' (in {cls.__module__}) produces "
|
|
151
|
+
f"canonical_name '{canonical}' which collides with existing class "
|
|
152
|
+
f"'{existing.__name__}' (in {existing.__module__}). "
|
|
153
|
+
f"Rename one of the classes to avoid ambiguity."
|
|
154
|
+
)
|
|
155
|
+
_canonical_name_registry[canonical] = cls
|
|
352
156
|
|
|
353
157
|
@classmethod
|
|
354
158
|
def create(
|
|
@@ -357,111 +161,22 @@ class Document(BaseModel, ABC):
|
|
|
357
161
|
name: str,
|
|
358
162
|
content: str | bytes | dict[str, Any] | list[Any] | BaseModel,
|
|
359
163
|
description: str | None = None,
|
|
360
|
-
sources:
|
|
164
|
+
sources: tuple[str, ...] | None = None,
|
|
165
|
+
origins: tuple[str, ...] | None = None,
|
|
166
|
+
attachments: tuple[Attachment, ...] | None = None,
|
|
361
167
|
) -> Self:
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
This is the **recommended way to create documents**. It accepts various
|
|
367
|
-
content types and automatically converts them to bytes based on the file
|
|
368
|
-
extension. Use the `parse` method to reverse this conversion.
|
|
369
|
-
|
|
370
|
-
Best Practice (by default, unless instructed otherwise):
|
|
371
|
-
Only provide name and content. The description parameter is RARELY needed.
|
|
372
|
-
|
|
373
|
-
Args:
|
|
374
|
-
name: Document filename (required, keyword-only).
|
|
375
|
-
Extension determines serialization:
|
|
376
|
-
- .json → JSON serialization
|
|
377
|
-
- .yaml/.yml → YAML serialization
|
|
378
|
-
- .md → Markdown list joining (for list[str])
|
|
379
|
-
- Others → UTF-8 encoding (for str)
|
|
380
|
-
content: Document content in various formats (required, keyword-only):
|
|
381
|
-
- bytes: Used directly without conversion
|
|
382
|
-
- str: Encoded to UTF-8 bytes
|
|
383
|
-
- dict[str, Any]: Serialized to JSON (.json) or YAML (.yaml/.yml)
|
|
384
|
-
- list[str]: Joined automatically for .md (validates format compatibility),
|
|
385
|
-
else JSON/YAML
|
|
386
|
-
- list[BaseModel]: Serialized to JSON or YAML based on extension
|
|
387
|
-
- BaseModel: Serialized to JSON or YAML based on extension
|
|
388
|
-
description: Optional description - USUALLY OMIT THIS (defaults to None).
|
|
389
|
-
Only use when meaningful metadata helps downstream processing
|
|
390
|
-
sources: Optional list of source strings (document SHA256 hashes or references).
|
|
391
|
-
Used to track what sources contributed to creating this document.
|
|
392
|
-
Can contain document SHA256 hashes (for referencing other documents)
|
|
393
|
-
or arbitrary reference strings (URLs, file paths, descriptions).
|
|
394
|
-
Defaults to empty list
|
|
395
|
-
|
|
396
|
-
Returns:
|
|
397
|
-
New Document instance with content converted to bytes
|
|
398
|
-
|
|
399
|
-
Raises:
|
|
400
|
-
ValueError: If content type is not supported for the file extension,
|
|
401
|
-
or if markdown list format is incompatible
|
|
402
|
-
DocumentNameError: If filename violates validation rules
|
|
403
|
-
DocumentSizeError: If content exceeds MAX_CONTENT_SIZE
|
|
404
|
-
|
|
405
|
-
Note:
|
|
406
|
-
All conversions are reversible using the `parse` method.
|
|
407
|
-
For example: MyDocument.create(name="data.json", content={"key": "value"}).parse(dict)
|
|
408
|
-
returns the original dictionary {"key": "value"}.
|
|
409
|
-
|
|
410
|
-
Example:
|
|
411
|
-
>>> # CORRECT - no description needed (by default, unless instructed otherwise)
|
|
412
|
-
>>> doc = MyDocument.create(name="test.txt", content="Hello World")
|
|
413
|
-
>>> doc.content # b'Hello World'
|
|
414
|
-
>>> doc.parse(str) # "Hello World"
|
|
415
|
-
|
|
416
|
-
>>> # CORRECT - Dictionary to JSON, no description
|
|
417
|
-
>>> doc = MyDocument.create(name="config.json", content={"key": "value"})
|
|
418
|
-
>>> doc.content # b'{"key": "value", ...}'
|
|
419
|
-
>>> doc.parse(dict) # {"key": "value"}
|
|
420
|
-
|
|
421
|
-
>>> # AVOID unless description adds real value
|
|
422
|
-
>>> doc = MyDocument.create(
|
|
423
|
-
... name="config.json",
|
|
424
|
-
... content={"key": "value"},
|
|
425
|
-
... description="Config file" # Usually redundant!
|
|
426
|
-
... )
|
|
427
|
-
|
|
428
|
-
>>> # Pydantic model to YAML
|
|
429
|
-
>>> from pydantic import BaseModel
|
|
430
|
-
>>> class Config(BaseModel):
|
|
431
|
-
... host: str
|
|
432
|
-
... port: int
|
|
433
|
-
>>> config = Config(host="localhost", port=8080)
|
|
434
|
-
>>> doc = MyDocument.create(name="config.yaml", content=config)
|
|
435
|
-
>>> doc.parse(Config) # Returns Config instance
|
|
436
|
-
|
|
437
|
-
>>> # List to Markdown
|
|
438
|
-
>>> items = ["Section 1", "Section 2"]
|
|
439
|
-
>>> doc = MyDocument.create(name="sections.md", content=items)
|
|
440
|
-
>>> doc.parse(list) # ["Section 1", "Section 2"]
|
|
441
|
-
|
|
442
|
-
>>> # Document with sources for provenance tracking
|
|
443
|
-
>>> source_doc = MyDocument.create(name="source.txt", content="original")
|
|
444
|
-
>>> derived = MyDocument.create(
|
|
445
|
-
... name="result.txt",
|
|
446
|
-
... content="processed",
|
|
447
|
-
... sources=[source_doc.sha256, "https://api.example.com/data"]
|
|
448
|
-
... )
|
|
449
|
-
>>> derived.get_source_documents() # [source_doc.sha256]
|
|
450
|
-
>>> derived.get_source_references() # ["https://api.example.com/data"]
|
|
168
|
+
"""Create a document with automatic content-to-bytes conversion.
|
|
169
|
+
|
|
170
|
+
Serialization is extension-driven: .json → JSON, .yaml → YAML, others → UTF-8.
|
|
171
|
+
Reversible via parse(). Cannot be called on Document directly — must use a subclass.
|
|
451
172
|
"""
|
|
452
|
-
# Use model_validate to leverage the existing validator logic
|
|
453
|
-
temp = cls.model_validate({
|
|
454
|
-
"name": name,
|
|
455
|
-
"content": content,
|
|
456
|
-
"description": description,
|
|
457
|
-
"sources": sources,
|
|
458
|
-
})
|
|
459
|
-
# Now construct with type-checker-friendly call (bytes only)
|
|
460
173
|
return cls(
|
|
461
|
-
name=
|
|
462
|
-
content=
|
|
463
|
-
description=
|
|
464
|
-
sources=
|
|
174
|
+
name=name,
|
|
175
|
+
content=_convert_content(name, content),
|
|
176
|
+
description=description,
|
|
177
|
+
sources=sources,
|
|
178
|
+
origins=origins,
|
|
179
|
+
attachments=attachments,
|
|
465
180
|
)
|
|
466
181
|
|
|
467
182
|
def __init__(
|
|
@@ -470,61 +185,53 @@ class Document(BaseModel, ABC):
|
|
|
470
185
|
name: str,
|
|
471
186
|
content: bytes,
|
|
472
187
|
description: str | None = None,
|
|
473
|
-
sources:
|
|
188
|
+
sources: tuple[str, ...] | None = None,
|
|
189
|
+
origins: tuple[str, ...] | None = None,
|
|
190
|
+
attachments: tuple[Attachment, ...] | None = None,
|
|
474
191
|
) -> None:
|
|
475
|
-
"""Initialize
|
|
476
|
-
|
|
477
|
-
@public
|
|
478
|
-
|
|
479
|
-
Important:
|
|
480
|
-
**Most users should use the `create` classmethod instead of __init__.**
|
|
481
|
-
The create method provides automatic content conversion for various types
|
|
482
|
-
(str, dict, list, Pydantic models) while __init__ only accepts bytes.
|
|
483
|
-
|
|
484
|
-
This constructor accepts only bytes content for type safety. It prevents
|
|
485
|
-
direct instantiation of the abstract Document class.
|
|
486
|
-
|
|
487
|
-
Args:
|
|
488
|
-
name: Document filename (required, keyword-only)
|
|
489
|
-
content: Document content as raw bytes (required, keyword-only)
|
|
490
|
-
description: Optional human-readable description (keyword-only)
|
|
491
|
-
sources: Optional list of source strings for provenance tracking.
|
|
492
|
-
Can contain document SHA256 hashes (for referencing other documents)
|
|
493
|
-
or arbitrary reference strings (URLs, file paths, descriptions).
|
|
494
|
-
Defaults to empty list
|
|
495
|
-
|
|
496
|
-
Raises:
|
|
497
|
-
TypeError: If attempting to instantiate Document directly.
|
|
498
|
-
|
|
499
|
-
Example:
|
|
500
|
-
>>> # Direct constructor - only for bytes content:
|
|
501
|
-
>>> doc = MyDocument(name="test.txt", content=b"Hello World")
|
|
502
|
-
>>> doc.content # b'Hello World'
|
|
503
|
-
|
|
504
|
-
>>> # RECOMMENDED: Use create for automatic conversion:
|
|
505
|
-
>>> doc = MyDocument.create(name="text.txt", content="Hello World")
|
|
506
|
-
>>> doc = MyDocument.create(name="data.json", content={"key": "value"})
|
|
507
|
-
>>> doc = MyDocument.create(name="config.yaml", content=my_model)
|
|
508
|
-
>>> doc = MyDocument.create(name="items.md", content=["item1", "item2"])
|
|
509
|
-
"""
|
|
192
|
+
"""Initialize with raw bytes content. Most users should use `create()` instead."""
|
|
510
193
|
if type(self) is Document:
|
|
511
|
-
raise TypeError("Cannot instantiate
|
|
194
|
+
raise TypeError("Cannot instantiate Document directly — use a concrete subclass")
|
|
512
195
|
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
196
|
+
super().__init__(
|
|
197
|
+
name=name,
|
|
198
|
+
content=content,
|
|
199
|
+
description=description,
|
|
200
|
+
sources=sources or (),
|
|
201
|
+
origins=origins or (),
|
|
202
|
+
attachments=attachments or (),
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
# Register with task context for document lifecycle tracking
|
|
206
|
+
if not is_registration_suppressed():
|
|
207
|
+
task_ctx = get_task_context()
|
|
208
|
+
if task_ctx is not None:
|
|
209
|
+
task_ctx.register_created(self) # pyright: ignore[reportAttributeAccessIssue, reportUnknownMemberType]
|
|
518
210
|
|
|
519
211
|
name: str
|
|
520
212
|
description: str | None = None
|
|
521
|
-
content: bytes
|
|
522
|
-
sources:
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
213
|
+
content: bytes
|
|
214
|
+
sources: tuple[str, ...] = ()
|
|
215
|
+
"""Content provenance: documents and references this document's content was directly
|
|
216
|
+
derived from. Can be document SHA256 hashes (for pipeline documents) or external
|
|
217
|
+
references (URLs, file paths). Answers: 'where did this content come from?'
|
|
218
|
+
|
|
219
|
+
Example: an analysis document derived from an input document has
|
|
220
|
+
sources=(input_doc.sha256,). A webpage capture has sources=("https://example.com",)."""
|
|
221
|
+
|
|
222
|
+
origins: tuple[str, ...] = ()
|
|
223
|
+
"""Causal provenance: documents that caused this document to be created without directly
|
|
224
|
+
contributing to its content. Always document SHA256 hashes, never arbitrary strings.
|
|
225
|
+
Answers: 'why does this document exist?'
|
|
226
|
+
|
|
227
|
+
Example: a research plan causes 10 webpages to be captured. Each webpage's source is its
|
|
228
|
+
URL (content provenance), its origin is the research plan (causal provenance — the plan
|
|
229
|
+
caused the capture but didn't contribute to the webpage's content).
|
|
230
|
+
|
|
231
|
+
A SHA256 hash must not appear in both sources and origins for the same document.
|
|
232
|
+
Within a pipeline task or flow, all source/origin SHA256 references must point to
|
|
233
|
+
documents that existed before the task/flow started executing."""
|
|
234
|
+
attachments: tuple[Attachment, ...] = ()
|
|
528
235
|
|
|
529
236
|
# Pydantic configuration
|
|
530
237
|
model_config = ConfigDict(
|
|
@@ -533,145 +240,27 @@ class Document(BaseModel, ABC):
|
|
|
533
240
|
extra="forbid",
|
|
534
241
|
)
|
|
535
242
|
|
|
536
|
-
@abstractmethod
|
|
537
|
-
def get_base_type(self) -> Literal["flow", "task", "temporary"]:
|
|
538
|
-
"""Get the base type of the document.
|
|
539
|
-
|
|
540
|
-
Abstract method that must be implemented by all Document subclasses
|
|
541
|
-
to indicate their persistence behavior.
|
|
542
|
-
|
|
543
|
-
Returns:
|
|
544
|
-
One of "flow" (persisted across flow runs), "task" (temporary
|
|
545
|
-
within task execution), or "temporary" (never persisted).
|
|
546
|
-
|
|
547
|
-
Note:
|
|
548
|
-
This method determines document persistence and lifecycle.
|
|
549
|
-
FlowDocument returns "flow", TaskDocument returns "task".
|
|
550
|
-
"""
|
|
551
|
-
raise NotImplementedError("Subclasses must implement this method")
|
|
552
|
-
|
|
553
|
-
@final
|
|
554
|
-
@property
|
|
555
|
-
def base_type(self) -> Literal["flow", "task", "temporary"]:
|
|
556
|
-
"""Get the document's base type.
|
|
557
|
-
|
|
558
|
-
Property alias for get_base_type() providing a cleaner API.
|
|
559
|
-
This property cannot be overridden by subclasses.
|
|
560
|
-
|
|
561
|
-
Returns:
|
|
562
|
-
The document's base type: "flow", "task", or "temporary".
|
|
563
|
-
"""
|
|
564
|
-
return self.get_base_type()
|
|
565
|
-
|
|
566
|
-
@final
|
|
567
|
-
@property
|
|
568
|
-
def is_flow(self) -> bool:
|
|
569
|
-
"""Check if this is a flow document.
|
|
570
|
-
|
|
571
|
-
Flow documents persist across Prefect flow runs and are saved
|
|
572
|
-
to the file system between pipeline steps.
|
|
573
|
-
|
|
574
|
-
Returns:
|
|
575
|
-
True if this is a FlowDocument subclass, False otherwise.
|
|
576
|
-
"""
|
|
577
|
-
return self.get_base_type() == "flow"
|
|
578
|
-
|
|
579
|
-
@final
|
|
580
|
-
@property
|
|
581
|
-
def is_task(self) -> bool:
|
|
582
|
-
"""Check if this is a task document.
|
|
583
|
-
|
|
584
|
-
Task documents are temporary within Prefect task execution
|
|
585
|
-
and are not persisted between pipeline steps.
|
|
586
|
-
|
|
587
|
-
Returns:
|
|
588
|
-
True if this is a TaskDocument subclass, False otherwise.
|
|
589
|
-
"""
|
|
590
|
-
return self.get_base_type() == "task"
|
|
591
|
-
|
|
592
|
-
@final
|
|
593
|
-
@property
|
|
594
|
-
def is_temporary(self) -> bool:
|
|
595
|
-
"""Check if this is a temporary document.
|
|
596
|
-
|
|
597
|
-
Temporary documents are never persisted and exist only
|
|
598
|
-
during execution.
|
|
599
|
-
|
|
600
|
-
Returns:
|
|
601
|
-
True if this document is temporary, False otherwise.
|
|
602
|
-
"""
|
|
603
|
-
return self.get_base_type() == "temporary"
|
|
604
|
-
|
|
605
243
|
@final
|
|
606
244
|
@classmethod
|
|
607
245
|
def get_expected_files(cls) -> list[str] | None:
|
|
608
|
-
"""
|
|
609
|
-
|
|
610
|
-
If the document class defines a FILES enum, returns the list of
|
|
611
|
-
valid file names. Used to restrict documents to specific files.
|
|
612
|
-
|
|
613
|
-
Returns:
|
|
614
|
-
List of allowed file names if FILES enum is defined,
|
|
615
|
-
None if unrestricted.
|
|
616
|
-
|
|
617
|
-
Raises:
|
|
618
|
-
DocumentNameError: If FILES is defined but not a valid StrEnum.
|
|
619
|
-
|
|
620
|
-
Example:
|
|
621
|
-
>>> class ConfigDocument(FlowDocument):
|
|
622
|
-
... class FILES(StrEnum):
|
|
623
|
-
... CONFIG = "config.yaml"
|
|
624
|
-
... SETTINGS = "settings.json"
|
|
625
|
-
>>> ConfigDocument.get_expected_files()
|
|
626
|
-
['config.yaml', 'settings.json']
|
|
627
|
-
"""
|
|
246
|
+
"""Return allowed filenames from FILES enum, or None if unrestricted."""
|
|
628
247
|
if not hasattr(cls, "FILES"):
|
|
629
248
|
return None
|
|
630
|
-
files =
|
|
249
|
+
files: type[StrEnum] = cls.FILES # pyright: ignore[reportAttributeAccessIssue, reportUnknownMemberType, reportUnknownVariableType]
|
|
631
250
|
if not files:
|
|
632
251
|
return None
|
|
633
252
|
assert issubclass(files, StrEnum)
|
|
634
253
|
try:
|
|
635
254
|
values = [member.value for member in files]
|
|
636
255
|
except TypeError:
|
|
637
|
-
raise DocumentNameError(f"{cls.__name__}.FILES must be an Enum of string values")
|
|
256
|
+
raise DocumentNameError(f"{cls.__name__}.FILES must be an Enum of string values") from None
|
|
638
257
|
if len(values) == 0:
|
|
639
258
|
return None
|
|
640
259
|
return values
|
|
641
260
|
|
|
642
261
|
@classmethod
|
|
643
262
|
def validate_file_name(cls, name: str) -> None:
|
|
644
|
-
"""Validate
|
|
645
|
-
|
|
646
|
-
DO NOT OVERRIDE this method if you define a FILES enum!
|
|
647
|
-
The validation is automatic when FILES enum is present.
|
|
648
|
-
|
|
649
|
-
# CORRECT - FILES enum provides automatic validation:
|
|
650
|
-
class MyDocument(FlowDocument):
|
|
651
|
-
class FILES(StrEnum):
|
|
652
|
-
CONFIG = "config.yaml" # Validation happens automatically!
|
|
653
|
-
|
|
654
|
-
# WRONG - Unnecessary override:
|
|
655
|
-
class MyDocument(FlowDocument):
|
|
656
|
-
class FILES(StrEnum):
|
|
657
|
-
CONFIG = "config.yaml"
|
|
658
|
-
|
|
659
|
-
def validate_file_name(cls, name): # DON'T DO THIS!
|
|
660
|
-
pass # Validation already happens via FILES enum
|
|
661
|
-
|
|
662
|
-
Only override for custom validation logic BEYOND FILES enum constraints.
|
|
663
|
-
|
|
664
|
-
Args:
|
|
665
|
-
name: The file name to validate.
|
|
666
|
-
|
|
667
|
-
Raises:
|
|
668
|
-
DocumentNameError: If the name doesn't match allowed patterns.
|
|
669
|
-
|
|
670
|
-
Note:
|
|
671
|
-
- If FILES enum is defined, name must exactly match one of the values
|
|
672
|
-
- If FILES is not defined, any name is allowed
|
|
673
|
-
- Override in subclasses ONLY for custom regex patterns or logic
|
|
674
|
-
"""
|
|
263
|
+
"""Validate filename against FILES enum. Override only for custom validation beyond FILES."""
|
|
675
264
|
allowed = cls.get_expected_files()
|
|
676
265
|
if not allowed:
|
|
677
266
|
return
|
|
@@ -681,45 +270,18 @@ class Document(BaseModel, ABC):
|
|
|
681
270
|
raise DocumentNameError(f"Invalid filename '{name}'. Allowed names: {allowed_str}")
|
|
682
271
|
|
|
683
272
|
@field_validator("name")
|
|
273
|
+
@classmethod
|
|
684
274
|
def validate_name(cls, v: str) -> str:
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
Ensures the document name is secure and follows conventions:
|
|
688
|
-
- No path traversal characters (.., \\, /)
|
|
689
|
-
- Cannot end with .description.md or .sources.json
|
|
690
|
-
- No leading/trailing whitespace
|
|
691
|
-
- Must match FILES enum if defined
|
|
692
|
-
|
|
693
|
-
Performance:
|
|
694
|
-
Validation is O(n) where n is the length of the name.
|
|
695
|
-
FILES enum check is O(m) where m is the number of allowed files
|
|
696
|
-
|
|
697
|
-
Args:
|
|
698
|
-
v: The name value to validate.
|
|
699
|
-
|
|
700
|
-
Returns:
|
|
701
|
-
The validated name.
|
|
702
|
-
|
|
703
|
-
Raises:
|
|
704
|
-
DocumentNameError: If the name violates any validation rules.
|
|
705
|
-
|
|
706
|
-
Note:
|
|
707
|
-
This is called automatically by Pydantic during model construction.
|
|
708
|
-
"""
|
|
709
|
-
if v.endswith(cls.DESCRIPTION_EXTENSION):
|
|
710
|
-
raise DocumentNameError(
|
|
711
|
-
f"Document names cannot end with {cls.DESCRIPTION_EXTENSION}: {v}"
|
|
712
|
-
)
|
|
713
|
-
|
|
714
|
-
if v.endswith(cls.SOURCES_EXTENSION):
|
|
715
|
-
raise DocumentNameError(f"Document names cannot end with {cls.SOURCES_EXTENSION}: {v}")
|
|
716
|
-
|
|
275
|
+
"""Reject path traversal, whitespace issues, reserved suffixes. Must match FILES enum if defined."""
|
|
717
276
|
if ".." in v or "\\" in v or "/" in v:
|
|
718
277
|
raise DocumentNameError(f"Invalid filename - contains path traversal characters: {v}")
|
|
719
278
|
|
|
720
279
|
if not v or v.startswith(" ") or v.endswith(" "):
|
|
721
280
|
raise DocumentNameError(f"Invalid filename format: {v}")
|
|
722
281
|
|
|
282
|
+
if v.endswith(".meta.json"):
|
|
283
|
+
raise DocumentNameError(f"Document names cannot end with .meta.json (reserved): {v}")
|
|
284
|
+
|
|
723
285
|
cls.validate_file_name(v)
|
|
724
286
|
|
|
725
287
|
return v
|
|
@@ -727,174 +289,58 @@ class Document(BaseModel, ABC):
|
|
|
727
289
|
@field_validator("content", mode="before")
|
|
728
290
|
@classmethod
|
|
729
291
|
def validate_content(cls, v: Any, info: ValidationInfo) -> bytes:
|
|
730
|
-
"""
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
Conversion Strategy:
|
|
737
|
-
1. bytes → Passthrough (no conversion)
|
|
738
|
-
2. str → UTF-8 encoding
|
|
739
|
-
3. dict/BaseModel + .json → JSON serialization (indented)
|
|
740
|
-
4. dict/BaseModel + .yaml/.yml → YAML serialization
|
|
741
|
-
5. list[str] + .md → Join with markdown sections (validates format compatibility)
|
|
742
|
-
6. list[Any] + .json/.yaml → JSON/YAML array
|
|
743
|
-
7. int/float/bool + .json → JSON primitive
|
|
744
|
-
|
|
745
|
-
Args:
|
|
746
|
-
v: Content to validate (any supported type)
|
|
747
|
-
info: Validation context containing other field values
|
|
748
|
-
|
|
749
|
-
Returns:
|
|
750
|
-
Content converted to bytes
|
|
751
|
-
|
|
752
|
-
Raises:
|
|
753
|
-
DocumentSizeError: If content exceeds MAX_CONTENT_SIZE
|
|
754
|
-
ValueError: If content type unsupported for file extension
|
|
755
|
-
|
|
756
|
-
Note:
|
|
757
|
-
This validator enables create() to accept multiple types while
|
|
758
|
-
ensuring __init__ only receives bytes for type safety.
|
|
759
|
-
"""
|
|
760
|
-
# Get the name from validation context if available
|
|
761
|
-
name = ""
|
|
762
|
-
if hasattr(info, "data") and "name" in info.data:
|
|
763
|
-
name = info.data["name"]
|
|
764
|
-
name_lower = name.lower()
|
|
765
|
-
|
|
766
|
-
# Convert based on content type
|
|
767
|
-
if isinstance(v, bytes):
|
|
768
|
-
pass # Already bytes
|
|
769
|
-
elif isinstance(v, str):
|
|
770
|
-
v = v.encode("utf-8")
|
|
771
|
-
elif isinstance(v, dict):
|
|
772
|
-
# Serialize dict based on extension
|
|
773
|
-
if name_lower.endswith((".yaml", ".yml")):
|
|
774
|
-
# Use YAML format for YAML files
|
|
775
|
-
yaml = YAML()
|
|
776
|
-
stream = BytesIO()
|
|
777
|
-
yaml.dump(v, stream)
|
|
778
|
-
v = stream.getvalue()
|
|
779
|
-
elif name_lower.endswith(".json"):
|
|
780
|
-
# Use JSON for JSON files
|
|
781
|
-
v = json.dumps(v, indent=2).encode("utf-8")
|
|
782
|
-
else:
|
|
783
|
-
# Dict not supported for other file types
|
|
784
|
-
raise ValueError(f"Unsupported content type: {type(v)} for file {name}")
|
|
785
|
-
elif isinstance(v, list):
|
|
786
|
-
# Handle lists based on file extension
|
|
787
|
-
if name_lower.endswith(".md"):
|
|
788
|
-
# For markdown files, join with separator
|
|
789
|
-
if all(isinstance(item, str) for item in v):
|
|
790
|
-
# Check that no string contains the separator
|
|
791
|
-
for item in v:
|
|
792
|
-
if cls.MARKDOWN_LIST_SEPARATOR in item:
|
|
793
|
-
raise ValueError(
|
|
794
|
-
f"Markdown list item cannot contain the separator "
|
|
795
|
-
f"'{cls.MARKDOWN_LIST_SEPARATOR}' as it will mess up formatting"
|
|
796
|
-
)
|
|
797
|
-
v = cls.MARKDOWN_LIST_SEPARATOR.join(v).encode("utf-8")
|
|
798
|
-
else:
|
|
799
|
-
raise ValueError(
|
|
800
|
-
f"Unsupported content type: mixed-type list for markdown file {name}"
|
|
801
|
-
)
|
|
802
|
-
elif name_lower.endswith((".yaml", ".yml")):
|
|
803
|
-
# Check if it's a list of Pydantic models
|
|
804
|
-
if v and isinstance(v[0], BaseModel):
|
|
805
|
-
# Convert models to dicts first
|
|
806
|
-
v = [item.model_dump(mode="json") for item in v]
|
|
807
|
-
# Use YAML format for YAML files
|
|
808
|
-
yaml = YAML()
|
|
809
|
-
stream = BytesIO()
|
|
810
|
-
yaml.dump(v, stream)
|
|
811
|
-
v = stream.getvalue()
|
|
812
|
-
elif name_lower.endswith(".json"):
|
|
813
|
-
# Check if it's a list of Pydantic models
|
|
814
|
-
if v and isinstance(v[0], BaseModel):
|
|
815
|
-
# Convert models to dicts first
|
|
816
|
-
v = [item.model_dump(mode="json") for item in v]
|
|
817
|
-
# For JSON files, serialize as JSON
|
|
818
|
-
v = json.dumps(v, indent=2).encode("utf-8")
|
|
819
|
-
else:
|
|
820
|
-
# Check if it's a list of BaseModel
|
|
821
|
-
if v and isinstance(v[0], BaseModel):
|
|
822
|
-
raise ValueError("list[BaseModel] requires .json or .yaml extension")
|
|
823
|
-
# List content not supported for other file types
|
|
824
|
-
raise ValueError(f"Unsupported content type: {type(v)} for file {name}")
|
|
825
|
-
elif isinstance(v, BaseModel):
|
|
826
|
-
# Serialize Pydantic models
|
|
827
|
-
if name_lower.endswith((".yaml", ".yml")):
|
|
828
|
-
yaml = YAML()
|
|
829
|
-
stream = BytesIO()
|
|
830
|
-
yaml.dump(v.model_dump(mode="json"), stream)
|
|
831
|
-
v = stream.getvalue()
|
|
832
|
-
else:
|
|
833
|
-
v = json.dumps(v.model_dump(mode="json"), indent=2).encode("utf-8")
|
|
834
|
-
elif isinstance(v, (int, float, bool)):
|
|
835
|
-
# Numbers and booleans: JSON-serialize for .json, string for others
|
|
836
|
-
if name_lower.endswith(".json"):
|
|
837
|
-
v = json.dumps(v).encode("utf-8")
|
|
838
|
-
elif name_lower.endswith((".yaml", ".yml")):
|
|
839
|
-
v = str(v).encode("utf-8")
|
|
840
|
-
elif name_lower.endswith(".txt"):
|
|
841
|
-
v = str(v).encode("utf-8")
|
|
842
|
-
else:
|
|
843
|
-
# For other extensions, convert to string
|
|
844
|
-
v = str(v).encode("utf-8")
|
|
845
|
-
elif v is None:
|
|
846
|
-
# Handle None - only supported for JSON/YAML
|
|
847
|
-
if name_lower.endswith((".json", ".yaml", ".yml")):
|
|
848
|
-
if name_lower.endswith((".yaml", ".yml")):
|
|
849
|
-
v = b"null\n"
|
|
850
|
-
else:
|
|
851
|
-
v = b"null"
|
|
852
|
-
else:
|
|
853
|
-
raise ValueError(f"Unsupported content type: {type(None)} for file {name}")
|
|
854
|
-
else:
|
|
855
|
-
# Try to see if it has model_dump (duck typing for Pydantic-like)
|
|
856
|
-
if hasattr(v, "model_dump"):
|
|
857
|
-
if name_lower.endswith((".yaml", ".yml")):
|
|
858
|
-
yaml = YAML()
|
|
859
|
-
stream = BytesIO()
|
|
860
|
-
yaml.dump(v.model_dump(mode="json"), stream) # type: ignore[attr-defined]
|
|
861
|
-
v = stream.getvalue()
|
|
862
|
-
else:
|
|
863
|
-
v = json.dumps(v.model_dump(mode="json"), indent=2).encode("utf-8") # type: ignore[attr-defined]
|
|
864
|
-
else:
|
|
865
|
-
# List non-.json files should raise error
|
|
866
|
-
if name_lower.endswith(".txt") and isinstance(v, list):
|
|
867
|
-
raise ValueError("List content not supported for text files")
|
|
868
|
-
raise ValueError(f"Unsupported content type: {type(v)}")
|
|
869
|
-
|
|
870
|
-
# Check content size limit
|
|
871
|
-
max_size = getattr(cls, "MAX_CONTENT_SIZE", 100 * 1024 * 1024)
|
|
872
|
-
if len(v) > max_size:
|
|
873
|
-
raise DocumentSizeError(
|
|
874
|
-
f"Document size ({len(v)} bytes) exceeds maximum allowed size ({max_size} bytes)"
|
|
875
|
-
)
|
|
876
|
-
|
|
292
|
+
"""Convert content to bytes via `_convert_content` if not already bytes. Enforces MAX_CONTENT_SIZE."""
|
|
293
|
+
if not isinstance(v, bytes):
|
|
294
|
+
name = info.data.get("name", "") if hasattr(info, "data") else ""
|
|
295
|
+
v = _convert_content(name, v)
|
|
296
|
+
if len(v) > cls.MAX_CONTENT_SIZE:
|
|
297
|
+
raise DocumentSizeError(f"Document size ({len(v)} bytes) exceeds maximum allowed size ({cls.MAX_CONTENT_SIZE} bytes)")
|
|
877
298
|
return v
|
|
878
299
|
|
|
879
|
-
@
|
|
880
|
-
|
|
881
|
-
|
|
300
|
+
@field_validator("sources")
|
|
301
|
+
@classmethod
|
|
302
|
+
def validate_sources(cls, v: tuple[str, ...]) -> tuple[str, ...]:
|
|
303
|
+
"""Sources must be document SHA256 hashes or URLs."""
|
|
304
|
+
for src in v:
|
|
305
|
+
if not is_document_sha256(src) and "://" not in src:
|
|
306
|
+
raise ValueError(f"Source must be a document SHA256 hash or a URL (containing '://'), got: {src!r}")
|
|
307
|
+
return v
|
|
882
308
|
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
|
|
309
|
+
@field_validator("origins")
|
|
310
|
+
@classmethod
|
|
311
|
+
def validate_origins(cls, v: tuple[str, ...]) -> tuple[str, ...]:
|
|
312
|
+
"""Origins must be valid document SHA256 hashes."""
|
|
313
|
+
for origin in v:
|
|
314
|
+
if not is_document_sha256(origin):
|
|
315
|
+
raise ValueError(f"Origin must be a document SHA256 hash, got: {origin}")
|
|
316
|
+
return v
|
|
886
317
|
|
|
887
|
-
|
|
888
|
-
|
|
318
|
+
@model_validator(mode="after")
|
|
319
|
+
def validate_no_source_origin_overlap(self) -> Self:
|
|
320
|
+
"""Reject documents where the same SHA256 appears in both sources and origins."""
|
|
321
|
+
source_sha256s = {src for src in self.sources if is_document_sha256(src)}
|
|
322
|
+
if source_sha256s:
|
|
323
|
+
overlap = source_sha256s & set(self.origins)
|
|
324
|
+
if overlap:
|
|
325
|
+
sample = next(iter(overlap))
|
|
326
|
+
raise ValueError(
|
|
327
|
+
f"SHA256 hash {sample[:12]}... appears in both sources and origins. "
|
|
328
|
+
f"A document reference must be either a source (content provenance) "
|
|
329
|
+
f"or an origin (causal provenance), not both."
|
|
330
|
+
)
|
|
331
|
+
return self
|
|
889
332
|
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
|
|
333
|
+
@model_validator(mode="after")
|
|
334
|
+
def validate_total_size(self) -> Self:
|
|
335
|
+
"""Validate that total document size (content + attachments) is within limits."""
|
|
336
|
+
total = self.size
|
|
337
|
+
if total > self.MAX_CONTENT_SIZE:
|
|
338
|
+
raise DocumentSizeError(f"Total document size ({total} bytes) including attachments exceeds maximum allowed size ({self.MAX_CONTENT_SIZE} bytes)")
|
|
339
|
+
return self
|
|
893
340
|
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
"""
|
|
341
|
+
@field_serializer("content")
|
|
342
|
+
def serialize_content(self, v: bytes) -> str: # noqa: PLR6301
|
|
343
|
+
"""UTF-8 decode for text, base64 for binary. Called by Pydantic during serialization."""
|
|
898
344
|
try:
|
|
899
345
|
return v.decode("utf-8")
|
|
900
346
|
except UnicodeDecodeError:
|
|
@@ -904,281 +350,89 @@ class Document(BaseModel, ABC):
|
|
|
904
350
|
@final
|
|
905
351
|
@property
|
|
906
352
|
def id(self) -> str:
|
|
907
|
-
"""
|
|
908
|
-
|
|
909
|
-
@public
|
|
910
|
-
|
|
911
|
-
This ID is crucial for LLM interactions. When documents are provided to
|
|
912
|
-
LLMs via generate() or generate_structured(), their IDs are included,
|
|
913
|
-
allowing the LLM to reference documents in prompts by either name or ID.
|
|
914
|
-
The ID is content-based (derived from SHA256 hash of content only),
|
|
915
|
-
so the same content always produces the same ID. Changing the name or
|
|
916
|
-
description does NOT change the ID.
|
|
917
|
-
|
|
918
|
-
Returns:
|
|
919
|
-
6-character base32-encoded string (uppercase, e.g., "A7B2C9").
|
|
920
|
-
This is the first 6 chars of the full base32 SHA256, NOT hex.
|
|
921
|
-
|
|
922
|
-
Collision Rate:
|
|
923
|
-
With base32 encoding (5 bits per char), 6 chars = 30 bits.
|
|
924
|
-
Expect collisions after ~32K documents (birthday paradox).
|
|
925
|
-
For higher uniqueness requirements, use the full sha256 property.
|
|
926
|
-
|
|
927
|
-
Note:
|
|
928
|
-
While shorter than full SHA256, this provides
|
|
929
|
-
reasonable uniqueness for most use cases.
|
|
930
|
-
"""
|
|
353
|
+
"""First 6 chars of sha256. Used as short document identifier in LLM context."""
|
|
931
354
|
return self.sha256[:6]
|
|
932
355
|
|
|
933
356
|
@final
|
|
934
357
|
@cached_property
|
|
935
358
|
def sha256(self) -> str:
|
|
936
|
-
"""
|
|
937
|
-
|
|
938
|
-
|
|
939
|
-
|
|
940
|
-
|
|
941
|
-
|
|
942
|
-
|
|
943
|
-
|
|
944
|
-
Returns:
|
|
945
|
-
Full SHA256 hash as base32-encoded uppercase string.
|
|
946
|
-
|
|
947
|
-
Why Base32 Instead of Hex:
|
|
948
|
-
- Base32 is case-insensitive, avoiding issues with different file systems
|
|
949
|
-
and AI interactions where casing might be inconsistent
|
|
950
|
-
- More compact than hex (52 chars vs 64 chars for SHA-256)
|
|
951
|
-
- Contains more information per character than hex (5 bits vs 4 bits)
|
|
952
|
-
- Safe for URLs without encoding
|
|
953
|
-
- Compatible with case-insensitive file systems
|
|
954
|
-
- Avoids confusion in AI interactions where models might change casing
|
|
955
|
-
- Not base64 because we want consistent uppercase for all uses
|
|
956
|
-
|
|
957
|
-
Note:
|
|
958
|
-
This is computed once and cached for performance.
|
|
959
|
-
The hash is deterministic based on content only.
|
|
960
|
-
"""
|
|
961
|
-
return b32encode(hashlib.sha256(self.content).digest()).decode("ascii").upper().rstrip("=")
|
|
359
|
+
"""Full SHA256 identity hash (name + content + attachments). BASE32 encoded, cached."""
|
|
360
|
+
return compute_document_sha256(self)
|
|
361
|
+
|
|
362
|
+
@final
|
|
363
|
+
@cached_property
|
|
364
|
+
def content_sha256(self) -> str:
|
|
365
|
+
"""SHA256 hash of raw content bytes only. Used for content deduplication."""
|
|
366
|
+
return compute_content_sha256(self.content)
|
|
962
367
|
|
|
963
368
|
@final
|
|
964
369
|
@property
|
|
965
370
|
def size(self) -> int:
|
|
966
|
-
"""
|
|
967
|
-
|
|
968
|
-
@public
|
|
969
|
-
|
|
970
|
-
Returns:
|
|
971
|
-
Size of content in bytes.
|
|
972
|
-
|
|
973
|
-
Note:
|
|
974
|
-
Useful for monitoring document sizes and
|
|
975
|
-
ensuring they stay within limits.
|
|
976
|
-
"""
|
|
977
|
-
return len(self.content)
|
|
371
|
+
"""Total size of content + attachments in bytes."""
|
|
372
|
+
return len(self.content) + sum(att.size for att in self.attachments)
|
|
978
373
|
|
|
979
374
|
@cached_property
|
|
980
|
-
def detected_mime_type(self) -> str:
|
|
981
|
-
"""Detect the MIME type from document content.
|
|
982
|
-
|
|
983
|
-
Detection strategy (in order):
|
|
984
|
-
1. Returns 'text/plain' for empty content
|
|
985
|
-
2. Extension-based detection for known text formats (preferred)
|
|
986
|
-
3. python-magic content analysis for unknown extensions
|
|
987
|
-
4. Fallback to extension or 'application/octet-stream'
|
|
988
|
-
|
|
989
|
-
Returns:
|
|
990
|
-
MIME type string (e.g., "text/plain", "application/json").
|
|
991
|
-
|
|
992
|
-
Note:
|
|
993
|
-
This is cached after first access. Extension-based detection
|
|
994
|
-
is preferred for text formats to avoid misidentification.
|
|
995
|
-
"""
|
|
996
|
-
return detect_mime_type(self.content, self.name)
|
|
997
|
-
|
|
998
|
-
@property
|
|
999
375
|
def mime_type(self) -> str:
|
|
1000
|
-
"""
|
|
1001
|
-
|
|
1002
|
-
@public
|
|
1003
|
-
|
|
1004
|
-
Primary property for accessing MIME type information.
|
|
1005
|
-
Automatically detects MIME type based on file extension and content.
|
|
1006
|
-
|
|
1007
|
-
Returns:
|
|
1008
|
-
MIME type string (e.g., "text/plain", "application/json").
|
|
1009
|
-
|
|
1010
|
-
Note:
|
|
1011
|
-
MIME type detection uses extension-based detection for known
|
|
1012
|
-
text formats and content analysis for binary formats.
|
|
1013
|
-
"""
|
|
1014
|
-
return self.detected_mime_type
|
|
376
|
+
"""Detected MIME type. Extension-based for known formats, content analysis for others. Cached."""
|
|
377
|
+
return detect_mime_type(self.content, self.name)
|
|
1015
378
|
|
|
1016
379
|
@property
|
|
1017
380
|
def is_text(self) -> bool:
|
|
1018
|
-
"""
|
|
1019
|
-
|
|
1020
|
-
@public
|
|
1021
|
-
|
|
1022
|
-
Returns:
|
|
1023
|
-
True if MIME type indicates text content
|
|
1024
|
-
(text/*, application/json, application/x-yaml, text/yaml, etc.),
|
|
1025
|
-
False otherwise.
|
|
1026
|
-
|
|
1027
|
-
Note:
|
|
1028
|
-
Used to determine if text property can be safely accessed.
|
|
1029
|
-
"""
|
|
381
|
+
"""True if MIME type indicates text content."""
|
|
1030
382
|
return is_text_mime_type(self.mime_type)
|
|
1031
383
|
|
|
1032
384
|
@property
|
|
1033
385
|
def is_pdf(self) -> bool:
|
|
1034
|
-
"""
|
|
1035
|
-
|
|
1036
|
-
@public
|
|
1037
|
-
|
|
1038
|
-
Returns:
|
|
1039
|
-
True if MIME type is application/pdf, False otherwise.
|
|
1040
|
-
|
|
1041
|
-
Note:
|
|
1042
|
-
PDF documents require special handling and are
|
|
1043
|
-
supported by certain LLM models.
|
|
1044
|
-
"""
|
|
386
|
+
"""True if MIME type is application/pdf."""
|
|
1045
387
|
return is_pdf_mime_type(self.mime_type)
|
|
1046
388
|
|
|
1047
389
|
@property
|
|
1048
390
|
def is_image(self) -> bool:
|
|
1049
|
-
"""
|
|
1050
|
-
|
|
1051
|
-
@public
|
|
1052
|
-
|
|
1053
|
-
Returns:
|
|
1054
|
-
True if MIME type starts with "image/", False otherwise.
|
|
1055
|
-
|
|
1056
|
-
Note:
|
|
1057
|
-
Image documents are automatically encoded for
|
|
1058
|
-
vision-capable LLM models.
|
|
1059
|
-
"""
|
|
391
|
+
"""True if MIME type starts with image/."""
|
|
1060
392
|
return is_image_mime_type(self.mime_type)
|
|
1061
393
|
|
|
1062
394
|
@classmethod
|
|
1063
395
|
def canonical_name(cls) -> str:
|
|
1064
|
-
"""
|
|
1065
|
-
|
|
1066
|
-
Returns a standardized snake_case name derived from the
|
|
1067
|
-
class name, used for directory naming and identification.
|
|
1068
|
-
|
|
1069
|
-
Returns:
|
|
1070
|
-
Snake_case canonical name.
|
|
1071
|
-
|
|
1072
|
-
Example:
|
|
1073
|
-
>>> class UserDataDocument(FlowDocument): ...
|
|
1074
|
-
>>> UserDataDocument.canonical_name()
|
|
1075
|
-
'user_data'
|
|
1076
|
-
"""
|
|
396
|
+
"""Snake_case name derived from class name, used for directory naming."""
|
|
1077
397
|
return canonical_name_key(cls)
|
|
1078
398
|
|
|
1079
399
|
@property
|
|
1080
400
|
def text(self) -> str:
|
|
1081
|
-
"""
|
|
1082
|
-
|
|
1083
|
-
@public
|
|
1084
|
-
|
|
1085
|
-
Decodes the bytes content as UTF-8 text. Only available for
|
|
1086
|
-
text-based documents (check is_text property first).
|
|
1087
|
-
|
|
1088
|
-
Returns:
|
|
1089
|
-
UTF-8 decoded string.
|
|
1090
|
-
|
|
1091
|
-
Raises:
|
|
1092
|
-
ValueError: If document is not text (is_text == False).
|
|
1093
|
-
|
|
1094
|
-
Example:
|
|
1095
|
-
>>> doc = MyDocument.create(name="data.txt", content="Hello \u2728")
|
|
1096
|
-
>>> if doc.is_text:
|
|
1097
|
-
... print(doc.text) # "Hello \u2728"
|
|
1098
|
-
|
|
1099
|
-
>>> # Binary document raises error:
|
|
1100
|
-
>>> binary_doc = MyDocument(name="image.png", content=png_bytes)
|
|
1101
|
-
>>> binary_doc.text # Raises ValueError
|
|
1102
|
-
"""
|
|
401
|
+
"""Content decoded as UTF-8. Raises ValueError if not text."""
|
|
1103
402
|
if not self.is_text:
|
|
1104
403
|
raise ValueError(f"Document is not text: {self.name}")
|
|
1105
404
|
return self.content.decode("utf-8")
|
|
1106
405
|
|
|
1107
|
-
@
|
|
406
|
+
@cached_property
|
|
1108
407
|
def approximate_tokens_count(self) -> int:
|
|
1109
|
-
"""Approximate
|
|
1110
|
-
|
|
1111
|
-
@public
|
|
1112
|
-
|
|
1113
|
-
Uses tiktoken with gpt-4 encoding to estimate token count.
|
|
1114
|
-
For text documents, encodes the actual text. For non-text
|
|
1115
|
-
documents (images, PDFs, etc.), returns a fixed estimate of 1024 tokens.
|
|
1116
|
-
|
|
1117
|
-
Returns:
|
|
1118
|
-
Approximate number of tokens for this document.
|
|
1119
|
-
|
|
1120
|
-
Example:
|
|
1121
|
-
>>> doc = MyDocument.create(name="data.txt", content="Hello world")
|
|
1122
|
-
>>> doc.approximate_tokens_count # ~2 tokens
|
|
1123
|
-
"""
|
|
408
|
+
"""Approximate token count (tiktoken gpt-4 encoding). Images=1080, PDFs/other=1024."""
|
|
409
|
+
enc = get_tiktoken_encoding()
|
|
1124
410
|
if self.is_text:
|
|
1125
|
-
|
|
411
|
+
total = len(enc.encode(self.text))
|
|
412
|
+
elif self.is_image:
|
|
413
|
+
total = 1080
|
|
1126
414
|
else:
|
|
1127
|
-
|
|
1128
|
-
|
|
1129
|
-
|
|
1130
|
-
|
|
1131
|
-
|
|
1132
|
-
|
|
1133
|
-
|
|
1134
|
-
|
|
1135
|
-
|
|
1136
|
-
|
|
1137
|
-
|
|
1138
|
-
Raises:
|
|
1139
|
-
ValueError: If document is not text-based.
|
|
1140
|
-
YAMLError: If content is not valid YAML.
|
|
415
|
+
total = 1024
|
|
416
|
+
|
|
417
|
+
for att in self.attachments:
|
|
418
|
+
if att.is_image:
|
|
419
|
+
total += 1080
|
|
420
|
+
elif att.is_pdf:
|
|
421
|
+
total += 1024
|
|
422
|
+
elif att.is_text:
|
|
423
|
+
total += len(enc.encode(att.text))
|
|
424
|
+
else:
|
|
425
|
+
total += 1024
|
|
1141
426
|
|
|
1142
|
-
|
|
1143
|
-
>>> # From dict content
|
|
1144
|
-
>>> doc = MyDocument.create(name="config.yaml", content={
|
|
1145
|
-
... "server": {"host": "localhost", "port": 8080}
|
|
1146
|
-
... })
|
|
1147
|
-
>>> doc.as_yaml() # {'server': {'host': 'localhost', 'port': 8080}}
|
|
427
|
+
return total
|
|
1148
428
|
|
|
1149
|
-
|
|
1150
|
-
|
|
1151
|
-
>>> doc2.as_yaml() # {'key': 'value', 'items': ['a', 'b']}
|
|
1152
|
-
"""
|
|
429
|
+
def as_yaml(self) -> Any:
|
|
430
|
+
"""Parse content as YAML via ruamel.yaml."""
|
|
1153
431
|
yaml = YAML()
|
|
1154
432
|
return yaml.load(self.text) # type: ignore[no-untyped-call, no-any-return]
|
|
1155
433
|
|
|
1156
434
|
def as_json(self) -> Any:
|
|
1157
|
-
"""Parse
|
|
1158
|
-
|
|
1159
|
-
Parses the document's text content as JSON and returns Python objects.
|
|
1160
|
-
Document must contain valid JSON text.
|
|
1161
|
-
|
|
1162
|
-
Returns:
|
|
1163
|
-
Parsed JSON data: dict, list, str, int, float, bool, or None.
|
|
1164
|
-
|
|
1165
|
-
Raises:
|
|
1166
|
-
ValueError: If document is not text-based.
|
|
1167
|
-
JSONDecodeError: If content is not valid JSON.
|
|
1168
|
-
|
|
1169
|
-
Example:
|
|
1170
|
-
>>> # From dict content
|
|
1171
|
-
>>> doc = MyDocument.create(name="data.json", content={"key": "value"})
|
|
1172
|
-
>>> doc.as_json() # {'key': 'value'}
|
|
1173
|
-
|
|
1174
|
-
>>> # From JSON string
|
|
1175
|
-
>>> doc2 = MyDocument(name="array.json", content=b'[1, 2, 3]')
|
|
1176
|
-
>>> doc2.as_json() # [1, 2, 3]
|
|
1177
|
-
|
|
1178
|
-
>>> # Invalid JSON
|
|
1179
|
-
>>> bad_doc = MyDocument(name="bad.json", content=b"not json")
|
|
1180
|
-
>>> bad_doc.as_json() # Raises JSONDecodeError
|
|
1181
|
-
"""
|
|
435
|
+
"""Parse content as JSON."""
|
|
1182
436
|
return json.loads(self.text)
|
|
1183
437
|
|
|
1184
438
|
@overload
|
|
@@ -1187,50 +441,8 @@ class Document(BaseModel, ABC):
|
|
|
1187
441
|
@overload
|
|
1188
442
|
def as_pydantic_model(self, model_type: type[list[TModel]]) -> list[TModel]: ...
|
|
1189
443
|
|
|
1190
|
-
def as_pydantic_model(
|
|
1191
|
-
|
|
1192
|
-
) -> TModel | list[TModel]:
|
|
1193
|
-
"""Parse document content as Pydantic model with validation.
|
|
1194
|
-
|
|
1195
|
-
@public
|
|
1196
|
-
|
|
1197
|
-
Parses JSON or YAML content and validates it against a Pydantic model.
|
|
1198
|
-
Automatically detects format based on MIME type. Supports both single
|
|
1199
|
-
models and lists of models.
|
|
1200
|
-
|
|
1201
|
-
Args:
|
|
1202
|
-
model_type: Pydantic model class to validate against.
|
|
1203
|
-
Can be either:
|
|
1204
|
-
- type[Model] for single model
|
|
1205
|
-
- type[list[Model]] for list of models
|
|
1206
|
-
|
|
1207
|
-
Returns:
|
|
1208
|
-
Validated Pydantic model instance or list of instances.
|
|
1209
|
-
|
|
1210
|
-
Raises:
|
|
1211
|
-
ValueError: If document is not text or type mismatch.
|
|
1212
|
-
ValidationError: If data doesn't match model schema.
|
|
1213
|
-
JSONDecodeError/YAMLError: If content parsing fails.
|
|
1214
|
-
|
|
1215
|
-
Example:
|
|
1216
|
-
>>> from pydantic import BaseModel
|
|
1217
|
-
>>>
|
|
1218
|
-
>>> class User(BaseModel):
|
|
1219
|
-
... name: str
|
|
1220
|
-
... age: int
|
|
1221
|
-
>>>
|
|
1222
|
-
>>> # Single model
|
|
1223
|
-
>>> doc = MyDocument.create(name="user.json",
|
|
1224
|
-
... content={"name": "Alice", "age": 30})
|
|
1225
|
-
>>> user = doc.as_pydantic_model(User)
|
|
1226
|
-
>>> print(user.name) # "Alice"
|
|
1227
|
-
>>>
|
|
1228
|
-
>>> # List of models
|
|
1229
|
-
>>> doc2 = MyDocument.create(name="users.json",
|
|
1230
|
-
... content=[{"name": "Bob", "age": 25}, {"name": "Eve", "age": 28}])
|
|
1231
|
-
>>> users = doc2.as_pydantic_model(list[User])
|
|
1232
|
-
>>> print(len(users)) # 2
|
|
1233
|
-
"""
|
|
444
|
+
def as_pydantic_model(self, model_type: type[TModel] | type[list[TModel]]) -> TModel | list[TModel]:
|
|
445
|
+
"""Parse JSON/YAML content and validate against a Pydantic model. Supports single and list types."""
|
|
1234
446
|
data = self.as_yaml() if is_yaml_mime_type(self.mime_type) else self.as_json()
|
|
1235
447
|
|
|
1236
448
|
if get_origin(model_type) is list:
|
|
@@ -1245,493 +457,165 @@ class Document(BaseModel, ABC):
|
|
|
1245
457
|
single_model = cast(type[TModel], model_type)
|
|
1246
458
|
return single_model.model_validate(data)
|
|
1247
459
|
|
|
1248
|
-
def
|
|
1249
|
-
|
|
1250
|
-
|
|
1251
|
-
|
|
1252
|
-
|
|
1253
|
-
|
|
1254
|
-
|
|
1255
|
-
|
|
1256
|
-
Returns:
|
|
1257
|
-
List of string sections (preserves whitespace within sections).
|
|
1258
|
-
|
|
1259
|
-
Raises:
|
|
1260
|
-
ValueError: If document is not text-based.
|
|
1261
|
-
|
|
1262
|
-
Example:
|
|
1263
|
-
>>> # Using create with list
|
|
1264
|
-
>>> sections = ["# Chapter 1\nIntroduction", "# Chapter 2\nDetails"]
|
|
1265
|
-
>>> doc = MyDocument.create(name="book.md", content=sections)
|
|
1266
|
-
>>> doc.as_markdown_list() # Returns original sections
|
|
1267
|
-
|
|
1268
|
-
>>> # Round-trip conversion works automatically
|
|
1269
|
-
>>> sections = ["Part 1", "Part 2", "Part 3"]
|
|
1270
|
-
>>> doc2 = MyDocument.create(name="parts.md", content=sections)
|
|
1271
|
-
>>> doc2.as_markdown_list() # ['Part 1', 'Part 2', 'Part 3']
|
|
1272
|
-
"""
|
|
1273
|
-
return self.text.split(self.MARKDOWN_LIST_SEPARATOR)
|
|
460
|
+
def _parse_structured(self) -> Any:
|
|
461
|
+
"""Parse content as JSON or YAML based on extension. Strict — no guessing."""
|
|
462
|
+
name_lower = self.name.lower()
|
|
463
|
+
if name_lower.endswith(".json"):
|
|
464
|
+
return self.as_json()
|
|
465
|
+
if name_lower.endswith((".yaml", ".yml")):
|
|
466
|
+
return self.as_yaml()
|
|
467
|
+
raise ValueError(f"Cannot parse '{self.name}' as structured data — use .json or .yaml extension")
|
|
1274
468
|
|
|
1275
469
|
def parse(self, type_: type[Any]) -> Any:
|
|
1276
|
-
|
|
1277
|
-
|
|
1278
|
-
@public
|
|
1279
|
-
|
|
1280
|
-
This method reverses the automatic conversion performed by the `create`
|
|
1281
|
-
classmethod. It intelligently parses the bytes content based on the
|
|
1282
|
-
document's file extension and converts to the requested type.
|
|
1283
|
-
|
|
1284
|
-
Designed for roundtrip conversion:
|
|
1285
|
-
>>> original = {"key": "value"}
|
|
1286
|
-
>>> doc = MyDocument.create(name="data.json", content=original)
|
|
1287
|
-
>>> restored = doc.parse(dict)
|
|
1288
|
-
>>> assert restored == original # True
|
|
1289
|
-
|
|
1290
|
-
Args:
|
|
1291
|
-
type_: Target type to parse content into. Supported types:
|
|
1292
|
-
- bytes: Returns raw content (no conversion)
|
|
1293
|
-
- str: Decodes UTF-8 text
|
|
1294
|
-
- dict: Parses JSON (.json) or YAML (.yaml/.yml)
|
|
1295
|
-
- list: Splits markdown (.md) or parses JSON/YAML
|
|
1296
|
-
- BaseModel subclasses: Validates JSON/YAML into model
|
|
1297
|
-
|
|
1298
|
-
Returns:
|
|
1299
|
-
Content parsed to the requested type.
|
|
1300
|
-
|
|
1301
|
-
Raises:
|
|
1302
|
-
ValueError: If type is unsupported or parsing fails.
|
|
1303
|
-
|
|
1304
|
-
Extension Rules:
|
|
1305
|
-
- .json → JSON parsing for dict/list/BaseModel
|
|
1306
|
-
- .yaml/.yml → YAML parsing for dict/list/BaseModel
|
|
1307
|
-
- .md + list → Split automatically into sections
|
|
1308
|
-
- Any + str → UTF-8 decode
|
|
1309
|
-
- Any + bytes → Raw content
|
|
1310
|
-
|
|
1311
|
-
Example:
|
|
1312
|
-
>>> # String content
|
|
1313
|
-
>>> doc = MyDocument(name="test.txt", content=b"Hello")
|
|
1314
|
-
>>> doc.parse(str)
|
|
1315
|
-
'Hello'
|
|
1316
|
-
|
|
1317
|
-
>>> # JSON content
|
|
1318
|
-
>>> doc = MyDocument.create(name="data.json", content={"key": "value"})
|
|
1319
|
-
>>> doc.parse(dict) # Returns {'key': 'value'}
|
|
1320
|
-
|
|
1321
|
-
>>> # Markdown list
|
|
1322
|
-
>>> items = ["Item 1", "Item 2"]
|
|
1323
|
-
>>> doc = MyDocument.create(name="list.md", content=items)
|
|
1324
|
-
>>> doc.parse(list)
|
|
1325
|
-
['Item 1', 'Item 2']
|
|
1326
|
-
"""
|
|
1327
|
-
# Handle basic types
|
|
470
|
+
"""Parse content to the requested type. Reverses create() conversion. Extension-based dispatch, no guessing."""
|
|
1328
471
|
if type_ is bytes:
|
|
1329
472
|
return self.content
|
|
1330
|
-
|
|
1331
|
-
|
|
1332
|
-
|
|
1333
|
-
|
|
1334
|
-
|
|
1335
|
-
|
|
1336
|
-
|
|
1337
|
-
|
|
473
|
+
if type_ is str:
|
|
474
|
+
return self.text if self.content else ""
|
|
475
|
+
if type_ is dict or type_ is list:
|
|
476
|
+
data = self._parse_structured()
|
|
477
|
+
if not isinstance(data, type_):
|
|
478
|
+
raise ValueError(f"Expected {type_.__name__} but got {type(data).__name__}")
|
|
479
|
+
return data # pyright: ignore[reportUnknownVariableType]
|
|
480
|
+
if isinstance(type_, type) and issubclass(type_, BaseModel): # pyright: ignore[reportUnnecessaryIsInstance]
|
|
481
|
+
return self.as_pydantic_model(type_)
|
|
482
|
+
raise ValueError(f"Unsupported parse type: {type_}")
|
|
1338
483
|
|
|
1339
|
-
|
|
1340
|
-
|
|
1341
|
-
|
|
1342
|
-
|
|
1343
|
-
# Ensure the result is the correct type
|
|
1344
|
-
if type_ is dict and not isinstance(result, dict):
|
|
1345
|
-
raise ValueError(f"Expected dict but got {type(result).__name__}")
|
|
1346
|
-
if type_ is list and not isinstance(result, list):
|
|
1347
|
-
raise ValueError(f"Expected list but got {type(result).__name__}")
|
|
1348
|
-
return result
|
|
1349
|
-
elif issubclass(type_, BaseModel):
|
|
1350
|
-
return self.as_pydantic_model(type_)
|
|
1351
|
-
else:
|
|
1352
|
-
raise ValueError(f"Cannot parse JSON file to type {type_}")
|
|
1353
|
-
|
|
1354
|
-
# YAML files
|
|
1355
|
-
elif name_lower.endswith((".yaml", ".yml")):
|
|
1356
|
-
if type_ is dict or type_ is list:
|
|
1357
|
-
result = self.as_yaml()
|
|
1358
|
-
# Ensure the result is the correct type
|
|
1359
|
-
if type_ is dict and not isinstance(result, dict):
|
|
1360
|
-
raise ValueError(f"Expected dict but got {type(result).__name__}")
|
|
1361
|
-
if type_ is list and not isinstance(result, list):
|
|
1362
|
-
raise ValueError(f"Expected list but got {type(result).__name__}")
|
|
1363
|
-
return result
|
|
1364
|
-
elif issubclass(type_, BaseModel):
|
|
1365
|
-
return self.as_pydantic_model(type_)
|
|
1366
|
-
else:
|
|
1367
|
-
raise ValueError(f"Cannot parse YAML file to type {type_}")
|
|
484
|
+
@property
|
|
485
|
+
def source_documents(self) -> tuple[str, ...]:
|
|
486
|
+
"""Document SHA256 hashes from sources (filtered by is_document_sha256)."""
|
|
487
|
+
return tuple(src for src in self.sources if is_document_sha256(src))
|
|
1368
488
|
|
|
1369
|
-
|
|
1370
|
-
|
|
1371
|
-
|
|
489
|
+
@property
|
|
490
|
+
def source_references(self) -> tuple[str, ...]:
|
|
491
|
+
"""Non-hash reference strings from sources (URLs, file paths, etc.)."""
|
|
492
|
+
return tuple(src for src in self.sources if not is_document_sha256(src))
|
|
1372
493
|
|
|
1373
|
-
|
|
1374
|
-
|
|
1375
|
-
# Try JSON first, then YAML
|
|
1376
|
-
try:
|
|
1377
|
-
result = self.as_json()
|
|
1378
|
-
# Ensure the result is the correct type
|
|
1379
|
-
if type_ is dict and not isinstance(result, dict):
|
|
1380
|
-
raise ValueError(f"Expected dict but got {type(result).__name__}")
|
|
1381
|
-
if type_ is list and not isinstance(result, list):
|
|
1382
|
-
raise ValueError(f"Expected list but got {type(result).__name__}")
|
|
1383
|
-
return result
|
|
1384
|
-
except (json.JSONDecodeError, ValueError):
|
|
1385
|
-
try:
|
|
1386
|
-
result = self.as_yaml()
|
|
1387
|
-
# Ensure the result is the correct type
|
|
1388
|
-
if type_ is dict and not isinstance(result, dict):
|
|
1389
|
-
raise ValueError(f"Expected dict but got {type(result).__name__}")
|
|
1390
|
-
if type_ is list and not isinstance(result, list):
|
|
1391
|
-
raise ValueError(f"Expected list but got {type(result).__name__}")
|
|
1392
|
-
return result
|
|
1393
|
-
except Exception as e:
|
|
1394
|
-
raise ValueError(f"Cannot parse content to {type_}") from e
|
|
1395
|
-
|
|
1396
|
-
raise ValueError(f"Unsupported type {type_} for file {self.name}")
|
|
1397
|
-
|
|
1398
|
-
def get_source_documents(self) -> list[str]:
|
|
1399
|
-
"""Get list of document SHA256 hashes referenced as sources.
|
|
1400
|
-
|
|
1401
|
-
Retrieves all document references from this document's sources list,
|
|
1402
|
-
filtering for valid SHA256 hashes that reference other documents.
|
|
1403
|
-
This is useful for building dependency graphs and tracking document
|
|
1404
|
-
lineage in processing pipelines.
|
|
1405
|
-
|
|
1406
|
-
Returns:
|
|
1407
|
-
List of SHA256 hashes (base32 encoded) for documents referenced
|
|
1408
|
-
as sources. Each hash uniquely identifies another document that
|
|
1409
|
-
contributed to creating this one.
|
|
1410
|
-
|
|
1411
|
-
Example:
|
|
1412
|
-
>>> # Create a derived document from multiple sources
|
|
1413
|
-
>>> source1 = MyDocument.create(name="data1.txt", content="First")
|
|
1414
|
-
>>> source2 = MyDocument.create(name="data2.txt", content="Second")
|
|
1415
|
-
>>>
|
|
1416
|
-
>>> merged = MyDocument.create(
|
|
1417
|
-
... name="merged.txt",
|
|
1418
|
-
... content="Combined data",
|
|
1419
|
-
... sources=[source1.sha256, source2.sha256, "https://api.example.com"]
|
|
1420
|
-
... )
|
|
1421
|
-
>>>
|
|
1422
|
-
>>> # Get only document references (not URLs)
|
|
1423
|
-
>>> doc_refs = merged.get_source_documents()
|
|
1424
|
-
>>> print(doc_refs) # [source1.sha256, source2.sha256]
|
|
1425
|
-
>>>
|
|
1426
|
-
>>> # Check if specific document is a source
|
|
1427
|
-
>>> if source1.sha256 in doc_refs:
|
|
1428
|
-
... print("Document derived from source1")
|
|
1429
|
-
"""
|
|
1430
|
-
return [src for src in self.sources if is_document_sha256(src)]
|
|
1431
|
-
|
|
1432
|
-
def get_source_references(self) -> list[str]:
|
|
1433
|
-
"""Get list of arbitrary reference strings from sources.
|
|
1434
|
-
|
|
1435
|
-
Retrieves all non-document references from this document's sources list.
|
|
1436
|
-
These are typically URLs, file paths, API endpoints, or descriptive strings
|
|
1437
|
-
that indicate where the document's content originated from, but are not
|
|
1438
|
-
references to other documents in the pipeline.
|
|
1439
|
-
|
|
1440
|
-
Returns:
|
|
1441
|
-
List of reference strings that are not document SHA256 hashes.
|
|
1442
|
-
Can include URLs, file paths, API endpoints, dataset names,
|
|
1443
|
-
or any other string that provides source context.
|
|
1444
|
-
|
|
1445
|
-
Example:
|
|
1446
|
-
>>> # Create document with mixed source types
|
|
1447
|
-
>>> doc = MyDocument.create(
|
|
1448
|
-
... name="report.txt",
|
|
1449
|
-
... content="Analysis results",
|
|
1450
|
-
... sources=[
|
|
1451
|
-
... other_doc.sha256, # Document reference
|
|
1452
|
-
... "https://api.example.com/data", # API URL
|
|
1453
|
-
... "dataset:customer-2024", # Dataset identifier
|
|
1454
|
-
... "/path/to/source.csv", # File path
|
|
1455
|
-
... ]
|
|
1456
|
-
... )
|
|
1457
|
-
>>>
|
|
1458
|
-
>>> # Get only non-document references
|
|
1459
|
-
>>> refs = doc.get_source_references()
|
|
1460
|
-
>>> print(refs)
|
|
1461
|
-
>>> # ["https://api.example.com/data", "dataset:customer-2024", "/path/to/source.csv"]
|
|
1462
|
-
>>>
|
|
1463
|
-
>>> # Use for attribution or debugging
|
|
1464
|
-
>>> for ref in refs:
|
|
1465
|
-
... print(f"Data sourced from: {ref}")
|
|
1466
|
-
"""
|
|
1467
|
-
return [src for src in self.sources if not is_document_sha256(src)]
|
|
1468
|
-
|
|
1469
|
-
def has_source(self, source: Document | str) -> bool:
|
|
1470
|
-
"""Check if a specific source is tracked for this document.
|
|
1471
|
-
|
|
1472
|
-
Verifies whether a given source (document or reference string) is
|
|
1473
|
-
included in this document's sources list. Useful for dependency
|
|
1474
|
-
checking, lineage verification, and conditional processing based
|
|
1475
|
-
on document origins.
|
|
1476
|
-
|
|
1477
|
-
Args:
|
|
1478
|
-
source: Source to check for. Can be:
|
|
1479
|
-
- Document: Checks if document's SHA256 is in sources
|
|
1480
|
-
- str: Checks if exact string is in sources (hash or reference)
|
|
1481
|
-
|
|
1482
|
-
Returns:
|
|
1483
|
-
True if the source is tracked in this document's sources,
|
|
1484
|
-
False otherwise.
|
|
1485
|
-
|
|
1486
|
-
Raises:
|
|
1487
|
-
TypeError: If source is not a Document or string.
|
|
1488
|
-
|
|
1489
|
-
Example:
|
|
1490
|
-
>>> # Check if document was derived from specific source
|
|
1491
|
-
>>> source_doc = MyDocument.create(name="original.txt", content="Data")
|
|
1492
|
-
>>> api_url = "https://api.example.com/data"
|
|
1493
|
-
>>>
|
|
1494
|
-
>>> derived = MyDocument.create(
|
|
1495
|
-
... name="processed.txt",
|
|
1496
|
-
... content="Processed data",
|
|
1497
|
-
... sources=[source_doc.sha256, api_url]
|
|
1498
|
-
... )
|
|
1499
|
-
>>>
|
|
1500
|
-
>>> # Check document source
|
|
1501
|
-
>>> if derived.has_source(source_doc):
|
|
1502
|
-
... print("Derived from source_doc")
|
|
1503
|
-
>>>
|
|
1504
|
-
>>> # Check string reference
|
|
1505
|
-
>>> if derived.has_source(api_url):
|
|
1506
|
-
... print("Data from API")
|
|
1507
|
-
>>>
|
|
1508
|
-
>>> # Check by SHA256 directly
|
|
1509
|
-
>>> if derived.has_source(source_doc.sha256):
|
|
1510
|
-
... print("Has specific hash")
|
|
1511
|
-
"""
|
|
494
|
+
def has_source(self, source: "Document | str") -> bool:
|
|
495
|
+
"""Check if a source (Document or string) is in this document's sources."""
|
|
1512
496
|
if isinstance(source, str):
|
|
1513
|
-
# Direct string comparison
|
|
1514
497
|
return source in self.sources
|
|
1515
|
-
|
|
1516
|
-
# Check if document's SHA256 is in sources
|
|
498
|
+
if isinstance(source, Document): # type: ignore[misc]
|
|
1517
499
|
return source.sha256 in self.sources
|
|
1518
|
-
|
|
1519
|
-
raise TypeError(f"Invalid source type: {type(source)}")
|
|
500
|
+
raise TypeError(f"Invalid source type: {type(source)}") # pyright: ignore[reportUnreachable]
|
|
1520
501
|
|
|
1521
502
|
@final
|
|
1522
503
|
def serialize_model(self) -> dict[str, Any]:
|
|
1523
|
-
"""Serialize
|
|
1524
|
-
|
|
1525
|
-
Creates a complete JSON-serializable representation of the document
|
|
1526
|
-
with all metadata and properly encoded content. Automatically chooses
|
|
1527
|
-
the most appropriate encoding (UTF-8 for text, base64 for binary).
|
|
1528
|
-
|
|
1529
|
-
Returns:
|
|
1530
|
-
Dictionary with the following keys:
|
|
1531
|
-
- name: Document filename (str)
|
|
1532
|
-
- description: Optional description (str | None)
|
|
1533
|
-
- base_type: Persistence type - "flow", "task", or "temporary" (str)
|
|
1534
|
-
- size: Content size in bytes (int)
|
|
1535
|
-
- id: Short hash identifier, first 6 chars of SHA256 (str)
|
|
1536
|
-
- sha256: Full SHA256 hash in base32 encoding without padding (str)
|
|
1537
|
-
- mime_type: Detected MIME type (str)
|
|
1538
|
-
- sources: List of source strings (list[dict])
|
|
1539
|
-
- canonical_name: Canonical snake_case name for debug tracing (str)
|
|
1540
|
-
- class_name: Name of the actual document class for debug tracing (str)
|
|
1541
|
-
- content: Encoded content (str)
|
|
1542
|
-
- content_encoding: Either "utf-8" or "base64" (str)
|
|
1543
|
-
|
|
1544
|
-
Encoding Strategy:
|
|
1545
|
-
- Text files (text/*, application/json, etc.) → UTF-8 string
|
|
1546
|
-
- Binary files (images, PDFs, etc.) → Base64 string
|
|
1547
|
-
- Invalid UTF-8 in text files → UTF-8 with replacement chars
|
|
1548
|
-
|
|
1549
|
-
Example:
|
|
1550
|
-
>>> doc = MyDocument.create(name="data.json", content={"key": "value"})
|
|
1551
|
-
>>> serialized = doc.serialize_model()
|
|
1552
|
-
>>> serialized["content_encoding"] # "utf-8"
|
|
1553
|
-
>>> serialized["mime_type"] # "application/json"
|
|
1554
|
-
"""
|
|
1555
|
-
result = {
|
|
504
|
+
"""Serialize to JSON-compatible dict for storage/transmission. Roundtrips with from_dict()."""
|
|
505
|
+
result: dict[str, Any] = { # nosemgrep: mutable-field-on-frozen-pydantic-model
|
|
1556
506
|
"name": self.name,
|
|
1557
507
|
"description": self.description,
|
|
1558
|
-
"base_type": self.get_base_type(),
|
|
1559
508
|
"size": self.size,
|
|
1560
509
|
"id": self.id,
|
|
1561
510
|
"sha256": self.sha256,
|
|
511
|
+
"content_sha256": self.content_sha256,
|
|
1562
512
|
"mime_type": self.mime_type,
|
|
1563
|
-
"sources": self.sources,
|
|
513
|
+
"sources": list(self.sources),
|
|
514
|
+
"origins": list(self.origins),
|
|
1564
515
|
"canonical_name": canonical_name_key(self.__class__),
|
|
1565
516
|
"class_name": self.__class__.__name__,
|
|
1566
517
|
}
|
|
1567
518
|
|
|
1568
|
-
# Try to encode content as UTF-8, fall back to base64
|
|
1569
519
|
if self.is_text:
|
|
1570
520
|
try:
|
|
1571
521
|
result["content"] = self.content.decode("utf-8")
|
|
1572
522
|
result["content_encoding"] = "utf-8"
|
|
1573
523
|
except UnicodeDecodeError:
|
|
1574
|
-
# For text files with encoding issues, use UTF-8 with replacement
|
|
1575
524
|
result["content"] = self.content.decode("utf-8", errors="replace")
|
|
1576
525
|
result["content_encoding"] = "utf-8"
|
|
1577
526
|
else:
|
|
1578
|
-
# Binary content - use base64
|
|
1579
527
|
result["content"] = base64.b64encode(self.content).decode("ascii")
|
|
1580
528
|
result["content_encoding"] = "base64"
|
|
1581
529
|
|
|
530
|
+
serialized_attachments: list[dict[str, Any]] = [] # nosemgrep: mutable-field-on-frozen-pydantic-model
|
|
531
|
+
for att in self.attachments:
|
|
532
|
+
att_data: dict[str, Any] = {"name": att.name, "description": att.description} # nosemgrep: mutable-field-on-frozen-pydantic-model
|
|
533
|
+
if att.is_text:
|
|
534
|
+
att_data["content"] = att.content.decode("utf-8", errors="replace")
|
|
535
|
+
att_data["content_encoding"] = "utf-8"
|
|
536
|
+
else:
|
|
537
|
+
att_data["content"] = base64.b64encode(att.content).decode("ascii")
|
|
538
|
+
att_data["content_encoding"] = "base64"
|
|
539
|
+
att_data["mime_type"] = att.mime_type
|
|
540
|
+
att_data["size"] = att.size
|
|
541
|
+
serialized_attachments.append(att_data)
|
|
542
|
+
result["attachments"] = serialized_attachments
|
|
543
|
+
|
|
1582
544
|
return result
|
|
1583
545
|
|
|
1584
546
|
@final
|
|
1585
547
|
@classmethod
|
|
1586
548
|
def from_dict(cls, data: dict[str, Any]) -> Self:
|
|
1587
|
-
|
|
1588
|
-
|
|
1589
|
-
Reconstructs a Document instance from the dictionary format produced
|
|
1590
|
-
by serialize_model(). Automatically handles content decoding based on
|
|
1591
|
-
the content_encoding field.
|
|
1592
|
-
|
|
1593
|
-
Args:
|
|
1594
|
-
data: Dictionary containing serialized document. Required keys:
|
|
1595
|
-
- name: Document filename (str)
|
|
1596
|
-
- content: Encoded content (str or bytes)
|
|
1597
|
-
Optional keys:
|
|
1598
|
-
- description: Document description (str | None)
|
|
1599
|
-
- content_encoding: "utf-8" or "base64" (defaults to "utf-8")
|
|
1600
|
-
- sources: List of source strings
|
|
1601
|
-
|
|
1602
|
-
Returns:
|
|
1603
|
-
New Document instance with restored content.
|
|
1604
|
-
|
|
1605
|
-
Raises:
|
|
1606
|
-
ValueError: If content type is invalid or base64 decoding fails
|
|
1607
|
-
KeyError: If required keys are missing from data dictionary
|
|
1608
|
-
|
|
1609
|
-
Note:
|
|
1610
|
-
Provides roundtrip guarantee with serialize_model().
|
|
1611
|
-
Content and name are preserved exactly.
|
|
1612
|
-
|
|
1613
|
-
Example:
|
|
1614
|
-
>>> data = {
|
|
1615
|
-
... "name": "config.yaml",
|
|
1616
|
-
... "content": "key: value\n",
|
|
1617
|
-
... "content_encoding": "utf-8",
|
|
1618
|
-
... "description": "Config file"
|
|
1619
|
-
... }
|
|
1620
|
-
>>> doc = MyDocument.from_dict(data)
|
|
1621
|
-
"""
|
|
1622
|
-
# Extract content and encoding
|
|
549
|
+
"""Deserialize from dict produced by serialize_model(). Roundtrip guarantee."""
|
|
1623
550
|
content_raw = data.get("content", "")
|
|
1624
551
|
content_encoding = data.get("content_encoding", "utf-8")
|
|
1625
552
|
|
|
1626
|
-
# Decode content based on encoding
|
|
1627
553
|
content: bytes
|
|
1628
554
|
if content_encoding == "base64":
|
|
1629
|
-
|
|
555
|
+
if not isinstance(content_raw, str):
|
|
556
|
+
raise ValueError("base64 content must be string")
|
|
1630
557
|
content = base64.b64decode(content_raw)
|
|
1631
558
|
elif isinstance(content_raw, str):
|
|
1632
|
-
# Default to UTF-8
|
|
1633
559
|
content = content_raw.encode("utf-8")
|
|
1634
560
|
elif isinstance(content_raw, bytes):
|
|
1635
561
|
content = content_raw
|
|
1636
562
|
else:
|
|
1637
563
|
raise ValueError(f"Invalid content type: {type(content_raw)}")
|
|
1638
564
|
|
|
565
|
+
attachments: tuple[Attachment, ...] | None = None
|
|
566
|
+
if attachments_raw := data.get("attachments"):
|
|
567
|
+
att_list: list[Attachment] = [] # nosemgrep: mutable-field-on-frozen-pydantic-model
|
|
568
|
+
for att_data in attachments_raw:
|
|
569
|
+
att_content_raw = att_data.get("content", "")
|
|
570
|
+
if att_data.get("content_encoding") == "base64":
|
|
571
|
+
att_content = base64.b64decode(att_content_raw)
|
|
572
|
+
elif isinstance(att_content_raw, str):
|
|
573
|
+
att_content = att_content_raw.encode("utf-8")
|
|
574
|
+
else:
|
|
575
|
+
att_content = att_content_raw
|
|
576
|
+
att_list.append(Attachment(name=att_data["name"], content=att_content, description=att_data.get("description")))
|
|
577
|
+
attachments = tuple(att_list)
|
|
578
|
+
|
|
1639
579
|
return cls(
|
|
1640
580
|
name=data["name"],
|
|
1641
581
|
content=content,
|
|
1642
582
|
description=data.get("description"),
|
|
1643
|
-
sources=data.get("sources"
|
|
583
|
+
sources=tuple(data.get("sources") or ()),
|
|
584
|
+
origins=tuple(data.get("origins") or ()),
|
|
585
|
+
attachments=attachments,
|
|
1644
586
|
)
|
|
1645
587
|
|
|
1646
588
|
@final
|
|
1647
|
-
def model_convert(
|
|
1648
|
-
|
|
1649
|
-
new_type: type[TDocument],
|
|
1650
|
-
*,
|
|
1651
|
-
update: dict[str, Any] | None = None,
|
|
1652
|
-
deep: bool = False,
|
|
1653
|
-
) -> TDocument:
|
|
1654
|
-
"""Convert document to a different Document type with optional updates.
|
|
1655
|
-
|
|
1656
|
-
@public
|
|
1657
|
-
|
|
1658
|
-
Creates a new document of a different type, preserving all attributes
|
|
1659
|
-
while allowing updates. This is useful for converting between document
|
|
1660
|
-
types (e.g., TaskDocument to FlowDocument) while maintaining data integrity.
|
|
1661
|
-
|
|
1662
|
-
Args:
|
|
1663
|
-
new_type: Target Document class for conversion. Must be a concrete
|
|
1664
|
-
subclass of Document (not abstract classes like Document,
|
|
1665
|
-
FlowDocument, or TaskDocument).
|
|
1666
|
-
update: Dictionary of attributes to update. Supports any attributes
|
|
1667
|
-
that the Document constructor accepts (name, content,
|
|
1668
|
-
description, sources).
|
|
1669
|
-
deep: Whether to perform a deep copy of mutable attributes.
|
|
1670
|
-
|
|
1671
|
-
Returns:
|
|
1672
|
-
New Document instance of the specified type.
|
|
1673
|
-
|
|
1674
|
-
Raises:
|
|
1675
|
-
TypeError: If new_type is not a subclass of Document, is an abstract
|
|
1676
|
-
class, or if update contains invalid attributes.
|
|
1677
|
-
DocumentNameError: If the name violates the target type's FILES enum.
|
|
1678
|
-
DocumentSizeError: If content exceeds MAX_CONTENT_SIZE.
|
|
1679
|
-
|
|
1680
|
-
Example:
|
|
1681
|
-
>>> # Convert TaskDocument to FlowDocument
|
|
1682
|
-
>>> task_doc = MyTaskDoc.create(name="temp.json", content={"data": "value"})
|
|
1683
|
-
>>> flow_doc = task_doc.model_convert(MyFlowDoc)
|
|
1684
|
-
>>> assert flow_doc.is_flow
|
|
1685
|
-
>>> assert flow_doc.content == task_doc.content
|
|
1686
|
-
>>>
|
|
1687
|
-
>>> # Convert with updates
|
|
1688
|
-
>>> updated = task_doc.model_convert(
|
|
1689
|
-
... MyFlowDoc,
|
|
1690
|
-
... update={"name": "permanent.json", "description": "Converted"}
|
|
1691
|
-
... )
|
|
1692
|
-
>>>
|
|
1693
|
-
>>> # Track document lineage
|
|
1694
|
-
>>> derived = doc.model_convert(
|
|
1695
|
-
... ProcessedDoc,
|
|
1696
|
-
... update={"sources": [doc.sha256]}
|
|
1697
|
-
... )
|
|
1698
|
-
"""
|
|
1699
|
-
# Validate new_type
|
|
589
|
+
def model_convert(self, new_type: type[TDocument], *, update: dict[str, Any] | None = None) -> TDocument:
|
|
590
|
+
"""Convert to a different Document subclass with optional field overrides."""
|
|
1700
591
|
try:
|
|
1701
|
-
|
|
1702
|
-
|
|
1703
|
-
|
|
1704
|
-
|
|
1705
|
-
|
|
1706
|
-
|
|
1707
|
-
|
|
1708
|
-
|
|
1709
|
-
|
|
1710
|
-
|
|
1711
|
-
|
|
1712
|
-
if class_name == "Document":
|
|
1713
|
-
raise TypeError("Cannot instantiate abstract Document class directly")
|
|
1714
|
-
if class_name == "FlowDocument":
|
|
1715
|
-
raise TypeError("Cannot instantiate abstract FlowDocument class directly")
|
|
1716
|
-
if class_name == "TaskDocument":
|
|
1717
|
-
raise TypeError("Cannot instantiate abstract TaskDocument class directly")
|
|
1718
|
-
|
|
1719
|
-
# Get current document data with proper typing
|
|
1720
|
-
data: dict[str, Any] = {
|
|
592
|
+
if not isinstance(new_type, type): # pyright: ignore[reportUnnecessaryIsInstance]
|
|
593
|
+
raise TypeError(f"new_type must be a class, got {new_type}") # pyright: ignore[reportUnreachable]
|
|
594
|
+
if not issubclass(new_type, Document): # pyright: ignore[reportUnnecessaryIsInstance]
|
|
595
|
+
raise TypeError(f"new_type must be a subclass of Document, got {new_type}") # pyright: ignore[reportUnreachable]
|
|
596
|
+
except (TypeError, AttributeError) as err:
|
|
597
|
+
raise TypeError(f"new_type must be a subclass of Document, got {new_type}") from err
|
|
598
|
+
|
|
599
|
+
if new_type is Document:
|
|
600
|
+
raise TypeError("Cannot instantiate Document directly — use a concrete subclass")
|
|
601
|
+
|
|
602
|
+
data: dict[str, Any] = { # nosemgrep: mutable-field-on-frozen-pydantic-model
|
|
1721
603
|
"name": self.name,
|
|
1722
604
|
"content": self.content,
|
|
1723
605
|
"description": self.description,
|
|
1724
|
-
"sources": self.sources
|
|
606
|
+
"sources": self.sources,
|
|
607
|
+
"origins": self.origins,
|
|
608
|
+
"attachments": self.attachments,
|
|
1725
609
|
}
|
|
1726
610
|
|
|
1727
|
-
# Apply updates if provided
|
|
1728
611
|
if update:
|
|
1729
612
|
data.update(update)
|
|
1730
613
|
|
|
1731
|
-
# Create new document of target type
|
|
1732
614
|
return new_type(
|
|
1733
615
|
name=data["name"],
|
|
1734
616
|
content=data["content"],
|
|
1735
617
|
description=data.get("description"),
|
|
1736
|
-
sources=data.get("sources"
|
|
618
|
+
sources=data.get("sources"),
|
|
619
|
+
origins=data.get("origins"),
|
|
620
|
+
attachments=data.get("attachments"),
|
|
1737
621
|
)
|