ai-pipeline-core 0.1.13__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ai_pipeline_core/__init__.py +25 -14
- ai_pipeline_core/documents/__init__.py +2 -1
- ai_pipeline_core/documents/document.py +317 -49
- ai_pipeline_core/documents/document_list.py +136 -33
- ai_pipeline_core/documents/flow_document.py +8 -29
- ai_pipeline_core/documents/task_document.py +6 -27
- ai_pipeline_core/documents/temporary_document.py +6 -27
- ai_pipeline_core/documents/utils.py +64 -1
- ai_pipeline_core/flow/config.py +174 -5
- ai_pipeline_core/flow/options.py +2 -2
- ai_pipeline_core/llm/__init__.py +6 -1
- ai_pipeline_core/llm/ai_messages.py +14 -7
- ai_pipeline_core/llm/client.py +143 -55
- ai_pipeline_core/llm/model_options.py +20 -5
- ai_pipeline_core/llm/model_response.py +77 -29
- ai_pipeline_core/llm/model_types.py +38 -40
- ai_pipeline_core/logging/__init__.py +0 -2
- ai_pipeline_core/logging/logging_config.py +0 -6
- ai_pipeline_core/logging/logging_mixin.py +2 -10
- ai_pipeline_core/pipeline.py +68 -65
- ai_pipeline_core/prefect.py +12 -3
- ai_pipeline_core/prompt_manager.py +6 -7
- ai_pipeline_core/settings.py +13 -5
- ai_pipeline_core/simple_runner/__init__.py +1 -11
- ai_pipeline_core/simple_runner/cli.py +13 -12
- ai_pipeline_core/simple_runner/simple_runner.py +34 -172
- ai_pipeline_core/storage/__init__.py +8 -0
- ai_pipeline_core/storage/storage.py +628 -0
- ai_pipeline_core/tracing.py +110 -26
- {ai_pipeline_core-0.1.13.dist-info → ai_pipeline_core-0.2.0.dist-info}/METADATA +60 -23
- ai_pipeline_core-0.2.0.dist-info/RECORD +38 -0
- ai_pipeline_core-0.1.13.dist-info/RECORD +0 -36
- {ai_pipeline_core-0.1.13.dist-info → ai_pipeline_core-0.2.0.dist-info}/WHEEL +0 -0
- {ai_pipeline_core-0.1.13.dist-info → ai_pipeline_core-0.2.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -6,6 +6,8 @@ This module provides the core document abstraction for working with various type
|
|
|
6
6
|
in AI pipelines. Documents are immutable Pydantic models that wrap binary content with metadata.
|
|
7
7
|
"""
|
|
8
8
|
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
9
11
|
import base64
|
|
10
12
|
import hashlib
|
|
11
13
|
import json
|
|
@@ -30,13 +32,14 @@ from typing import (
|
|
|
30
32
|
from pydantic import (
|
|
31
33
|
BaseModel,
|
|
32
34
|
ConfigDict,
|
|
35
|
+
Field,
|
|
33
36
|
ValidationInfo,
|
|
34
37
|
field_serializer,
|
|
35
38
|
field_validator,
|
|
36
39
|
)
|
|
37
40
|
from ruamel.yaml import YAML
|
|
38
41
|
|
|
39
|
-
from ai_pipeline_core.documents.utils import canonical_name_key
|
|
42
|
+
from ai_pipeline_core.documents.utils import canonical_name_key, is_document_sha256
|
|
40
43
|
from ai_pipeline_core.exceptions import DocumentNameError, DocumentSizeError
|
|
41
44
|
|
|
42
45
|
from .mime_type import (
|
|
@@ -58,8 +61,7 @@ class Document(BaseModel, ABC):
|
|
|
58
61
|
Document is the fundamental data abstraction for all content flowing through
|
|
59
62
|
pipelines. It provides automatic encoding, MIME type detection, serialization,
|
|
60
63
|
and validation. All documents must be subclassed from FlowDocument or TaskDocument
|
|
61
|
-
based on their persistence requirements.
|
|
62
|
-
class that can be instantiated directly (not abstract).
|
|
64
|
+
based on their persistence requirements.
|
|
63
65
|
|
|
64
66
|
VALIDATION IS AUTOMATIC - Do not add manual validation!
|
|
65
67
|
Size validation, name validation, and MIME type detection are built-in.
|
|
@@ -71,7 +73,7 @@ class Document(BaseModel, ABC):
|
|
|
71
73
|
document.validate_file_name(document.name) # NO! Automatic
|
|
72
74
|
|
|
73
75
|
Best Practices:
|
|
74
|
-
- Use create() classmethod for automatic type conversion (
|
|
76
|
+
- Use create() classmethod for automatic type conversion (default preferred)
|
|
75
77
|
- Omit description parameter unless truly needed for metadata
|
|
76
78
|
- When using LLM functions, pass AIMessages or str. Wrap any Document values
|
|
77
79
|
in AIMessages([...]). Do not call .text yourself
|
|
@@ -94,6 +96,7 @@ class Document(BaseModel, ABC):
|
|
|
94
96
|
- SHA256 hashing for deduplication
|
|
95
97
|
- Support for text, JSON, YAML, PDF, and image formats
|
|
96
98
|
- Conversion utilities between different formats
|
|
99
|
+
- Source provenance tracking via sources field
|
|
97
100
|
|
|
98
101
|
Class Variables:
|
|
99
102
|
MAX_CONTENT_SIZE: Maximum allowed content size in bytes (default 25MB)
|
|
@@ -102,6 +105,7 @@ class Document(BaseModel, ABC):
|
|
|
102
105
|
name: Document filename (validated for security)
|
|
103
106
|
description: Optional human-readable description
|
|
104
107
|
content: Raw document content as bytes
|
|
108
|
+
sources: List of source references tracking document provenance
|
|
105
109
|
|
|
106
110
|
Creating Documents:
|
|
107
111
|
**Use the `create` classmethod** for most use cases. It accepts various
|
|
@@ -117,7 +121,7 @@ class Document(BaseModel, ABC):
|
|
|
117
121
|
Warning:
|
|
118
122
|
- Document subclasses should NOT start with 'Test' prefix (pytest conflict)
|
|
119
123
|
- Cannot instantiate Document directly - must subclass FlowDocument or TaskDocument
|
|
120
|
-
- Cannot add custom fields - only name, description, content are allowed
|
|
124
|
+
- Cannot add custom fields - only name, description, content, sources are allowed
|
|
121
125
|
- Document is an abstract class and cannot be instantiated directly
|
|
122
126
|
|
|
123
127
|
Metadata Attachment Patterns:
|
|
@@ -126,10 +130,62 @@ class Document(BaseModel, ABC):
|
|
|
126
130
|
2. Embed metadata in content (e.g., JSON with data + metadata fields)
|
|
127
131
|
3. Create a separate MetadataDocument type to accompany data documents
|
|
128
132
|
4. Use document naming conventions (e.g., "data_v2_2024.json")
|
|
129
|
-
5. Store metadata in flow_options
|
|
133
|
+
5. Store metadata in flow_options
|
|
134
|
+
|
|
135
|
+
FILES Enum Best Practice:
|
|
136
|
+
When defining a FILES enum, NEVER use magic strings to reference files.
|
|
137
|
+
Always use the enum values to maintain type safety and refactorability.
|
|
138
|
+
|
|
139
|
+
WRONG - Magic strings/numbers:
|
|
140
|
+
doc = ConfigDocument.create(name="config.yaml", content=data) # NO!
|
|
141
|
+
doc = docs.get_by("settings.json") # NO! Magic string
|
|
142
|
+
files = ["config.yaml", "settings.json"] # NO! Magic strings
|
|
143
|
+
|
|
144
|
+
CORRECT - Use enum references:
|
|
145
|
+
doc = ConfigDocument.create(
|
|
146
|
+
name=ConfigDocument.FILES.CONFIG, # YES! Type-safe
|
|
147
|
+
content=data
|
|
148
|
+
)
|
|
149
|
+
doc = docs.get_by(ConfigDocument.FILES.SETTINGS) # YES!
|
|
150
|
+
files = [
|
|
151
|
+
ConfigDocument.FILES.CONFIG,
|
|
152
|
+
ConfigDocument.FILES.SETTINGS
|
|
153
|
+
] # YES! Refactorable
|
|
154
|
+
|
|
155
|
+
Pydantic Model Interaction:
|
|
156
|
+
Documents provide DIRECT support for Pydantic models. Use the built-in
|
|
157
|
+
methods instead of manual JSON conversion.
|
|
158
|
+
|
|
159
|
+
WRONG - Manual JSON conversion:
|
|
160
|
+
# Don't do this - manual JSON handling
|
|
161
|
+
json_str = doc.text
|
|
162
|
+
json_data = json.loads(json_str)
|
|
163
|
+
model = MyModel(**json_data) # NO! Use as_pydantic_model
|
|
164
|
+
|
|
165
|
+
# Don't do this - manual serialization
|
|
166
|
+
json_str = model.model_dump_json()
|
|
167
|
+
doc = MyDocument.create(name="data.json", content=json_str) # NO!
|
|
168
|
+
|
|
169
|
+
CORRECT - Direct Pydantic interaction:
|
|
170
|
+
# Reading Pydantic model from document
|
|
171
|
+
model = doc.as_pydantic_model(MyModel) # Direct conversion
|
|
172
|
+
models = doc.as_pydantic_model(list[MyModel]) # List support
|
|
173
|
+
|
|
174
|
+
# Creating document from Pydantic model
|
|
175
|
+
doc = MyDocument.create(
|
|
176
|
+
name="data.json",
|
|
177
|
+
content=model # Direct BaseModel support
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
# Round-trip is seamless
|
|
181
|
+
original_model = MyModel(field="value")
|
|
182
|
+
doc = MyDocument.create(name="data.json", content=original_model)
|
|
183
|
+
restored = doc.as_pydantic_model(MyModel)
|
|
184
|
+
assert restored == original_model # Perfect round-trip
|
|
130
185
|
|
|
131
186
|
Example:
|
|
132
187
|
>>> from enum import StrEnum
|
|
188
|
+
>>> from pydantic import BaseModel
|
|
133
189
|
>>>
|
|
134
190
|
>>> # Simple document:
|
|
135
191
|
>>> class MyDocument(FlowDocument):
|
|
@@ -141,10 +197,32 @@ class Document(BaseModel, ABC):
|
|
|
141
197
|
... CONFIG = "config.yaml"
|
|
142
198
|
... SETTINGS = "settings.json"
|
|
143
199
|
>>>
|
|
144
|
-
>>> #
|
|
145
|
-
>>> doc =
|
|
146
|
-
|
|
147
|
-
|
|
200
|
+
>>> # CORRECT FILES usage - no magic strings:
|
|
201
|
+
>>> doc = ConfigDocument.create(
|
|
202
|
+
... name=ConfigDocument.FILES.CONFIG, # Use enum
|
|
203
|
+
... content={"key": "value"}
|
|
204
|
+
... )
|
|
205
|
+
>>>
|
|
206
|
+
>>> # CORRECT Pydantic usage:
|
|
207
|
+
>>> class Config(BaseModel):
|
|
208
|
+
... key: str
|
|
209
|
+
>>>
|
|
210
|
+
>>> # Direct creation from Pydantic model
|
|
211
|
+
>>> config_model = Config(key="value")
|
|
212
|
+
>>> doc = MyDocument.create(name="data.json", content=config_model)
|
|
213
|
+
>>>
|
|
214
|
+
>>> # Direct extraction to Pydantic model
|
|
215
|
+
>>> restored = doc.as_pydantic_model(Config)
|
|
216
|
+
>>> print(restored.key) # "value"
|
|
217
|
+
>>>
|
|
218
|
+
>>> # Track document provenance with sources
|
|
219
|
+
>>> source_doc = MyDocument.create(name="input.txt", content="raw data")
|
|
220
|
+
>>> processed = MyDocument.create(
|
|
221
|
+
... name="output.txt",
|
|
222
|
+
... content="processed data",
|
|
223
|
+
... sources=[source_doc.sha256] # Reference source document
|
|
224
|
+
... )
|
|
225
|
+
>>> processed.has_source(source_doc) # True
|
|
148
226
|
"""
|
|
149
227
|
|
|
150
228
|
MAX_CONTENT_SIZE: ClassVar[int] = 25 * 1024 * 1024
|
|
@@ -156,6 +234,9 @@ class Document(BaseModel, ABC):
|
|
|
156
234
|
DESCRIPTION_EXTENSION: ClassVar[str] = ".description.md"
|
|
157
235
|
"""File extension for description files."""
|
|
158
236
|
|
|
237
|
+
SOURCES_EXTENSION: ClassVar[str] = ".sources.json"
|
|
238
|
+
"""File extension for sources metadata files."""
|
|
239
|
+
|
|
159
240
|
MARKDOWN_LIST_SEPARATOR: ClassVar[str] = "\n\n-----------------\n\n"
|
|
160
241
|
"""Separator for markdown list items."""
|
|
161
242
|
|
|
@@ -193,7 +274,7 @@ class Document(BaseModel, ABC):
|
|
|
193
274
|
)
|
|
194
275
|
# Check that the Document's model_fields only contain the allowed fields
|
|
195
276
|
# It prevents AI models from adding additional fields to documents
|
|
196
|
-
allowed = {"name", "description", "content"}
|
|
277
|
+
allowed = {"name", "description", "content", "sources"}
|
|
197
278
|
current = set(getattr(cls, "model_fields", {}).keys())
|
|
198
279
|
extras = current - allowed
|
|
199
280
|
if extras:
|
|
@@ -204,25 +285,58 @@ class Document(BaseModel, ABC):
|
|
|
204
285
|
|
|
205
286
|
@overload
|
|
206
287
|
@classmethod
|
|
207
|
-
def create(
|
|
288
|
+
def create(
|
|
289
|
+
cls,
|
|
290
|
+
*,
|
|
291
|
+
name: str,
|
|
292
|
+
content: bytes,
|
|
293
|
+
description: str | None = None,
|
|
294
|
+
sources: list[str] = [],
|
|
295
|
+
) -> Self: ...
|
|
208
296
|
|
|
209
297
|
@overload
|
|
210
298
|
@classmethod
|
|
211
|
-
def create(
|
|
299
|
+
def create(
|
|
300
|
+
cls,
|
|
301
|
+
*,
|
|
302
|
+
name: str,
|
|
303
|
+
content: str,
|
|
304
|
+
description: str | None = None,
|
|
305
|
+
sources: list[str] = [],
|
|
306
|
+
) -> Self: ...
|
|
212
307
|
|
|
213
308
|
@overload
|
|
214
309
|
@classmethod
|
|
215
310
|
def create(
|
|
216
|
-
cls,
|
|
311
|
+
cls,
|
|
312
|
+
*,
|
|
313
|
+
name: str,
|
|
314
|
+
content: dict[str, Any],
|
|
315
|
+
description: str | None = None,
|
|
316
|
+
sources: list[str] = [],
|
|
217
317
|
) -> Self: ...
|
|
218
318
|
|
|
219
319
|
@overload
|
|
220
320
|
@classmethod
|
|
221
|
-
def create(
|
|
321
|
+
def create(
|
|
322
|
+
cls,
|
|
323
|
+
*,
|
|
324
|
+
name: str,
|
|
325
|
+
content: list[Any],
|
|
326
|
+
description: str | None = None,
|
|
327
|
+
sources: list[str] = [],
|
|
328
|
+
) -> Self: ...
|
|
222
329
|
|
|
223
330
|
@overload
|
|
224
331
|
@classmethod
|
|
225
|
-
def create(
|
|
332
|
+
def create(
|
|
333
|
+
cls,
|
|
334
|
+
*,
|
|
335
|
+
name: str,
|
|
336
|
+
content: BaseModel,
|
|
337
|
+
description: str | None = None,
|
|
338
|
+
sources: list[str] = [],
|
|
339
|
+
) -> Self: ...
|
|
226
340
|
|
|
227
341
|
@classmethod
|
|
228
342
|
def create(
|
|
@@ -231,6 +345,7 @@ class Document(BaseModel, ABC):
|
|
|
231
345
|
name: str,
|
|
232
346
|
content: str | bytes | dict[str, Any] | list[Any] | BaseModel,
|
|
233
347
|
description: str | None = None,
|
|
348
|
+
sources: list[str] = [],
|
|
234
349
|
) -> Self:
|
|
235
350
|
r"""Create a Document with automatic content type conversion (recommended).
|
|
236
351
|
|
|
@@ -240,7 +355,7 @@ class Document(BaseModel, ABC):
|
|
|
240
355
|
content types and automatically converts them to bytes based on the file
|
|
241
356
|
extension. Use the `parse` method to reverse this conversion.
|
|
242
357
|
|
|
243
|
-
Best Practice (
|
|
358
|
+
Best Practice (by default, unless instructed otherwise):
|
|
244
359
|
Only provide name and content. The description parameter is RARELY needed.
|
|
245
360
|
|
|
246
361
|
Args:
|
|
@@ -254,19 +369,24 @@ class Document(BaseModel, ABC):
|
|
|
254
369
|
- bytes: Used directly without conversion
|
|
255
370
|
- str: Encoded to UTF-8 bytes
|
|
256
371
|
- dict[str, Any]: Serialized to JSON (.json) or YAML (.yaml/.yml)
|
|
257
|
-
- list[str]: Joined
|
|
258
|
-
|
|
372
|
+
- list[str]: Joined automatically for .md (validates format compatibility),
|
|
373
|
+
else JSON/YAML
|
|
259
374
|
- list[BaseModel]: Serialized to JSON or YAML based on extension
|
|
260
375
|
- BaseModel: Serialized to JSON or YAML based on extension
|
|
261
376
|
description: Optional description - USUALLY OMIT THIS (defaults to None).
|
|
262
377
|
Only use when meaningful metadata helps downstream processing
|
|
378
|
+
sources: Optional list of source strings (document SHA256 hashes or references).
|
|
379
|
+
Used to track what sources contributed to creating this document.
|
|
380
|
+
Can contain document SHA256 hashes (for referencing other documents)
|
|
381
|
+
or arbitrary reference strings (URLs, file paths, descriptions).
|
|
382
|
+
Defaults to empty list
|
|
263
383
|
|
|
264
384
|
Returns:
|
|
265
385
|
New Document instance with content converted to bytes
|
|
266
386
|
|
|
267
387
|
Raises:
|
|
268
388
|
ValueError: If content type is not supported for the file extension,
|
|
269
|
-
or if markdown list
|
|
389
|
+
or if markdown list format is incompatible
|
|
270
390
|
DocumentNameError: If filename violates validation rules
|
|
271
391
|
DocumentSizeError: If content exceeds MAX_CONTENT_SIZE
|
|
272
392
|
|
|
@@ -276,7 +396,7 @@ class Document(BaseModel, ABC):
|
|
|
276
396
|
returns the original dictionary {"key": "value"}.
|
|
277
397
|
|
|
278
398
|
Example:
|
|
279
|
-
>>> # CORRECT - no description needed (
|
|
399
|
+
>>> # CORRECT - no description needed (by default, unless instructed otherwise)
|
|
280
400
|
>>> doc = MyDocument.create(name="test.txt", content="Hello World")
|
|
281
401
|
>>> doc.content # b'Hello World'
|
|
282
402
|
>>> doc.parse(str) # "Hello World"
|
|
@@ -306,11 +426,31 @@ class Document(BaseModel, ABC):
|
|
|
306
426
|
>>> items = ["Section 1", "Section 2"]
|
|
307
427
|
>>> doc = MyDocument.create(name="sections.md", content=items)
|
|
308
428
|
>>> doc.parse(list) # ["Section 1", "Section 2"]
|
|
429
|
+
|
|
430
|
+
>>> # Document with sources for provenance tracking
|
|
431
|
+
>>> source_doc = MyDocument.create(name="source.txt", content="original")
|
|
432
|
+
>>> derived = MyDocument.create(
|
|
433
|
+
... name="result.txt",
|
|
434
|
+
... content="processed",
|
|
435
|
+
... sources=[source_doc.sha256, "https://api.example.com/data"]
|
|
436
|
+
... )
|
|
437
|
+
>>> derived.get_source_documents() # [source_doc.sha256]
|
|
438
|
+
>>> derived.get_source_references() # ["https://api.example.com/data"]
|
|
309
439
|
"""
|
|
310
440
|
# Use model_validate to leverage the existing validator logic
|
|
311
|
-
temp = cls.model_validate({
|
|
441
|
+
temp = cls.model_validate({
|
|
442
|
+
"name": name,
|
|
443
|
+
"content": content,
|
|
444
|
+
"description": description,
|
|
445
|
+
"sources": sources,
|
|
446
|
+
})
|
|
312
447
|
# Now construct with type-checker-friendly call (bytes only)
|
|
313
|
-
return cls(
|
|
448
|
+
return cls(
|
|
449
|
+
name=temp.name,
|
|
450
|
+
content=temp.content,
|
|
451
|
+
description=temp.description,
|
|
452
|
+
sources=temp.sources,
|
|
453
|
+
)
|
|
314
454
|
|
|
315
455
|
def __init__(
|
|
316
456
|
self,
|
|
@@ -318,6 +458,7 @@ class Document(BaseModel, ABC):
|
|
|
318
458
|
name: str,
|
|
319
459
|
content: bytes,
|
|
320
460
|
description: str | None = None,
|
|
461
|
+
sources: list[str] = [],
|
|
321
462
|
) -> None:
|
|
322
463
|
"""Initialize a Document instance with raw bytes content.
|
|
323
464
|
|
|
@@ -335,6 +476,10 @@ class Document(BaseModel, ABC):
|
|
|
335
476
|
name: Document filename (required, keyword-only)
|
|
336
477
|
content: Document content as raw bytes (required, keyword-only)
|
|
337
478
|
description: Optional human-readable description (keyword-only)
|
|
479
|
+
sources: Optional list of source strings for provenance tracking.
|
|
480
|
+
Can contain document SHA256 hashes (for referencing other documents)
|
|
481
|
+
or arbitrary reference strings (URLs, file paths, descriptions).
|
|
482
|
+
Defaults to empty list
|
|
338
483
|
|
|
339
484
|
Raises:
|
|
340
485
|
TypeError: If attempting to instantiate Document directly.
|
|
@@ -349,19 +494,21 @@ class Document(BaseModel, ABC):
|
|
|
349
494
|
>>> doc = MyDocument.create(name="data.json", content={"key": "value"})
|
|
350
495
|
>>> doc = MyDocument.create(name="config.yaml", content=my_model)
|
|
351
496
|
>>> doc = MyDocument.create(name="items.md", content=["item1", "item2"])
|
|
352
|
-
|
|
353
|
-
See Also:
|
|
354
|
-
create: Recommended factory method with automatic type conversion
|
|
355
|
-
parse: Method to reverse the conversion done by create
|
|
356
497
|
"""
|
|
357
498
|
if type(self) is Document:
|
|
358
499
|
raise TypeError("Cannot instantiate abstract Document class directly")
|
|
359
500
|
|
|
360
|
-
super().__init__(name=name, content=content, description=description)
|
|
501
|
+
super().__init__(name=name, content=content, description=description, sources=sources)
|
|
361
502
|
|
|
362
503
|
name: str
|
|
363
504
|
description: str | None = None
|
|
364
505
|
content: bytes # Note: constructor accepts str | bytes, but field stores bytes only
|
|
506
|
+
sources: list[str] = Field(
|
|
507
|
+
default_factory=list,
|
|
508
|
+
description="List of source references for tracking document provenance. "
|
|
509
|
+
"Can contain document SHA256 hashes (for referencing other documents) "
|
|
510
|
+
"or arbitrary reference strings (URLs, file paths, descriptions)",
|
|
511
|
+
)
|
|
365
512
|
|
|
366
513
|
# Pydantic configuration
|
|
367
514
|
model_config = ConfigDict(
|
|
@@ -383,8 +530,7 @@ class Document(BaseModel, ABC):
|
|
|
383
530
|
|
|
384
531
|
Note:
|
|
385
532
|
This method determines document persistence and lifecycle.
|
|
386
|
-
FlowDocument returns "flow", TaskDocument returns "task"
|
|
387
|
-
TemporaryDocument returns "temporary".
|
|
533
|
+
FlowDocument returns "flow", TaskDocument returns "task".
|
|
388
534
|
"""
|
|
389
535
|
raise NotImplementedError("Subclasses must implement this method")
|
|
390
536
|
|
|
@@ -436,7 +582,7 @@ class Document(BaseModel, ABC):
|
|
|
436
582
|
during execution.
|
|
437
583
|
|
|
438
584
|
Returns:
|
|
439
|
-
True if this is
|
|
585
|
+
True if this document is temporary, False otherwise.
|
|
440
586
|
"""
|
|
441
587
|
return self.get_base_type() == "temporary"
|
|
442
588
|
|
|
@@ -481,8 +627,6 @@ class Document(BaseModel, ABC):
|
|
|
481
627
|
def validate_file_name(cls, name: str) -> None:
|
|
482
628
|
"""Validate that a file name matches allowed patterns.
|
|
483
629
|
|
|
484
|
-
@public
|
|
485
|
-
|
|
486
630
|
DO NOT OVERRIDE this method if you define a FILES enum!
|
|
487
631
|
The validation is automatic when FILES enum is present.
|
|
488
632
|
|
|
@@ -526,7 +670,7 @@ class Document(BaseModel, ABC):
|
|
|
526
670
|
|
|
527
671
|
Ensures the document name is secure and follows conventions:
|
|
528
672
|
- No path traversal characters (.., \\, /)
|
|
529
|
-
- Cannot end with .description.md
|
|
673
|
+
- Cannot end with .description.md or .sources.json
|
|
530
674
|
- No leading/trailing whitespace
|
|
531
675
|
- Must match FILES enum if defined
|
|
532
676
|
|
|
@@ -551,6 +695,9 @@ class Document(BaseModel, ABC):
|
|
|
551
695
|
f"Document names cannot end with {cls.DESCRIPTION_EXTENSION}: {v}"
|
|
552
696
|
)
|
|
553
697
|
|
|
698
|
+
if v.endswith(cls.SOURCES_EXTENSION):
|
|
699
|
+
raise DocumentNameError(f"Document names cannot end with {cls.SOURCES_EXTENSION}: {v}")
|
|
700
|
+
|
|
554
701
|
if ".." in v or "\\" in v or "/" in v:
|
|
555
702
|
raise DocumentNameError(f"Invalid filename - contains path traversal characters: {v}")
|
|
556
703
|
|
|
@@ -575,7 +722,7 @@ class Document(BaseModel, ABC):
|
|
|
575
722
|
2. str → UTF-8 encoding
|
|
576
723
|
3. dict/BaseModel + .json → JSON serialization (indented)
|
|
577
724
|
4. dict/BaseModel + .yaml/.yml → YAML serialization
|
|
578
|
-
5. list[str] + .md → Join with markdown
|
|
725
|
+
5. list[str] + .md → Join with markdown sections (validates format compatibility)
|
|
579
726
|
6. list[Any] + .json/.yaml → JSON/YAML array
|
|
580
727
|
7. int/float/bool + .json → JSON primitive
|
|
581
728
|
|
|
@@ -795,7 +942,7 @@ class Document(BaseModel, ABC):
|
|
|
795
942
|
This is computed once and cached for performance.
|
|
796
943
|
The hash is deterministic based on content only.
|
|
797
944
|
"""
|
|
798
|
-
return b32encode(hashlib.sha256(self.content).digest()).decode("ascii").upper()
|
|
945
|
+
return b32encode(hashlib.sha256(self.content).digest()).decode("ascii").upper().rstrip("=")
|
|
799
946
|
|
|
800
947
|
@final
|
|
801
948
|
@property
|
|
@@ -944,8 +1091,6 @@ class Document(BaseModel, ABC):
|
|
|
944
1091
|
def as_yaml(self) -> Any:
|
|
945
1092
|
r"""Parse document content as YAML.
|
|
946
1093
|
|
|
947
|
-
@public
|
|
948
|
-
|
|
949
1094
|
Parses the document's text content as YAML and returns Python objects.
|
|
950
1095
|
Uses ruamel.yaml which is safe by default (no code execution).
|
|
951
1096
|
|
|
@@ -973,8 +1118,6 @@ class Document(BaseModel, ABC):
|
|
|
973
1118
|
def as_json(self) -> Any:
|
|
974
1119
|
"""Parse document content as JSON.
|
|
975
1120
|
|
|
976
|
-
@public
|
|
977
|
-
|
|
978
1121
|
Parses the document's text content as JSON and returns Python objects.
|
|
979
1122
|
Document must contain valid JSON text.
|
|
980
1123
|
|
|
@@ -1069,7 +1212,7 @@ class Document(BaseModel, ABC):
|
|
|
1069
1212
|
|
|
1070
1213
|
@public
|
|
1071
1214
|
|
|
1072
|
-
Splits text content using markdown
|
|
1215
|
+
Splits text content automatically using markdown section separators.
|
|
1073
1216
|
Designed for markdown documents with multiple sections.
|
|
1074
1217
|
|
|
1075
1218
|
Returns:
|
|
@@ -1084,9 +1227,9 @@ class Document(BaseModel, ABC):
|
|
|
1084
1227
|
>>> doc = MyDocument.create(name="book.md", content=sections)
|
|
1085
1228
|
>>> doc.as_markdown_list() # Returns original sections
|
|
1086
1229
|
|
|
1087
|
-
>>> #
|
|
1088
|
-
>>>
|
|
1089
|
-
>>> doc2 = MyDocument(name="parts.md", content=
|
|
1230
|
+
>>> # Round-trip conversion works automatically
|
|
1231
|
+
>>> sections = ["Part 1", "Part 2", "Part 3"]
|
|
1232
|
+
>>> doc2 = MyDocument.create(name="parts.md", content=sections)
|
|
1090
1233
|
>>> doc2.as_markdown_list() # ['Part 1', 'Part 2', 'Part 3']
|
|
1091
1234
|
"""
|
|
1092
1235
|
return self.text.split(self.MARKDOWN_LIST_SEPARATOR)
|
|
@@ -1123,7 +1266,7 @@ class Document(BaseModel, ABC):
|
|
|
1123
1266
|
Extension Rules:
|
|
1124
1267
|
- .json → JSON parsing for dict/list/BaseModel
|
|
1125
1268
|
- .yaml/.yml → YAML parsing for dict/list/BaseModel
|
|
1126
|
-
- .md + list → Split
|
|
1269
|
+
- .md + list → Split automatically into sections
|
|
1127
1270
|
- Any + str → UTF-8 decode
|
|
1128
1271
|
- Any + bytes → Raw content
|
|
1129
1272
|
|
|
@@ -1139,8 +1282,7 @@ class Document(BaseModel, ABC):
|
|
|
1139
1282
|
|
|
1140
1283
|
>>> # Markdown list
|
|
1141
1284
|
>>> items = ["Item 1", "Item 2"]
|
|
1142
|
-
>>>
|
|
1143
|
-
>>> doc = MyDocument(name="list.md", content=content)
|
|
1285
|
+
>>> doc = MyDocument.create(name="list.md", content=items)
|
|
1144
1286
|
>>> doc.parse(list)
|
|
1145
1287
|
['Item 1', 'Item 2']
|
|
1146
1288
|
"""
|
|
@@ -1215,6 +1357,129 @@ class Document(BaseModel, ABC):
|
|
|
1215
1357
|
|
|
1216
1358
|
raise ValueError(f"Unsupported type {type_} for file {self.name}")
|
|
1217
1359
|
|
|
1360
|
+
def get_source_documents(self) -> list[str]:
|
|
1361
|
+
"""Get list of document SHA256 hashes referenced as sources.
|
|
1362
|
+
|
|
1363
|
+
Retrieves all document references from this document's sources list,
|
|
1364
|
+
filtering for valid SHA256 hashes that reference other documents.
|
|
1365
|
+
This is useful for building dependency graphs and tracking document
|
|
1366
|
+
lineage in processing pipelines.
|
|
1367
|
+
|
|
1368
|
+
Returns:
|
|
1369
|
+
List of SHA256 hashes (base32 encoded) for documents referenced
|
|
1370
|
+
as sources. Each hash uniquely identifies another document that
|
|
1371
|
+
contributed to creating this one.
|
|
1372
|
+
|
|
1373
|
+
Example:
|
|
1374
|
+
>>> # Create a derived document from multiple sources
|
|
1375
|
+
>>> source1 = MyDocument.create(name="data1.txt", content="First")
|
|
1376
|
+
>>> source2 = MyDocument.create(name="data2.txt", content="Second")
|
|
1377
|
+
>>>
|
|
1378
|
+
>>> merged = MyDocument.create(
|
|
1379
|
+
... name="merged.txt",
|
|
1380
|
+
... content="Combined data",
|
|
1381
|
+
... sources=[source1.sha256, source2.sha256, "https://api.example.com"]
|
|
1382
|
+
... )
|
|
1383
|
+
>>>
|
|
1384
|
+
>>> # Get only document references (not URLs)
|
|
1385
|
+
>>> doc_refs = merged.get_source_documents()
|
|
1386
|
+
>>> print(doc_refs) # [source1.sha256, source2.sha256]
|
|
1387
|
+
>>>
|
|
1388
|
+
>>> # Check if specific document is a source
|
|
1389
|
+
>>> if source1.sha256 in doc_refs:
|
|
1390
|
+
... print("Document derived from source1")
|
|
1391
|
+
"""
|
|
1392
|
+
return [src for src in self.sources if is_document_sha256(src)]
|
|
1393
|
+
|
|
1394
|
+
def get_source_references(self) -> list[str]:
|
|
1395
|
+
"""Get list of arbitrary reference strings from sources.
|
|
1396
|
+
|
|
1397
|
+
Retrieves all non-document references from this document's sources list.
|
|
1398
|
+
These are typically URLs, file paths, API endpoints, or descriptive strings
|
|
1399
|
+
that indicate where the document's content originated from, but are not
|
|
1400
|
+
references to other documents in the pipeline.
|
|
1401
|
+
|
|
1402
|
+
Returns:
|
|
1403
|
+
List of reference strings that are not document SHA256 hashes.
|
|
1404
|
+
Can include URLs, file paths, API endpoints, dataset names,
|
|
1405
|
+
or any other string that provides source context.
|
|
1406
|
+
|
|
1407
|
+
Example:
|
|
1408
|
+
>>> # Create document with mixed source types
|
|
1409
|
+
>>> doc = MyDocument.create(
|
|
1410
|
+
... name="report.txt",
|
|
1411
|
+
... content="Analysis results",
|
|
1412
|
+
... sources=[
|
|
1413
|
+
... other_doc.sha256, # Document reference
|
|
1414
|
+
... "https://api.example.com/data", # API URL
|
|
1415
|
+
... "dataset:customer-2024", # Dataset identifier
|
|
1416
|
+
... "/path/to/source.csv", # File path
|
|
1417
|
+
... ]
|
|
1418
|
+
... )
|
|
1419
|
+
>>>
|
|
1420
|
+
>>> # Get only non-document references
|
|
1421
|
+
>>> refs = doc.get_source_references()
|
|
1422
|
+
>>> print(refs)
|
|
1423
|
+
>>> # ["https://api.example.com/data", "dataset:customer-2024", "/path/to/source.csv"]
|
|
1424
|
+
>>>
|
|
1425
|
+
>>> # Use for attribution or debugging
|
|
1426
|
+
>>> for ref in refs:
|
|
1427
|
+
... print(f"Data sourced from: {ref}")
|
|
1428
|
+
"""
|
|
1429
|
+
return [src for src in self.sources if not is_document_sha256(src)]
|
|
1430
|
+
|
|
1431
|
+
def has_source(self, source: Document | str) -> bool:
|
|
1432
|
+
"""Check if a specific source is tracked for this document.
|
|
1433
|
+
|
|
1434
|
+
Verifies whether a given source (document or reference string) is
|
|
1435
|
+
included in this document's sources list. Useful for dependency
|
|
1436
|
+
checking, lineage verification, and conditional processing based
|
|
1437
|
+
on document origins.
|
|
1438
|
+
|
|
1439
|
+
Args:
|
|
1440
|
+
source: Source to check for. Can be:
|
|
1441
|
+
- Document: Checks if document's SHA256 is in sources
|
|
1442
|
+
- str: Checks if exact string is in sources (hash or reference)
|
|
1443
|
+
|
|
1444
|
+
Returns:
|
|
1445
|
+
True if the source is tracked in this document's sources,
|
|
1446
|
+
False otherwise.
|
|
1447
|
+
|
|
1448
|
+
Raises:
|
|
1449
|
+
TypeError: If source is not a Document or string.
|
|
1450
|
+
|
|
1451
|
+
Example:
|
|
1452
|
+
>>> # Check if document was derived from specific source
|
|
1453
|
+
>>> source_doc = MyDocument.create(name="original.txt", content="Data")
|
|
1454
|
+
>>> api_url = "https://api.example.com/data"
|
|
1455
|
+
>>>
|
|
1456
|
+
>>> derived = MyDocument.create(
|
|
1457
|
+
... name="processed.txt",
|
|
1458
|
+
... content="Processed data",
|
|
1459
|
+
... sources=[source_doc.sha256, api_url]
|
|
1460
|
+
... )
|
|
1461
|
+
>>>
|
|
1462
|
+
>>> # Check document source
|
|
1463
|
+
>>> if derived.has_source(source_doc):
|
|
1464
|
+
... print("Derived from source_doc")
|
|
1465
|
+
>>>
|
|
1466
|
+
>>> # Check string reference
|
|
1467
|
+
>>> if derived.has_source(api_url):
|
|
1468
|
+
... print("Data from API")
|
|
1469
|
+
>>>
|
|
1470
|
+
>>> # Check by SHA256 directly
|
|
1471
|
+
>>> if derived.has_source(source_doc.sha256):
|
|
1472
|
+
... print("Has specific hash")
|
|
1473
|
+
"""
|
|
1474
|
+
if isinstance(source, str):
|
|
1475
|
+
# Direct string comparison
|
|
1476
|
+
return source in self.sources
|
|
1477
|
+
elif isinstance(source, Document): # type: ignore[misc]
|
|
1478
|
+
# Check if document's SHA256 is in sources
|
|
1479
|
+
return source.sha256 in self.sources
|
|
1480
|
+
else:
|
|
1481
|
+
raise TypeError(f"Invalid source type: {type(source)}")
|
|
1482
|
+
|
|
1218
1483
|
@final
|
|
1219
1484
|
def serialize_model(self) -> dict[str, Any]:
|
|
1220
1485
|
"""Serialize document to dictionary for storage or transmission.
|
|
@@ -1230,8 +1495,9 @@ class Document(BaseModel, ABC):
|
|
|
1230
1495
|
- base_type: Persistence type - "flow", "task", or "temporary" (str)
|
|
1231
1496
|
- size: Content size in bytes (int)
|
|
1232
1497
|
- id: Short hash identifier, first 6 chars of SHA256 (str)
|
|
1233
|
-
- sha256: Full SHA256 hash in base32 encoding (str)
|
|
1498
|
+
- sha256: Full SHA256 hash in base32 encoding without padding (str)
|
|
1234
1499
|
- mime_type: Detected MIME type (str)
|
|
1500
|
+
- sources: List of source strings (list[dict])
|
|
1235
1501
|
- content: Encoded content (str)
|
|
1236
1502
|
- content_encoding: Either "utf-8" or "base64" (str)
|
|
1237
1503
|
|
|
@@ -1254,6 +1520,7 @@ class Document(BaseModel, ABC):
|
|
|
1254
1520
|
"id": self.id,
|
|
1255
1521
|
"sha256": self.sha256,
|
|
1256
1522
|
"mime_type": self.mime_type,
|
|
1523
|
+
"sources": self.sources,
|
|
1257
1524
|
}
|
|
1258
1525
|
|
|
1259
1526
|
# Try to encode content as UTF-8, fall back to base64
|
|
@@ -1288,6 +1555,7 @@ class Document(BaseModel, ABC):
|
|
|
1288
1555
|
Optional keys:
|
|
1289
1556
|
- description: Document description (str | None)
|
|
1290
1557
|
- content_encoding: "utf-8" or "base64" (defaults to "utf-8")
|
|
1558
|
+
- sources: List of source strings
|
|
1291
1559
|
|
|
1292
1560
|
Returns:
|
|
1293
1561
|
New Document instance with restored content.
|
|
@@ -1326,9 +1594,9 @@ class Document(BaseModel, ABC):
|
|
|
1326
1594
|
else:
|
|
1327
1595
|
raise ValueError(f"Invalid content type: {type(content_raw)}")
|
|
1328
1596
|
|
|
1329
|
-
# Create document with the required fields
|
|
1330
1597
|
return cls(
|
|
1331
1598
|
name=data["name"],
|
|
1332
1599
|
content=content,
|
|
1333
1600
|
description=data.get("description"),
|
|
1601
|
+
sources=data.get("sources", []),
|
|
1334
1602
|
)
|