ai-pipeline-core 0.1.13__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. ai_pipeline_core/__init__.py +25 -14
  2. ai_pipeline_core/documents/__init__.py +2 -1
  3. ai_pipeline_core/documents/document.py +317 -49
  4. ai_pipeline_core/documents/document_list.py +136 -33
  5. ai_pipeline_core/documents/flow_document.py +8 -29
  6. ai_pipeline_core/documents/task_document.py +6 -27
  7. ai_pipeline_core/documents/temporary_document.py +6 -27
  8. ai_pipeline_core/documents/utils.py +64 -1
  9. ai_pipeline_core/flow/config.py +174 -5
  10. ai_pipeline_core/flow/options.py +2 -2
  11. ai_pipeline_core/llm/__init__.py +6 -1
  12. ai_pipeline_core/llm/ai_messages.py +14 -7
  13. ai_pipeline_core/llm/client.py +143 -55
  14. ai_pipeline_core/llm/model_options.py +20 -5
  15. ai_pipeline_core/llm/model_response.py +77 -29
  16. ai_pipeline_core/llm/model_types.py +38 -40
  17. ai_pipeline_core/logging/__init__.py +0 -2
  18. ai_pipeline_core/logging/logging_config.py +0 -6
  19. ai_pipeline_core/logging/logging_mixin.py +2 -10
  20. ai_pipeline_core/pipeline.py +68 -65
  21. ai_pipeline_core/prefect.py +12 -3
  22. ai_pipeline_core/prompt_manager.py +6 -7
  23. ai_pipeline_core/settings.py +13 -5
  24. ai_pipeline_core/simple_runner/__init__.py +1 -11
  25. ai_pipeline_core/simple_runner/cli.py +13 -12
  26. ai_pipeline_core/simple_runner/simple_runner.py +34 -172
  27. ai_pipeline_core/storage/__init__.py +8 -0
  28. ai_pipeline_core/storage/storage.py +628 -0
  29. ai_pipeline_core/tracing.py +110 -26
  30. {ai_pipeline_core-0.1.13.dist-info → ai_pipeline_core-0.2.0.dist-info}/METADATA +60 -23
  31. ai_pipeline_core-0.2.0.dist-info/RECORD +38 -0
  32. ai_pipeline_core-0.1.13.dist-info/RECORD +0 -36
  33. {ai_pipeline_core-0.1.13.dist-info → ai_pipeline_core-0.2.0.dist-info}/WHEEL +0 -0
  34. {ai_pipeline_core-0.1.13.dist-info → ai_pipeline_core-0.2.0.dist-info}/licenses/LICENSE +0 -0
@@ -6,6 +6,8 @@ This module provides the core document abstraction for working with various type
6
6
  in AI pipelines. Documents are immutable Pydantic models that wrap binary content with metadata.
7
7
  """
8
8
 
9
+ from __future__ import annotations
10
+
9
11
  import base64
10
12
  import hashlib
11
13
  import json
@@ -30,13 +32,14 @@ from typing import (
30
32
  from pydantic import (
31
33
  BaseModel,
32
34
  ConfigDict,
35
+ Field,
33
36
  ValidationInfo,
34
37
  field_serializer,
35
38
  field_validator,
36
39
  )
37
40
  from ruamel.yaml import YAML
38
41
 
39
- from ai_pipeline_core.documents.utils import canonical_name_key
42
+ from ai_pipeline_core.documents.utils import canonical_name_key, is_document_sha256
40
43
  from ai_pipeline_core.exceptions import DocumentNameError, DocumentSizeError
41
44
 
42
45
  from .mime_type import (
@@ -58,8 +61,7 @@ class Document(BaseModel, ABC):
58
61
  Document is the fundamental data abstraction for all content flowing through
59
62
  pipelines. It provides automatic encoding, MIME type detection, serialization,
60
63
  and validation. All documents must be subclassed from FlowDocument or TaskDocument
61
- based on their persistence requirements. TemporaryDocument is a special concrete
62
- class that can be instantiated directly (not abstract).
64
+ based on their persistence requirements.
63
65
 
64
66
  VALIDATION IS AUTOMATIC - Do not add manual validation!
65
67
  Size validation, name validation, and MIME type detection are built-in.
@@ -71,7 +73,7 @@ class Document(BaseModel, ABC):
71
73
  document.validate_file_name(document.name) # NO! Automatic
72
74
 
73
75
  Best Practices:
74
- - Use create() classmethod for automatic type conversion (90% of cases)
76
+ - Use create() classmethod for automatic type conversion (default preferred)
75
77
  - Omit description parameter unless truly needed for metadata
76
78
  - When using LLM functions, pass AIMessages or str. Wrap any Document values
77
79
  in AIMessages([...]). Do not call .text yourself
@@ -94,6 +96,7 @@ class Document(BaseModel, ABC):
94
96
  - SHA256 hashing for deduplication
95
97
  - Support for text, JSON, YAML, PDF, and image formats
96
98
  - Conversion utilities between different formats
99
+ - Source provenance tracking via sources field
97
100
 
98
101
  Class Variables:
99
102
  MAX_CONTENT_SIZE: Maximum allowed content size in bytes (default 25MB)
@@ -102,6 +105,7 @@ class Document(BaseModel, ABC):
102
105
  name: Document filename (validated for security)
103
106
  description: Optional human-readable description
104
107
  content: Raw document content as bytes
108
+ sources: List of source references tracking document provenance
105
109
 
106
110
  Creating Documents:
107
111
  **Use the `create` classmethod** for most use cases. It accepts various
@@ -117,7 +121,7 @@ class Document(BaseModel, ABC):
117
121
  Warning:
118
122
  - Document subclasses should NOT start with 'Test' prefix (pytest conflict)
119
123
  - Cannot instantiate Document directly - must subclass FlowDocument or TaskDocument
120
- - Cannot add custom fields - only name, description, content are allowed
124
+ - Cannot add custom fields - only name, description, content, sources are allowed
121
125
  - Document is an abstract class and cannot be instantiated directly
122
126
 
123
127
  Metadata Attachment Patterns:
@@ -126,10 +130,62 @@ class Document(BaseModel, ABC):
126
130
  2. Embed metadata in content (e.g., JSON with data + metadata fields)
127
131
  3. Create a separate MetadataDocument type to accompany data documents
128
132
  4. Use document naming conventions (e.g., "data_v2_2024.json")
129
- 5. Store metadata in flow_options or pass through TraceInfo
133
+ 5. Store metadata in flow_options
134
+
135
+ FILES Enum Best Practice:
136
+ When defining a FILES enum, NEVER use magic strings to reference files.
137
+ Always use the enum values to maintain type safety and refactorability.
138
+
139
+ WRONG - Magic strings/numbers:
140
+ doc = ConfigDocument.create(name="config.yaml", content=data) # NO!
141
+ doc = docs.get_by("settings.json") # NO! Magic string
142
+ files = ["config.yaml", "settings.json"] # NO! Magic strings
143
+
144
+ CORRECT - Use enum references:
145
+ doc = ConfigDocument.create(
146
+ name=ConfigDocument.FILES.CONFIG, # YES! Type-safe
147
+ content=data
148
+ )
149
+ doc = docs.get_by(ConfigDocument.FILES.SETTINGS) # YES!
150
+ files = [
151
+ ConfigDocument.FILES.CONFIG,
152
+ ConfigDocument.FILES.SETTINGS
153
+ ] # YES! Refactorable
154
+
155
+ Pydantic Model Interaction:
156
+ Documents provide DIRECT support for Pydantic models. Use the built-in
157
+ methods instead of manual JSON conversion.
158
+
159
+ WRONG - Manual JSON conversion:
160
+ # Don't do this - manual JSON handling
161
+ json_str = doc.text
162
+ json_data = json.loads(json_str)
163
+ model = MyModel(**json_data) # NO! Use as_pydantic_model
164
+
165
+ # Don't do this - manual serialization
166
+ json_str = model.model_dump_json()
167
+ doc = MyDocument.create(name="data.json", content=json_str) # NO!
168
+
169
+ CORRECT - Direct Pydantic interaction:
170
+ # Reading Pydantic model from document
171
+ model = doc.as_pydantic_model(MyModel) # Direct conversion
172
+ models = doc.as_pydantic_model(list[MyModel]) # List support
173
+
174
+ # Creating document from Pydantic model
175
+ doc = MyDocument.create(
176
+ name="data.json",
177
+ content=model # Direct BaseModel support
178
+ )
179
+
180
+ # Round-trip is seamless
181
+ original_model = MyModel(field="value")
182
+ doc = MyDocument.create(name="data.json", content=original_model)
183
+ restored = doc.as_pydantic_model(MyModel)
184
+ assert restored == original_model # Perfect round-trip
130
185
 
131
186
  Example:
132
187
  >>> from enum import StrEnum
188
+ >>> from pydantic import BaseModel
133
189
  >>>
134
190
  >>> # Simple document:
135
191
  >>> class MyDocument(FlowDocument):
@@ -141,10 +197,32 @@ class Document(BaseModel, ABC):
141
197
  ... CONFIG = "config.yaml"
142
198
  ... SETTINGS = "settings.json"
143
199
  >>>
144
- >>> # RECOMMENDED: Use create for automatic conversion
145
- >>> doc = MyDocument.create(name="data.json", content={"key": "value"})
146
- >>> print(doc.is_text) # True
147
- >>> data = doc.as_json() # {'key': 'value'}
200
+ >>> # CORRECT FILES usage - no magic strings:
201
+ >>> doc = ConfigDocument.create(
202
+ ... name=ConfigDocument.FILES.CONFIG, # Use enum
203
+ ... content={"key": "value"}
204
+ ... )
205
+ >>>
206
+ >>> # CORRECT Pydantic usage:
207
+ >>> class Config(BaseModel):
208
+ ... key: str
209
+ >>>
210
+ >>> # Direct creation from Pydantic model
211
+ >>> config_model = Config(key="value")
212
+ >>> doc = MyDocument.create(name="data.json", content=config_model)
213
+ >>>
214
+ >>> # Direct extraction to Pydantic model
215
+ >>> restored = doc.as_pydantic_model(Config)
216
+ >>> print(restored.key) # "value"
217
+ >>>
218
+ >>> # Track document provenance with sources
219
+ >>> source_doc = MyDocument.create(name="input.txt", content="raw data")
220
+ >>> processed = MyDocument.create(
221
+ ... name="output.txt",
222
+ ... content="processed data",
223
+ ... sources=[source_doc.sha256] # Reference source document
224
+ ... )
225
+ >>> processed.has_source(source_doc) # True
148
226
  """
149
227
 
150
228
  MAX_CONTENT_SIZE: ClassVar[int] = 25 * 1024 * 1024
@@ -156,6 +234,9 @@ class Document(BaseModel, ABC):
156
234
  DESCRIPTION_EXTENSION: ClassVar[str] = ".description.md"
157
235
  """File extension for description files."""
158
236
 
237
+ SOURCES_EXTENSION: ClassVar[str] = ".sources.json"
238
+ """File extension for sources metadata files."""
239
+
159
240
  MARKDOWN_LIST_SEPARATOR: ClassVar[str] = "\n\n-----------------\n\n"
160
241
  """Separator for markdown list items."""
161
242
 
@@ -193,7 +274,7 @@ class Document(BaseModel, ABC):
193
274
  )
194
275
  # Check that the Document's model_fields only contain the allowed fields
195
276
  # It prevents AI models from adding additional fields to documents
196
- allowed = {"name", "description", "content"}
277
+ allowed = {"name", "description", "content", "sources"}
197
278
  current = set(getattr(cls, "model_fields", {}).keys())
198
279
  extras = current - allowed
199
280
  if extras:
@@ -204,25 +285,58 @@ class Document(BaseModel, ABC):
204
285
 
205
286
  @overload
206
287
  @classmethod
207
- def create(cls, *, name: str, content: bytes, description: str | None = None) -> Self: ...
288
+ def create(
289
+ cls,
290
+ *,
291
+ name: str,
292
+ content: bytes,
293
+ description: str | None = None,
294
+ sources: list[str] = [],
295
+ ) -> Self: ...
208
296
 
209
297
  @overload
210
298
  @classmethod
211
- def create(cls, *, name: str, content: str, description: str | None = None) -> Self: ...
299
+ def create(
300
+ cls,
301
+ *,
302
+ name: str,
303
+ content: str,
304
+ description: str | None = None,
305
+ sources: list[str] = [],
306
+ ) -> Self: ...
212
307
 
213
308
  @overload
214
309
  @classmethod
215
310
  def create(
216
- cls, *, name: str, content: dict[str, Any], description: str | None = None
311
+ cls,
312
+ *,
313
+ name: str,
314
+ content: dict[str, Any],
315
+ description: str | None = None,
316
+ sources: list[str] = [],
217
317
  ) -> Self: ...
218
318
 
219
319
  @overload
220
320
  @classmethod
221
- def create(cls, *, name: str, content: list[Any], description: str | None = None) -> Self: ...
321
+ def create(
322
+ cls,
323
+ *,
324
+ name: str,
325
+ content: list[Any],
326
+ description: str | None = None,
327
+ sources: list[str] = [],
328
+ ) -> Self: ...
222
329
 
223
330
  @overload
224
331
  @classmethod
225
- def create(cls, *, name: str, content: BaseModel, description: str | None = None) -> Self: ...
332
+ def create(
333
+ cls,
334
+ *,
335
+ name: str,
336
+ content: BaseModel,
337
+ description: str | None = None,
338
+ sources: list[str] = [],
339
+ ) -> Self: ...
226
340
 
227
341
  @classmethod
228
342
  def create(
@@ -231,6 +345,7 @@ class Document(BaseModel, ABC):
231
345
  name: str,
232
346
  content: str | bytes | dict[str, Any] | list[Any] | BaseModel,
233
347
  description: str | None = None,
348
+ sources: list[str] = [],
234
349
  ) -> Self:
235
350
  r"""Create a Document with automatic content type conversion (recommended).
236
351
 
@@ -240,7 +355,7 @@ class Document(BaseModel, ABC):
240
355
  content types and automatically converts them to bytes based on the file
241
356
  extension. Use the `parse` method to reverse this conversion.
242
357
 
243
- Best Practice (90% of cases):
358
+ Best Practice (by default, unless instructed otherwise):
244
359
  Only provide name and content. The description parameter is RARELY needed.
245
360
 
246
361
  Args:
@@ -254,19 +369,24 @@ class Document(BaseModel, ABC):
254
369
  - bytes: Used directly without conversion
255
370
  - str: Encoded to UTF-8 bytes
256
371
  - dict[str, Any]: Serialized to JSON (.json) or YAML (.yaml/.yml)
257
- - list[str]: Joined with separator for .md (validates no items
258
- contain separator), else JSON/YAML
372
+ - list[str]: Joined automatically for .md (validates format compatibility),
373
+ else JSON/YAML
259
374
  - list[BaseModel]: Serialized to JSON or YAML based on extension
260
375
  - BaseModel: Serialized to JSON or YAML based on extension
261
376
  description: Optional description - USUALLY OMIT THIS (defaults to None).
262
377
  Only use when meaningful metadata helps downstream processing
378
+ sources: Optional list of source strings (document SHA256 hashes or references).
379
+ Used to track what sources contributed to creating this document.
380
+ Can contain document SHA256 hashes (for referencing other documents)
381
+ or arbitrary reference strings (URLs, file paths, descriptions).
382
+ Defaults to empty list
263
383
 
264
384
  Returns:
265
385
  New Document instance with content converted to bytes
266
386
 
267
387
  Raises:
268
388
  ValueError: If content type is not supported for the file extension,
269
- or if markdown list items contain the separator
389
+ or if markdown list format is incompatible
270
390
  DocumentNameError: If filename violates validation rules
271
391
  DocumentSizeError: If content exceeds MAX_CONTENT_SIZE
272
392
 
@@ -276,7 +396,7 @@ class Document(BaseModel, ABC):
276
396
  returns the original dictionary {"key": "value"}.
277
397
 
278
398
  Example:
279
- >>> # CORRECT - no description needed (90% of cases)
399
+ >>> # CORRECT - no description needed (by default, unless instructed otherwise)
280
400
  >>> doc = MyDocument.create(name="test.txt", content="Hello World")
281
401
  >>> doc.content # b'Hello World'
282
402
  >>> doc.parse(str) # "Hello World"
@@ -306,11 +426,31 @@ class Document(BaseModel, ABC):
306
426
  >>> items = ["Section 1", "Section 2"]
307
427
  >>> doc = MyDocument.create(name="sections.md", content=items)
308
428
  >>> doc.parse(list) # ["Section 1", "Section 2"]
429
+
430
+ >>> # Document with sources for provenance tracking
431
+ >>> source_doc = MyDocument.create(name="source.txt", content="original")
432
+ >>> derived = MyDocument.create(
433
+ ... name="result.txt",
434
+ ... content="processed",
435
+ ... sources=[source_doc.sha256, "https://api.example.com/data"]
436
+ ... )
437
+ >>> derived.get_source_documents() # [source_doc.sha256]
438
+ >>> derived.get_source_references() # ["https://api.example.com/data"]
309
439
  """
310
440
  # Use model_validate to leverage the existing validator logic
311
- temp = cls.model_validate({"name": name, "content": content, "description": description})
441
+ temp = cls.model_validate({
442
+ "name": name,
443
+ "content": content,
444
+ "description": description,
445
+ "sources": sources,
446
+ })
312
447
  # Now construct with type-checker-friendly call (bytes only)
313
- return cls(name=temp.name, content=temp.content, description=temp.description)
448
+ return cls(
449
+ name=temp.name,
450
+ content=temp.content,
451
+ description=temp.description,
452
+ sources=temp.sources,
453
+ )
314
454
 
315
455
  def __init__(
316
456
  self,
@@ -318,6 +458,7 @@ class Document(BaseModel, ABC):
318
458
  name: str,
319
459
  content: bytes,
320
460
  description: str | None = None,
461
+ sources: list[str] = [],
321
462
  ) -> None:
322
463
  """Initialize a Document instance with raw bytes content.
323
464
 
@@ -335,6 +476,10 @@ class Document(BaseModel, ABC):
335
476
  name: Document filename (required, keyword-only)
336
477
  content: Document content as raw bytes (required, keyword-only)
337
478
  description: Optional human-readable description (keyword-only)
479
+ sources: Optional list of source strings for provenance tracking.
480
+ Can contain document SHA256 hashes (for referencing other documents)
481
+ or arbitrary reference strings (URLs, file paths, descriptions).
482
+ Defaults to empty list
338
483
 
339
484
  Raises:
340
485
  TypeError: If attempting to instantiate Document directly.
@@ -349,19 +494,21 @@ class Document(BaseModel, ABC):
349
494
  >>> doc = MyDocument.create(name="data.json", content={"key": "value"})
350
495
  >>> doc = MyDocument.create(name="config.yaml", content=my_model)
351
496
  >>> doc = MyDocument.create(name="items.md", content=["item1", "item2"])
352
-
353
- See Also:
354
- create: Recommended factory method with automatic type conversion
355
- parse: Method to reverse the conversion done by create
356
497
  """
357
498
  if type(self) is Document:
358
499
  raise TypeError("Cannot instantiate abstract Document class directly")
359
500
 
360
- super().__init__(name=name, content=content, description=description)
501
+ super().__init__(name=name, content=content, description=description, sources=sources)
361
502
 
362
503
  name: str
363
504
  description: str | None = None
364
505
  content: bytes # Note: constructor accepts str | bytes, but field stores bytes only
506
+ sources: list[str] = Field(
507
+ default_factory=list,
508
+ description="List of source references for tracking document provenance. "
509
+ "Can contain document SHA256 hashes (for referencing other documents) "
510
+ "or arbitrary reference strings (URLs, file paths, descriptions)",
511
+ )
365
512
 
366
513
  # Pydantic configuration
367
514
  model_config = ConfigDict(
@@ -383,8 +530,7 @@ class Document(BaseModel, ABC):
383
530
 
384
531
  Note:
385
532
  This method determines document persistence and lifecycle.
386
- FlowDocument returns "flow", TaskDocument returns "task",
387
- TemporaryDocument returns "temporary".
533
+ FlowDocument returns "flow", TaskDocument returns "task".
388
534
  """
389
535
  raise NotImplementedError("Subclasses must implement this method")
390
536
 
@@ -436,7 +582,7 @@ class Document(BaseModel, ABC):
436
582
  during execution.
437
583
 
438
584
  Returns:
439
- True if this is a TemporaryDocument, False otherwise.
585
+ True if this document is temporary, False otherwise.
440
586
  """
441
587
  return self.get_base_type() == "temporary"
442
588
 
@@ -481,8 +627,6 @@ class Document(BaseModel, ABC):
481
627
  def validate_file_name(cls, name: str) -> None:
482
628
  """Validate that a file name matches allowed patterns.
483
629
 
484
- @public
485
-
486
630
  DO NOT OVERRIDE this method if you define a FILES enum!
487
631
  The validation is automatic when FILES enum is present.
488
632
 
@@ -526,7 +670,7 @@ class Document(BaseModel, ABC):
526
670
 
527
671
  Ensures the document name is secure and follows conventions:
528
672
  - No path traversal characters (.., \\, /)
529
- - Cannot end with .description.md
673
+ - Cannot end with .description.md or .sources.json
530
674
  - No leading/trailing whitespace
531
675
  - Must match FILES enum if defined
532
676
 
@@ -551,6 +695,9 @@ class Document(BaseModel, ABC):
551
695
  f"Document names cannot end with {cls.DESCRIPTION_EXTENSION}: {v}"
552
696
  )
553
697
 
698
+ if v.endswith(cls.SOURCES_EXTENSION):
699
+ raise DocumentNameError(f"Document names cannot end with {cls.SOURCES_EXTENSION}: {v}")
700
+
554
701
  if ".." in v or "\\" in v or "/" in v:
555
702
  raise DocumentNameError(f"Invalid filename - contains path traversal characters: {v}")
556
703
 
@@ -575,7 +722,7 @@ class Document(BaseModel, ABC):
575
722
  2. str → UTF-8 encoding
576
723
  3. dict/BaseModel + .json → JSON serialization (indented)
577
724
  4. dict/BaseModel + .yaml/.yml → YAML serialization
578
- 5. list[str] + .md → Join with markdown separator (validates no items contain separator)
725
+ 5. list[str] + .md → Join with markdown sections (validates format compatibility)
579
726
  6. list[Any] + .json/.yaml → JSON/YAML array
580
727
  7. int/float/bool + .json → JSON primitive
581
728
 
@@ -795,7 +942,7 @@ class Document(BaseModel, ABC):
795
942
  This is computed once and cached for performance.
796
943
  The hash is deterministic based on content only.
797
944
  """
798
- return b32encode(hashlib.sha256(self.content).digest()).decode("ascii").upper()
945
+ return b32encode(hashlib.sha256(self.content).digest()).decode("ascii").upper().rstrip("=")
799
946
 
800
947
  @final
801
948
  @property
@@ -944,8 +1091,6 @@ class Document(BaseModel, ABC):
944
1091
  def as_yaml(self) -> Any:
945
1092
  r"""Parse document content as YAML.
946
1093
 
947
- @public
948
-
949
1094
  Parses the document's text content as YAML and returns Python objects.
950
1095
  Uses ruamel.yaml which is safe by default (no code execution).
951
1096
 
@@ -973,8 +1118,6 @@ class Document(BaseModel, ABC):
973
1118
  def as_json(self) -> Any:
974
1119
  """Parse document content as JSON.
975
1120
 
976
- @public
977
-
978
1121
  Parses the document's text content as JSON and returns Python objects.
979
1122
  Document must contain valid JSON text.
980
1123
 
@@ -1069,7 +1212,7 @@ class Document(BaseModel, ABC):
1069
1212
 
1070
1213
  @public
1071
1214
 
1072
- Splits text content using markdown separator ("\n\n-----------------\n\n").
1215
+ Splits text content automatically using markdown section separators.
1073
1216
  Designed for markdown documents with multiple sections.
1074
1217
 
1075
1218
  Returns:
@@ -1084,9 +1227,9 @@ class Document(BaseModel, ABC):
1084
1227
  >>> doc = MyDocument.create(name="book.md", content=sections)
1085
1228
  >>> doc.as_markdown_list() # Returns original sections
1086
1229
 
1087
- >>> # Manual creation with separator
1088
- >>> content = "Part 1\n\n-----------------\n\nPart 2\n\n-----------------\n\nPart 3"
1089
- >>> doc2 = MyDocument(name="parts.md", content=content.encode())
1230
+ >>> # Round-trip conversion works automatically
1231
+ >>> sections = ["Part 1", "Part 2", "Part 3"]
1232
+ >>> doc2 = MyDocument.create(name="parts.md", content=sections)
1090
1233
  >>> doc2.as_markdown_list() # ['Part 1', 'Part 2', 'Part 3']
1091
1234
  """
1092
1235
  return self.text.split(self.MARKDOWN_LIST_SEPARATOR)
@@ -1123,7 +1266,7 @@ class Document(BaseModel, ABC):
1123
1266
  Extension Rules:
1124
1267
  - .json → JSON parsing for dict/list/BaseModel
1125
1268
  - .yaml/.yml → YAML parsing for dict/list/BaseModel
1126
- - .md + list → Split by markdown separator
1269
+ - .md + list → Split automatically into sections
1127
1270
  - Any + str → UTF-8 decode
1128
1271
  - Any + bytes → Raw content
1129
1272
 
@@ -1139,8 +1282,7 @@ class Document(BaseModel, ABC):
1139
1282
 
1140
1283
  >>> # Markdown list
1141
1284
  >>> items = ["Item 1", "Item 2"]
1142
- >>> content = "\n\n---\n\n".join(items).encode()
1143
- >>> doc = MyDocument(name="list.md", content=content)
1285
+ >>> doc = MyDocument.create(name="list.md", content=items)
1144
1286
  >>> doc.parse(list)
1145
1287
  ['Item 1', 'Item 2']
1146
1288
  """
@@ -1215,6 +1357,129 @@ class Document(BaseModel, ABC):
1215
1357
 
1216
1358
  raise ValueError(f"Unsupported type {type_} for file {self.name}")
1217
1359
 
1360
+ def get_source_documents(self) -> list[str]:
1361
+ """Get list of document SHA256 hashes referenced as sources.
1362
+
1363
+ Retrieves all document references from this document's sources list,
1364
+ filtering for valid SHA256 hashes that reference other documents.
1365
+ This is useful for building dependency graphs and tracking document
1366
+ lineage in processing pipelines.
1367
+
1368
+ Returns:
1369
+ List of SHA256 hashes (base32 encoded) for documents referenced
1370
+ as sources. Each hash uniquely identifies another document that
1371
+ contributed to creating this one.
1372
+
1373
+ Example:
1374
+ >>> # Create a derived document from multiple sources
1375
+ >>> source1 = MyDocument.create(name="data1.txt", content="First")
1376
+ >>> source2 = MyDocument.create(name="data2.txt", content="Second")
1377
+ >>>
1378
+ >>> merged = MyDocument.create(
1379
+ ... name="merged.txt",
1380
+ ... content="Combined data",
1381
+ ... sources=[source1.sha256, source2.sha256, "https://api.example.com"]
1382
+ ... )
1383
+ >>>
1384
+ >>> # Get only document references (not URLs)
1385
+ >>> doc_refs = merged.get_source_documents()
1386
+ >>> print(doc_refs) # [source1.sha256, source2.sha256]
1387
+ >>>
1388
+ >>> # Check if specific document is a source
1389
+ >>> if source1.sha256 in doc_refs:
1390
+ ... print("Document derived from source1")
1391
+ """
1392
+ return [src for src in self.sources if is_document_sha256(src)]
1393
+
1394
+ def get_source_references(self) -> list[str]:
1395
+ """Get list of arbitrary reference strings from sources.
1396
+
1397
+ Retrieves all non-document references from this document's sources list.
1398
+ These are typically URLs, file paths, API endpoints, or descriptive strings
1399
+ that indicate where the document's content originated from, but are not
1400
+ references to other documents in the pipeline.
1401
+
1402
+ Returns:
1403
+ List of reference strings that are not document SHA256 hashes.
1404
+ Can include URLs, file paths, API endpoints, dataset names,
1405
+ or any other string that provides source context.
1406
+
1407
+ Example:
1408
+ >>> # Create document with mixed source types
1409
+ >>> doc = MyDocument.create(
1410
+ ... name="report.txt",
1411
+ ... content="Analysis results",
1412
+ ... sources=[
1413
+ ... other_doc.sha256, # Document reference
1414
+ ... "https://api.example.com/data", # API URL
1415
+ ... "dataset:customer-2024", # Dataset identifier
1416
+ ... "/path/to/source.csv", # File path
1417
+ ... ]
1418
+ ... )
1419
+ >>>
1420
+ >>> # Get only non-document references
1421
+ >>> refs = doc.get_source_references()
1422
+ >>> print(refs)
1423
+ >>> # ["https://api.example.com/data", "dataset:customer-2024", "/path/to/source.csv"]
1424
+ >>>
1425
+ >>> # Use for attribution or debugging
1426
+ >>> for ref in refs:
1427
+ ... print(f"Data sourced from: {ref}")
1428
+ """
1429
+ return [src for src in self.sources if not is_document_sha256(src)]
1430
+
1431
+ def has_source(self, source: Document | str) -> bool:
1432
+ """Check if a specific source is tracked for this document.
1433
+
1434
+ Verifies whether a given source (document or reference string) is
1435
+ included in this document's sources list. Useful for dependency
1436
+ checking, lineage verification, and conditional processing based
1437
+ on document origins.
1438
+
1439
+ Args:
1440
+ source: Source to check for. Can be:
1441
+ - Document: Checks if document's SHA256 is in sources
1442
+ - str: Checks if exact string is in sources (hash or reference)
1443
+
1444
+ Returns:
1445
+ True if the source is tracked in this document's sources,
1446
+ False otherwise.
1447
+
1448
+ Raises:
1449
+ TypeError: If source is not a Document or string.
1450
+
1451
+ Example:
1452
+ >>> # Check if document was derived from specific source
1453
+ >>> source_doc = MyDocument.create(name="original.txt", content="Data")
1454
+ >>> api_url = "https://api.example.com/data"
1455
+ >>>
1456
+ >>> derived = MyDocument.create(
1457
+ ... name="processed.txt",
1458
+ ... content="Processed data",
1459
+ ... sources=[source_doc.sha256, api_url]
1460
+ ... )
1461
+ >>>
1462
+ >>> # Check document source
1463
+ >>> if derived.has_source(source_doc):
1464
+ ... print("Derived from source_doc")
1465
+ >>>
1466
+ >>> # Check string reference
1467
+ >>> if derived.has_source(api_url):
1468
+ ... print("Data from API")
1469
+ >>>
1470
+ >>> # Check by SHA256 directly
1471
+ >>> if derived.has_source(source_doc.sha256):
1472
+ ... print("Has specific hash")
1473
+ """
1474
+ if isinstance(source, str):
1475
+ # Direct string comparison
1476
+ return source in self.sources
1477
+ elif isinstance(source, Document): # type: ignore[misc]
1478
+ # Check if document's SHA256 is in sources
1479
+ return source.sha256 in self.sources
1480
+ else:
1481
+ raise TypeError(f"Invalid source type: {type(source)}")
1482
+
1218
1483
  @final
1219
1484
  def serialize_model(self) -> dict[str, Any]:
1220
1485
  """Serialize document to dictionary for storage or transmission.
@@ -1230,8 +1495,9 @@ class Document(BaseModel, ABC):
1230
1495
  - base_type: Persistence type - "flow", "task", or "temporary" (str)
1231
1496
  - size: Content size in bytes (int)
1232
1497
  - id: Short hash identifier, first 6 chars of SHA256 (str)
1233
- - sha256: Full SHA256 hash in base32 encoding (str)
1498
+ - sha256: Full SHA256 hash in base32 encoding without padding (str)
1234
1499
  - mime_type: Detected MIME type (str)
1500
+ - sources: List of source strings (list[dict])
1235
1501
  - content: Encoded content (str)
1236
1502
  - content_encoding: Either "utf-8" or "base64" (str)
1237
1503
 
@@ -1254,6 +1520,7 @@ class Document(BaseModel, ABC):
1254
1520
  "id": self.id,
1255
1521
  "sha256": self.sha256,
1256
1522
  "mime_type": self.mime_type,
1523
+ "sources": self.sources,
1257
1524
  }
1258
1525
 
1259
1526
  # Try to encode content as UTF-8, fall back to base64
@@ -1288,6 +1555,7 @@ class Document(BaseModel, ABC):
1288
1555
  Optional keys:
1289
1556
  - description: Document description (str | None)
1290
1557
  - content_encoding: "utf-8" or "base64" (defaults to "utf-8")
1558
+ - sources: List of source strings
1291
1559
 
1292
1560
  Returns:
1293
1561
  New Document instance with restored content.
@@ -1326,9 +1594,9 @@ class Document(BaseModel, ABC):
1326
1594
  else:
1327
1595
  raise ValueError(f"Invalid content type: {type(content_raw)}")
1328
1596
 
1329
- # Create document with the required fields
1330
1597
  return cls(
1331
1598
  name=data["name"],
1332
1599
  content=content,
1333
1600
  description=data.get("description"),
1601
+ sources=data.get("sources", []),
1334
1602
  )