ai-pipeline-core 0.1.13__tar.gz → 0.1.14__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/PKG-INFO +42 -7
  2. {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/README.md +41 -6
  3. {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/ai_pipeline_core/__init__.py +5 -2
  4. {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/ai_pipeline_core/documents/__init__.py +2 -1
  5. {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/ai_pipeline_core/documents/document.py +239 -14
  6. {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/ai_pipeline_core/documents/document_list.py +72 -16
  7. {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/ai_pipeline_core/documents/flow_document.py +6 -23
  8. {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/ai_pipeline_core/documents/task_document.py +6 -23
  9. {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/ai_pipeline_core/documents/temporary_document.py +5 -19
  10. {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/ai_pipeline_core/documents/utils.py +64 -1
  11. {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/ai_pipeline_core/flow/options.py +2 -2
  12. {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/ai_pipeline_core/llm/__init__.py +5 -0
  13. {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/ai_pipeline_core/llm/ai_messages.py +0 -3
  14. {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/ai_pipeline_core/llm/client.py +50 -19
  15. {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/ai_pipeline_core/llm/model_options.py +18 -0
  16. {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/ai_pipeline_core/llm/model_response.py +62 -15
  17. ai_pipeline_core-0.1.14/ai_pipeline_core/llm/model_types.py +86 -0
  18. {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/ai_pipeline_core/pipeline.py +28 -2
  19. {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/ai_pipeline_core/simple_runner/simple_runner.py +18 -1
  20. {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/ai_pipeline_core/tracing.py +113 -6
  21. {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/pyproject.toml +2 -2
  22. ai_pipeline_core-0.1.13/ai_pipeline_core/llm/model_types.py +0 -84
  23. {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/.gitignore +0 -0
  24. {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/LICENSE +0 -0
  25. {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/ai_pipeline_core/documents/mime_type.py +0 -0
  26. {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/ai_pipeline_core/exceptions.py +0 -0
  27. {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/ai_pipeline_core/flow/__init__.py +0 -0
  28. {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/ai_pipeline_core/flow/config.py +0 -0
  29. {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/ai_pipeline_core/logging/__init__.py +0 -0
  30. {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/ai_pipeline_core/logging/logging.yml +0 -0
  31. {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/ai_pipeline_core/logging/logging_config.py +0 -0
  32. {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/ai_pipeline_core/logging/logging_mixin.py +0 -0
  33. {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/ai_pipeline_core/prefect.py +0 -0
  34. {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/ai_pipeline_core/prompt_manager.py +0 -0
  35. {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/ai_pipeline_core/py.typed +0 -0
  36. {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/ai_pipeline_core/settings.py +0 -0
  37. {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/ai_pipeline_core/simple_runner/__init__.py +0 -0
  38. {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/ai_pipeline_core/simple_runner/cli.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ai-pipeline-core
3
- Version: 0.1.13
3
+ Version: 0.1.14
4
4
  Summary: Core utilities for AI-powered processing pipelines using prefect
5
5
  Project-URL: Homepage, https://github.com/bbarwik/ai-pipeline-core
6
6
  Project-URL: Repository, https://github.com/bbarwik/ai-pipeline-core
@@ -57,11 +57,11 @@ AI Pipeline Core is a production-ready framework that combines document processi
57
57
 
58
58
  ### Key Features
59
59
 
60
- - **Document Processing**: Type-safe handling of text, JSON, YAML, PDFs, and images with automatic MIME type detection
61
- - **LLM Integration**: Unified interface to any model via LiteLLM proxy with intelligent context caching
60
+ - **Document Processing**: Type-safe handling of text, JSON, YAML, PDFs, and images with automatic MIME type detection and provenance tracking
61
+ - **LLM Integration**: Unified interface to any model via LiteLLM proxy with configurable context caching
62
62
  - **Structured Output**: Type-safe generation with Pydantic model validation
63
63
  - **Workflow Orchestration**: Prefect-based flows and tasks with automatic retries
64
- - **Observability**: Built-in distributed tracing via Laminar (LMNR) for debugging and monitoring
64
+ - **Observability**: Built-in distributed tracing via Laminar (LMNR) with cost tracking for debugging and monitoring
65
65
  - **Local Development**: Simple runner for testing pipelines without infrastructure
66
66
 
67
67
  ## Installation
@@ -178,6 +178,19 @@ doc = MyDocument.create(
178
178
  # Parse back to original type
179
179
  data = doc.parse(dict) # Returns {"key": "value"}
180
180
 
181
+ # Document provenance tracking (new in v0.1.14)
182
+ doc_with_sources = MyDocument.create(
183
+ name="derived.json",
184
+ content={"result": "processed"},
185
+ sources=[source_doc.sha256, "https://api.example.com/data"]
186
+ )
187
+
188
+ # Check provenance
189
+ for hash in doc_with_sources.get_source_documents():
190
+ print(f"Derived from document: {hash}")
191
+ for ref in doc_with_sources.get_source_references():
192
+ print(f"External source: {ref}")
193
+
181
194
  # Temporary documents (never persisted)
182
195
  temp = TemporaryDocument.create(
183
196
  name="api_response.json",
@@ -211,6 +224,10 @@ if doc.is_text:
211
224
 
212
225
  # Parse structured data
213
226
  data = doc.as_json() # or as_yaml(), as_pydantic_model()
227
+
228
+ # Enhanced filtering (new in v0.1.14)
229
+ filtered = documents.filter_by([Doc1, Doc2, Doc3]) # Multiple types
230
+ named = documents.filter_by(["file1.txt", "file2.txt"]) # Multiple names
214
231
  ```
215
232
 
216
233
  ### LLM Integration
@@ -233,7 +250,7 @@ static_context = AIMessages([large_document])
233
250
  # First call: caches context
234
251
  r1 = await llm.generate(
235
252
  model="gpt-5",
236
- context=static_context, # Cached for 120 seconds
253
+ context=static_context, # Cached for 120 seconds by default
237
254
  messages="Summarize" # Dynamic query
238
255
  )
239
256
 
@@ -243,6 +260,22 @@ r2 = await llm.generate(
243
260
  context=static_context, # Reused from cache!
244
261
  messages="Key points?" # Different query
245
262
  )
263
+
264
+ # Custom cache TTL (new in v0.1.14)
265
+ response = await llm.generate(
266
+ model="gpt-5",
267
+ context=static_context,
268
+ messages="Analyze",
269
+ options=ModelOptions(cache_ttl="300s") # Cache for 5 minutes
270
+ )
271
+
272
+ # Disable caching for dynamic contexts
273
+ response = await llm.generate(
274
+ model="gpt-5",
275
+ context=dynamic_context,
276
+ messages="Process",
277
+ options=ModelOptions(cache_ttl=None) # No caching
278
+ )
246
279
  ```
247
280
 
248
281
  ### Flow Configuration
@@ -272,11 +305,13 @@ class ProcessingConfig(FlowConfig):
272
305
  Enhanced decorators with built-in tracing and monitoring:
273
306
 
274
307
  ```python
275
- from ai_pipeline_core import pipeline_flow, pipeline_task
308
+ from ai_pipeline_core import pipeline_flow, pipeline_task, set_trace_cost
276
309
 
277
310
  @pipeline_task # Automatic retry, tracing, and monitoring
278
311
  async def process_chunk(data: str) -> str:
279
- return await transform(data)
312
+ result = await transform(data)
313
+ set_trace_cost(0.05) # Track costs (new in v0.1.14)
314
+ return result
280
315
 
281
316
  @pipeline_flow # Full observability and orchestration
282
317
  async def main_flow(
@@ -13,11 +13,11 @@ AI Pipeline Core is a production-ready framework that combines document processi
13
13
 
14
14
  ### Key Features
15
15
 
16
- - **Document Processing**: Type-safe handling of text, JSON, YAML, PDFs, and images with automatic MIME type detection
17
- - **LLM Integration**: Unified interface to any model via LiteLLM proxy with intelligent context caching
16
+ - **Document Processing**: Type-safe handling of text, JSON, YAML, PDFs, and images with automatic MIME type detection and provenance tracking
17
+ - **LLM Integration**: Unified interface to any model via LiteLLM proxy with configurable context caching
18
18
  - **Structured Output**: Type-safe generation with Pydantic model validation
19
19
  - **Workflow Orchestration**: Prefect-based flows and tasks with automatic retries
20
- - **Observability**: Built-in distributed tracing via Laminar (LMNR) for debugging and monitoring
20
+ - **Observability**: Built-in distributed tracing via Laminar (LMNR) with cost tracking for debugging and monitoring
21
21
  - **Local Development**: Simple runner for testing pipelines without infrastructure
22
22
 
23
23
  ## Installation
@@ -134,6 +134,19 @@ doc = MyDocument.create(
134
134
  # Parse back to original type
135
135
  data = doc.parse(dict) # Returns {"key": "value"}
136
136
 
137
+ # Document provenance tracking (new in v0.1.14)
138
+ doc_with_sources = MyDocument.create(
139
+ name="derived.json",
140
+ content={"result": "processed"},
141
+ sources=[source_doc.sha256, "https://api.example.com/data"]
142
+ )
143
+
144
+ # Check provenance
145
+ for hash in doc_with_sources.get_source_documents():
146
+ print(f"Derived from document: {hash}")
147
+ for ref in doc_with_sources.get_source_references():
148
+ print(f"External source: {ref}")
149
+
137
150
  # Temporary documents (never persisted)
138
151
  temp = TemporaryDocument.create(
139
152
  name="api_response.json",
@@ -167,6 +180,10 @@ if doc.is_text:
167
180
 
168
181
  # Parse structured data
169
182
  data = doc.as_json() # or as_yaml(), as_pydantic_model()
183
+
184
+ # Enhanced filtering (new in v0.1.14)
185
+ filtered = documents.filter_by([Doc1, Doc2, Doc3]) # Multiple types
186
+ named = documents.filter_by(["file1.txt", "file2.txt"]) # Multiple names
170
187
  ```
171
188
 
172
189
  ### LLM Integration
@@ -189,7 +206,7 @@ static_context = AIMessages([large_document])
189
206
  # First call: caches context
190
207
  r1 = await llm.generate(
191
208
  model="gpt-5",
192
- context=static_context, # Cached for 120 seconds
209
+ context=static_context, # Cached for 120 seconds by default
193
210
  messages="Summarize" # Dynamic query
194
211
  )
195
212
 
@@ -199,6 +216,22 @@ r2 = await llm.generate(
199
216
  context=static_context, # Reused from cache!
200
217
  messages="Key points?" # Different query
201
218
  )
219
+
220
+ # Custom cache TTL (new in v0.1.14)
221
+ response = await llm.generate(
222
+ model="gpt-5",
223
+ context=static_context,
224
+ messages="Analyze",
225
+ options=ModelOptions(cache_ttl="300s") # Cache for 5 minutes
226
+ )
227
+
228
+ # Disable caching for dynamic contexts
229
+ response = await llm.generate(
230
+ model="gpt-5",
231
+ context=dynamic_context,
232
+ messages="Process",
233
+ options=ModelOptions(cache_ttl=None) # No caching
234
+ )
202
235
  ```
203
236
 
204
237
  ### Flow Configuration
@@ -228,11 +261,13 @@ class ProcessingConfig(FlowConfig):
228
261
  Enhanced decorators with built-in tracing and monitoring:
229
262
 
230
263
  ```python
231
- from ai_pipeline_core import pipeline_flow, pipeline_task
264
+ from ai_pipeline_core import pipeline_flow, pipeline_task, set_trace_cost
232
265
 
233
266
  @pipeline_task # Automatic retry, tracing, and monitoring
234
267
  async def process_chunk(data: str) -> str:
235
- return await transform(data)
268
+ result = await transform(data)
269
+ set_trace_cost(0.05) # Track costs (new in v0.1.14)
270
+ return result
236
271
 
237
272
  @pipeline_flow # Full observability and orchestration
238
273
  async def main_flow(
@@ -88,6 +88,7 @@ from .documents import (
88
88
  TaskDocument,
89
89
  TemporaryDocument,
90
90
  canonical_name_key,
91
+ is_document_sha256,
91
92
  sanitize_url,
92
93
  )
93
94
  from .flow import FlowConfig, FlowOptions
@@ -111,9 +112,9 @@ from .pipeline import pipeline_flow, pipeline_task
111
112
  from .prefect import disable_run_logger, prefect_test_harness
112
113
  from .prompt_manager import PromptManager
113
114
  from .settings import Settings
114
- from .tracing import TraceInfo, TraceLevel, trace
115
+ from .tracing import TraceInfo, TraceLevel, set_trace_cost, trace
115
116
 
116
- __version__ = "0.1.13"
117
+ __version__ = "0.1.14"
117
118
 
118
119
  __all__ = [
119
120
  # Config/Settings
@@ -132,6 +133,7 @@ __all__ = [
132
133
  "TaskDocument",
133
134
  "TemporaryDocument",
134
135
  "canonical_name_key",
136
+ "is_document_sha256",
135
137
  "sanitize_url",
136
138
  # Flow/Task
137
139
  "FlowConfig",
@@ -154,6 +156,7 @@ __all__ = [
154
156
  "trace",
155
157
  "TraceLevel",
156
158
  "TraceInfo",
159
+ "set_trace_cost",
157
160
  # Utils
158
161
  "PromptManager",
159
162
  ]
@@ -12,7 +12,7 @@ from .document_list import DocumentList
12
12
  from .flow_document import FlowDocument
13
13
  from .task_document import TaskDocument
14
14
  from .temporary_document import TemporaryDocument
15
- from .utils import canonical_name_key, sanitize_url
15
+ from .utils import canonical_name_key, is_document_sha256, sanitize_url
16
16
 
17
17
  __all__ = [
18
18
  "Document",
@@ -21,5 +21,6 @@ __all__ = [
21
21
  "TaskDocument",
22
22
  "TemporaryDocument",
23
23
  "canonical_name_key",
24
+ "is_document_sha256",
24
25
  "sanitize_url",
25
26
  ]
@@ -6,6 +6,8 @@ This module provides the core document abstraction for working with various type
6
6
  in AI pipelines. Documents are immutable Pydantic models that wrap binary content with metadata.
7
7
  """
8
8
 
9
+ from __future__ import annotations
10
+
9
11
  import base64
10
12
  import hashlib
11
13
  import json
@@ -30,13 +32,14 @@ from typing import (
30
32
  from pydantic import (
31
33
  BaseModel,
32
34
  ConfigDict,
35
+ Field,
33
36
  ValidationInfo,
34
37
  field_serializer,
35
38
  field_validator,
36
39
  )
37
40
  from ruamel.yaml import YAML
38
41
 
39
- from ai_pipeline_core.documents.utils import canonical_name_key
42
+ from ai_pipeline_core.documents.utils import canonical_name_key, is_document_sha256
40
43
  from ai_pipeline_core.exceptions import DocumentNameError, DocumentSizeError
41
44
 
42
45
  from .mime_type import (
@@ -94,6 +97,7 @@ class Document(BaseModel, ABC):
94
97
  - SHA256 hashing for deduplication
95
98
  - Support for text, JSON, YAML, PDF, and image formats
96
99
  - Conversion utilities between different formats
100
+ - Source provenance tracking via sources field
97
101
 
98
102
  Class Variables:
99
103
  MAX_CONTENT_SIZE: Maximum allowed content size in bytes (default 25MB)
@@ -102,6 +106,7 @@ class Document(BaseModel, ABC):
102
106
  name: Document filename (validated for security)
103
107
  description: Optional human-readable description
104
108
  content: Raw document content as bytes
109
+ sources: List of source references tracking document provenance
105
110
 
106
111
  Creating Documents:
107
112
  **Use the `create` classmethod** for most use cases. It accepts various
@@ -117,7 +122,7 @@ class Document(BaseModel, ABC):
117
122
  Warning:
118
123
  - Document subclasses should NOT start with 'Test' prefix (pytest conflict)
119
124
  - Cannot instantiate Document directly - must subclass FlowDocument or TaskDocument
120
- - Cannot add custom fields - only name, description, content are allowed
125
+ - Cannot add custom fields - only name, description, content, sources are allowed
121
126
  - Document is an abstract class and cannot be instantiated directly
122
127
 
123
128
  Metadata Attachment Patterns:
@@ -145,6 +150,15 @@ class Document(BaseModel, ABC):
145
150
  >>> doc = MyDocument.create(name="data.json", content={"key": "value"})
146
151
  >>> print(doc.is_text) # True
147
152
  >>> data = doc.as_json() # {'key': 'value'}
153
+ >>>
154
+ >>> # Track document provenance with sources
155
+ >>> source_doc = MyDocument.create(name="input.txt", content="raw data")
156
+ >>> processed = MyDocument.create(
157
+ ... name="output.txt",
158
+ ... content="processed data",
159
+ ... sources=[source_doc.sha256] # Reference source document
160
+ ... )
161
+ >>> processed.has_source(source_doc) # True
148
162
  """
149
163
 
150
164
  MAX_CONTENT_SIZE: ClassVar[int] = 25 * 1024 * 1024
@@ -193,7 +207,7 @@ class Document(BaseModel, ABC):
193
207
  )
194
208
  # Check that the Document's model_fields only contain the allowed fields
195
209
  # It prevents AI models from adding additional fields to documents
196
- allowed = {"name", "description", "content"}
210
+ allowed = {"name", "description", "content", "sources"}
197
211
  current = set(getattr(cls, "model_fields", {}).keys())
198
212
  extras = current - allowed
199
213
  if extras:
@@ -204,25 +218,58 @@ class Document(BaseModel, ABC):
204
218
 
205
219
  @overload
206
220
  @classmethod
207
- def create(cls, *, name: str, content: bytes, description: str | None = None) -> Self: ...
221
+ def create(
222
+ cls,
223
+ *,
224
+ name: str,
225
+ content: bytes,
226
+ description: str | None = None,
227
+ sources: list[str] = [],
228
+ ) -> Self: ...
208
229
 
209
230
  @overload
210
231
  @classmethod
211
- def create(cls, *, name: str, content: str, description: str | None = None) -> Self: ...
232
+ def create(
233
+ cls,
234
+ *,
235
+ name: str,
236
+ content: str,
237
+ description: str | None = None,
238
+ sources: list[str] = [],
239
+ ) -> Self: ...
212
240
 
213
241
  @overload
214
242
  @classmethod
215
243
  def create(
216
- cls, *, name: str, content: dict[str, Any], description: str | None = None
244
+ cls,
245
+ *,
246
+ name: str,
247
+ content: dict[str, Any],
248
+ description: str | None = None,
249
+ sources: list[str] = [],
217
250
  ) -> Self: ...
218
251
 
219
252
  @overload
220
253
  @classmethod
221
- def create(cls, *, name: str, content: list[Any], description: str | None = None) -> Self: ...
254
+ def create(
255
+ cls,
256
+ *,
257
+ name: str,
258
+ content: list[Any],
259
+ description: str | None = None,
260
+ sources: list[str] = [],
261
+ ) -> Self: ...
222
262
 
223
263
  @overload
224
264
  @classmethod
225
- def create(cls, *, name: str, content: BaseModel, description: str | None = None) -> Self: ...
265
+ def create(
266
+ cls,
267
+ *,
268
+ name: str,
269
+ content: BaseModel,
270
+ description: str | None = None,
271
+ sources: list[str] = [],
272
+ ) -> Self: ...
226
273
 
227
274
  @classmethod
228
275
  def create(
@@ -231,6 +278,7 @@ class Document(BaseModel, ABC):
231
278
  name: str,
232
279
  content: str | bytes | dict[str, Any] | list[Any] | BaseModel,
233
280
  description: str | None = None,
281
+ sources: list[str] = [],
234
282
  ) -> Self:
235
283
  r"""Create a Document with automatic content type conversion (recommended).
236
284
 
@@ -260,6 +308,11 @@ class Document(BaseModel, ABC):
260
308
  - BaseModel: Serialized to JSON or YAML based on extension
261
309
  description: Optional description - USUALLY OMIT THIS (defaults to None).
262
310
  Only use when meaningful metadata helps downstream processing
311
+ sources: Optional list of source strings (document SHA256 hashes or references).
312
+ Used to track what sources contributed to creating this document.
313
+ Can contain document SHA256 hashes (for referencing other documents)
314
+ or arbitrary reference strings (URLs, file paths, descriptions).
315
+ Defaults to empty list
263
316
 
264
317
  Returns:
265
318
  New Document instance with content converted to bytes
@@ -306,11 +359,31 @@ class Document(BaseModel, ABC):
306
359
  >>> items = ["Section 1", "Section 2"]
307
360
  >>> doc = MyDocument.create(name="sections.md", content=items)
308
361
  >>> doc.parse(list) # ["Section 1", "Section 2"]
362
+
363
+ >>> # Document with sources for provenance tracking
364
+ >>> source_doc = MyDocument.create(name="source.txt", content="original")
365
+ >>> derived = MyDocument.create(
366
+ ... name="result.txt",
367
+ ... content="processed",
368
+ ... sources=[source_doc.sha256, "https://api.example.com/data"]
369
+ ... )
370
+ >>> derived.get_source_documents() # [source_doc.sha256]
371
+ >>> derived.get_source_references() # ["https://api.example.com/data"]
309
372
  """
310
373
  # Use model_validate to leverage the existing validator logic
311
- temp = cls.model_validate({"name": name, "content": content, "description": description})
374
+ temp = cls.model_validate({
375
+ "name": name,
376
+ "content": content,
377
+ "description": description,
378
+ "sources": sources,
379
+ })
312
380
  # Now construct with type-checker-friendly call (bytes only)
313
- return cls(name=temp.name, content=temp.content, description=temp.description)
381
+ return cls(
382
+ name=temp.name,
383
+ content=temp.content,
384
+ description=temp.description,
385
+ sources=temp.sources,
386
+ )
314
387
 
315
388
  def __init__(
316
389
  self,
@@ -318,6 +391,7 @@ class Document(BaseModel, ABC):
318
391
  name: str,
319
392
  content: bytes,
320
393
  description: str | None = None,
394
+ sources: list[str] = [],
321
395
  ) -> None:
322
396
  """Initialize a Document instance with raw bytes content.
323
397
 
@@ -335,6 +409,10 @@ class Document(BaseModel, ABC):
335
409
  name: Document filename (required, keyword-only)
336
410
  content: Document content as raw bytes (required, keyword-only)
337
411
  description: Optional human-readable description (keyword-only)
412
+ sources: Optional list of source strings for provenance tracking.
413
+ Can contain document SHA256 hashes (for referencing other documents)
414
+ or arbitrary reference strings (URLs, file paths, descriptions).
415
+ Defaults to empty list
338
416
 
339
417
  Raises:
340
418
  TypeError: If attempting to instantiate Document directly.
@@ -357,11 +435,17 @@ class Document(BaseModel, ABC):
357
435
  if type(self) is Document:
358
436
  raise TypeError("Cannot instantiate abstract Document class directly")
359
437
 
360
- super().__init__(name=name, content=content, description=description)
438
+ super().__init__(name=name, content=content, description=description, sources=sources)
361
439
 
362
440
  name: str
363
441
  description: str | None = None
364
442
  content: bytes # Note: constructor accepts str | bytes, but field stores bytes only
443
+ sources: list[str] = Field(
444
+ default_factory=list,
445
+ description="List of source references for tracking document provenance. "
446
+ "Can contain document SHA256 hashes (for referencing other documents) "
447
+ "or arbitrary reference strings (URLs, file paths, descriptions)",
448
+ )
365
449
 
366
450
  # Pydantic configuration
367
451
  model_config = ConfigDict(
@@ -795,7 +879,7 @@ class Document(BaseModel, ABC):
795
879
  This is computed once and cached for performance.
796
880
  The hash is deterministic based on content only.
797
881
  """
798
- return b32encode(hashlib.sha256(self.content).digest()).decode("ascii").upper()
882
+ return b32encode(hashlib.sha256(self.content).digest()).decode("ascii").upper().rstrip("=")
799
883
 
800
884
  @final
801
885
  @property
@@ -1215,6 +1299,144 @@ class Document(BaseModel, ABC):
1215
1299
 
1216
1300
  raise ValueError(f"Unsupported type {type_} for file {self.name}")
1217
1301
 
1302
+ def get_source_documents(self) -> list[str]:
1303
+ """Get list of document SHA256 hashes referenced as sources.
1304
+
1305
+ Retrieves all document references from this document's sources list,
1306
+ filtering for valid SHA256 hashes that reference other documents.
1307
+ This is useful for building dependency graphs and tracking document
1308
+ lineage in processing pipelines.
1309
+
1310
+ Returns:
1311
+ List of SHA256 hashes (base32 encoded) for documents referenced
1312
+ as sources. Each hash uniquely identifies another document that
1313
+ contributed to creating this one.
1314
+
1315
+ Example:
1316
+ >>> # Create a derived document from multiple sources
1317
+ >>> source1 = MyDocument.create(name="data1.txt", content="First")
1318
+ >>> source2 = MyDocument.create(name="data2.txt", content="Second")
1319
+ >>>
1320
+ >>> merged = MyDocument.create(
1321
+ ... name="merged.txt",
1322
+ ... content="Combined data",
1323
+ ... sources=[source1.sha256, source2.sha256, "https://api.example.com"]
1324
+ ... )
1325
+ >>>
1326
+ >>> # Get only document references (not URLs)
1327
+ >>> doc_refs = merged.get_source_documents()
1328
+ >>> print(doc_refs) # [source1.sha256, source2.sha256]
1329
+ >>>
1330
+ >>> # Check if specific document is a source
1331
+ >>> if source1.sha256 in doc_refs:
1332
+ ... print("Document derived from source1")
1333
+
1334
+ See Also:
1335
+ - get_source_references: Get non-document source references (URLs, etc.)
1336
+ - has_source: Check if a specific source is tracked
1337
+ - Document.create: Add sources when creating documents
1338
+ """
1339
+ return [src for src in self.sources if is_document_sha256(src)]
1340
+
1341
+ def get_source_references(self) -> list[str]:
1342
+ """Get list of arbitrary reference strings from sources.
1343
+
1344
+ Retrieves all non-document references from this document's sources list.
1345
+ These are typically URLs, file paths, API endpoints, or descriptive strings
1346
+ that indicate where the document's content originated from, but are not
1347
+ references to other documents in the pipeline.
1348
+
1349
+ Returns:
1350
+ List of reference strings that are not document SHA256 hashes.
1351
+ Can include URLs, file paths, API endpoints, dataset names,
1352
+ or any other string that provides source context.
1353
+
1354
+ Example:
1355
+ >>> # Create document with mixed source types
1356
+ >>> doc = MyDocument.create(
1357
+ ... name="report.txt",
1358
+ ... content="Analysis results",
1359
+ ... sources=[
1360
+ ... other_doc.sha256, # Document reference
1361
+ ... "https://api.example.com/data", # API URL
1362
+ ... "dataset:customer-2024", # Dataset identifier
1363
+ ... "/path/to/source.csv", # File path
1364
+ ... ]
1365
+ ... )
1366
+ >>>
1367
+ >>> # Get only non-document references
1368
+ >>> refs = doc.get_source_references()
1369
+ >>> print(refs)
1370
+ >>> # ["https://api.example.com/data", "dataset:customer-2024", "/path/to/source.csv"]
1371
+ >>>
1372
+ >>> # Use for attribution or debugging
1373
+ >>> for ref in refs:
1374
+ ... print(f"Data sourced from: {ref}")
1375
+
1376
+ See Also:
1377
+ - get_source_documents: Get document SHA256 references
1378
+ - has_source: Check if a specific source is tracked
1379
+ - Document.create: Add sources when creating documents
1380
+ """
1381
+ return [src for src in self.sources if not is_document_sha256(src)]
1382
+
1383
+ def has_source(self, source: Document | str) -> bool:
1384
+ """Check if a specific source is tracked for this document.
1385
+
1386
+ Verifies whether a given source (document or reference string) is
1387
+ included in this document's sources list. Useful for dependency
1388
+ checking, lineage verification, and conditional processing based
1389
+ on document origins.
1390
+
1391
+ Args:
1392
+ source: Source to check for. Can be:
1393
+ - Document: Checks if document's SHA256 is in sources
1394
+ - str: Checks if exact string is in sources (hash or reference)
1395
+
1396
+ Returns:
1397
+ True if the source is tracked in this document's sources,
1398
+ False otherwise.
1399
+
1400
+ Raises:
1401
+ TypeError: If source is not a Document or string.
1402
+
1403
+ Example:
1404
+ >>> # Check if document was derived from specific source
1405
+ >>> source_doc = MyDocument.create(name="original.txt", content="Data")
1406
+ >>> api_url = "https://api.example.com/data"
1407
+ >>>
1408
+ >>> derived = MyDocument.create(
1409
+ ... name="processed.txt",
1410
+ ... content="Processed data",
1411
+ ... sources=[source_doc.sha256, api_url]
1412
+ ... )
1413
+ >>>
1414
+ >>> # Check document source
1415
+ >>> if derived.has_source(source_doc):
1416
+ ... print("Derived from source_doc")
1417
+ >>>
1418
+ >>> # Check string reference
1419
+ >>> if derived.has_source(api_url):
1420
+ ... print("Data from API")
1421
+ >>>
1422
+ >>> # Check by SHA256 directly
1423
+ >>> if derived.has_source(source_doc.sha256):
1424
+ ... print("Has specific hash")
1425
+
1426
+ See Also:
1427
+ - get_source_documents: Get all document sources
1428
+ - get_source_references: Get all reference sources
1429
+ - Document.create: Add sources when creating documents
1430
+ """
1431
+ if isinstance(source, str):
1432
+ # Direct string comparison
1433
+ return source in self.sources
1434
+ elif isinstance(source, Document): # type: ignore[misc]
1435
+ # Check if document's SHA256 is in sources
1436
+ return source.sha256 in self.sources
1437
+ else:
1438
+ raise TypeError(f"Invalid source type: {type(source)}")
1439
+
1218
1440
  @final
1219
1441
  def serialize_model(self) -> dict[str, Any]:
1220
1442
  """Serialize document to dictionary for storage or transmission.
@@ -1230,8 +1452,9 @@ class Document(BaseModel, ABC):
1230
1452
  - base_type: Persistence type - "flow", "task", or "temporary" (str)
1231
1453
  - size: Content size in bytes (int)
1232
1454
  - id: Short hash identifier, first 6 chars of SHA256 (str)
1233
- - sha256: Full SHA256 hash in base32 encoding (str)
1455
+ - sha256: Full SHA256 hash in base32 encoding without padding (str)
1234
1456
  - mime_type: Detected MIME type (str)
1457
+ - sources: List of source strings (list[dict])
1235
1458
  - content: Encoded content (str)
1236
1459
  - content_encoding: Either "utf-8" or "base64" (str)
1237
1460
 
@@ -1254,6 +1477,7 @@ class Document(BaseModel, ABC):
1254
1477
  "id": self.id,
1255
1478
  "sha256": self.sha256,
1256
1479
  "mime_type": self.mime_type,
1480
+ "sources": self.sources,
1257
1481
  }
1258
1482
 
1259
1483
  # Try to encode content as UTF-8, fall back to base64
@@ -1288,6 +1512,7 @@ class Document(BaseModel, ABC):
1288
1512
  Optional keys:
1289
1513
  - description: Document description (str | None)
1290
1514
  - content_encoding: "utf-8" or "base64" (defaults to "utf-8")
1515
+ - sources: List of source strings
1291
1516
 
1292
1517
  Returns:
1293
1518
  New Document instance with restored content.
@@ -1326,9 +1551,9 @@ class Document(BaseModel, ABC):
1326
1551
  else:
1327
1552
  raise ValueError(f"Invalid content type: {type(content_raw)}")
1328
1553
 
1329
- # Create document with the required fields
1330
1554
  return cls(
1331
1555
  name=data["name"],
1332
1556
  content=content,
1333
1557
  description=data.get("description"),
1558
+ sources=data.get("sources", []),
1334
1559
  )