ai-pipeline-core 0.1.13__tar.gz → 0.1.14__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/PKG-INFO +42 -7
- {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/README.md +41 -6
- {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/ai_pipeline_core/__init__.py +5 -2
- {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/ai_pipeline_core/documents/__init__.py +2 -1
- {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/ai_pipeline_core/documents/document.py +239 -14
- {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/ai_pipeline_core/documents/document_list.py +72 -16
- {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/ai_pipeline_core/documents/flow_document.py +6 -23
- {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/ai_pipeline_core/documents/task_document.py +6 -23
- {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/ai_pipeline_core/documents/temporary_document.py +5 -19
- {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/ai_pipeline_core/documents/utils.py +64 -1
- {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/ai_pipeline_core/flow/options.py +2 -2
- {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/ai_pipeline_core/llm/__init__.py +5 -0
- {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/ai_pipeline_core/llm/ai_messages.py +0 -3
- {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/ai_pipeline_core/llm/client.py +50 -19
- {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/ai_pipeline_core/llm/model_options.py +18 -0
- {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/ai_pipeline_core/llm/model_response.py +62 -15
- ai_pipeline_core-0.1.14/ai_pipeline_core/llm/model_types.py +86 -0
- {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/ai_pipeline_core/pipeline.py +28 -2
- {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/ai_pipeline_core/simple_runner/simple_runner.py +18 -1
- {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/ai_pipeline_core/tracing.py +113 -6
- {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/pyproject.toml +2 -2
- ai_pipeline_core-0.1.13/ai_pipeline_core/llm/model_types.py +0 -84
- {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/.gitignore +0 -0
- {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/LICENSE +0 -0
- {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/ai_pipeline_core/documents/mime_type.py +0 -0
- {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/ai_pipeline_core/exceptions.py +0 -0
- {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/ai_pipeline_core/flow/__init__.py +0 -0
- {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/ai_pipeline_core/flow/config.py +0 -0
- {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/ai_pipeline_core/logging/__init__.py +0 -0
- {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/ai_pipeline_core/logging/logging.yml +0 -0
- {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/ai_pipeline_core/logging/logging_config.py +0 -0
- {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/ai_pipeline_core/logging/logging_mixin.py +0 -0
- {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/ai_pipeline_core/prefect.py +0 -0
- {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/ai_pipeline_core/prompt_manager.py +0 -0
- {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/ai_pipeline_core/py.typed +0 -0
- {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/ai_pipeline_core/settings.py +0 -0
- {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/ai_pipeline_core/simple_runner/__init__.py +0 -0
- {ai_pipeline_core-0.1.13 → ai_pipeline_core-0.1.14}/ai_pipeline_core/simple_runner/cli.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ai-pipeline-core
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.14
|
|
4
4
|
Summary: Core utilities for AI-powered processing pipelines using prefect
|
|
5
5
|
Project-URL: Homepage, https://github.com/bbarwik/ai-pipeline-core
|
|
6
6
|
Project-URL: Repository, https://github.com/bbarwik/ai-pipeline-core
|
|
@@ -57,11 +57,11 @@ AI Pipeline Core is a production-ready framework that combines document processi
|
|
|
57
57
|
|
|
58
58
|
### Key Features
|
|
59
59
|
|
|
60
|
-
- **Document Processing**: Type-safe handling of text, JSON, YAML, PDFs, and images with automatic MIME type detection
|
|
61
|
-
- **LLM Integration**: Unified interface to any model via LiteLLM proxy with
|
|
60
|
+
- **Document Processing**: Type-safe handling of text, JSON, YAML, PDFs, and images with automatic MIME type detection and provenance tracking
|
|
61
|
+
- **LLM Integration**: Unified interface to any model via LiteLLM proxy with configurable context caching
|
|
62
62
|
- **Structured Output**: Type-safe generation with Pydantic model validation
|
|
63
63
|
- **Workflow Orchestration**: Prefect-based flows and tasks with automatic retries
|
|
64
|
-
- **Observability**: Built-in distributed tracing via Laminar (LMNR) for debugging and monitoring
|
|
64
|
+
- **Observability**: Built-in distributed tracing via Laminar (LMNR) with cost tracking for debugging and monitoring
|
|
65
65
|
- **Local Development**: Simple runner for testing pipelines without infrastructure
|
|
66
66
|
|
|
67
67
|
## Installation
|
|
@@ -178,6 +178,19 @@ doc = MyDocument.create(
|
|
|
178
178
|
# Parse back to original type
|
|
179
179
|
data = doc.parse(dict) # Returns {"key": "value"}
|
|
180
180
|
|
|
181
|
+
# Document provenance tracking (new in v0.1.14)
|
|
182
|
+
doc_with_sources = MyDocument.create(
|
|
183
|
+
name="derived.json",
|
|
184
|
+
content={"result": "processed"},
|
|
185
|
+
sources=[source_doc.sha256, "https://api.example.com/data"]
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
# Check provenance
|
|
189
|
+
for hash in doc_with_sources.get_source_documents():
|
|
190
|
+
print(f"Derived from document: {hash}")
|
|
191
|
+
for ref in doc_with_sources.get_source_references():
|
|
192
|
+
print(f"External source: {ref}")
|
|
193
|
+
|
|
181
194
|
# Temporary documents (never persisted)
|
|
182
195
|
temp = TemporaryDocument.create(
|
|
183
196
|
name="api_response.json",
|
|
@@ -211,6 +224,10 @@ if doc.is_text:
|
|
|
211
224
|
|
|
212
225
|
# Parse structured data
|
|
213
226
|
data = doc.as_json() # or as_yaml(), as_pydantic_model()
|
|
227
|
+
|
|
228
|
+
# Enhanced filtering (new in v0.1.14)
|
|
229
|
+
filtered = documents.filter_by([Doc1, Doc2, Doc3]) # Multiple types
|
|
230
|
+
named = documents.filter_by(["file1.txt", "file2.txt"]) # Multiple names
|
|
214
231
|
```
|
|
215
232
|
|
|
216
233
|
### LLM Integration
|
|
@@ -233,7 +250,7 @@ static_context = AIMessages([large_document])
|
|
|
233
250
|
# First call: caches context
|
|
234
251
|
r1 = await llm.generate(
|
|
235
252
|
model="gpt-5",
|
|
236
|
-
context=static_context, # Cached for 120 seconds
|
|
253
|
+
context=static_context, # Cached for 120 seconds by default
|
|
237
254
|
messages="Summarize" # Dynamic query
|
|
238
255
|
)
|
|
239
256
|
|
|
@@ -243,6 +260,22 @@ r2 = await llm.generate(
|
|
|
243
260
|
context=static_context, # Reused from cache!
|
|
244
261
|
messages="Key points?" # Different query
|
|
245
262
|
)
|
|
263
|
+
|
|
264
|
+
# Custom cache TTL (new in v0.1.14)
|
|
265
|
+
response = await llm.generate(
|
|
266
|
+
model="gpt-5",
|
|
267
|
+
context=static_context,
|
|
268
|
+
messages="Analyze",
|
|
269
|
+
options=ModelOptions(cache_ttl="300s") # Cache for 5 minutes
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
# Disable caching for dynamic contexts
|
|
273
|
+
response = await llm.generate(
|
|
274
|
+
model="gpt-5",
|
|
275
|
+
context=dynamic_context,
|
|
276
|
+
messages="Process",
|
|
277
|
+
options=ModelOptions(cache_ttl=None) # No caching
|
|
278
|
+
)
|
|
246
279
|
```
|
|
247
280
|
|
|
248
281
|
### Flow Configuration
|
|
@@ -272,11 +305,13 @@ class ProcessingConfig(FlowConfig):
|
|
|
272
305
|
Enhanced decorators with built-in tracing and monitoring:
|
|
273
306
|
|
|
274
307
|
```python
|
|
275
|
-
from ai_pipeline_core import pipeline_flow, pipeline_task
|
|
308
|
+
from ai_pipeline_core import pipeline_flow, pipeline_task, set_trace_cost
|
|
276
309
|
|
|
277
310
|
@pipeline_task # Automatic retry, tracing, and monitoring
|
|
278
311
|
async def process_chunk(data: str) -> str:
|
|
279
|
-
|
|
312
|
+
result = await transform(data)
|
|
313
|
+
set_trace_cost(0.05) # Track costs (new in v0.1.14)
|
|
314
|
+
return result
|
|
280
315
|
|
|
281
316
|
@pipeline_flow # Full observability and orchestration
|
|
282
317
|
async def main_flow(
|
|
@@ -13,11 +13,11 @@ AI Pipeline Core is a production-ready framework that combines document processi
|
|
|
13
13
|
|
|
14
14
|
### Key Features
|
|
15
15
|
|
|
16
|
-
- **Document Processing**: Type-safe handling of text, JSON, YAML, PDFs, and images with automatic MIME type detection
|
|
17
|
-
- **LLM Integration**: Unified interface to any model via LiteLLM proxy with
|
|
16
|
+
- **Document Processing**: Type-safe handling of text, JSON, YAML, PDFs, and images with automatic MIME type detection and provenance tracking
|
|
17
|
+
- **LLM Integration**: Unified interface to any model via LiteLLM proxy with configurable context caching
|
|
18
18
|
- **Structured Output**: Type-safe generation with Pydantic model validation
|
|
19
19
|
- **Workflow Orchestration**: Prefect-based flows and tasks with automatic retries
|
|
20
|
-
- **Observability**: Built-in distributed tracing via Laminar (LMNR) for debugging and monitoring
|
|
20
|
+
- **Observability**: Built-in distributed tracing via Laminar (LMNR) with cost tracking for debugging and monitoring
|
|
21
21
|
- **Local Development**: Simple runner for testing pipelines without infrastructure
|
|
22
22
|
|
|
23
23
|
## Installation
|
|
@@ -134,6 +134,19 @@ doc = MyDocument.create(
|
|
|
134
134
|
# Parse back to original type
|
|
135
135
|
data = doc.parse(dict) # Returns {"key": "value"}
|
|
136
136
|
|
|
137
|
+
# Document provenance tracking (new in v0.1.14)
|
|
138
|
+
doc_with_sources = MyDocument.create(
|
|
139
|
+
name="derived.json",
|
|
140
|
+
content={"result": "processed"},
|
|
141
|
+
sources=[source_doc.sha256, "https://api.example.com/data"]
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
# Check provenance
|
|
145
|
+
for hash in doc_with_sources.get_source_documents():
|
|
146
|
+
print(f"Derived from document: {hash}")
|
|
147
|
+
for ref in doc_with_sources.get_source_references():
|
|
148
|
+
print(f"External source: {ref}")
|
|
149
|
+
|
|
137
150
|
# Temporary documents (never persisted)
|
|
138
151
|
temp = TemporaryDocument.create(
|
|
139
152
|
name="api_response.json",
|
|
@@ -167,6 +180,10 @@ if doc.is_text:
|
|
|
167
180
|
|
|
168
181
|
# Parse structured data
|
|
169
182
|
data = doc.as_json() # or as_yaml(), as_pydantic_model()
|
|
183
|
+
|
|
184
|
+
# Enhanced filtering (new in v0.1.14)
|
|
185
|
+
filtered = documents.filter_by([Doc1, Doc2, Doc3]) # Multiple types
|
|
186
|
+
named = documents.filter_by(["file1.txt", "file2.txt"]) # Multiple names
|
|
170
187
|
```
|
|
171
188
|
|
|
172
189
|
### LLM Integration
|
|
@@ -189,7 +206,7 @@ static_context = AIMessages([large_document])
|
|
|
189
206
|
# First call: caches context
|
|
190
207
|
r1 = await llm.generate(
|
|
191
208
|
model="gpt-5",
|
|
192
|
-
context=static_context, # Cached for 120 seconds
|
|
209
|
+
context=static_context, # Cached for 120 seconds by default
|
|
193
210
|
messages="Summarize" # Dynamic query
|
|
194
211
|
)
|
|
195
212
|
|
|
@@ -199,6 +216,22 @@ r2 = await llm.generate(
|
|
|
199
216
|
context=static_context, # Reused from cache!
|
|
200
217
|
messages="Key points?" # Different query
|
|
201
218
|
)
|
|
219
|
+
|
|
220
|
+
# Custom cache TTL (new in v0.1.14)
|
|
221
|
+
response = await llm.generate(
|
|
222
|
+
model="gpt-5",
|
|
223
|
+
context=static_context,
|
|
224
|
+
messages="Analyze",
|
|
225
|
+
options=ModelOptions(cache_ttl="300s") # Cache for 5 minutes
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
# Disable caching for dynamic contexts
|
|
229
|
+
response = await llm.generate(
|
|
230
|
+
model="gpt-5",
|
|
231
|
+
context=dynamic_context,
|
|
232
|
+
messages="Process",
|
|
233
|
+
options=ModelOptions(cache_ttl=None) # No caching
|
|
234
|
+
)
|
|
202
235
|
```
|
|
203
236
|
|
|
204
237
|
### Flow Configuration
|
|
@@ -228,11 +261,13 @@ class ProcessingConfig(FlowConfig):
|
|
|
228
261
|
Enhanced decorators with built-in tracing and monitoring:
|
|
229
262
|
|
|
230
263
|
```python
|
|
231
|
-
from ai_pipeline_core import pipeline_flow, pipeline_task
|
|
264
|
+
from ai_pipeline_core import pipeline_flow, pipeline_task, set_trace_cost
|
|
232
265
|
|
|
233
266
|
@pipeline_task # Automatic retry, tracing, and monitoring
|
|
234
267
|
async def process_chunk(data: str) -> str:
|
|
235
|
-
|
|
268
|
+
result = await transform(data)
|
|
269
|
+
set_trace_cost(0.05) # Track costs (new in v0.1.14)
|
|
270
|
+
return result
|
|
236
271
|
|
|
237
272
|
@pipeline_flow # Full observability and orchestration
|
|
238
273
|
async def main_flow(
|
|
@@ -88,6 +88,7 @@ from .documents import (
|
|
|
88
88
|
TaskDocument,
|
|
89
89
|
TemporaryDocument,
|
|
90
90
|
canonical_name_key,
|
|
91
|
+
is_document_sha256,
|
|
91
92
|
sanitize_url,
|
|
92
93
|
)
|
|
93
94
|
from .flow import FlowConfig, FlowOptions
|
|
@@ -111,9 +112,9 @@ from .pipeline import pipeline_flow, pipeline_task
|
|
|
111
112
|
from .prefect import disable_run_logger, prefect_test_harness
|
|
112
113
|
from .prompt_manager import PromptManager
|
|
113
114
|
from .settings import Settings
|
|
114
|
-
from .tracing import TraceInfo, TraceLevel, trace
|
|
115
|
+
from .tracing import TraceInfo, TraceLevel, set_trace_cost, trace
|
|
115
116
|
|
|
116
|
-
__version__ = "0.1.
|
|
117
|
+
__version__ = "0.1.14"
|
|
117
118
|
|
|
118
119
|
__all__ = [
|
|
119
120
|
# Config/Settings
|
|
@@ -132,6 +133,7 @@ __all__ = [
|
|
|
132
133
|
"TaskDocument",
|
|
133
134
|
"TemporaryDocument",
|
|
134
135
|
"canonical_name_key",
|
|
136
|
+
"is_document_sha256",
|
|
135
137
|
"sanitize_url",
|
|
136
138
|
# Flow/Task
|
|
137
139
|
"FlowConfig",
|
|
@@ -154,6 +156,7 @@ __all__ = [
|
|
|
154
156
|
"trace",
|
|
155
157
|
"TraceLevel",
|
|
156
158
|
"TraceInfo",
|
|
159
|
+
"set_trace_cost",
|
|
157
160
|
# Utils
|
|
158
161
|
"PromptManager",
|
|
159
162
|
]
|
|
@@ -12,7 +12,7 @@ from .document_list import DocumentList
|
|
|
12
12
|
from .flow_document import FlowDocument
|
|
13
13
|
from .task_document import TaskDocument
|
|
14
14
|
from .temporary_document import TemporaryDocument
|
|
15
|
-
from .utils import canonical_name_key, sanitize_url
|
|
15
|
+
from .utils import canonical_name_key, is_document_sha256, sanitize_url
|
|
16
16
|
|
|
17
17
|
__all__ = [
|
|
18
18
|
"Document",
|
|
@@ -21,5 +21,6 @@ __all__ = [
|
|
|
21
21
|
"TaskDocument",
|
|
22
22
|
"TemporaryDocument",
|
|
23
23
|
"canonical_name_key",
|
|
24
|
+
"is_document_sha256",
|
|
24
25
|
"sanitize_url",
|
|
25
26
|
]
|
|
@@ -6,6 +6,8 @@ This module provides the core document abstraction for working with various type
|
|
|
6
6
|
in AI pipelines. Documents are immutable Pydantic models that wrap binary content with metadata.
|
|
7
7
|
"""
|
|
8
8
|
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
9
11
|
import base64
|
|
10
12
|
import hashlib
|
|
11
13
|
import json
|
|
@@ -30,13 +32,14 @@ from typing import (
|
|
|
30
32
|
from pydantic import (
|
|
31
33
|
BaseModel,
|
|
32
34
|
ConfigDict,
|
|
35
|
+
Field,
|
|
33
36
|
ValidationInfo,
|
|
34
37
|
field_serializer,
|
|
35
38
|
field_validator,
|
|
36
39
|
)
|
|
37
40
|
from ruamel.yaml import YAML
|
|
38
41
|
|
|
39
|
-
from ai_pipeline_core.documents.utils import canonical_name_key
|
|
42
|
+
from ai_pipeline_core.documents.utils import canonical_name_key, is_document_sha256
|
|
40
43
|
from ai_pipeline_core.exceptions import DocumentNameError, DocumentSizeError
|
|
41
44
|
|
|
42
45
|
from .mime_type import (
|
|
@@ -94,6 +97,7 @@ class Document(BaseModel, ABC):
|
|
|
94
97
|
- SHA256 hashing for deduplication
|
|
95
98
|
- Support for text, JSON, YAML, PDF, and image formats
|
|
96
99
|
- Conversion utilities between different formats
|
|
100
|
+
- Source provenance tracking via sources field
|
|
97
101
|
|
|
98
102
|
Class Variables:
|
|
99
103
|
MAX_CONTENT_SIZE: Maximum allowed content size in bytes (default 25MB)
|
|
@@ -102,6 +106,7 @@ class Document(BaseModel, ABC):
|
|
|
102
106
|
name: Document filename (validated for security)
|
|
103
107
|
description: Optional human-readable description
|
|
104
108
|
content: Raw document content as bytes
|
|
109
|
+
sources: List of source references tracking document provenance
|
|
105
110
|
|
|
106
111
|
Creating Documents:
|
|
107
112
|
**Use the `create` classmethod** for most use cases. It accepts various
|
|
@@ -117,7 +122,7 @@ class Document(BaseModel, ABC):
|
|
|
117
122
|
Warning:
|
|
118
123
|
- Document subclasses should NOT start with 'Test' prefix (pytest conflict)
|
|
119
124
|
- Cannot instantiate Document directly - must subclass FlowDocument or TaskDocument
|
|
120
|
-
- Cannot add custom fields - only name, description, content are allowed
|
|
125
|
+
- Cannot add custom fields - only name, description, content, sources are allowed
|
|
121
126
|
- Document is an abstract class and cannot be instantiated directly
|
|
122
127
|
|
|
123
128
|
Metadata Attachment Patterns:
|
|
@@ -145,6 +150,15 @@ class Document(BaseModel, ABC):
|
|
|
145
150
|
>>> doc = MyDocument.create(name="data.json", content={"key": "value"})
|
|
146
151
|
>>> print(doc.is_text) # True
|
|
147
152
|
>>> data = doc.as_json() # {'key': 'value'}
|
|
153
|
+
>>>
|
|
154
|
+
>>> # Track document provenance with sources
|
|
155
|
+
>>> source_doc = MyDocument.create(name="input.txt", content="raw data")
|
|
156
|
+
>>> processed = MyDocument.create(
|
|
157
|
+
... name="output.txt",
|
|
158
|
+
... content="processed data",
|
|
159
|
+
... sources=[source_doc.sha256] # Reference source document
|
|
160
|
+
... )
|
|
161
|
+
>>> processed.has_source(source_doc) # True
|
|
148
162
|
"""
|
|
149
163
|
|
|
150
164
|
MAX_CONTENT_SIZE: ClassVar[int] = 25 * 1024 * 1024
|
|
@@ -193,7 +207,7 @@ class Document(BaseModel, ABC):
|
|
|
193
207
|
)
|
|
194
208
|
# Check that the Document's model_fields only contain the allowed fields
|
|
195
209
|
# It prevents AI models from adding additional fields to documents
|
|
196
|
-
allowed = {"name", "description", "content"}
|
|
210
|
+
allowed = {"name", "description", "content", "sources"}
|
|
197
211
|
current = set(getattr(cls, "model_fields", {}).keys())
|
|
198
212
|
extras = current - allowed
|
|
199
213
|
if extras:
|
|
@@ -204,25 +218,58 @@ class Document(BaseModel, ABC):
|
|
|
204
218
|
|
|
205
219
|
@overload
|
|
206
220
|
@classmethod
|
|
207
|
-
def create(
|
|
221
|
+
def create(
|
|
222
|
+
cls,
|
|
223
|
+
*,
|
|
224
|
+
name: str,
|
|
225
|
+
content: bytes,
|
|
226
|
+
description: str | None = None,
|
|
227
|
+
sources: list[str] = [],
|
|
228
|
+
) -> Self: ...
|
|
208
229
|
|
|
209
230
|
@overload
|
|
210
231
|
@classmethod
|
|
211
|
-
def create(
|
|
232
|
+
def create(
|
|
233
|
+
cls,
|
|
234
|
+
*,
|
|
235
|
+
name: str,
|
|
236
|
+
content: str,
|
|
237
|
+
description: str | None = None,
|
|
238
|
+
sources: list[str] = [],
|
|
239
|
+
) -> Self: ...
|
|
212
240
|
|
|
213
241
|
@overload
|
|
214
242
|
@classmethod
|
|
215
243
|
def create(
|
|
216
|
-
cls,
|
|
244
|
+
cls,
|
|
245
|
+
*,
|
|
246
|
+
name: str,
|
|
247
|
+
content: dict[str, Any],
|
|
248
|
+
description: str | None = None,
|
|
249
|
+
sources: list[str] = [],
|
|
217
250
|
) -> Self: ...
|
|
218
251
|
|
|
219
252
|
@overload
|
|
220
253
|
@classmethod
|
|
221
|
-
def create(
|
|
254
|
+
def create(
|
|
255
|
+
cls,
|
|
256
|
+
*,
|
|
257
|
+
name: str,
|
|
258
|
+
content: list[Any],
|
|
259
|
+
description: str | None = None,
|
|
260
|
+
sources: list[str] = [],
|
|
261
|
+
) -> Self: ...
|
|
222
262
|
|
|
223
263
|
@overload
|
|
224
264
|
@classmethod
|
|
225
|
-
def create(
|
|
265
|
+
def create(
|
|
266
|
+
cls,
|
|
267
|
+
*,
|
|
268
|
+
name: str,
|
|
269
|
+
content: BaseModel,
|
|
270
|
+
description: str | None = None,
|
|
271
|
+
sources: list[str] = [],
|
|
272
|
+
) -> Self: ...
|
|
226
273
|
|
|
227
274
|
@classmethod
|
|
228
275
|
def create(
|
|
@@ -231,6 +278,7 @@ class Document(BaseModel, ABC):
|
|
|
231
278
|
name: str,
|
|
232
279
|
content: str | bytes | dict[str, Any] | list[Any] | BaseModel,
|
|
233
280
|
description: str | None = None,
|
|
281
|
+
sources: list[str] = [],
|
|
234
282
|
) -> Self:
|
|
235
283
|
r"""Create a Document with automatic content type conversion (recommended).
|
|
236
284
|
|
|
@@ -260,6 +308,11 @@ class Document(BaseModel, ABC):
|
|
|
260
308
|
- BaseModel: Serialized to JSON or YAML based on extension
|
|
261
309
|
description: Optional description - USUALLY OMIT THIS (defaults to None).
|
|
262
310
|
Only use when meaningful metadata helps downstream processing
|
|
311
|
+
sources: Optional list of source strings (document SHA256 hashes or references).
|
|
312
|
+
Used to track what sources contributed to creating this document.
|
|
313
|
+
Can contain document SHA256 hashes (for referencing other documents)
|
|
314
|
+
or arbitrary reference strings (URLs, file paths, descriptions).
|
|
315
|
+
Defaults to empty list
|
|
263
316
|
|
|
264
317
|
Returns:
|
|
265
318
|
New Document instance with content converted to bytes
|
|
@@ -306,11 +359,31 @@ class Document(BaseModel, ABC):
|
|
|
306
359
|
>>> items = ["Section 1", "Section 2"]
|
|
307
360
|
>>> doc = MyDocument.create(name="sections.md", content=items)
|
|
308
361
|
>>> doc.parse(list) # ["Section 1", "Section 2"]
|
|
362
|
+
|
|
363
|
+
>>> # Document with sources for provenance tracking
|
|
364
|
+
>>> source_doc = MyDocument.create(name="source.txt", content="original")
|
|
365
|
+
>>> derived = MyDocument.create(
|
|
366
|
+
... name="result.txt",
|
|
367
|
+
... content="processed",
|
|
368
|
+
... sources=[source_doc.sha256, "https://api.example.com/data"]
|
|
369
|
+
... )
|
|
370
|
+
>>> derived.get_source_documents() # [source_doc.sha256]
|
|
371
|
+
>>> derived.get_source_references() # ["https://api.example.com/data"]
|
|
309
372
|
"""
|
|
310
373
|
# Use model_validate to leverage the existing validator logic
|
|
311
|
-
temp = cls.model_validate({
|
|
374
|
+
temp = cls.model_validate({
|
|
375
|
+
"name": name,
|
|
376
|
+
"content": content,
|
|
377
|
+
"description": description,
|
|
378
|
+
"sources": sources,
|
|
379
|
+
})
|
|
312
380
|
# Now construct with type-checker-friendly call (bytes only)
|
|
313
|
-
return cls(
|
|
381
|
+
return cls(
|
|
382
|
+
name=temp.name,
|
|
383
|
+
content=temp.content,
|
|
384
|
+
description=temp.description,
|
|
385
|
+
sources=temp.sources,
|
|
386
|
+
)
|
|
314
387
|
|
|
315
388
|
def __init__(
|
|
316
389
|
self,
|
|
@@ -318,6 +391,7 @@ class Document(BaseModel, ABC):
|
|
|
318
391
|
name: str,
|
|
319
392
|
content: bytes,
|
|
320
393
|
description: str | None = None,
|
|
394
|
+
sources: list[str] = [],
|
|
321
395
|
) -> None:
|
|
322
396
|
"""Initialize a Document instance with raw bytes content.
|
|
323
397
|
|
|
@@ -335,6 +409,10 @@ class Document(BaseModel, ABC):
|
|
|
335
409
|
name: Document filename (required, keyword-only)
|
|
336
410
|
content: Document content as raw bytes (required, keyword-only)
|
|
337
411
|
description: Optional human-readable description (keyword-only)
|
|
412
|
+
sources: Optional list of source strings for provenance tracking.
|
|
413
|
+
Can contain document SHA256 hashes (for referencing other documents)
|
|
414
|
+
or arbitrary reference strings (URLs, file paths, descriptions).
|
|
415
|
+
Defaults to empty list
|
|
338
416
|
|
|
339
417
|
Raises:
|
|
340
418
|
TypeError: If attempting to instantiate Document directly.
|
|
@@ -357,11 +435,17 @@ class Document(BaseModel, ABC):
|
|
|
357
435
|
if type(self) is Document:
|
|
358
436
|
raise TypeError("Cannot instantiate abstract Document class directly")
|
|
359
437
|
|
|
360
|
-
super().__init__(name=name, content=content, description=description)
|
|
438
|
+
super().__init__(name=name, content=content, description=description, sources=sources)
|
|
361
439
|
|
|
362
440
|
name: str
|
|
363
441
|
description: str | None = None
|
|
364
442
|
content: bytes # Note: constructor accepts str | bytes, but field stores bytes only
|
|
443
|
+
sources: list[str] = Field(
|
|
444
|
+
default_factory=list,
|
|
445
|
+
description="List of source references for tracking document provenance. "
|
|
446
|
+
"Can contain document SHA256 hashes (for referencing other documents) "
|
|
447
|
+
"or arbitrary reference strings (URLs, file paths, descriptions)",
|
|
448
|
+
)
|
|
365
449
|
|
|
366
450
|
# Pydantic configuration
|
|
367
451
|
model_config = ConfigDict(
|
|
@@ -795,7 +879,7 @@ class Document(BaseModel, ABC):
|
|
|
795
879
|
This is computed once and cached for performance.
|
|
796
880
|
The hash is deterministic based on content only.
|
|
797
881
|
"""
|
|
798
|
-
return b32encode(hashlib.sha256(self.content).digest()).decode("ascii").upper()
|
|
882
|
+
return b32encode(hashlib.sha256(self.content).digest()).decode("ascii").upper().rstrip("=")
|
|
799
883
|
|
|
800
884
|
@final
|
|
801
885
|
@property
|
|
@@ -1215,6 +1299,144 @@ class Document(BaseModel, ABC):
|
|
|
1215
1299
|
|
|
1216
1300
|
raise ValueError(f"Unsupported type {type_} for file {self.name}")
|
|
1217
1301
|
|
|
1302
|
+
def get_source_documents(self) -> list[str]:
|
|
1303
|
+
"""Get list of document SHA256 hashes referenced as sources.
|
|
1304
|
+
|
|
1305
|
+
Retrieves all document references from this document's sources list,
|
|
1306
|
+
filtering for valid SHA256 hashes that reference other documents.
|
|
1307
|
+
This is useful for building dependency graphs and tracking document
|
|
1308
|
+
lineage in processing pipelines.
|
|
1309
|
+
|
|
1310
|
+
Returns:
|
|
1311
|
+
List of SHA256 hashes (base32 encoded) for documents referenced
|
|
1312
|
+
as sources. Each hash uniquely identifies another document that
|
|
1313
|
+
contributed to creating this one.
|
|
1314
|
+
|
|
1315
|
+
Example:
|
|
1316
|
+
>>> # Create a derived document from multiple sources
|
|
1317
|
+
>>> source1 = MyDocument.create(name="data1.txt", content="First")
|
|
1318
|
+
>>> source2 = MyDocument.create(name="data2.txt", content="Second")
|
|
1319
|
+
>>>
|
|
1320
|
+
>>> merged = MyDocument.create(
|
|
1321
|
+
... name="merged.txt",
|
|
1322
|
+
... content="Combined data",
|
|
1323
|
+
... sources=[source1.sha256, source2.sha256, "https://api.example.com"]
|
|
1324
|
+
... )
|
|
1325
|
+
>>>
|
|
1326
|
+
>>> # Get only document references (not URLs)
|
|
1327
|
+
>>> doc_refs = merged.get_source_documents()
|
|
1328
|
+
>>> print(doc_refs) # [source1.sha256, source2.sha256]
|
|
1329
|
+
>>>
|
|
1330
|
+
>>> # Check if specific document is a source
|
|
1331
|
+
>>> if source1.sha256 in doc_refs:
|
|
1332
|
+
... print("Document derived from source1")
|
|
1333
|
+
|
|
1334
|
+
See Also:
|
|
1335
|
+
- get_source_references: Get non-document source references (URLs, etc.)
|
|
1336
|
+
- has_source: Check if a specific source is tracked
|
|
1337
|
+
- Document.create: Add sources when creating documents
|
|
1338
|
+
"""
|
|
1339
|
+
return [src for src in self.sources if is_document_sha256(src)]
|
|
1340
|
+
|
|
1341
|
+
def get_source_references(self) -> list[str]:
|
|
1342
|
+
"""Get list of arbitrary reference strings from sources.
|
|
1343
|
+
|
|
1344
|
+
Retrieves all non-document references from this document's sources list.
|
|
1345
|
+
These are typically URLs, file paths, API endpoints, or descriptive strings
|
|
1346
|
+
that indicate where the document's content originated from, but are not
|
|
1347
|
+
references to other documents in the pipeline.
|
|
1348
|
+
|
|
1349
|
+
Returns:
|
|
1350
|
+
List of reference strings that are not document SHA256 hashes.
|
|
1351
|
+
Can include URLs, file paths, API endpoints, dataset names,
|
|
1352
|
+
or any other string that provides source context.
|
|
1353
|
+
|
|
1354
|
+
Example:
|
|
1355
|
+
>>> # Create document with mixed source types
|
|
1356
|
+
>>> doc = MyDocument.create(
|
|
1357
|
+
... name="report.txt",
|
|
1358
|
+
... content="Analysis results",
|
|
1359
|
+
... sources=[
|
|
1360
|
+
... other_doc.sha256, # Document reference
|
|
1361
|
+
... "https://api.example.com/data", # API URL
|
|
1362
|
+
... "dataset:customer-2024", # Dataset identifier
|
|
1363
|
+
... "/path/to/source.csv", # File path
|
|
1364
|
+
... ]
|
|
1365
|
+
... )
|
|
1366
|
+
>>>
|
|
1367
|
+
>>> # Get only non-document references
|
|
1368
|
+
>>> refs = doc.get_source_references()
|
|
1369
|
+
>>> print(refs)
|
|
1370
|
+
>>> # ["https://api.example.com/data", "dataset:customer-2024", "/path/to/source.csv"]
|
|
1371
|
+
>>>
|
|
1372
|
+
>>> # Use for attribution or debugging
|
|
1373
|
+
>>> for ref in refs:
|
|
1374
|
+
... print(f"Data sourced from: {ref}")
|
|
1375
|
+
|
|
1376
|
+
See Also:
|
|
1377
|
+
- get_source_documents: Get document SHA256 references
|
|
1378
|
+
- has_source: Check if a specific source is tracked
|
|
1379
|
+
- Document.create: Add sources when creating documents
|
|
1380
|
+
"""
|
|
1381
|
+
return [src for src in self.sources if not is_document_sha256(src)]
|
|
1382
|
+
|
|
1383
|
+
def has_source(self, source: Document | str) -> bool:
|
|
1384
|
+
"""Check if a specific source is tracked for this document.
|
|
1385
|
+
|
|
1386
|
+
Verifies whether a given source (document or reference string) is
|
|
1387
|
+
included in this document's sources list. Useful for dependency
|
|
1388
|
+
checking, lineage verification, and conditional processing based
|
|
1389
|
+
on document origins.
|
|
1390
|
+
|
|
1391
|
+
Args:
|
|
1392
|
+
source: Source to check for. Can be:
|
|
1393
|
+
- Document: Checks if document's SHA256 is in sources
|
|
1394
|
+
- str: Checks if exact string is in sources (hash or reference)
|
|
1395
|
+
|
|
1396
|
+
Returns:
|
|
1397
|
+
True if the source is tracked in this document's sources,
|
|
1398
|
+
False otherwise.
|
|
1399
|
+
|
|
1400
|
+
Raises:
|
|
1401
|
+
TypeError: If source is not a Document or string.
|
|
1402
|
+
|
|
1403
|
+
Example:
|
|
1404
|
+
>>> # Check if document was derived from specific source
|
|
1405
|
+
>>> source_doc = MyDocument.create(name="original.txt", content="Data")
|
|
1406
|
+
>>> api_url = "https://api.example.com/data"
|
|
1407
|
+
>>>
|
|
1408
|
+
>>> derived = MyDocument.create(
|
|
1409
|
+
... name="processed.txt",
|
|
1410
|
+
... content="Processed data",
|
|
1411
|
+
... sources=[source_doc.sha256, api_url]
|
|
1412
|
+
... )
|
|
1413
|
+
>>>
|
|
1414
|
+
>>> # Check document source
|
|
1415
|
+
>>> if derived.has_source(source_doc):
|
|
1416
|
+
... print("Derived from source_doc")
|
|
1417
|
+
>>>
|
|
1418
|
+
>>> # Check string reference
|
|
1419
|
+
>>> if derived.has_source(api_url):
|
|
1420
|
+
... print("Data from API")
|
|
1421
|
+
>>>
|
|
1422
|
+
>>> # Check by SHA256 directly
|
|
1423
|
+
>>> if derived.has_source(source_doc.sha256):
|
|
1424
|
+
... print("Has specific hash")
|
|
1425
|
+
|
|
1426
|
+
See Also:
|
|
1427
|
+
- get_source_documents: Get all document sources
|
|
1428
|
+
- get_source_references: Get all reference sources
|
|
1429
|
+
- Document.create: Add sources when creating documents
|
|
1430
|
+
"""
|
|
1431
|
+
if isinstance(source, str):
|
|
1432
|
+
# Direct string comparison
|
|
1433
|
+
return source in self.sources
|
|
1434
|
+
elif isinstance(source, Document): # type: ignore[misc]
|
|
1435
|
+
# Check if document's SHA256 is in sources
|
|
1436
|
+
return source.sha256 in self.sources
|
|
1437
|
+
else:
|
|
1438
|
+
raise TypeError(f"Invalid source type: {type(source)}")
|
|
1439
|
+
|
|
1218
1440
|
@final
|
|
1219
1441
|
def serialize_model(self) -> dict[str, Any]:
|
|
1220
1442
|
"""Serialize document to dictionary for storage or transmission.
|
|
@@ -1230,8 +1452,9 @@ class Document(BaseModel, ABC):
|
|
|
1230
1452
|
- base_type: Persistence type - "flow", "task", or "temporary" (str)
|
|
1231
1453
|
- size: Content size in bytes (int)
|
|
1232
1454
|
- id: Short hash identifier, first 6 chars of SHA256 (str)
|
|
1233
|
-
- sha256: Full SHA256 hash in base32 encoding (str)
|
|
1455
|
+
- sha256: Full SHA256 hash in base32 encoding without padding (str)
|
|
1234
1456
|
- mime_type: Detected MIME type (str)
|
|
1457
|
+
- sources: List of source strings (list[dict])
|
|
1235
1458
|
- content: Encoded content (str)
|
|
1236
1459
|
- content_encoding: Either "utf-8" or "base64" (str)
|
|
1237
1460
|
|
|
@@ -1254,6 +1477,7 @@ class Document(BaseModel, ABC):
|
|
|
1254
1477
|
"id": self.id,
|
|
1255
1478
|
"sha256": self.sha256,
|
|
1256
1479
|
"mime_type": self.mime_type,
|
|
1480
|
+
"sources": self.sources,
|
|
1257
1481
|
}
|
|
1258
1482
|
|
|
1259
1483
|
# Try to encode content as UTF-8, fall back to base64
|
|
@@ -1288,6 +1512,7 @@ class Document(BaseModel, ABC):
|
|
|
1288
1512
|
Optional keys:
|
|
1289
1513
|
- description: Document description (str | None)
|
|
1290
1514
|
- content_encoding: "utf-8" or "base64" (defaults to "utf-8")
|
|
1515
|
+
- sources: List of source strings
|
|
1291
1516
|
|
|
1292
1517
|
Returns:
|
|
1293
1518
|
New Document instance with restored content.
|
|
@@ -1326,9 +1551,9 @@ class Document(BaseModel, ABC):
|
|
|
1326
1551
|
else:
|
|
1327
1552
|
raise ValueError(f"Invalid content type: {type(content_raw)}")
|
|
1328
1553
|
|
|
1329
|
-
# Create document with the required fields
|
|
1330
1554
|
return cls(
|
|
1331
1555
|
name=data["name"],
|
|
1332
1556
|
content=content,
|
|
1333
1557
|
description=data.get("description"),
|
|
1558
|
+
sources=data.get("sources", []),
|
|
1334
1559
|
)
|