ai-pipeline-core 0.1.12__py3-none-any.whl → 0.1.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ai_pipeline_core/__init__.py +5 -2
- ai_pipeline_core/documents/__init__.py +2 -1
- ai_pipeline_core/documents/document.py +239 -14
- ai_pipeline_core/documents/document_list.py +72 -16
- ai_pipeline_core/documents/flow_document.py +6 -23
- ai_pipeline_core/documents/task_document.py +6 -23
- ai_pipeline_core/documents/temporary_document.py +5 -19
- ai_pipeline_core/documents/utils.py +64 -1
- ai_pipeline_core/flow/options.py +2 -2
- ai_pipeline_core/llm/__init__.py +5 -0
- ai_pipeline_core/llm/ai_messages.py +0 -3
- ai_pipeline_core/llm/client.py +50 -19
- ai_pipeline_core/llm/model_options.py +18 -0
- ai_pipeline_core/llm/model_response.py +62 -15
- ai_pipeline_core/llm/model_types.py +38 -36
- ai_pipeline_core/pipeline.py +28 -2
- ai_pipeline_core/settings.py +4 -0
- ai_pipeline_core/simple_runner/simple_runner.py +18 -1
- ai_pipeline_core/tracing.py +115 -7
- {ai_pipeline_core-0.1.12.dist-info → ai_pipeline_core-0.1.14.dist-info}/METADATA +42 -7
- ai_pipeline_core-0.1.14.dist-info/RECORD +36 -0
- ai_pipeline_core-0.1.12.dist-info/RECORD +0 -36
- {ai_pipeline_core-0.1.12.dist-info → ai_pipeline_core-0.1.14.dist-info}/WHEEL +0 -0
- {ai_pipeline_core-0.1.12.dist-info → ai_pipeline_core-0.1.14.dist-info}/licenses/LICENSE +0 -0
ai_pipeline_core/__init__.py
CHANGED
|
@@ -88,6 +88,7 @@ from .documents import (
|
|
|
88
88
|
TaskDocument,
|
|
89
89
|
TemporaryDocument,
|
|
90
90
|
canonical_name_key,
|
|
91
|
+
is_document_sha256,
|
|
91
92
|
sanitize_url,
|
|
92
93
|
)
|
|
93
94
|
from .flow import FlowConfig, FlowOptions
|
|
@@ -111,9 +112,9 @@ from .pipeline import pipeline_flow, pipeline_task
|
|
|
111
112
|
from .prefect import disable_run_logger, prefect_test_harness
|
|
112
113
|
from .prompt_manager import PromptManager
|
|
113
114
|
from .settings import Settings
|
|
114
|
-
from .tracing import TraceInfo, TraceLevel, trace
|
|
115
|
+
from .tracing import TraceInfo, TraceLevel, set_trace_cost, trace
|
|
115
116
|
|
|
116
|
-
__version__ = "0.1.
|
|
117
|
+
__version__ = "0.1.14"
|
|
117
118
|
|
|
118
119
|
__all__ = [
|
|
119
120
|
# Config/Settings
|
|
@@ -132,6 +133,7 @@ __all__ = [
|
|
|
132
133
|
"TaskDocument",
|
|
133
134
|
"TemporaryDocument",
|
|
134
135
|
"canonical_name_key",
|
|
136
|
+
"is_document_sha256",
|
|
135
137
|
"sanitize_url",
|
|
136
138
|
# Flow/Task
|
|
137
139
|
"FlowConfig",
|
|
@@ -154,6 +156,7 @@ __all__ = [
|
|
|
154
156
|
"trace",
|
|
155
157
|
"TraceLevel",
|
|
156
158
|
"TraceInfo",
|
|
159
|
+
"set_trace_cost",
|
|
157
160
|
# Utils
|
|
158
161
|
"PromptManager",
|
|
159
162
|
]
|
|
@@ -12,7 +12,7 @@ from .document_list import DocumentList
|
|
|
12
12
|
from .flow_document import FlowDocument
|
|
13
13
|
from .task_document import TaskDocument
|
|
14
14
|
from .temporary_document import TemporaryDocument
|
|
15
|
-
from .utils import canonical_name_key, sanitize_url
|
|
15
|
+
from .utils import canonical_name_key, is_document_sha256, sanitize_url
|
|
16
16
|
|
|
17
17
|
__all__ = [
|
|
18
18
|
"Document",
|
|
@@ -21,5 +21,6 @@ __all__ = [
|
|
|
21
21
|
"TaskDocument",
|
|
22
22
|
"TemporaryDocument",
|
|
23
23
|
"canonical_name_key",
|
|
24
|
+
"is_document_sha256",
|
|
24
25
|
"sanitize_url",
|
|
25
26
|
]
|
|
@@ -6,6 +6,8 @@ This module provides the core document abstraction for working with various type
|
|
|
6
6
|
in AI pipelines. Documents are immutable Pydantic models that wrap binary content with metadata.
|
|
7
7
|
"""
|
|
8
8
|
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
9
11
|
import base64
|
|
10
12
|
import hashlib
|
|
11
13
|
import json
|
|
@@ -30,13 +32,14 @@ from typing import (
|
|
|
30
32
|
from pydantic import (
|
|
31
33
|
BaseModel,
|
|
32
34
|
ConfigDict,
|
|
35
|
+
Field,
|
|
33
36
|
ValidationInfo,
|
|
34
37
|
field_serializer,
|
|
35
38
|
field_validator,
|
|
36
39
|
)
|
|
37
40
|
from ruamel.yaml import YAML
|
|
38
41
|
|
|
39
|
-
from ai_pipeline_core.documents.utils import canonical_name_key
|
|
42
|
+
from ai_pipeline_core.documents.utils import canonical_name_key, is_document_sha256
|
|
40
43
|
from ai_pipeline_core.exceptions import DocumentNameError, DocumentSizeError
|
|
41
44
|
|
|
42
45
|
from .mime_type import (
|
|
@@ -94,6 +97,7 @@ class Document(BaseModel, ABC):
|
|
|
94
97
|
- SHA256 hashing for deduplication
|
|
95
98
|
- Support for text, JSON, YAML, PDF, and image formats
|
|
96
99
|
- Conversion utilities between different formats
|
|
100
|
+
- Source provenance tracking via sources field
|
|
97
101
|
|
|
98
102
|
Class Variables:
|
|
99
103
|
MAX_CONTENT_SIZE: Maximum allowed content size in bytes (default 25MB)
|
|
@@ -102,6 +106,7 @@ class Document(BaseModel, ABC):
|
|
|
102
106
|
name: Document filename (validated for security)
|
|
103
107
|
description: Optional human-readable description
|
|
104
108
|
content: Raw document content as bytes
|
|
109
|
+
sources: List of source references tracking document provenance
|
|
105
110
|
|
|
106
111
|
Creating Documents:
|
|
107
112
|
**Use the `create` classmethod** for most use cases. It accepts various
|
|
@@ -117,7 +122,7 @@ class Document(BaseModel, ABC):
|
|
|
117
122
|
Warning:
|
|
118
123
|
- Document subclasses should NOT start with 'Test' prefix (pytest conflict)
|
|
119
124
|
- Cannot instantiate Document directly - must subclass FlowDocument or TaskDocument
|
|
120
|
-
- Cannot add custom fields - only name, description, content are allowed
|
|
125
|
+
- Cannot add custom fields - only name, description, content, sources are allowed
|
|
121
126
|
- Document is an abstract class and cannot be instantiated directly
|
|
122
127
|
|
|
123
128
|
Metadata Attachment Patterns:
|
|
@@ -145,6 +150,15 @@ class Document(BaseModel, ABC):
|
|
|
145
150
|
>>> doc = MyDocument.create(name="data.json", content={"key": "value"})
|
|
146
151
|
>>> print(doc.is_text) # True
|
|
147
152
|
>>> data = doc.as_json() # {'key': 'value'}
|
|
153
|
+
>>>
|
|
154
|
+
>>> # Track document provenance with sources
|
|
155
|
+
>>> source_doc = MyDocument.create(name="input.txt", content="raw data")
|
|
156
|
+
>>> processed = MyDocument.create(
|
|
157
|
+
... name="output.txt",
|
|
158
|
+
... content="processed data",
|
|
159
|
+
... sources=[source_doc.sha256] # Reference source document
|
|
160
|
+
... )
|
|
161
|
+
>>> processed.has_source(source_doc) # True
|
|
148
162
|
"""
|
|
149
163
|
|
|
150
164
|
MAX_CONTENT_SIZE: ClassVar[int] = 25 * 1024 * 1024
|
|
@@ -193,7 +207,7 @@ class Document(BaseModel, ABC):
|
|
|
193
207
|
)
|
|
194
208
|
# Check that the Document's model_fields only contain the allowed fields
|
|
195
209
|
# It prevents AI models from adding additional fields to documents
|
|
196
|
-
allowed = {"name", "description", "content"}
|
|
210
|
+
allowed = {"name", "description", "content", "sources"}
|
|
197
211
|
current = set(getattr(cls, "model_fields", {}).keys())
|
|
198
212
|
extras = current - allowed
|
|
199
213
|
if extras:
|
|
@@ -204,25 +218,58 @@ class Document(BaseModel, ABC):
|
|
|
204
218
|
|
|
205
219
|
@overload
|
|
206
220
|
@classmethod
|
|
207
|
-
def create(
|
|
221
|
+
def create(
|
|
222
|
+
cls,
|
|
223
|
+
*,
|
|
224
|
+
name: str,
|
|
225
|
+
content: bytes,
|
|
226
|
+
description: str | None = None,
|
|
227
|
+
sources: list[str] = [],
|
|
228
|
+
) -> Self: ...
|
|
208
229
|
|
|
209
230
|
@overload
|
|
210
231
|
@classmethod
|
|
211
|
-
def create(
|
|
232
|
+
def create(
|
|
233
|
+
cls,
|
|
234
|
+
*,
|
|
235
|
+
name: str,
|
|
236
|
+
content: str,
|
|
237
|
+
description: str | None = None,
|
|
238
|
+
sources: list[str] = [],
|
|
239
|
+
) -> Self: ...
|
|
212
240
|
|
|
213
241
|
@overload
|
|
214
242
|
@classmethod
|
|
215
243
|
def create(
|
|
216
|
-
cls,
|
|
244
|
+
cls,
|
|
245
|
+
*,
|
|
246
|
+
name: str,
|
|
247
|
+
content: dict[str, Any],
|
|
248
|
+
description: str | None = None,
|
|
249
|
+
sources: list[str] = [],
|
|
217
250
|
) -> Self: ...
|
|
218
251
|
|
|
219
252
|
@overload
|
|
220
253
|
@classmethod
|
|
221
|
-
def create(
|
|
254
|
+
def create(
|
|
255
|
+
cls,
|
|
256
|
+
*,
|
|
257
|
+
name: str,
|
|
258
|
+
content: list[Any],
|
|
259
|
+
description: str | None = None,
|
|
260
|
+
sources: list[str] = [],
|
|
261
|
+
) -> Self: ...
|
|
222
262
|
|
|
223
263
|
@overload
|
|
224
264
|
@classmethod
|
|
225
|
-
def create(
|
|
265
|
+
def create(
|
|
266
|
+
cls,
|
|
267
|
+
*,
|
|
268
|
+
name: str,
|
|
269
|
+
content: BaseModel,
|
|
270
|
+
description: str | None = None,
|
|
271
|
+
sources: list[str] = [],
|
|
272
|
+
) -> Self: ...
|
|
226
273
|
|
|
227
274
|
@classmethod
|
|
228
275
|
def create(
|
|
@@ -231,6 +278,7 @@ class Document(BaseModel, ABC):
|
|
|
231
278
|
name: str,
|
|
232
279
|
content: str | bytes | dict[str, Any] | list[Any] | BaseModel,
|
|
233
280
|
description: str | None = None,
|
|
281
|
+
sources: list[str] = [],
|
|
234
282
|
) -> Self:
|
|
235
283
|
r"""Create a Document with automatic content type conversion (recommended).
|
|
236
284
|
|
|
@@ -260,6 +308,11 @@ class Document(BaseModel, ABC):
|
|
|
260
308
|
- BaseModel: Serialized to JSON or YAML based on extension
|
|
261
309
|
description: Optional description - USUALLY OMIT THIS (defaults to None).
|
|
262
310
|
Only use when meaningful metadata helps downstream processing
|
|
311
|
+
sources: Optional list of source strings (document SHA256 hashes or references).
|
|
312
|
+
Used to track what sources contributed to creating this document.
|
|
313
|
+
Can contain document SHA256 hashes (for referencing other documents)
|
|
314
|
+
or arbitrary reference strings (URLs, file paths, descriptions).
|
|
315
|
+
Defaults to empty list
|
|
263
316
|
|
|
264
317
|
Returns:
|
|
265
318
|
New Document instance with content converted to bytes
|
|
@@ -306,11 +359,31 @@ class Document(BaseModel, ABC):
|
|
|
306
359
|
>>> items = ["Section 1", "Section 2"]
|
|
307
360
|
>>> doc = MyDocument.create(name="sections.md", content=items)
|
|
308
361
|
>>> doc.parse(list) # ["Section 1", "Section 2"]
|
|
362
|
+
|
|
363
|
+
>>> # Document with sources for provenance tracking
|
|
364
|
+
>>> source_doc = MyDocument.create(name="source.txt", content="original")
|
|
365
|
+
>>> derived = MyDocument.create(
|
|
366
|
+
... name="result.txt",
|
|
367
|
+
... content="processed",
|
|
368
|
+
... sources=[source_doc.sha256, "https://api.example.com/data"]
|
|
369
|
+
... )
|
|
370
|
+
>>> derived.get_source_documents() # [source_doc.sha256]
|
|
371
|
+
>>> derived.get_source_references() # ["https://api.example.com/data"]
|
|
309
372
|
"""
|
|
310
373
|
# Use model_validate to leverage the existing validator logic
|
|
311
|
-
temp = cls.model_validate({
|
|
374
|
+
temp = cls.model_validate({
|
|
375
|
+
"name": name,
|
|
376
|
+
"content": content,
|
|
377
|
+
"description": description,
|
|
378
|
+
"sources": sources,
|
|
379
|
+
})
|
|
312
380
|
# Now construct with type-checker-friendly call (bytes only)
|
|
313
|
-
return cls(
|
|
381
|
+
return cls(
|
|
382
|
+
name=temp.name,
|
|
383
|
+
content=temp.content,
|
|
384
|
+
description=temp.description,
|
|
385
|
+
sources=temp.sources,
|
|
386
|
+
)
|
|
314
387
|
|
|
315
388
|
def __init__(
|
|
316
389
|
self,
|
|
@@ -318,6 +391,7 @@ class Document(BaseModel, ABC):
|
|
|
318
391
|
name: str,
|
|
319
392
|
content: bytes,
|
|
320
393
|
description: str | None = None,
|
|
394
|
+
sources: list[str] = [],
|
|
321
395
|
) -> None:
|
|
322
396
|
"""Initialize a Document instance with raw bytes content.
|
|
323
397
|
|
|
@@ -335,6 +409,10 @@ class Document(BaseModel, ABC):
|
|
|
335
409
|
name: Document filename (required, keyword-only)
|
|
336
410
|
content: Document content as raw bytes (required, keyword-only)
|
|
337
411
|
description: Optional human-readable description (keyword-only)
|
|
412
|
+
sources: Optional list of source strings for provenance tracking.
|
|
413
|
+
Can contain document SHA256 hashes (for referencing other documents)
|
|
414
|
+
or arbitrary reference strings (URLs, file paths, descriptions).
|
|
415
|
+
Defaults to empty list
|
|
338
416
|
|
|
339
417
|
Raises:
|
|
340
418
|
TypeError: If attempting to instantiate Document directly.
|
|
@@ -357,11 +435,17 @@ class Document(BaseModel, ABC):
|
|
|
357
435
|
if type(self) is Document:
|
|
358
436
|
raise TypeError("Cannot instantiate abstract Document class directly")
|
|
359
437
|
|
|
360
|
-
super().__init__(name=name, content=content, description=description)
|
|
438
|
+
super().__init__(name=name, content=content, description=description, sources=sources)
|
|
361
439
|
|
|
362
440
|
name: str
|
|
363
441
|
description: str | None = None
|
|
364
442
|
content: bytes # Note: constructor accepts str | bytes, but field stores bytes only
|
|
443
|
+
sources: list[str] = Field(
|
|
444
|
+
default_factory=list,
|
|
445
|
+
description="List of source references for tracking document provenance. "
|
|
446
|
+
"Can contain document SHA256 hashes (for referencing other documents) "
|
|
447
|
+
"or arbitrary reference strings (URLs, file paths, descriptions)",
|
|
448
|
+
)
|
|
365
449
|
|
|
366
450
|
# Pydantic configuration
|
|
367
451
|
model_config = ConfigDict(
|
|
@@ -795,7 +879,7 @@ class Document(BaseModel, ABC):
|
|
|
795
879
|
This is computed once and cached for performance.
|
|
796
880
|
The hash is deterministic based on content only.
|
|
797
881
|
"""
|
|
798
|
-
return b32encode(hashlib.sha256(self.content).digest()).decode("ascii").upper()
|
|
882
|
+
return b32encode(hashlib.sha256(self.content).digest()).decode("ascii").upper().rstrip("=")
|
|
799
883
|
|
|
800
884
|
@final
|
|
801
885
|
@property
|
|
@@ -1215,6 +1299,144 @@ class Document(BaseModel, ABC):
|
|
|
1215
1299
|
|
|
1216
1300
|
raise ValueError(f"Unsupported type {type_} for file {self.name}")
|
|
1217
1301
|
|
|
1302
|
+
def get_source_documents(self) -> list[str]:
|
|
1303
|
+
"""Get list of document SHA256 hashes referenced as sources.
|
|
1304
|
+
|
|
1305
|
+
Retrieves all document references from this document's sources list,
|
|
1306
|
+
filtering for valid SHA256 hashes that reference other documents.
|
|
1307
|
+
This is useful for building dependency graphs and tracking document
|
|
1308
|
+
lineage in processing pipelines.
|
|
1309
|
+
|
|
1310
|
+
Returns:
|
|
1311
|
+
List of SHA256 hashes (base32 encoded) for documents referenced
|
|
1312
|
+
as sources. Each hash uniquely identifies another document that
|
|
1313
|
+
contributed to creating this one.
|
|
1314
|
+
|
|
1315
|
+
Example:
|
|
1316
|
+
>>> # Create a derived document from multiple sources
|
|
1317
|
+
>>> source1 = MyDocument.create(name="data1.txt", content="First")
|
|
1318
|
+
>>> source2 = MyDocument.create(name="data2.txt", content="Second")
|
|
1319
|
+
>>>
|
|
1320
|
+
>>> merged = MyDocument.create(
|
|
1321
|
+
... name="merged.txt",
|
|
1322
|
+
... content="Combined data",
|
|
1323
|
+
... sources=[source1.sha256, source2.sha256, "https://api.example.com"]
|
|
1324
|
+
... )
|
|
1325
|
+
>>>
|
|
1326
|
+
>>> # Get only document references (not URLs)
|
|
1327
|
+
>>> doc_refs = merged.get_source_documents()
|
|
1328
|
+
>>> print(doc_refs) # [source1.sha256, source2.sha256]
|
|
1329
|
+
>>>
|
|
1330
|
+
>>> # Check if specific document is a source
|
|
1331
|
+
>>> if source1.sha256 in doc_refs:
|
|
1332
|
+
... print("Document derived from source1")
|
|
1333
|
+
|
|
1334
|
+
See Also:
|
|
1335
|
+
- get_source_references: Get non-document source references (URLs, etc.)
|
|
1336
|
+
- has_source: Check if a specific source is tracked
|
|
1337
|
+
- Document.create: Add sources when creating documents
|
|
1338
|
+
"""
|
|
1339
|
+
return [src for src in self.sources if is_document_sha256(src)]
|
|
1340
|
+
|
|
1341
|
+
def get_source_references(self) -> list[str]:
|
|
1342
|
+
"""Get list of arbitrary reference strings from sources.
|
|
1343
|
+
|
|
1344
|
+
Retrieves all non-document references from this document's sources list.
|
|
1345
|
+
These are typically URLs, file paths, API endpoints, or descriptive strings
|
|
1346
|
+
that indicate where the document's content originated from, but are not
|
|
1347
|
+
references to other documents in the pipeline.
|
|
1348
|
+
|
|
1349
|
+
Returns:
|
|
1350
|
+
List of reference strings that are not document SHA256 hashes.
|
|
1351
|
+
Can include URLs, file paths, API endpoints, dataset names,
|
|
1352
|
+
or any other string that provides source context.
|
|
1353
|
+
|
|
1354
|
+
Example:
|
|
1355
|
+
>>> # Create document with mixed source types
|
|
1356
|
+
>>> doc = MyDocument.create(
|
|
1357
|
+
... name="report.txt",
|
|
1358
|
+
... content="Analysis results",
|
|
1359
|
+
... sources=[
|
|
1360
|
+
... other_doc.sha256, # Document reference
|
|
1361
|
+
... "https://api.example.com/data", # API URL
|
|
1362
|
+
... "dataset:customer-2024", # Dataset identifier
|
|
1363
|
+
... "/path/to/source.csv", # File path
|
|
1364
|
+
... ]
|
|
1365
|
+
... )
|
|
1366
|
+
>>>
|
|
1367
|
+
>>> # Get only non-document references
|
|
1368
|
+
>>> refs = doc.get_source_references()
|
|
1369
|
+
>>> print(refs)
|
|
1370
|
+
>>> # ["https://api.example.com/data", "dataset:customer-2024", "/path/to/source.csv"]
|
|
1371
|
+
>>>
|
|
1372
|
+
>>> # Use for attribution or debugging
|
|
1373
|
+
>>> for ref in refs:
|
|
1374
|
+
... print(f"Data sourced from: {ref}")
|
|
1375
|
+
|
|
1376
|
+
See Also:
|
|
1377
|
+
- get_source_documents: Get document SHA256 references
|
|
1378
|
+
- has_source: Check if a specific source is tracked
|
|
1379
|
+
- Document.create: Add sources when creating documents
|
|
1380
|
+
"""
|
|
1381
|
+
return [src for src in self.sources if not is_document_sha256(src)]
|
|
1382
|
+
|
|
1383
|
+
def has_source(self, source: Document | str) -> bool:
|
|
1384
|
+
"""Check if a specific source is tracked for this document.
|
|
1385
|
+
|
|
1386
|
+
Verifies whether a given source (document or reference string) is
|
|
1387
|
+
included in this document's sources list. Useful for dependency
|
|
1388
|
+
checking, lineage verification, and conditional processing based
|
|
1389
|
+
on document origins.
|
|
1390
|
+
|
|
1391
|
+
Args:
|
|
1392
|
+
source: Source to check for. Can be:
|
|
1393
|
+
- Document: Checks if document's SHA256 is in sources
|
|
1394
|
+
- str: Checks if exact string is in sources (hash or reference)
|
|
1395
|
+
|
|
1396
|
+
Returns:
|
|
1397
|
+
True if the source is tracked in this document's sources,
|
|
1398
|
+
False otherwise.
|
|
1399
|
+
|
|
1400
|
+
Raises:
|
|
1401
|
+
TypeError: If source is not a Document or string.
|
|
1402
|
+
|
|
1403
|
+
Example:
|
|
1404
|
+
>>> # Check if document was derived from specific source
|
|
1405
|
+
>>> source_doc = MyDocument.create(name="original.txt", content="Data")
|
|
1406
|
+
>>> api_url = "https://api.example.com/data"
|
|
1407
|
+
>>>
|
|
1408
|
+
>>> derived = MyDocument.create(
|
|
1409
|
+
... name="processed.txt",
|
|
1410
|
+
... content="Processed data",
|
|
1411
|
+
... sources=[source_doc.sha256, api_url]
|
|
1412
|
+
... )
|
|
1413
|
+
>>>
|
|
1414
|
+
>>> # Check document source
|
|
1415
|
+
>>> if derived.has_source(source_doc):
|
|
1416
|
+
... print("Derived from source_doc")
|
|
1417
|
+
>>>
|
|
1418
|
+
>>> # Check string reference
|
|
1419
|
+
>>> if derived.has_source(api_url):
|
|
1420
|
+
... print("Data from API")
|
|
1421
|
+
>>>
|
|
1422
|
+
>>> # Check by SHA256 directly
|
|
1423
|
+
>>> if derived.has_source(source_doc.sha256):
|
|
1424
|
+
... print("Has specific hash")
|
|
1425
|
+
|
|
1426
|
+
See Also:
|
|
1427
|
+
- get_source_documents: Get all document sources
|
|
1428
|
+
- get_source_references: Get all reference sources
|
|
1429
|
+
- Document.create: Add sources when creating documents
|
|
1430
|
+
"""
|
|
1431
|
+
if isinstance(source, str):
|
|
1432
|
+
# Direct string comparison
|
|
1433
|
+
return source in self.sources
|
|
1434
|
+
elif isinstance(source, Document): # type: ignore[misc]
|
|
1435
|
+
# Check if document's SHA256 is in sources
|
|
1436
|
+
return source.sha256 in self.sources
|
|
1437
|
+
else:
|
|
1438
|
+
raise TypeError(f"Invalid source type: {type(source)}")
|
|
1439
|
+
|
|
1218
1440
|
@final
|
|
1219
1441
|
def serialize_model(self) -> dict[str, Any]:
|
|
1220
1442
|
"""Serialize document to dictionary for storage or transmission.
|
|
@@ -1230,8 +1452,9 @@ class Document(BaseModel, ABC):
|
|
|
1230
1452
|
- base_type: Persistence type - "flow", "task", or "temporary" (str)
|
|
1231
1453
|
- size: Content size in bytes (int)
|
|
1232
1454
|
- id: Short hash identifier, first 6 chars of SHA256 (str)
|
|
1233
|
-
- sha256: Full SHA256 hash in base32 encoding (str)
|
|
1455
|
+
- sha256: Full SHA256 hash in base32 encoding without padding (str)
|
|
1234
1456
|
- mime_type: Detected MIME type (str)
|
|
1457
|
+
- sources: List of source strings (list[dict])
|
|
1235
1458
|
- content: Encoded content (str)
|
|
1236
1459
|
- content_encoding: Either "utf-8" or "base64" (str)
|
|
1237
1460
|
|
|
@@ -1254,6 +1477,7 @@ class Document(BaseModel, ABC):
|
|
|
1254
1477
|
"id": self.id,
|
|
1255
1478
|
"sha256": self.sha256,
|
|
1256
1479
|
"mime_type": self.mime_type,
|
|
1480
|
+
"sources": self.sources,
|
|
1257
1481
|
}
|
|
1258
1482
|
|
|
1259
1483
|
# Try to encode content as UTF-8, fall back to base64
|
|
@@ -1288,6 +1512,7 @@ class Document(BaseModel, ABC):
|
|
|
1288
1512
|
Optional keys:
|
|
1289
1513
|
- description: Document description (str | None)
|
|
1290
1514
|
- content_encoding: "utf-8" or "base64" (defaults to "utf-8")
|
|
1515
|
+
- sources: List of source strings
|
|
1291
1516
|
|
|
1292
1517
|
Returns:
|
|
1293
1518
|
New Document instance with restored content.
|
|
@@ -1326,9 +1551,9 @@ class Document(BaseModel, ABC):
|
|
|
1326
1551
|
else:
|
|
1327
1552
|
raise ValueError(f"Invalid content type: {type(content_raw)}")
|
|
1328
1553
|
|
|
1329
|
-
# Create document with the required fields
|
|
1330
1554
|
return cls(
|
|
1331
1555
|
name=data["name"],
|
|
1332
1556
|
content=content,
|
|
1333
1557
|
description=data.get("description"),
|
|
1558
|
+
sources=data.get("sources", []),
|
|
1334
1559
|
)
|
|
@@ -152,41 +152,97 @@ class DocumentList(list[Document]):
|
|
|
152
152
|
def filter_by(self, arg: type[Document]) -> "DocumentList": ...
|
|
153
153
|
|
|
154
154
|
@overload
|
|
155
|
-
def filter_by(self, arg:
|
|
155
|
+
def filter_by(self, arg: Iterable[type[Document]]) -> "DocumentList": ...
|
|
156
156
|
|
|
157
|
-
|
|
158
|
-
|
|
157
|
+
@overload
|
|
158
|
+
def filter_by(self, arg: Iterable[str]) -> "DocumentList": ...
|
|
159
|
+
|
|
160
|
+
def filter_by(
|
|
161
|
+
self, arg: str | type[Document] | Iterable[type[Document]] | Iterable[str]
|
|
162
|
+
) -> "DocumentList":
|
|
163
|
+
"""Filter documents by name(s) or type(s).
|
|
159
164
|
|
|
160
165
|
@public
|
|
161
166
|
|
|
162
167
|
Args:
|
|
163
|
-
arg:
|
|
168
|
+
arg: Can be one of:
|
|
169
|
+
- str: Single document name to filter by
|
|
170
|
+
- type[Document]: Single document type to filter by (includes subclasses)
|
|
171
|
+
- Iterable[type[Document]]: Multiple document types to filter by
|
|
172
|
+
(list, tuple, set, generator, or any iterable)
|
|
173
|
+
- Iterable[str]: Multiple document names to filter by
|
|
174
|
+
(list, tuple, set, generator, or any iterable)
|
|
164
175
|
|
|
165
176
|
Returns:
|
|
166
177
|
New DocumentList with filtered documents.
|
|
167
178
|
|
|
168
179
|
Raises:
|
|
169
|
-
TypeError: If arg is not a valid type (str,
|
|
180
|
+
TypeError: If arg is not a valid type (not str, type, or iterable),
|
|
181
|
+
or if iterable contains mixed types (strings and types together).
|
|
182
|
+
AttributeError: If arg is expected to be iterable but doesn't support iteration.
|
|
170
183
|
|
|
171
184
|
Example:
|
|
172
|
-
>>> docs.filter_by("file.txt") # Filter by name
|
|
173
|
-
>>> docs.filter_by(MyDocument) # Filter by type
|
|
174
|
-
>>> docs.filter_by([Doc1, Doc2]) # Filter by multiple types
|
|
185
|
+
>>> docs.filter_by("file.txt") # Filter by single name
|
|
186
|
+
>>> docs.filter_by(MyDocument) # Filter by single type
|
|
187
|
+
>>> docs.filter_by([Doc1, Doc2]) # Filter by multiple types (list)
|
|
188
|
+
>>> docs.filter_by({"file1.txt", "file2.txt"}) # Filter by multiple names (set)
|
|
189
|
+
>>> docs.filter_by((SubDoc, AnotherDoc)) # Filter by multiple types (tuple)
|
|
190
|
+
>>> docs.filter_by(name for name in ["a.txt", "b.txt"]) # Generator expression
|
|
175
191
|
"""
|
|
176
192
|
if isinstance(arg, str):
|
|
177
|
-
# Filter by name
|
|
193
|
+
# Filter by single name
|
|
178
194
|
return DocumentList([doc for doc in self if doc.name == arg])
|
|
179
195
|
elif isinstance(arg, type):
|
|
180
196
|
# Filter by single type (including subclasses)
|
|
197
|
+
# The type system ensures arg is type[Document] due to overloads
|
|
181
198
|
return DocumentList([doc for doc in self if isinstance(doc, arg)])
|
|
182
|
-
elif isinstance(arg, list): # type: ignore[reportUnnecessaryIsInstance]
|
|
183
|
-
# Filter by multiple types
|
|
184
|
-
documents = DocumentList()
|
|
185
|
-
for document_type in arg:
|
|
186
|
-
documents.extend([doc for doc in self if isinstance(doc, document_type)])
|
|
187
|
-
return documents
|
|
188
199
|
else:
|
|
189
|
-
|
|
200
|
+
# Try to consume as iterable
|
|
201
|
+
try:
|
|
202
|
+
# Convert to list to check the first element and allow reuse
|
|
203
|
+
items = list(arg) # type: ignore[arg-type]
|
|
204
|
+
if not items:
|
|
205
|
+
return DocumentList()
|
|
206
|
+
|
|
207
|
+
first_item = items[0]
|
|
208
|
+
if isinstance(first_item, str):
|
|
209
|
+
# Iterable of names - validate all items are strings
|
|
210
|
+
for item in items:
|
|
211
|
+
if not isinstance(item, str):
|
|
212
|
+
raise TypeError(
|
|
213
|
+
"Iterable must contain only strings or only Document types, "
|
|
214
|
+
"not mixed types"
|
|
215
|
+
)
|
|
216
|
+
names_set = set(items)
|
|
217
|
+
return DocumentList([doc for doc in self if doc.name in names_set])
|
|
218
|
+
elif isinstance(first_item, type): # type: ignore[reportUnnecessaryIsInstance]
|
|
219
|
+
# Iterable of document types - validate all items are types
|
|
220
|
+
for item in items:
|
|
221
|
+
if not isinstance(item, type):
|
|
222
|
+
raise TypeError(
|
|
223
|
+
"Iterable must contain only strings or only Document types, "
|
|
224
|
+
"not mixed types"
|
|
225
|
+
)
|
|
226
|
+
# Convert to set for efficient lookup
|
|
227
|
+
types_set = set(items)
|
|
228
|
+
# Filter documents that match any of the requested types
|
|
229
|
+
matching = [
|
|
230
|
+
doc
|
|
231
|
+
for doc in self
|
|
232
|
+
if any(isinstance(doc, doc_type) for doc_type in types_set) # type: ignore[arg-type]
|
|
233
|
+
]
|
|
234
|
+
return DocumentList(matching)
|
|
235
|
+
else:
|
|
236
|
+
raise TypeError(
|
|
237
|
+
f"Iterable must contain strings or Document types, "
|
|
238
|
+
f"got {type(first_item).__name__}"
|
|
239
|
+
)
|
|
240
|
+
except (TypeError, AttributeError) as e:
|
|
241
|
+
# If the error message already mentions Iterable, re-raise it
|
|
242
|
+
if "Iterable" in str(e) or "strings or Document types" in str(e):
|
|
243
|
+
raise
|
|
244
|
+
# Otherwise, provide a generic error message
|
|
245
|
+
raise TypeError(f"Invalid argument type for filter_by: {type(arg).__name__}") from e
|
|
190
246
|
|
|
191
247
|
@overload
|
|
192
248
|
def get_by(self, arg: str) -> Document: ...
|
|
@@ -27,24 +27,8 @@ class FlowDocument(Document):
|
|
|
27
27
|
- Saved in directories named after the document's canonical name
|
|
28
28
|
|
|
29
29
|
Creating FlowDocuments:
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
>>> from enum import StrEnum
|
|
34
|
-
>>>
|
|
35
|
-
>>> # Simple document with pass:
|
|
36
|
-
>>> class MyDoc(FlowDocument):
|
|
37
|
-
... pass
|
|
38
|
-
>>>
|
|
39
|
-
>>> # Document with restricted file names:
|
|
40
|
-
>>> class ConfigDoc(FlowDocument):
|
|
41
|
-
... class FILES(StrEnum):
|
|
42
|
-
... CONFIG = "config.yaml"
|
|
43
|
-
... SETTINGS = "settings.json"
|
|
44
|
-
>>>
|
|
45
|
-
>>> # RECOMMENDED - automatic conversion:
|
|
46
|
-
>>> doc = MyDoc.create(name="data.json", content={"key": "value"})
|
|
47
|
-
>>> doc = ConfigDoc.create(name="config.yaml", content={"host": "localhost"})
|
|
30
|
+
Same as Document - use `create()` for automatic conversion, `__init__` for bytes.
|
|
31
|
+
See Document.create() for detailed usage examples.
|
|
48
32
|
|
|
49
33
|
Persistence:
|
|
50
34
|
Documents are saved to: {output_dir}/{canonical_name}/{filename}
|
|
@@ -66,13 +50,11 @@ class FlowDocument(Document):
|
|
|
66
50
|
name: str,
|
|
67
51
|
content: bytes,
|
|
68
52
|
description: str | None = None,
|
|
53
|
+
sources: list[str] = [],
|
|
69
54
|
) -> None:
|
|
70
55
|
"""Initialize a FlowDocument with raw bytes content.
|
|
71
56
|
|
|
72
|
-
|
|
73
|
-
**Most users should use the `create` classmethod instead of __init__.**
|
|
74
|
-
The create method provides automatic content conversion for various types
|
|
75
|
-
(str, dict, list, Pydantic models) while __init__ only accepts bytes.
|
|
57
|
+
See Document.__init__() for parameter details and usage notes.
|
|
76
58
|
|
|
77
59
|
Prevents direct instantiation of the abstract FlowDocument class.
|
|
78
60
|
FlowDocument must be subclassed for specific document types.
|
|
@@ -81,6 +63,7 @@ class FlowDocument(Document):
|
|
|
81
63
|
name: Document filename (required, keyword-only)
|
|
82
64
|
content: Document content as raw bytes (required, keyword-only)
|
|
83
65
|
description: Optional human-readable description (keyword-only)
|
|
66
|
+
sources: Optional list of strings for provenance tracking
|
|
84
67
|
|
|
85
68
|
Raises:
|
|
86
69
|
TypeError: If attempting to instantiate FlowDocument directly
|
|
@@ -109,7 +92,7 @@ class FlowDocument(Document):
|
|
|
109
92
|
"""
|
|
110
93
|
if type(self) is FlowDocument:
|
|
111
94
|
raise TypeError("Cannot instantiate abstract FlowDocument class directly")
|
|
112
|
-
super().__init__(name=name, content=content, description=description)
|
|
95
|
+
super().__init__(name=name, content=content, description=description, sources=sources)
|
|
113
96
|
|
|
114
97
|
@final
|
|
115
98
|
def get_base_type(self) -> Literal["flow"]:
|