ai-pipeline-core 0.1.12__py3-none-any.whl → 0.1.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -88,6 +88,7 @@ from .documents import (
88
88
  TaskDocument,
89
89
  TemporaryDocument,
90
90
  canonical_name_key,
91
+ is_document_sha256,
91
92
  sanitize_url,
92
93
  )
93
94
  from .flow import FlowConfig, FlowOptions
@@ -111,9 +112,9 @@ from .pipeline import pipeline_flow, pipeline_task
111
112
  from .prefect import disable_run_logger, prefect_test_harness
112
113
  from .prompt_manager import PromptManager
113
114
  from .settings import Settings
114
- from .tracing import TraceInfo, TraceLevel, trace
115
+ from .tracing import TraceInfo, TraceLevel, set_trace_cost, trace
115
116
 
116
- __version__ = "0.1.12"
117
+ __version__ = "0.1.14"
117
118
 
118
119
  __all__ = [
119
120
  # Config/Settings
@@ -132,6 +133,7 @@ __all__ = [
132
133
  "TaskDocument",
133
134
  "TemporaryDocument",
134
135
  "canonical_name_key",
136
+ "is_document_sha256",
135
137
  "sanitize_url",
136
138
  # Flow/Task
137
139
  "FlowConfig",
@@ -154,6 +156,7 @@ __all__ = [
154
156
  "trace",
155
157
  "TraceLevel",
156
158
  "TraceInfo",
159
+ "set_trace_cost",
157
160
  # Utils
158
161
  "PromptManager",
159
162
  ]
@@ -12,7 +12,7 @@ from .document_list import DocumentList
12
12
  from .flow_document import FlowDocument
13
13
  from .task_document import TaskDocument
14
14
  from .temporary_document import TemporaryDocument
15
- from .utils import canonical_name_key, sanitize_url
15
+ from .utils import canonical_name_key, is_document_sha256, sanitize_url
16
16
 
17
17
  __all__ = [
18
18
  "Document",
@@ -21,5 +21,6 @@ __all__ = [
21
21
  "TaskDocument",
22
22
  "TemporaryDocument",
23
23
  "canonical_name_key",
24
+ "is_document_sha256",
24
25
  "sanitize_url",
25
26
  ]
@@ -6,6 +6,8 @@ This module provides the core document abstraction for working with various type
6
6
  in AI pipelines. Documents are immutable Pydantic models that wrap binary content with metadata.
7
7
  """
8
8
 
9
+ from __future__ import annotations
10
+
9
11
  import base64
10
12
  import hashlib
11
13
  import json
@@ -30,13 +32,14 @@ from typing import (
30
32
  from pydantic import (
31
33
  BaseModel,
32
34
  ConfigDict,
35
+ Field,
33
36
  ValidationInfo,
34
37
  field_serializer,
35
38
  field_validator,
36
39
  )
37
40
  from ruamel.yaml import YAML
38
41
 
39
- from ai_pipeline_core.documents.utils import canonical_name_key
42
+ from ai_pipeline_core.documents.utils import canonical_name_key, is_document_sha256
40
43
  from ai_pipeline_core.exceptions import DocumentNameError, DocumentSizeError
41
44
 
42
45
  from .mime_type import (
@@ -94,6 +97,7 @@ class Document(BaseModel, ABC):
94
97
  - SHA256 hashing for deduplication
95
98
  - Support for text, JSON, YAML, PDF, and image formats
96
99
  - Conversion utilities between different formats
100
+ - Source provenance tracking via sources field
97
101
 
98
102
  Class Variables:
99
103
  MAX_CONTENT_SIZE: Maximum allowed content size in bytes (default 25MB)
@@ -102,6 +106,7 @@ class Document(BaseModel, ABC):
102
106
  name: Document filename (validated for security)
103
107
  description: Optional human-readable description
104
108
  content: Raw document content as bytes
109
+ sources: List of source references tracking document provenance
105
110
 
106
111
  Creating Documents:
107
112
  **Use the `create` classmethod** for most use cases. It accepts various
@@ -117,7 +122,7 @@ class Document(BaseModel, ABC):
117
122
  Warning:
118
123
  - Document subclasses should NOT start with 'Test' prefix (pytest conflict)
119
124
  - Cannot instantiate Document directly - must subclass FlowDocument or TaskDocument
120
- - Cannot add custom fields - only name, description, content are allowed
125
+ - Cannot add custom fields - only name, description, content, sources are allowed
121
126
  - Document is an abstract class and cannot be instantiated directly
122
127
 
123
128
  Metadata Attachment Patterns:
@@ -145,6 +150,15 @@ class Document(BaseModel, ABC):
145
150
  >>> doc = MyDocument.create(name="data.json", content={"key": "value"})
146
151
  >>> print(doc.is_text) # True
147
152
  >>> data = doc.as_json() # {'key': 'value'}
153
+ >>>
154
+ >>> # Track document provenance with sources
155
+ >>> source_doc = MyDocument.create(name="input.txt", content="raw data")
156
+ >>> processed = MyDocument.create(
157
+ ... name="output.txt",
158
+ ... content="processed data",
159
+ ... sources=[source_doc.sha256] # Reference source document
160
+ ... )
161
+ >>> processed.has_source(source_doc) # True
148
162
  """
149
163
 
150
164
  MAX_CONTENT_SIZE: ClassVar[int] = 25 * 1024 * 1024
@@ -193,7 +207,7 @@ class Document(BaseModel, ABC):
193
207
  )
194
208
  # Check that the Document's model_fields only contain the allowed fields
195
209
  # It prevents AI models from adding additional fields to documents
196
- allowed = {"name", "description", "content"}
210
+ allowed = {"name", "description", "content", "sources"}
197
211
  current = set(getattr(cls, "model_fields", {}).keys())
198
212
  extras = current - allowed
199
213
  if extras:
@@ -204,25 +218,58 @@ class Document(BaseModel, ABC):
204
218
 
205
219
  @overload
206
220
  @classmethod
207
- def create(cls, *, name: str, content: bytes, description: str | None = None) -> Self: ...
221
+ def create(
222
+ cls,
223
+ *,
224
+ name: str,
225
+ content: bytes,
226
+ description: str | None = None,
227
+ sources: list[str] = [],
228
+ ) -> Self: ...
208
229
 
209
230
  @overload
210
231
  @classmethod
211
- def create(cls, *, name: str, content: str, description: str | None = None) -> Self: ...
232
+ def create(
233
+ cls,
234
+ *,
235
+ name: str,
236
+ content: str,
237
+ description: str | None = None,
238
+ sources: list[str] = [],
239
+ ) -> Self: ...
212
240
 
213
241
  @overload
214
242
  @classmethod
215
243
  def create(
216
- cls, *, name: str, content: dict[str, Any], description: str | None = None
244
+ cls,
245
+ *,
246
+ name: str,
247
+ content: dict[str, Any],
248
+ description: str | None = None,
249
+ sources: list[str] = [],
217
250
  ) -> Self: ...
218
251
 
219
252
  @overload
220
253
  @classmethod
221
- def create(cls, *, name: str, content: list[Any], description: str | None = None) -> Self: ...
254
+ def create(
255
+ cls,
256
+ *,
257
+ name: str,
258
+ content: list[Any],
259
+ description: str | None = None,
260
+ sources: list[str] = [],
261
+ ) -> Self: ...
222
262
 
223
263
  @overload
224
264
  @classmethod
225
- def create(cls, *, name: str, content: BaseModel, description: str | None = None) -> Self: ...
265
+ def create(
266
+ cls,
267
+ *,
268
+ name: str,
269
+ content: BaseModel,
270
+ description: str | None = None,
271
+ sources: list[str] = [],
272
+ ) -> Self: ...
226
273
 
227
274
  @classmethod
228
275
  def create(
@@ -231,6 +278,7 @@ class Document(BaseModel, ABC):
231
278
  name: str,
232
279
  content: str | bytes | dict[str, Any] | list[Any] | BaseModel,
233
280
  description: str | None = None,
281
+ sources: list[str] = [],
234
282
  ) -> Self:
235
283
  r"""Create a Document with automatic content type conversion (recommended).
236
284
 
@@ -260,6 +308,11 @@ class Document(BaseModel, ABC):
260
308
  - BaseModel: Serialized to JSON or YAML based on extension
261
309
  description: Optional description - USUALLY OMIT THIS (defaults to None).
262
310
  Only use when meaningful metadata helps downstream processing
311
+ sources: Optional list of source strings (document SHA256 hashes or references).
312
+ Used to track what sources contributed to creating this document.
313
+ Can contain document SHA256 hashes (for referencing other documents)
314
+ or arbitrary reference strings (URLs, file paths, descriptions).
315
+ Defaults to empty list
263
316
 
264
317
  Returns:
265
318
  New Document instance with content converted to bytes
@@ -306,11 +359,31 @@ class Document(BaseModel, ABC):
306
359
  >>> items = ["Section 1", "Section 2"]
307
360
  >>> doc = MyDocument.create(name="sections.md", content=items)
308
361
  >>> doc.parse(list) # ["Section 1", "Section 2"]
362
+
363
+ >>> # Document with sources for provenance tracking
364
+ >>> source_doc = MyDocument.create(name="source.txt", content="original")
365
+ >>> derived = MyDocument.create(
366
+ ... name="result.txt",
367
+ ... content="processed",
368
+ ... sources=[source_doc.sha256, "https://api.example.com/data"]
369
+ ... )
370
+ >>> derived.get_source_documents() # [source_doc.sha256]
371
+ >>> derived.get_source_references() # ["https://api.example.com/data"]
309
372
  """
310
373
  # Use model_validate to leverage the existing validator logic
311
- temp = cls.model_validate({"name": name, "content": content, "description": description})
374
+ temp = cls.model_validate({
375
+ "name": name,
376
+ "content": content,
377
+ "description": description,
378
+ "sources": sources,
379
+ })
312
380
  # Now construct with type-checker-friendly call (bytes only)
313
- return cls(name=temp.name, content=temp.content, description=temp.description)
381
+ return cls(
382
+ name=temp.name,
383
+ content=temp.content,
384
+ description=temp.description,
385
+ sources=temp.sources,
386
+ )
314
387
 
315
388
  def __init__(
316
389
  self,
@@ -318,6 +391,7 @@ class Document(BaseModel, ABC):
318
391
  name: str,
319
392
  content: bytes,
320
393
  description: str | None = None,
394
+ sources: list[str] = [],
321
395
  ) -> None:
322
396
  """Initialize a Document instance with raw bytes content.
323
397
 
@@ -335,6 +409,10 @@ class Document(BaseModel, ABC):
335
409
  name: Document filename (required, keyword-only)
336
410
  content: Document content as raw bytes (required, keyword-only)
337
411
  description: Optional human-readable description (keyword-only)
412
+ sources: Optional list of source strings for provenance tracking.
413
+ Can contain document SHA256 hashes (for referencing other documents)
414
+ or arbitrary reference strings (URLs, file paths, descriptions).
415
+ Defaults to empty list
338
416
 
339
417
  Raises:
340
418
  TypeError: If attempting to instantiate Document directly.
@@ -357,11 +435,17 @@ class Document(BaseModel, ABC):
357
435
  if type(self) is Document:
358
436
  raise TypeError("Cannot instantiate abstract Document class directly")
359
437
 
360
- super().__init__(name=name, content=content, description=description)
438
+ super().__init__(name=name, content=content, description=description, sources=sources)
361
439
 
362
440
  name: str
363
441
  description: str | None = None
364
442
  content: bytes # Note: constructor accepts str | bytes, but field stores bytes only
443
+ sources: list[str] = Field(
444
+ default_factory=list,
445
+ description="List of source references for tracking document provenance. "
446
+ "Can contain document SHA256 hashes (for referencing other documents) "
447
+ "or arbitrary reference strings (URLs, file paths, descriptions)",
448
+ )
365
449
 
366
450
  # Pydantic configuration
367
451
  model_config = ConfigDict(
@@ -795,7 +879,7 @@ class Document(BaseModel, ABC):
795
879
  This is computed once and cached for performance.
796
880
  The hash is deterministic based on content only.
797
881
  """
798
- return b32encode(hashlib.sha256(self.content).digest()).decode("ascii").upper()
882
+ return b32encode(hashlib.sha256(self.content).digest()).decode("ascii").upper().rstrip("=")
799
883
 
800
884
  @final
801
885
  @property
@@ -1215,6 +1299,144 @@ class Document(BaseModel, ABC):
1215
1299
 
1216
1300
  raise ValueError(f"Unsupported type {type_} for file {self.name}")
1217
1301
 
1302
+ def get_source_documents(self) -> list[str]:
1303
+ """Get list of document SHA256 hashes referenced as sources.
1304
+
1305
+ Retrieves all document references from this document's sources list,
1306
+ filtering for valid SHA256 hashes that reference other documents.
1307
+ This is useful for building dependency graphs and tracking document
1308
+ lineage in processing pipelines.
1309
+
1310
+ Returns:
1311
+ List of SHA256 hashes (base32 encoded) for documents referenced
1312
+ as sources. Each hash uniquely identifies another document that
1313
+ contributed to creating this one.
1314
+
1315
+ Example:
1316
+ >>> # Create a derived document from multiple sources
1317
+ >>> source1 = MyDocument.create(name="data1.txt", content="First")
1318
+ >>> source2 = MyDocument.create(name="data2.txt", content="Second")
1319
+ >>>
1320
+ >>> merged = MyDocument.create(
1321
+ ... name="merged.txt",
1322
+ ... content="Combined data",
1323
+ ... sources=[source1.sha256, source2.sha256, "https://api.example.com"]
1324
+ ... )
1325
+ >>>
1326
+ >>> # Get only document references (not URLs)
1327
+ >>> doc_refs = merged.get_source_documents()
1328
+ >>> print(doc_refs) # [source1.sha256, source2.sha256]
1329
+ >>>
1330
+ >>> # Check if specific document is a source
1331
+ >>> if source1.sha256 in doc_refs:
1332
+ ... print("Document derived from source1")
1333
+
1334
+ See Also:
1335
+ - get_source_references: Get non-document source references (URLs, etc.)
1336
+ - has_source: Check if a specific source is tracked
1337
+ - Document.create: Add sources when creating documents
1338
+ """
1339
+ return [src for src in self.sources if is_document_sha256(src)]
1340
+
1341
+ def get_source_references(self) -> list[str]:
1342
+ """Get list of arbitrary reference strings from sources.
1343
+
1344
+ Retrieves all non-document references from this document's sources list.
1345
+ These are typically URLs, file paths, API endpoints, or descriptive strings
1346
+ that indicate where the document's content originated from, but are not
1347
+ references to other documents in the pipeline.
1348
+
1349
+ Returns:
1350
+ List of reference strings that are not document SHA256 hashes.
1351
+ Can include URLs, file paths, API endpoints, dataset names,
1352
+ or any other string that provides source context.
1353
+
1354
+ Example:
1355
+ >>> # Create document with mixed source types
1356
+ >>> doc = MyDocument.create(
1357
+ ... name="report.txt",
1358
+ ... content="Analysis results",
1359
+ ... sources=[
1360
+ ... other_doc.sha256, # Document reference
1361
+ ... "https://api.example.com/data", # API URL
1362
+ ... "dataset:customer-2024", # Dataset identifier
1363
+ ... "/path/to/source.csv", # File path
1364
+ ... ]
1365
+ ... )
1366
+ >>>
1367
+ >>> # Get only non-document references
1368
+ >>> refs = doc.get_source_references()
1369
+ >>> print(refs)
1370
+ >>> # ["https://api.example.com/data", "dataset:customer-2024", "/path/to/source.csv"]
1371
+ >>>
1372
+ >>> # Use for attribution or debugging
1373
+ >>> for ref in refs:
1374
+ ... print(f"Data sourced from: {ref}")
1375
+
1376
+ See Also:
1377
+ - get_source_documents: Get document SHA256 references
1378
+ - has_source: Check if a specific source is tracked
1379
+ - Document.create: Add sources when creating documents
1380
+ """
1381
+ return [src for src in self.sources if not is_document_sha256(src)]
1382
+
1383
+ def has_source(self, source: Document | str) -> bool:
1384
+ """Check if a specific source is tracked for this document.
1385
+
1386
+ Verifies whether a given source (document or reference string) is
1387
+ included in this document's sources list. Useful for dependency
1388
+ checking, lineage verification, and conditional processing based
1389
+ on document origins.
1390
+
1391
+ Args:
1392
+ source: Source to check for. Can be:
1393
+ - Document: Checks if document's SHA256 is in sources
1394
+ - str: Checks if exact string is in sources (hash or reference)
1395
+
1396
+ Returns:
1397
+ True if the source is tracked in this document's sources,
1398
+ False otherwise.
1399
+
1400
+ Raises:
1401
+ TypeError: If source is not a Document or string.
1402
+
1403
+ Example:
1404
+ >>> # Check if document was derived from specific source
1405
+ >>> source_doc = MyDocument.create(name="original.txt", content="Data")
1406
+ >>> api_url = "https://api.example.com/data"
1407
+ >>>
1408
+ >>> derived = MyDocument.create(
1409
+ ... name="processed.txt",
1410
+ ... content="Processed data",
1411
+ ... sources=[source_doc.sha256, api_url]
1412
+ ... )
1413
+ >>>
1414
+ >>> # Check document source
1415
+ >>> if derived.has_source(source_doc):
1416
+ ... print("Derived from source_doc")
1417
+ >>>
1418
+ >>> # Check string reference
1419
+ >>> if derived.has_source(api_url):
1420
+ ... print("Data from API")
1421
+ >>>
1422
+ >>> # Check by SHA256 directly
1423
+ >>> if derived.has_source(source_doc.sha256):
1424
+ ... print("Has specific hash")
1425
+
1426
+ See Also:
1427
+ - get_source_documents: Get all document sources
1428
+ - get_source_references: Get all reference sources
1429
+ - Document.create: Add sources when creating documents
1430
+ """
1431
+ if isinstance(source, str):
1432
+ # Direct string comparison
1433
+ return source in self.sources
1434
+ elif isinstance(source, Document): # type: ignore[misc]
1435
+ # Check if document's SHA256 is in sources
1436
+ return source.sha256 in self.sources
1437
+ else:
1438
+ raise TypeError(f"Invalid source type: {type(source)}")
1439
+
1218
1440
  @final
1219
1441
  def serialize_model(self) -> dict[str, Any]:
1220
1442
  """Serialize document to dictionary for storage or transmission.
@@ -1230,8 +1452,9 @@ class Document(BaseModel, ABC):
1230
1452
  - base_type: Persistence type - "flow", "task", or "temporary" (str)
1231
1453
  - size: Content size in bytes (int)
1232
1454
  - id: Short hash identifier, first 6 chars of SHA256 (str)
1233
- - sha256: Full SHA256 hash in base32 encoding (str)
1455
+ - sha256: Full SHA256 hash in base32 encoding without padding (str)
1234
1456
  - mime_type: Detected MIME type (str)
1457
+ - sources: List of source strings (list[dict])
1235
1458
  - content: Encoded content (str)
1236
1459
  - content_encoding: Either "utf-8" or "base64" (str)
1237
1460
 
@@ -1254,6 +1477,7 @@ class Document(BaseModel, ABC):
1254
1477
  "id": self.id,
1255
1478
  "sha256": self.sha256,
1256
1479
  "mime_type": self.mime_type,
1480
+ "sources": self.sources,
1257
1481
  }
1258
1482
 
1259
1483
  # Try to encode content as UTF-8, fall back to base64
@@ -1288,6 +1512,7 @@ class Document(BaseModel, ABC):
1288
1512
  Optional keys:
1289
1513
  - description: Document description (str | None)
1290
1514
  - content_encoding: "utf-8" or "base64" (defaults to "utf-8")
1515
+ - sources: List of source strings
1291
1516
 
1292
1517
  Returns:
1293
1518
  New Document instance with restored content.
@@ -1326,9 +1551,9 @@ class Document(BaseModel, ABC):
1326
1551
  else:
1327
1552
  raise ValueError(f"Invalid content type: {type(content_raw)}")
1328
1553
 
1329
- # Create document with the required fields
1330
1554
  return cls(
1331
1555
  name=data["name"],
1332
1556
  content=content,
1333
1557
  description=data.get("description"),
1558
+ sources=data.get("sources", []),
1334
1559
  )
@@ -152,41 +152,97 @@ class DocumentList(list[Document]):
152
152
  def filter_by(self, arg: type[Document]) -> "DocumentList": ...
153
153
 
154
154
  @overload
155
- def filter_by(self, arg: list[type[Document]]) -> "DocumentList": ...
155
+ def filter_by(self, arg: Iterable[type[Document]]) -> "DocumentList": ...
156
156
 
157
- def filter_by(self, arg: str | type[Document] | list[type[Document]]) -> "DocumentList":
158
- """Filter documents by name or type(s).
157
+ @overload
158
+ def filter_by(self, arg: Iterable[str]) -> "DocumentList": ...
159
+
160
+ def filter_by(
161
+ self, arg: str | type[Document] | Iterable[type[Document]] | Iterable[str]
162
+ ) -> "DocumentList":
163
+ """Filter documents by name(s) or type(s).
159
164
 
160
165
  @public
161
166
 
162
167
  Args:
163
- arg: Document name (str), single document type, or list of document types.
168
+ arg: Can be one of:
169
+ - str: Single document name to filter by
170
+ - type[Document]: Single document type to filter by (includes subclasses)
171
+ - Iterable[type[Document]]: Multiple document types to filter by
172
+ (list, tuple, set, generator, or any iterable)
173
+ - Iterable[str]: Multiple document names to filter by
174
+ (list, tuple, set, generator, or any iterable)
164
175
 
165
176
  Returns:
166
177
  New DocumentList with filtered documents.
167
178
 
168
179
  Raises:
169
- TypeError: If arg is not a valid type (str, Document type, or list of Document types).
180
+ TypeError: If arg is not a valid type (not str, type, or iterable),
181
+ or if iterable contains mixed types (strings and types together).
182
+ AttributeError: If arg is expected to be iterable but doesn't support iteration.
170
183
 
171
184
  Example:
172
- >>> docs.filter_by("file.txt") # Filter by name
173
- >>> docs.filter_by(MyDocument) # Filter by type
174
- >>> docs.filter_by([Doc1, Doc2]) # Filter by multiple types
185
+ >>> docs.filter_by("file.txt") # Filter by single name
186
+ >>> docs.filter_by(MyDocument) # Filter by single type
187
+ >>> docs.filter_by([Doc1, Doc2]) # Filter by multiple types (list)
188
+ >>> docs.filter_by({"file1.txt", "file2.txt"}) # Filter by multiple names (set)
189
+ >>> docs.filter_by((SubDoc, AnotherDoc)) # Filter by multiple types (tuple)
190
+ >>> docs.filter_by(name for name in ["a.txt", "b.txt"]) # Generator expression
175
191
  """
176
192
  if isinstance(arg, str):
177
- # Filter by name
193
+ # Filter by single name
178
194
  return DocumentList([doc for doc in self if doc.name == arg])
179
195
  elif isinstance(arg, type):
180
196
  # Filter by single type (including subclasses)
197
+ # The type system ensures arg is type[Document] due to overloads
181
198
  return DocumentList([doc for doc in self if isinstance(doc, arg)])
182
- elif isinstance(arg, list): # type: ignore[reportUnnecessaryIsInstance]
183
- # Filter by multiple types
184
- documents = DocumentList()
185
- for document_type in arg:
186
- documents.extend([doc for doc in self if isinstance(doc, document_type)])
187
- return documents
188
199
  else:
189
- raise TypeError(f"Invalid argument type for filter_by: {type(arg)}")
200
+ # Try to consume as iterable
201
+ try:
202
+ # Convert to list to check the first element and allow reuse
203
+ items = list(arg) # type: ignore[arg-type]
204
+ if not items:
205
+ return DocumentList()
206
+
207
+ first_item = items[0]
208
+ if isinstance(first_item, str):
209
+ # Iterable of names - validate all items are strings
210
+ for item in items:
211
+ if not isinstance(item, str):
212
+ raise TypeError(
213
+ "Iterable must contain only strings or only Document types, "
214
+ "not mixed types"
215
+ )
216
+ names_set = set(items)
217
+ return DocumentList([doc for doc in self if doc.name in names_set])
218
+ elif isinstance(first_item, type): # type: ignore[reportUnnecessaryIsInstance]
219
+ # Iterable of document types - validate all items are types
220
+ for item in items:
221
+ if not isinstance(item, type):
222
+ raise TypeError(
223
+ "Iterable must contain only strings or only Document types, "
224
+ "not mixed types"
225
+ )
226
+ # Convert to set for efficient lookup
227
+ types_set = set(items)
228
+ # Filter documents that match any of the requested types
229
+ matching = [
230
+ doc
231
+ for doc in self
232
+ if any(isinstance(doc, doc_type) for doc_type in types_set) # type: ignore[arg-type]
233
+ ]
234
+ return DocumentList(matching)
235
+ else:
236
+ raise TypeError(
237
+ f"Iterable must contain strings or Document types, "
238
+ f"got {type(first_item).__name__}"
239
+ )
240
+ except (TypeError, AttributeError) as e:
241
+ # If the error message already mentions Iterable, re-raise it
242
+ if "Iterable" in str(e) or "strings or Document types" in str(e):
243
+ raise
244
+ # Otherwise, provide a generic error message
245
+ raise TypeError(f"Invalid argument type for filter_by: {type(arg).__name__}") from e
190
246
 
191
247
  @overload
192
248
  def get_by(self, arg: str) -> Document: ...
@@ -27,24 +27,8 @@ class FlowDocument(Document):
27
27
  - Saved in directories named after the document's canonical name
28
28
 
29
29
  Creating FlowDocuments:
30
- **Use the `create` classmethod** for most use cases. It handles automatic
31
- conversion of various content types. Only use __init__ when you have bytes.
32
-
33
- >>> from enum import StrEnum
34
- >>>
35
- >>> # Simple document with pass:
36
- >>> class MyDoc(FlowDocument):
37
- ... pass
38
- >>>
39
- >>> # Document with restricted file names:
40
- >>> class ConfigDoc(FlowDocument):
41
- ... class FILES(StrEnum):
42
- ... CONFIG = "config.yaml"
43
- ... SETTINGS = "settings.json"
44
- >>>
45
- >>> # RECOMMENDED - automatic conversion:
46
- >>> doc = MyDoc.create(name="data.json", content={"key": "value"})
47
- >>> doc = ConfigDoc.create(name="config.yaml", content={"host": "localhost"})
30
+ Same as Document - use `create()` for automatic conversion, `__init__` for bytes.
31
+ See Document.create() for detailed usage examples.
48
32
 
49
33
  Persistence:
50
34
  Documents are saved to: {output_dir}/{canonical_name}/{filename}
@@ -66,13 +50,11 @@ class FlowDocument(Document):
66
50
  name: str,
67
51
  content: bytes,
68
52
  description: str | None = None,
53
+ sources: list[str] = [],
69
54
  ) -> None:
70
55
  """Initialize a FlowDocument with raw bytes content.
71
56
 
72
- Important:
73
- **Most users should use the `create` classmethod instead of __init__.**
74
- The create method provides automatic content conversion for various types
75
- (str, dict, list, Pydantic models) while __init__ only accepts bytes.
57
+ See Document.__init__() for parameter details and usage notes.
76
58
 
77
59
  Prevents direct instantiation of the abstract FlowDocument class.
78
60
  FlowDocument must be subclassed for specific document types.
@@ -81,6 +63,7 @@ class FlowDocument(Document):
81
63
  name: Document filename (required, keyword-only)
82
64
  content: Document content as raw bytes (required, keyword-only)
83
65
  description: Optional human-readable description (keyword-only)
66
+ sources: Optional list of strings for provenance tracking
84
67
 
85
68
  Raises:
86
69
  TypeError: If attempting to instantiate FlowDocument directly
@@ -109,7 +92,7 @@ class FlowDocument(Document):
109
92
  """
110
93
  if type(self) is FlowDocument:
111
94
  raise TypeError("Cannot instantiate abstract FlowDocument class directly")
112
- super().__init__(name=name, content=content, description=description)
95
+ super().__init__(name=name, content=content, description=description, sources=sources)
113
96
 
114
97
  @final
115
98
  def get_base_type(self) -> Literal["flow"]: