ai-pipeline-core 0.1.13__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ai_pipeline_core/__init__.py +25 -14
- ai_pipeline_core/documents/__init__.py +2 -1
- ai_pipeline_core/documents/document.py +317 -49
- ai_pipeline_core/documents/document_list.py +136 -33
- ai_pipeline_core/documents/flow_document.py +8 -29
- ai_pipeline_core/documents/task_document.py +6 -27
- ai_pipeline_core/documents/temporary_document.py +6 -27
- ai_pipeline_core/documents/utils.py +64 -1
- ai_pipeline_core/flow/config.py +174 -5
- ai_pipeline_core/flow/options.py +2 -2
- ai_pipeline_core/llm/__init__.py +6 -1
- ai_pipeline_core/llm/ai_messages.py +14 -7
- ai_pipeline_core/llm/client.py +143 -55
- ai_pipeline_core/llm/model_options.py +20 -5
- ai_pipeline_core/llm/model_response.py +77 -29
- ai_pipeline_core/llm/model_types.py +38 -40
- ai_pipeline_core/logging/__init__.py +0 -2
- ai_pipeline_core/logging/logging_config.py +0 -6
- ai_pipeline_core/logging/logging_mixin.py +2 -10
- ai_pipeline_core/pipeline.py +68 -65
- ai_pipeline_core/prefect.py +12 -3
- ai_pipeline_core/prompt_manager.py +6 -7
- ai_pipeline_core/settings.py +13 -5
- ai_pipeline_core/simple_runner/__init__.py +1 -11
- ai_pipeline_core/simple_runner/cli.py +13 -12
- ai_pipeline_core/simple_runner/simple_runner.py +34 -172
- ai_pipeline_core/storage/__init__.py +8 -0
- ai_pipeline_core/storage/storage.py +628 -0
- ai_pipeline_core/tracing.py +110 -26
- {ai_pipeline_core-0.1.13.dist-info → ai_pipeline_core-0.2.0.dist-info}/METADATA +60 -23
- ai_pipeline_core-0.2.0.dist-info/RECORD +38 -0
- ai_pipeline_core-0.1.13.dist-info/RECORD +0 -36
- {ai_pipeline_core-0.1.13.dist-info → ai_pipeline_core-0.2.0.dist-info}/WHEEL +0 -0
- {ai_pipeline_core-0.1.13.dist-info → ai_pipeline_core-0.2.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -17,8 +17,8 @@ class DocumentList(list[Document]):
|
|
|
17
17
|
|
|
18
18
|
Specialized list with validation and filtering for documents.
|
|
19
19
|
|
|
20
|
-
Best Practice: Use default constructor
|
|
21
|
-
validate_same_type or validate_duplicates when you explicitly need them.
|
|
20
|
+
Best Practice: Use default constructor by default, unless instructed otherwise.
|
|
21
|
+
Only enable validate_same_type or validate_duplicates when you explicitly need them.
|
|
22
22
|
|
|
23
23
|
Example:
|
|
24
24
|
>>> # RECOMMENDED - default constructor for most cases
|
|
@@ -152,41 +152,109 @@ class DocumentList(list[Document]):
|
|
|
152
152
|
def filter_by(self, arg: type[Document]) -> "DocumentList": ...
|
|
153
153
|
|
|
154
154
|
@overload
|
|
155
|
-
def filter_by(self, arg:
|
|
155
|
+
def filter_by(self, arg: Iterable[type[Document]]) -> "DocumentList": ...
|
|
156
156
|
|
|
157
|
-
|
|
158
|
-
|
|
157
|
+
@overload
|
|
158
|
+
def filter_by(self, arg: Iterable[str]) -> "DocumentList": ...
|
|
159
|
+
|
|
160
|
+
def filter_by(
|
|
161
|
+
self, arg: str | type[Document] | Iterable[type[Document]] | Iterable[str]
|
|
162
|
+
) -> "DocumentList":
|
|
163
|
+
"""Filter documents by name(s) or type(s).
|
|
159
164
|
|
|
160
165
|
@public
|
|
161
166
|
|
|
167
|
+
ALWAYS returns a DocumentList (which may be empty), never raises an exception
|
|
168
|
+
for no matches. Use this when you want to process all matching documents.
|
|
169
|
+
|
|
162
170
|
Args:
|
|
163
|
-
arg:
|
|
171
|
+
arg: Can be one of:
|
|
172
|
+
- str: Single document name to filter by
|
|
173
|
+
- type[Document]: Single document type to filter by (includes subclasses)
|
|
174
|
+
- Iterable[type[Document]]: Multiple document types to filter by
|
|
175
|
+
(list, tuple, set, generator, or any iterable)
|
|
176
|
+
- Iterable[str]: Multiple document names to filter by
|
|
177
|
+
(list, tuple, set, generator, or any iterable)
|
|
164
178
|
|
|
165
179
|
Returns:
|
|
166
|
-
New DocumentList with filtered documents.
|
|
180
|
+
New DocumentList with filtered documents (may be empty).
|
|
181
|
+
- Returns ALL matching documents
|
|
182
|
+
- Empty DocumentList if no matches found
|
|
167
183
|
|
|
168
184
|
Raises:
|
|
169
|
-
TypeError: If arg is not a valid type (str,
|
|
185
|
+
TypeError: If arg is not a valid type (not str, type, or iterable),
|
|
186
|
+
or if iterable contains mixed types (strings and types together).
|
|
187
|
+
AttributeError: If arg is expected to be iterable but doesn't support iteration.
|
|
170
188
|
|
|
171
189
|
Example:
|
|
172
|
-
>>>
|
|
173
|
-
>>> docs.filter_by(
|
|
174
|
-
>>>
|
|
190
|
+
>>> # Returns list with all matching documents
|
|
191
|
+
>>> matching_docs = docs.filter_by("file.txt") # May be empty
|
|
192
|
+
>>> for doc in matching_docs:
|
|
193
|
+
... process(doc)
|
|
194
|
+
>>>
|
|
195
|
+
>>> # Filter by type - returns all instances
|
|
196
|
+
>>> config_docs = docs.filter_by(ConfigDocument)
|
|
197
|
+
>>> print(f"Found {len(config_docs)} config documents")
|
|
198
|
+
>>>
|
|
199
|
+
>>> # Filter by multiple names
|
|
200
|
+
>>> important_docs = docs.filter_by(["config.yaml", "settings.json"])
|
|
201
|
+
>>> if not important_docs: # Check if empty
|
|
202
|
+
... print("No important documents found")
|
|
175
203
|
"""
|
|
176
204
|
if isinstance(arg, str):
|
|
177
|
-
# Filter by name
|
|
205
|
+
# Filter by single name
|
|
178
206
|
return DocumentList([doc for doc in self if doc.name == arg])
|
|
179
207
|
elif isinstance(arg, type):
|
|
180
208
|
# Filter by single type (including subclasses)
|
|
209
|
+
# The type system ensures arg is type[Document] due to overloads
|
|
181
210
|
return DocumentList([doc for doc in self if isinstance(doc, arg)])
|
|
182
|
-
elif isinstance(arg, list): # type: ignore[reportUnnecessaryIsInstance]
|
|
183
|
-
# Filter by multiple types
|
|
184
|
-
documents = DocumentList()
|
|
185
|
-
for document_type in arg:
|
|
186
|
-
documents.extend([doc for doc in self if isinstance(doc, document_type)])
|
|
187
|
-
return documents
|
|
188
211
|
else:
|
|
189
|
-
|
|
212
|
+
# Try to consume as iterable
|
|
213
|
+
try:
|
|
214
|
+
# Convert to list to check the first element and allow reuse
|
|
215
|
+
items = list(arg) # type: ignore[arg-type]
|
|
216
|
+
if not items:
|
|
217
|
+
return DocumentList()
|
|
218
|
+
|
|
219
|
+
first_item = items[0]
|
|
220
|
+
if isinstance(first_item, str):
|
|
221
|
+
# Iterable of names - validate all items are strings
|
|
222
|
+
for item in items:
|
|
223
|
+
if not isinstance(item, str):
|
|
224
|
+
raise TypeError(
|
|
225
|
+
"Iterable must contain only strings or only Document types, "
|
|
226
|
+
"not mixed types"
|
|
227
|
+
)
|
|
228
|
+
names_set = set(items)
|
|
229
|
+
return DocumentList([doc for doc in self if doc.name in names_set])
|
|
230
|
+
elif isinstance(first_item, type): # type: ignore[reportUnnecessaryIsInstance]
|
|
231
|
+
# Iterable of document types - validate all items are types
|
|
232
|
+
for item in items:
|
|
233
|
+
if not isinstance(item, type):
|
|
234
|
+
raise TypeError(
|
|
235
|
+
"Iterable must contain only strings or only Document types, "
|
|
236
|
+
"not mixed types"
|
|
237
|
+
)
|
|
238
|
+
# Convert to set for efficient lookup
|
|
239
|
+
types_set = set(items)
|
|
240
|
+
# Filter documents that match any of the requested types
|
|
241
|
+
matching = [
|
|
242
|
+
doc
|
|
243
|
+
for doc in self
|
|
244
|
+
if any(isinstance(doc, doc_type) for doc_type in types_set) # type: ignore[arg-type]
|
|
245
|
+
]
|
|
246
|
+
return DocumentList(matching)
|
|
247
|
+
else:
|
|
248
|
+
raise TypeError(
|
|
249
|
+
f"Iterable must contain strings or Document types, "
|
|
250
|
+
f"got {type(first_item).__name__}"
|
|
251
|
+
)
|
|
252
|
+
except (TypeError, AttributeError) as e:
|
|
253
|
+
# If the error message already mentions Iterable, re-raise it
|
|
254
|
+
if "Iterable" in str(e) or "strings or Document types" in str(e):
|
|
255
|
+
raise
|
|
256
|
+
# Otherwise, provide a generic error message
|
|
257
|
+
raise TypeError(f"Invalid argument type for filter_by: {type(arg).__name__}") from e
|
|
190
258
|
|
|
191
259
|
@overload
|
|
192
260
|
def get_by(self, arg: str) -> Document: ...
|
|
@@ -201,38 +269,73 @@ class DocumentList(list[Document]):
|
|
|
201
269
|
def get_by(self, arg: type[Document], required: bool = True) -> Document | None: ...
|
|
202
270
|
|
|
203
271
|
def get_by(self, arg: str | type[Document], required: bool = True) -> Document | None:
|
|
204
|
-
"""Get
|
|
272
|
+
"""Get EXACTLY ONE document by name or type.
|
|
205
273
|
|
|
206
274
|
@public
|
|
207
275
|
|
|
276
|
+
IMPORTANT: This method expects to find exactly one matching document.
|
|
277
|
+
- If no matches and required=True: raises ValueError
|
|
278
|
+
- If no matches and required=False: returns None
|
|
279
|
+
- If multiple matches: ALWAYS raises ValueError (ambiguous)
|
|
280
|
+
|
|
281
|
+
When required=True (default), you do NOT need to check for None:
|
|
282
|
+
>>> doc = docs.get_by("config.yaml") # Will raise if not found
|
|
283
|
+
>>> # No need for: if doc is not None <- This is redundant!
|
|
284
|
+
>>> print(doc.content) # Safe to use directly
|
|
285
|
+
|
|
208
286
|
Args:
|
|
209
287
|
arg: Document name (str) or document type.
|
|
210
|
-
required: If True, raises ValueError when not found.
|
|
288
|
+
required: If True (default), raises ValueError when not found.
|
|
289
|
+
If False, returns None when not found.
|
|
211
290
|
|
|
212
291
|
Returns:
|
|
213
|
-
The
|
|
292
|
+
The single matching document, or None if not found and required=False.
|
|
214
293
|
|
|
215
294
|
Raises:
|
|
216
|
-
ValueError: If required=True and document not found
|
|
295
|
+
ValueError: If required=True and document not found, OR if multiple
|
|
296
|
+
documents match (ambiguous result).
|
|
217
297
|
TypeError: If arg is not a string or Document type.
|
|
218
298
|
|
|
219
299
|
Example:
|
|
220
|
-
>>>
|
|
221
|
-
>>> doc = docs.get_by(
|
|
300
|
+
>>> # CORRECT - No need to check for None when required=True (default)
|
|
301
|
+
>>> doc = docs.get_by("file.txt") # Raises if not found
|
|
302
|
+
>>> print(doc.content) # Safe to use directly
|
|
303
|
+
>>>
|
|
304
|
+
>>> # When using required=False, check for None
|
|
305
|
+
>>> doc = docs.get_by("optional.txt", required=False)
|
|
306
|
+
>>> if doc is not None:
|
|
307
|
+
... print(doc.content)
|
|
308
|
+
>>>
|
|
309
|
+
>>> # Will raise if multiple documents have same type
|
|
310
|
+
>>> # Use filter_by() instead if you want all matches
|
|
311
|
+
>>> try:
|
|
312
|
+
... doc = docs.get_by(ConfigDocument) # Error if 2+ configs
|
|
313
|
+
>>> except ValueError as e:
|
|
314
|
+
... configs = docs.filter_by(ConfigDocument) # Get all instead
|
|
222
315
|
"""
|
|
223
316
|
if isinstance(arg, str):
|
|
224
|
-
# Get by name
|
|
225
|
-
for doc in self
|
|
226
|
-
|
|
227
|
-
|
|
317
|
+
# Get by name - collect all matches to check for duplicates
|
|
318
|
+
matches = [doc for doc in self if doc.name == arg]
|
|
319
|
+
if len(matches) > 1:
|
|
320
|
+
raise ValueError(
|
|
321
|
+
f"Multiple documents found with name '{arg}'. "
|
|
322
|
+
f"Found {len(matches)} matches. Use filter_by() to get all matches."
|
|
323
|
+
)
|
|
324
|
+
if matches:
|
|
325
|
+
return matches[0]
|
|
228
326
|
if required:
|
|
229
327
|
raise ValueError(f"Document with name '{arg}' not found")
|
|
230
328
|
return None
|
|
231
329
|
elif isinstance(arg, type): # type: ignore[reportUnnecessaryIsInstance]
|
|
232
|
-
# Get by type (including subclasses)
|
|
233
|
-
for doc in self
|
|
234
|
-
|
|
235
|
-
|
|
330
|
+
# Get by type (including subclasses) - collect all matches
|
|
331
|
+
matches = [doc for doc in self if isinstance(doc, arg)]
|
|
332
|
+
if len(matches) > 1:
|
|
333
|
+
raise ValueError(
|
|
334
|
+
f"Multiple documents found of type '{arg.__name__}'. "
|
|
335
|
+
f"Found {len(matches)} matches. Use filter_by() to get all matches."
|
|
336
|
+
)
|
|
337
|
+
if matches:
|
|
338
|
+
return matches[0]
|
|
236
339
|
if required:
|
|
237
340
|
raise ValueError(f"Document of type '{arg.__name__}' not found")
|
|
238
341
|
return None
|
|
@@ -24,40 +24,20 @@ class FlowDocument(Document):
|
|
|
24
24
|
- Persisted to file system between pipeline steps
|
|
25
25
|
- Survives across multiple flow runs
|
|
26
26
|
- Used for flow inputs and outputs
|
|
27
|
-
- Saved in directories
|
|
27
|
+
- Saved in directories organized by the document's type/name
|
|
28
28
|
|
|
29
29
|
Creating FlowDocuments:
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
>>> from enum import StrEnum
|
|
34
|
-
>>>
|
|
35
|
-
>>> # Simple document with pass:
|
|
36
|
-
>>> class MyDoc(FlowDocument):
|
|
37
|
-
... pass
|
|
38
|
-
>>>
|
|
39
|
-
>>> # Document with restricted file names:
|
|
40
|
-
>>> class ConfigDoc(FlowDocument):
|
|
41
|
-
... class FILES(StrEnum):
|
|
42
|
-
... CONFIG = "config.yaml"
|
|
43
|
-
... SETTINGS = "settings.json"
|
|
44
|
-
>>>
|
|
45
|
-
>>> # RECOMMENDED - automatic conversion:
|
|
46
|
-
>>> doc = MyDoc.create(name="data.json", content={"key": "value"})
|
|
47
|
-
>>> doc = ConfigDoc.create(name="config.yaml", content={"host": "localhost"})
|
|
30
|
+
Same as Document - use `create()` for automatic conversion, `__init__` for bytes.
|
|
31
|
+
See Document.create() for detailed usage examples.
|
|
48
32
|
|
|
49
33
|
Persistence:
|
|
50
|
-
Documents are saved
|
|
34
|
+
Documents are saved under an output directory path associated with the document's type/name.
|
|
51
35
|
For example: output/my_doc/data.json
|
|
52
36
|
|
|
53
37
|
Note:
|
|
54
38
|
- Cannot instantiate FlowDocument directly - must subclass
|
|
55
39
|
- Used with FlowConfig to define flow input/output types
|
|
56
40
|
- No additional abstract methods to implement
|
|
57
|
-
|
|
58
|
-
See Also:
|
|
59
|
-
TaskDocument: For temporary documents within task execution
|
|
60
|
-
TemporaryDocument: For documents that are never persisted
|
|
61
41
|
"""
|
|
62
42
|
|
|
63
43
|
def __init__(
|
|
@@ -66,13 +46,11 @@ class FlowDocument(Document):
|
|
|
66
46
|
name: str,
|
|
67
47
|
content: bytes,
|
|
68
48
|
description: str | None = None,
|
|
49
|
+
sources: list[str] = [],
|
|
69
50
|
) -> None:
|
|
70
51
|
"""Initialize a FlowDocument with raw bytes content.
|
|
71
52
|
|
|
72
|
-
|
|
73
|
-
**Most users should use the `create` classmethod instead of __init__.**
|
|
74
|
-
The create method provides automatic content conversion for various types
|
|
75
|
-
(str, dict, list, Pydantic models) while __init__ only accepts bytes.
|
|
53
|
+
See Document.__init__() for parameter details and usage notes.
|
|
76
54
|
|
|
77
55
|
Prevents direct instantiation of the abstract FlowDocument class.
|
|
78
56
|
FlowDocument must be subclassed for specific document types.
|
|
@@ -81,6 +59,7 @@ class FlowDocument(Document):
|
|
|
81
59
|
name: Document filename (required, keyword-only)
|
|
82
60
|
content: Document content as raw bytes (required, keyword-only)
|
|
83
61
|
description: Optional human-readable description (keyword-only)
|
|
62
|
+
sources: Optional list of strings for provenance tracking
|
|
84
63
|
|
|
85
64
|
Raises:
|
|
86
65
|
TypeError: If attempting to instantiate FlowDocument directly
|
|
@@ -109,7 +88,7 @@ class FlowDocument(Document):
|
|
|
109
88
|
"""
|
|
110
89
|
if type(self) is FlowDocument:
|
|
111
90
|
raise TypeError("Cannot instantiate abstract FlowDocument class directly")
|
|
112
|
-
super().__init__(name=name, content=content, description=description)
|
|
91
|
+
super().__init__(name=name, content=content, description=description, sources=sources)
|
|
113
92
|
|
|
114
93
|
@final
|
|
115
94
|
def get_base_type(self) -> Literal["flow"]:
|
|
@@ -29,24 +29,8 @@ class TaskDocument(Document):
|
|
|
29
29
|
- Reduces persistent I/O for temporary data
|
|
30
30
|
|
|
31
31
|
Creating TaskDocuments:
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
>>> from enum import StrEnum
|
|
36
|
-
>>>
|
|
37
|
-
>>> # Simple task document:
|
|
38
|
-
>>> class TempDoc(TaskDocument):
|
|
39
|
-
... pass
|
|
40
|
-
>>>
|
|
41
|
-
>>> # With restricted files:
|
|
42
|
-
>>> class CacheDoc(TaskDocument):
|
|
43
|
-
... class FILES(StrEnum):
|
|
44
|
-
... CACHE = "cache.json"
|
|
45
|
-
... INDEX = "index.dat"
|
|
46
|
-
>>>
|
|
47
|
-
>>> # RECOMMENDED - automatic conversion:
|
|
48
|
-
>>> doc = TempDoc.create(name="temp.json", content={"status": "processing"})
|
|
49
|
-
>>> doc = CacheDoc.create(name="cache.json", content={"data": [1, 2, 3]})
|
|
32
|
+
Same as Document - use `create()` for automatic conversion, `__init__` for bytes.
|
|
33
|
+
See Document.create() for detailed usage examples.
|
|
50
34
|
|
|
51
35
|
Use Cases:
|
|
52
36
|
- Intermediate transformation results
|
|
@@ -59,10 +43,6 @@ class TaskDocument(Document):
|
|
|
59
43
|
- Not saved by simple_runner utilities
|
|
60
44
|
- Reduces I/O overhead for temporary data
|
|
61
45
|
- No additional abstract methods to implement
|
|
62
|
-
|
|
63
|
-
See Also:
|
|
64
|
-
FlowDocument: For documents that persist across flow runs
|
|
65
|
-
TemporaryDocument: Alternative for non-persistent documents
|
|
66
46
|
"""
|
|
67
47
|
|
|
68
48
|
def __init__(
|
|
@@ -71,13 +51,11 @@ class TaskDocument(Document):
|
|
|
71
51
|
name: str,
|
|
72
52
|
content: bytes,
|
|
73
53
|
description: str | None = None,
|
|
54
|
+
sources: list[str] = [],
|
|
74
55
|
) -> None:
|
|
75
56
|
"""Initialize a TaskDocument with raw bytes content.
|
|
76
57
|
|
|
77
|
-
|
|
78
|
-
**Most users should use the `create` classmethod instead of __init__.**
|
|
79
|
-
The create method provides automatic content conversion for various types
|
|
80
|
-
(str, dict, list, Pydantic models) while __init__ only accepts bytes.
|
|
58
|
+
See Document.__init__() for parameter details and usage notes.
|
|
81
59
|
|
|
82
60
|
Prevents direct instantiation of the abstract TaskDocument class.
|
|
83
61
|
TaskDocument must be subclassed for specific temporary document types.
|
|
@@ -86,6 +64,7 @@ class TaskDocument(Document):
|
|
|
86
64
|
name: Document filename (required, keyword-only)
|
|
87
65
|
content: Document content as raw bytes (required, keyword-only)
|
|
88
66
|
description: Optional human-readable description (keyword-only)
|
|
67
|
+
sources: Optional list of strings for provenance tracking
|
|
89
68
|
|
|
90
69
|
Raises:
|
|
91
70
|
TypeError: If attempting to instantiate TaskDocument directly
|
|
@@ -114,7 +93,7 @@ class TaskDocument(Document):
|
|
|
114
93
|
"""
|
|
115
94
|
if type(self) is TaskDocument:
|
|
116
95
|
raise TypeError("Cannot instantiate abstract TaskDocument class directly")
|
|
117
|
-
super().__init__(name=name, content=content, description=description)
|
|
96
|
+
super().__init__(name=name, content=content, description=description, sources=sources)
|
|
118
97
|
|
|
119
98
|
@final
|
|
120
99
|
def get_base_type(self) -> Literal["task"]:
|
|
@@ -1,7 +1,5 @@
|
|
|
1
1
|
"""Temporary document implementation for non-persistent data.
|
|
2
2
|
|
|
3
|
-
@public
|
|
4
|
-
|
|
5
3
|
This module provides the TemporaryDocument class for documents that
|
|
6
4
|
are never persisted, regardless of context.
|
|
7
5
|
"""
|
|
@@ -15,8 +13,6 @@ from .document import Document
|
|
|
15
13
|
class TemporaryDocument(Document):
|
|
16
14
|
r"""Concrete document class for data that is never persisted.
|
|
17
15
|
|
|
18
|
-
@public
|
|
19
|
-
|
|
20
16
|
TemporaryDocument is a final (non-subclassable) document type for
|
|
21
17
|
data that should never be saved to disk, regardless of whether it's
|
|
22
18
|
used in a flow or task context. Unlike FlowDocument and TaskDocument
|
|
@@ -28,27 +24,14 @@ class TemporaryDocument(Document):
|
|
|
28
24
|
- Cannot be subclassed (annotated with Python's @final decorator in code)
|
|
29
25
|
- Useful for transient data like API responses or intermediate calculations
|
|
30
26
|
- Ignored by simple_runner save operations
|
|
27
|
+
- Useful for tests and debugging
|
|
31
28
|
|
|
32
29
|
Creating TemporaryDocuments:
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
>>> doc = TemporaryDocument.create(
|
|
38
|
-
... name="api_response.json",
|
|
39
|
-
... content={"status": "ok", "data": [1, 2, 3]}
|
|
40
|
-
... )
|
|
41
|
-
>>> doc = TemporaryDocument.create(
|
|
42
|
-
... name="credentials.txt",
|
|
43
|
-
... content="secret_token_xyz"
|
|
44
|
-
... )
|
|
45
|
-
>>>
|
|
46
|
-
>>> # Direct constructor - only for bytes:
|
|
47
|
-
>>> doc = TemporaryDocument(
|
|
48
|
-
... name="binary.dat",
|
|
49
|
-
... content=b"\x00\x01\x02"
|
|
50
|
-
... )
|
|
51
|
-
>>>
|
|
30
|
+
Same as Document - use `create()` for automatic conversion, `__init__` for bytes.
|
|
31
|
+
Unlike abstract document types, TemporaryDocument can be instantiated directly.
|
|
32
|
+
See Document.create() for detailed usage examples.
|
|
33
|
+
|
|
34
|
+
>>> doc = TemporaryDocument.create(name="api.json", content={"status": "ok"})
|
|
52
35
|
>>> doc.is_temporary # Always True
|
|
53
36
|
|
|
54
37
|
Use Cases:
|
|
@@ -62,10 +45,6 @@ class TemporaryDocument(Document):
|
|
|
62
45
|
- This is a final class and cannot be subclassed
|
|
63
46
|
- Use when you explicitly want to prevent persistence
|
|
64
47
|
- Useful for sensitive data that shouldn't be written to disk
|
|
65
|
-
|
|
66
|
-
See Also:
|
|
67
|
-
FlowDocument: For documents that persist across flow runs
|
|
68
|
-
TaskDocument: For documents temporary within task execution
|
|
69
48
|
"""
|
|
70
49
|
|
|
71
50
|
def __init_subclass__(cls, **kwargs: Any) -> None:
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Utility functions for document handling.
|
|
2
2
|
|
|
3
3
|
Provides helper functions for URL sanitization, naming conventions,
|
|
4
|
-
|
|
4
|
+
canonical key generation, and hash validation used throughout the document system.
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
7
|
import re
|
|
@@ -115,3 +115,66 @@ def canonical_name_key(
|
|
|
115
115
|
break
|
|
116
116
|
|
|
117
117
|
return camel_to_snake(name)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def is_document_sha256(value: str) -> bool:
|
|
121
|
+
"""Check if a string is a valid base32-encoded SHA256 hash with proper entropy.
|
|
122
|
+
|
|
123
|
+
@public
|
|
124
|
+
|
|
125
|
+
This function validates that a string is not just formatted like a SHA256 hash,
|
|
126
|
+
but actually has the entropy characteristics of a real hash. It checks:
|
|
127
|
+
1. Correct length (52 characters without padding)
|
|
128
|
+
2. Valid base32 characters (A-Z, 2-7)
|
|
129
|
+
3. Sufficient entropy (at least 8 unique characters)
|
|
130
|
+
|
|
131
|
+
The entropy check prevents false positives like 'AAAAAAA...AAA' from being
|
|
132
|
+
identified as valid document hashes.
|
|
133
|
+
|
|
134
|
+
Args:
|
|
135
|
+
value: String to check if it's a document SHA256 hash.
|
|
136
|
+
|
|
137
|
+
Returns:
|
|
138
|
+
True if the string appears to be a real base32-encoded SHA256 hash,
|
|
139
|
+
False otherwise.
|
|
140
|
+
|
|
141
|
+
Examples:
|
|
142
|
+
>>> # Real SHA256 hash
|
|
143
|
+
>>> is_document_sha256("P3AEMA2PSYILKFYVBUALJLMIYWVZIS2QDI3S5VTMD2X7SOODF2YQ")
|
|
144
|
+
True
|
|
145
|
+
|
|
146
|
+
>>> # Too uniform - lacks entropy
|
|
147
|
+
>>> is_document_sha256("A" * 52)
|
|
148
|
+
False
|
|
149
|
+
|
|
150
|
+
>>> # Wrong length
|
|
151
|
+
>>> is_document_sha256("ABC123")
|
|
152
|
+
False
|
|
153
|
+
|
|
154
|
+
>>> # Invalid characters
|
|
155
|
+
>>> is_document_sha256("a" * 52) # lowercase
|
|
156
|
+
False
|
|
157
|
+
"""
|
|
158
|
+
# Check basic format: exactly 52 uppercase base32 characters
|
|
159
|
+
try:
|
|
160
|
+
if not value or len(value) != 52:
|
|
161
|
+
return False
|
|
162
|
+
except (TypeError, AttributeError):
|
|
163
|
+
return False
|
|
164
|
+
|
|
165
|
+
# Check if all characters are valid base32 (A-Z, 2-7)
|
|
166
|
+
try:
|
|
167
|
+
if not re.match(r"^[A-Z2-7]{52}$", value):
|
|
168
|
+
return False
|
|
169
|
+
except TypeError:
|
|
170
|
+
# re.match raises TypeError for non-string types like bytes
|
|
171
|
+
return False
|
|
172
|
+
|
|
173
|
+
# Check entropy: real SHA256 hashes have high entropy
|
|
174
|
+
# Require at least 8 unique characters (out of 32 possible in base32)
|
|
175
|
+
# This prevents patterns like "AAAAAAA..." from being identified as real hashes
|
|
176
|
+
unique_chars = len(set(value))
|
|
177
|
+
if unique_chars < 8:
|
|
178
|
+
return False
|
|
179
|
+
|
|
180
|
+
return True
|