ai-pipeline-core 0.1.14__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ai_pipeline_core/__init__.py +21 -13
- ai_pipeline_core/documents/document.py +202 -51
- ai_pipeline_core/documents/document_list.py +148 -24
- ai_pipeline_core/documents/flow_document.py +2 -6
- ai_pipeline_core/documents/task_document.py +0 -4
- ai_pipeline_core/documents/temporary_document.py +1 -8
- ai_pipeline_core/flow/config.py +174 -5
- ai_pipeline_core/llm/__init__.py +1 -6
- ai_pipeline_core/llm/ai_messages.py +137 -4
- ai_pipeline_core/llm/client.py +118 -65
- ai_pipeline_core/llm/model_options.py +6 -7
- ai_pipeline_core/llm/model_response.py +17 -16
- ai_pipeline_core/llm/model_types.py +3 -7
- ai_pipeline_core/logging/__init__.py +0 -2
- ai_pipeline_core/logging/logging_config.py +0 -6
- ai_pipeline_core/logging/logging_mixin.py +2 -10
- ai_pipeline_core/pipeline.py +54 -68
- ai_pipeline_core/prefect.py +12 -3
- ai_pipeline_core/prompt_manager.py +14 -7
- ai_pipeline_core/settings.py +13 -5
- ai_pipeline_core/simple_runner/__init__.py +1 -11
- ai_pipeline_core/simple_runner/cli.py +13 -12
- ai_pipeline_core/simple_runner/simple_runner.py +34 -189
- ai_pipeline_core/storage/__init__.py +8 -0
- ai_pipeline_core/storage/storage.py +628 -0
- ai_pipeline_core/tracing.py +234 -30
- {ai_pipeline_core-0.1.14.dist-info → ai_pipeline_core-0.2.1.dist-info}/METADATA +35 -20
- ai_pipeline_core-0.2.1.dist-info/RECORD +38 -0
- ai_pipeline_core-0.1.14.dist-info/RECORD +0 -36
- {ai_pipeline_core-0.1.14.dist-info → ai_pipeline_core-0.2.1.dist-info}/WHEEL +0 -0
- {ai_pipeline_core-0.1.14.dist-info → ai_pipeline_core-0.2.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -3,7 +3,8 @@
|
|
|
3
3
|
@public
|
|
4
4
|
"""
|
|
5
5
|
|
|
6
|
-
from
|
|
6
|
+
from copy import deepcopy
|
|
7
|
+
from typing import Any, Callable, Iterable, SupportsIndex, Union, overload
|
|
7
8
|
|
|
8
9
|
from typing_extensions import Self
|
|
9
10
|
|
|
@@ -17,8 +18,8 @@ class DocumentList(list[Document]):
|
|
|
17
18
|
|
|
18
19
|
Specialized list with validation and filtering for documents.
|
|
19
20
|
|
|
20
|
-
Best Practice: Use default constructor
|
|
21
|
-
validate_same_type or validate_duplicates when you explicitly need them.
|
|
21
|
+
Best Practice: Use default constructor by default, unless instructed otherwise.
|
|
22
|
+
Only enable validate_same_type or validate_duplicates when you explicitly need them.
|
|
22
23
|
|
|
23
24
|
Example:
|
|
24
25
|
>>> # RECOMMENDED - default constructor for most cases
|
|
@@ -37,6 +38,7 @@ class DocumentList(list[Document]):
|
|
|
37
38
|
documents: list[Document] | None = None,
|
|
38
39
|
validate_same_type: bool = False,
|
|
39
40
|
validate_duplicates: bool = False,
|
|
41
|
+
frozen: bool = False,
|
|
40
42
|
) -> None:
|
|
41
43
|
"""Initialize DocumentList.
|
|
42
44
|
|
|
@@ -46,12 +48,15 @@ class DocumentList(list[Document]):
|
|
|
46
48
|
documents: Initial list of documents.
|
|
47
49
|
validate_same_type: Enforce same document type.
|
|
48
50
|
validate_duplicates: Prevent duplicate filenames.
|
|
51
|
+
frozen: If True, list is immutable from creation.
|
|
49
52
|
"""
|
|
50
53
|
super().__init__()
|
|
51
54
|
self._validate_same_type = validate_same_type
|
|
52
55
|
self._validate_duplicates = validate_duplicates
|
|
56
|
+
self._frozen = False # Initialize as unfrozen to allow initial population
|
|
53
57
|
if documents:
|
|
54
58
|
self.extend(documents)
|
|
59
|
+
self._frozen = frozen # Set frozen state after initial population
|
|
55
60
|
|
|
56
61
|
def _validate_no_duplicates(self) -> None:
|
|
57
62
|
"""Check for duplicate document names.
|
|
@@ -109,18 +114,51 @@ class DocumentList(list[Document]):
|
|
|
109
114
|
self._validate_no_description_files()
|
|
110
115
|
self._validate_types()
|
|
111
116
|
|
|
117
|
+
def freeze(self) -> None:
|
|
118
|
+
"""Permanently freeze the list, preventing modifications.
|
|
119
|
+
|
|
120
|
+
Once frozen, the list cannot be unfrozen.
|
|
121
|
+
"""
|
|
122
|
+
self._frozen = True
|
|
123
|
+
|
|
124
|
+
def copy(self) -> "DocumentList":
|
|
125
|
+
"""Create an unfrozen deep copy of the list.
|
|
126
|
+
|
|
127
|
+
Returns:
|
|
128
|
+
New unfrozen DocumentList with deep-copied documents.
|
|
129
|
+
"""
|
|
130
|
+
copied_docs = deepcopy(list(self))
|
|
131
|
+
return DocumentList(
|
|
132
|
+
documents=copied_docs,
|
|
133
|
+
validate_same_type=self._validate_same_type,
|
|
134
|
+
validate_duplicates=self._validate_duplicates,
|
|
135
|
+
frozen=False, # Copies are always unfrozen
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
def _check_frozen(self) -> None:
|
|
139
|
+
"""Check if list is frozen and raise if it is.
|
|
140
|
+
|
|
141
|
+
Raises:
|
|
142
|
+
RuntimeError: If the list is frozen.
|
|
143
|
+
"""
|
|
144
|
+
if self._frozen:
|
|
145
|
+
raise RuntimeError("Cannot modify frozen DocumentList")
|
|
146
|
+
|
|
112
147
|
def append(self, document: Document) -> None:
|
|
113
148
|
"""Add a document to the end of the list."""
|
|
149
|
+
self._check_frozen()
|
|
114
150
|
super().append(document)
|
|
115
151
|
self._validate()
|
|
116
152
|
|
|
117
153
|
def extend(self, documents: Iterable[Document]) -> None:
|
|
118
154
|
"""Add multiple documents to the list."""
|
|
155
|
+
self._check_frozen()
|
|
119
156
|
super().extend(documents)
|
|
120
157
|
self._validate()
|
|
121
158
|
|
|
122
159
|
def insert(self, index: SupportsIndex, document: Document) -> None:
|
|
123
160
|
"""Insert a document at the specified position."""
|
|
161
|
+
self._check_frozen()
|
|
124
162
|
super().insert(index, document)
|
|
125
163
|
self._validate()
|
|
126
164
|
|
|
@@ -132,6 +170,7 @@ class DocumentList(list[Document]):
|
|
|
132
170
|
|
|
133
171
|
def __setitem__(self, index: Union[SupportsIndex, slice], value: Any) -> None:
|
|
134
172
|
"""Set item or slice with validation."""
|
|
173
|
+
self._check_frozen()
|
|
135
174
|
super().__setitem__(index, value)
|
|
136
175
|
self._validate()
|
|
137
176
|
|
|
@@ -141,10 +180,48 @@ class DocumentList(list[Document]):
|
|
|
141
180
|
Returns:
|
|
142
181
|
Self: This DocumentList after modification.
|
|
143
182
|
"""
|
|
183
|
+
self._check_frozen()
|
|
144
184
|
result = super().__iadd__(other)
|
|
145
185
|
self._validate()
|
|
146
186
|
return result
|
|
147
187
|
|
|
188
|
+
def __delitem__(self, index: Union[SupportsIndex, slice]) -> None:
|
|
189
|
+
"""Delete item or slice from list."""
|
|
190
|
+
self._check_frozen()
|
|
191
|
+
super().__delitem__(index)
|
|
192
|
+
|
|
193
|
+
def pop(self, index: SupportsIndex = -1) -> Document:
|
|
194
|
+
"""Remove and return item at index.
|
|
195
|
+
|
|
196
|
+
Returns:
|
|
197
|
+
Document removed from the list.
|
|
198
|
+
"""
|
|
199
|
+
self._check_frozen()
|
|
200
|
+
return super().pop(index)
|
|
201
|
+
|
|
202
|
+
def remove(self, document: Document) -> None:
|
|
203
|
+
"""Remove first occurrence of document."""
|
|
204
|
+
self._check_frozen()
|
|
205
|
+
super().remove(document)
|
|
206
|
+
|
|
207
|
+
def clear(self) -> None:
|
|
208
|
+
"""Remove all items from list."""
|
|
209
|
+
self._check_frozen()
|
|
210
|
+
super().clear()
|
|
211
|
+
|
|
212
|
+
def reverse(self) -> None:
|
|
213
|
+
"""Reverse list in place."""
|
|
214
|
+
self._check_frozen()
|
|
215
|
+
super().reverse()
|
|
216
|
+
|
|
217
|
+
def sort(self, *, key: Callable[[Document], Any] | None = None, reverse: bool = False) -> None:
|
|
218
|
+
"""Sort list in place."""
|
|
219
|
+
self._check_frozen()
|
|
220
|
+
if key is None:
|
|
221
|
+
super().sort(reverse=reverse) # type: ignore[call-arg]
|
|
222
|
+
else:
|
|
223
|
+
super().sort(key=key, reverse=reverse)
|
|
224
|
+
|
|
148
225
|
@overload
|
|
149
226
|
def filter_by(self, arg: str) -> "DocumentList": ...
|
|
150
227
|
|
|
@@ -164,6 +241,9 @@ class DocumentList(list[Document]):
|
|
|
164
241
|
|
|
165
242
|
@public
|
|
166
243
|
|
|
244
|
+
ALWAYS returns a DocumentList (which may be empty), never raises an exception
|
|
245
|
+
for no matches. Use this when you want to process all matching documents.
|
|
246
|
+
|
|
167
247
|
Args:
|
|
168
248
|
arg: Can be one of:
|
|
169
249
|
- str: Single document name to filter by
|
|
@@ -174,7 +254,9 @@ class DocumentList(list[Document]):
|
|
|
174
254
|
(list, tuple, set, generator, or any iterable)
|
|
175
255
|
|
|
176
256
|
Returns:
|
|
177
|
-
New DocumentList with filtered documents.
|
|
257
|
+
New DocumentList with filtered documents (may be empty).
|
|
258
|
+
- Returns ALL matching documents
|
|
259
|
+
- Empty DocumentList if no matches found
|
|
178
260
|
|
|
179
261
|
Raises:
|
|
180
262
|
TypeError: If arg is not a valid type (not str, type, or iterable),
|
|
@@ -182,12 +264,19 @@ class DocumentList(list[Document]):
|
|
|
182
264
|
AttributeError: If arg is expected to be iterable but doesn't support iteration.
|
|
183
265
|
|
|
184
266
|
Example:
|
|
185
|
-
>>>
|
|
186
|
-
>>> docs.filter_by(
|
|
187
|
-
>>>
|
|
188
|
-
|
|
189
|
-
>>>
|
|
190
|
-
>>>
|
|
267
|
+
>>> # Returns list with all matching documents
|
|
268
|
+
>>> matching_docs = docs.filter_by("file.txt") # May be empty
|
|
269
|
+
>>> for doc in matching_docs:
|
|
270
|
+
... process(doc)
|
|
271
|
+
>>>
|
|
272
|
+
>>> # Filter by type - returns all instances
|
|
273
|
+
>>> config_docs = docs.filter_by(ConfigDocument)
|
|
274
|
+
>>> print(f"Found {len(config_docs)} config documents")
|
|
275
|
+
>>>
|
|
276
|
+
>>> # Filter by multiple names
|
|
277
|
+
>>> important_docs = docs.filter_by(["config.yaml", "settings.json"])
|
|
278
|
+
>>> if not important_docs: # Check if empty
|
|
279
|
+
... print("No important documents found")
|
|
191
280
|
"""
|
|
192
281
|
if isinstance(arg, str):
|
|
193
282
|
# Filter by single name
|
|
@@ -257,38 +346,73 @@ class DocumentList(list[Document]):
|
|
|
257
346
|
def get_by(self, arg: type[Document], required: bool = True) -> Document | None: ...
|
|
258
347
|
|
|
259
348
|
def get_by(self, arg: str | type[Document], required: bool = True) -> Document | None:
|
|
260
|
-
"""Get
|
|
349
|
+
"""Get EXACTLY ONE document by name or type.
|
|
261
350
|
|
|
262
351
|
@public
|
|
263
352
|
|
|
353
|
+
IMPORTANT: This method expects to find exactly one matching document.
|
|
354
|
+
- If no matches and required=True: raises ValueError
|
|
355
|
+
- If no matches and required=False: returns None
|
|
356
|
+
- If multiple matches: ALWAYS raises ValueError (ambiguous)
|
|
357
|
+
|
|
358
|
+
When required=True (default), you do NOT need to check for None:
|
|
359
|
+
>>> doc = docs.get_by("config.yaml") # Will raise if not found
|
|
360
|
+
>>> # No need for: if doc is not None <- This is redundant!
|
|
361
|
+
>>> print(doc.content) # Safe to use directly
|
|
362
|
+
|
|
264
363
|
Args:
|
|
265
364
|
arg: Document name (str) or document type.
|
|
266
|
-
required: If True, raises ValueError when not found.
|
|
365
|
+
required: If True (default), raises ValueError when not found.
|
|
366
|
+
If False, returns None when not found.
|
|
267
367
|
|
|
268
368
|
Returns:
|
|
269
|
-
The
|
|
369
|
+
The single matching document, or None if not found and required=False.
|
|
270
370
|
|
|
271
371
|
Raises:
|
|
272
|
-
ValueError: If required=True and document not found
|
|
372
|
+
ValueError: If required=True and document not found, OR if multiple
|
|
373
|
+
documents match (ambiguous result).
|
|
273
374
|
TypeError: If arg is not a string or Document type.
|
|
274
375
|
|
|
275
376
|
Example:
|
|
276
|
-
>>>
|
|
277
|
-
>>> doc = docs.get_by(
|
|
377
|
+
>>> # CORRECT - No need to check for None when required=True (default)
|
|
378
|
+
>>> doc = docs.get_by("file.txt") # Raises if not found
|
|
379
|
+
>>> print(doc.content) # Safe to use directly
|
|
380
|
+
>>>
|
|
381
|
+
>>> # When using required=False, check for None
|
|
382
|
+
>>> doc = docs.get_by("optional.txt", required=False)
|
|
383
|
+
>>> if doc is not None:
|
|
384
|
+
... print(doc.content)
|
|
385
|
+
>>>
|
|
386
|
+
>>> # Will raise if multiple documents have same type
|
|
387
|
+
>>> # Use filter_by() instead if you want all matches
|
|
388
|
+
>>> try:
|
|
389
|
+
... doc = docs.get_by(ConfigDocument) # Error if 2+ configs
|
|
390
|
+
>>> except ValueError as e:
|
|
391
|
+
... configs = docs.filter_by(ConfigDocument) # Get all instead
|
|
278
392
|
"""
|
|
279
393
|
if isinstance(arg, str):
|
|
280
|
-
# Get by name
|
|
281
|
-
for doc in self
|
|
282
|
-
|
|
283
|
-
|
|
394
|
+
# Get by name - collect all matches to check for duplicates
|
|
395
|
+
matches = [doc for doc in self if doc.name == arg]
|
|
396
|
+
if len(matches) > 1:
|
|
397
|
+
raise ValueError(
|
|
398
|
+
f"Multiple documents found with name '{arg}'. "
|
|
399
|
+
f"Found {len(matches)} matches. Use filter_by() to get all matches."
|
|
400
|
+
)
|
|
401
|
+
if matches:
|
|
402
|
+
return matches[0]
|
|
284
403
|
if required:
|
|
285
404
|
raise ValueError(f"Document with name '{arg}' not found")
|
|
286
405
|
return None
|
|
287
406
|
elif isinstance(arg, type): # type: ignore[reportUnnecessaryIsInstance]
|
|
288
|
-
# Get by type (including subclasses)
|
|
289
|
-
for doc in self
|
|
290
|
-
|
|
291
|
-
|
|
407
|
+
# Get by type (including subclasses) - collect all matches
|
|
408
|
+
matches = [doc for doc in self if isinstance(doc, arg)]
|
|
409
|
+
if len(matches) > 1:
|
|
410
|
+
raise ValueError(
|
|
411
|
+
f"Multiple documents found of type '{arg.__name__}'. "
|
|
412
|
+
f"Found {len(matches)} matches. Use filter_by() to get all matches."
|
|
413
|
+
)
|
|
414
|
+
if matches:
|
|
415
|
+
return matches[0]
|
|
292
416
|
if required:
|
|
293
417
|
raise ValueError(f"Document of type '{arg.__name__}' not found")
|
|
294
418
|
return None
|
|
@@ -24,24 +24,20 @@ class FlowDocument(Document):
|
|
|
24
24
|
- Persisted to file system between pipeline steps
|
|
25
25
|
- Survives across multiple flow runs
|
|
26
26
|
- Used for flow inputs and outputs
|
|
27
|
-
- Saved in directories
|
|
27
|
+
- Saved in directories organized by the document's type/name
|
|
28
28
|
|
|
29
29
|
Creating FlowDocuments:
|
|
30
30
|
Same as Document - use `create()` for automatic conversion, `__init__` for bytes.
|
|
31
31
|
See Document.create() for detailed usage examples.
|
|
32
32
|
|
|
33
33
|
Persistence:
|
|
34
|
-
Documents are saved
|
|
34
|
+
Documents are saved under an output directory path associated with the document's type/name.
|
|
35
35
|
For example: output/my_doc/data.json
|
|
36
36
|
|
|
37
37
|
Note:
|
|
38
38
|
- Cannot instantiate FlowDocument directly - must subclass
|
|
39
39
|
- Used with FlowConfig to define flow input/output types
|
|
40
40
|
- No additional abstract methods to implement
|
|
41
|
-
|
|
42
|
-
See Also:
|
|
43
|
-
TaskDocument: For temporary documents within task execution
|
|
44
|
-
TemporaryDocument: For documents that are never persisted
|
|
45
41
|
"""
|
|
46
42
|
|
|
47
43
|
def __init__(
|
|
@@ -43,10 +43,6 @@ class TaskDocument(Document):
|
|
|
43
43
|
- Not saved by simple_runner utilities
|
|
44
44
|
- Reduces I/O overhead for temporary data
|
|
45
45
|
- No additional abstract methods to implement
|
|
46
|
-
|
|
47
|
-
See Also:
|
|
48
|
-
FlowDocument: For documents that persist across flow runs
|
|
49
|
-
TemporaryDocument: Alternative for non-persistent documents
|
|
50
46
|
"""
|
|
51
47
|
|
|
52
48
|
def __init__(
|
|
@@ -1,7 +1,5 @@
|
|
|
1
1
|
"""Temporary document implementation for non-persistent data.
|
|
2
2
|
|
|
3
|
-
@public
|
|
4
|
-
|
|
5
3
|
This module provides the TemporaryDocument class for documents that
|
|
6
4
|
are never persisted, regardless of context.
|
|
7
5
|
"""
|
|
@@ -15,8 +13,6 @@ from .document import Document
|
|
|
15
13
|
class TemporaryDocument(Document):
|
|
16
14
|
r"""Concrete document class for data that is never persisted.
|
|
17
15
|
|
|
18
|
-
@public
|
|
19
|
-
|
|
20
16
|
TemporaryDocument is a final (non-subclassable) document type for
|
|
21
17
|
data that should never be saved to disk, regardless of whether it's
|
|
22
18
|
used in a flow or task context. Unlike FlowDocument and TaskDocument
|
|
@@ -28,6 +24,7 @@ class TemporaryDocument(Document):
|
|
|
28
24
|
- Cannot be subclassed (annotated with Python's @final decorator in code)
|
|
29
25
|
- Useful for transient data like API responses or intermediate calculations
|
|
30
26
|
- Ignored by simple_runner save operations
|
|
27
|
+
- Useful for tests and debugging
|
|
31
28
|
|
|
32
29
|
Creating TemporaryDocuments:
|
|
33
30
|
Same as Document - use `create()` for automatic conversion, `__init__` for bytes.
|
|
@@ -48,10 +45,6 @@ class TemporaryDocument(Document):
|
|
|
48
45
|
- This is a final class and cannot be subclassed
|
|
49
46
|
- Use when you explicitly want to prevent persistence
|
|
50
47
|
- Useful for sensitive data that shouldn't be written to disk
|
|
51
|
-
|
|
52
|
-
See Also:
|
|
53
|
-
FlowDocument: For documents that persist across flow runs
|
|
54
|
-
TaskDocument: For documents temporary within task execution
|
|
55
48
|
"""
|
|
56
49
|
|
|
57
50
|
def __init_subclass__(cls, **kwargs: Any) -> None:
|
ai_pipeline_core/flow/config.py
CHANGED
|
@@ -10,11 +10,16 @@ Best Practice:
|
|
|
10
10
|
to ensure type safety and proper validation of output documents.
|
|
11
11
|
"""
|
|
12
12
|
|
|
13
|
+
import json
|
|
13
14
|
from abc import ABC
|
|
14
15
|
from typing import Any, ClassVar, Iterable
|
|
15
16
|
|
|
16
|
-
from ai_pipeline_core.documents import DocumentList, FlowDocument
|
|
17
|
+
from ai_pipeline_core.documents import Document, DocumentList, FlowDocument
|
|
17
18
|
from ai_pipeline_core.exceptions import DocumentValidationError
|
|
19
|
+
from ai_pipeline_core.logging import get_pipeline_logger
|
|
20
|
+
from ai_pipeline_core.storage import Storage
|
|
21
|
+
|
|
22
|
+
logger = get_pipeline_logger(__name__)
|
|
18
23
|
|
|
19
24
|
|
|
20
25
|
class FlowConfig(ABC):
|
|
@@ -51,8 +56,10 @@ class FlowConfig(ABC):
|
|
|
51
56
|
... OUTPUT_DOCUMENT_TYPE = ProcessedDocument # Different type!
|
|
52
57
|
>>>
|
|
53
58
|
>>> # Use in @pipeline_flow - RECOMMENDED PATTERN
|
|
54
|
-
>>> @pipeline_flow(name="processing")
|
|
55
|
-
>>> async def process(
|
|
59
|
+
>>> @pipeline_flow(config=ProcessingFlowConfig, name="processing")
|
|
60
|
+
>>> async def process(
|
|
61
|
+
... project_name: str, docs: DocumentList, flow_options: FlowOptions
|
|
62
|
+
... ) -> DocumentList:
|
|
56
63
|
... outputs = []
|
|
57
64
|
... # ... processing logic ...
|
|
58
65
|
... return config.create_and_validate_output(outputs)
|
|
@@ -289,8 +296,10 @@ class FlowConfig(ABC):
|
|
|
289
296
|
DocumentValidationError: If output type doesn't match OUTPUT_DOCUMENT_TYPE.
|
|
290
297
|
|
|
291
298
|
Example:
|
|
292
|
-
>>> @pipeline_flow(name="my_flow")
|
|
293
|
-
>>> async def process_flow(
|
|
299
|
+
>>> @pipeline_flow(config=MyFlowConfig, name="my_flow")
|
|
300
|
+
>>> async def process_flow(
|
|
301
|
+
... project_name: str, documents: DocumentList, flow_options: FlowOptions
|
|
302
|
+
... ) -> DocumentList:
|
|
294
303
|
>>> outputs = []
|
|
295
304
|
>>> # ... processing logic ...
|
|
296
305
|
>>> outputs.append(OutputDoc(...))
|
|
@@ -312,3 +321,163 @@ class FlowConfig(ABC):
|
|
|
312
321
|
documents = DocumentList(list(output)) # type: ignore[arg-type]
|
|
313
322
|
cls.validate_output_documents(documents)
|
|
314
323
|
return documents
|
|
324
|
+
|
|
325
|
+
@classmethod
|
|
326
|
+
async def load_documents(
|
|
327
|
+
cls,
|
|
328
|
+
uri: str,
|
|
329
|
+
) -> DocumentList:
|
|
330
|
+
"""Load documents from storage matching INPUT_DOCUMENT_TYPES.
|
|
331
|
+
|
|
332
|
+
Loads documents from a storage location based on the class's INPUT_DOCUMENT_TYPES.
|
|
333
|
+
Supports both local filesystem and Google Cloud Storage backends.
|
|
334
|
+
Automatically loads metadata (.description.md and .sources.json) when present.
|
|
335
|
+
|
|
336
|
+
Args:
|
|
337
|
+
uri: Storage URI (file://, gs://, or local path)
|
|
338
|
+
|
|
339
|
+
Returns:
|
|
340
|
+
DocumentList containing loaded documents matching INPUT_DOCUMENT_TYPES
|
|
341
|
+
|
|
342
|
+
Example:
|
|
343
|
+
>>> # Load from local filesystem
|
|
344
|
+
>>> docs = await MyFlowConfig.load_documents("./data")
|
|
345
|
+
>>>
|
|
346
|
+
>>> # Load from GCS (uses GCS_SERVICE_ACCOUNT_FILE from settings if configured)
|
|
347
|
+
>>> docs = await MyFlowConfig.load_documents("gs://bucket/data")
|
|
348
|
+
"""
|
|
349
|
+
# Use INPUT_DOCUMENT_TYPES if not specified
|
|
350
|
+
storage = await Storage.from_uri(uri)
|
|
351
|
+
loaded_documents = DocumentList()
|
|
352
|
+
|
|
353
|
+
# Process each document type
|
|
354
|
+
for doc_type in cls.INPUT_DOCUMENT_TYPES:
|
|
355
|
+
canonical_name = doc_type.canonical_name()
|
|
356
|
+
doc_storage = storage.with_base(canonical_name)
|
|
357
|
+
|
|
358
|
+
# Check if subdirectory exists
|
|
359
|
+
if not await doc_storage.exists(""):
|
|
360
|
+
logger.debug(f"Subdirectory {canonical_name} not found, skipping")
|
|
361
|
+
continue
|
|
362
|
+
|
|
363
|
+
# List files in subdirectory
|
|
364
|
+
objects = await doc_storage.list("", recursive=False, include_dirs=False)
|
|
365
|
+
|
|
366
|
+
# Create lookup set for metadata files
|
|
367
|
+
object_keys = {obj.key for obj in objects}
|
|
368
|
+
|
|
369
|
+
# Filter out metadata files
|
|
370
|
+
doc_files = [
|
|
371
|
+
obj
|
|
372
|
+
for obj in objects
|
|
373
|
+
if not obj.key.endswith(Document.DESCRIPTION_EXTENSION)
|
|
374
|
+
and not obj.key.endswith(Document.SOURCES_EXTENSION)
|
|
375
|
+
]
|
|
376
|
+
|
|
377
|
+
for obj in doc_files:
|
|
378
|
+
try:
|
|
379
|
+
# Load document content
|
|
380
|
+
content = await doc_storage.read_bytes(obj.key)
|
|
381
|
+
|
|
382
|
+
# Load metadata if present
|
|
383
|
+
description = None
|
|
384
|
+
sources: list[str] = []
|
|
385
|
+
|
|
386
|
+
# Check for description in objects list
|
|
387
|
+
desc_path = f"{obj.key}{Document.DESCRIPTION_EXTENSION}"
|
|
388
|
+
if desc_path in object_keys:
|
|
389
|
+
try:
|
|
390
|
+
description = await doc_storage.read_text(desc_path)
|
|
391
|
+
except Exception as e:
|
|
392
|
+
logger.warning(f"Failed to load description for {obj.key}: {e}")
|
|
393
|
+
|
|
394
|
+
# Check for sources in objects list
|
|
395
|
+
sources_path = f"{obj.key}{Document.SOURCES_EXTENSION}"
|
|
396
|
+
if sources_path in object_keys:
|
|
397
|
+
try:
|
|
398
|
+
sources_text = await doc_storage.read_text(sources_path)
|
|
399
|
+
sources = json.loads(sources_text)
|
|
400
|
+
except Exception as e:
|
|
401
|
+
logger.warning(f"Failed to load sources for {obj.key}: {e}")
|
|
402
|
+
|
|
403
|
+
# Create document instance
|
|
404
|
+
doc = doc_type(
|
|
405
|
+
name=obj.key,
|
|
406
|
+
content=content,
|
|
407
|
+
description=description,
|
|
408
|
+
sources=sources,
|
|
409
|
+
)
|
|
410
|
+
|
|
411
|
+
loaded_documents.append(doc)
|
|
412
|
+
logger.debug(f"Loaded {doc_type.__name__} document: {obj.key}")
|
|
413
|
+
except Exception as e:
|
|
414
|
+
logger.error(f"Failed to load {doc_type.__name__} document {obj.key}: {e}")
|
|
415
|
+
|
|
416
|
+
logger.info(f"Loaded {len(loaded_documents)} documents from {uri}")
|
|
417
|
+
return loaded_documents
|
|
418
|
+
|
|
419
|
+
@classmethod
|
|
420
|
+
async def save_documents(
|
|
421
|
+
cls,
|
|
422
|
+
uri: str,
|
|
423
|
+
documents: DocumentList,
|
|
424
|
+
*,
|
|
425
|
+
validate_output_type: bool = True,
|
|
426
|
+
) -> None:
|
|
427
|
+
"""Save documents to storage with metadata.
|
|
428
|
+
|
|
429
|
+
Saves FlowDocument instances to a storage location with their content
|
|
430
|
+
and metadata files (Document.DESCRIPTION_EXTENSION and Document.SOURCES_EXTENSION).
|
|
431
|
+
Non-FlowDocument instances (TaskDocument, TemporaryDocument) are skipped.
|
|
432
|
+
|
|
433
|
+
Args:
|
|
434
|
+
uri: Storage URI (file://, gs://, or local path)
|
|
435
|
+
documents: DocumentList to save
|
|
436
|
+
validate_output_type: If True, validate documents match cls.OUTPUT_DOCUMENT_TYPE
|
|
437
|
+
|
|
438
|
+
Raises:
|
|
439
|
+
DocumentValidationError: If validate_output_type=True and documents don't match
|
|
440
|
+
OUTPUT_DOCUMENT_TYPE
|
|
441
|
+
|
|
442
|
+
Example:
|
|
443
|
+
>>> # Save to local filesystem
|
|
444
|
+
>>> await MyFlowConfig.save_documents("./output", docs)
|
|
445
|
+
>>>
|
|
446
|
+
>>> # Save to GCS (uses GCS_SERVICE_ACCOUNT_FILE from settings if configured)
|
|
447
|
+
>>> await MyFlowConfig.save_documents("gs://bucket/output", docs)
|
|
448
|
+
"""
|
|
449
|
+
# Validate output type if requested
|
|
450
|
+
if validate_output_type:
|
|
451
|
+
cls.validate_output_documents(documents)
|
|
452
|
+
|
|
453
|
+
storage = await Storage.from_uri(uri)
|
|
454
|
+
saved_count = 0
|
|
455
|
+
|
|
456
|
+
for doc in documents:
|
|
457
|
+
# Skip non-FlowDocument instances
|
|
458
|
+
if not isinstance(doc, FlowDocument):
|
|
459
|
+
logger.warning(f"Skipping non-FlowDocument: {type(doc).__name__}")
|
|
460
|
+
continue
|
|
461
|
+
|
|
462
|
+
# Get canonical name for subdirectory
|
|
463
|
+
canonical_name = doc.canonical_name()
|
|
464
|
+
doc_storage = storage.with_base(canonical_name)
|
|
465
|
+
|
|
466
|
+
# Save document content
|
|
467
|
+
await doc_storage.write_bytes(doc.name, doc.content)
|
|
468
|
+
saved_count += 1
|
|
469
|
+
|
|
470
|
+
# Save description if present
|
|
471
|
+
if doc.description:
|
|
472
|
+
desc_path = f"{doc.name}{Document.DESCRIPTION_EXTENSION}"
|
|
473
|
+
await doc_storage.write_text(desc_path, doc.description)
|
|
474
|
+
|
|
475
|
+
# Save sources if present
|
|
476
|
+
if doc.sources:
|
|
477
|
+
sources_path = f"{doc.name}{Document.SOURCES_EXTENSION}"
|
|
478
|
+
sources_json = json.dumps(doc.sources, indent=2)
|
|
479
|
+
await doc_storage.write_text(sources_path, sources_json)
|
|
480
|
+
|
|
481
|
+
logger.debug(f"Saved {type(doc).__name__} document: {doc.name}")
|
|
482
|
+
|
|
483
|
+
logger.info(f"Saved {saved_count} documents to {uri}")
|
ai_pipeline_core/llm/__init__.py
CHANGED
|
@@ -8,8 +8,6 @@ from .ai_messages import AIMessages, AIMessageType
|
|
|
8
8
|
from .client import (
|
|
9
9
|
generate,
|
|
10
10
|
generate_structured,
|
|
11
|
-
generate_with_retry_for_testing,
|
|
12
|
-
process_messages_for_testing,
|
|
13
11
|
)
|
|
14
12
|
from .model_options import ModelOptions
|
|
15
13
|
from .model_response import ModelResponse, StructuredModelResponse
|
|
@@ -19,12 +17,9 @@ __all__ = [
|
|
|
19
17
|
"AIMessages",
|
|
20
18
|
"AIMessageType",
|
|
21
19
|
"ModelName",
|
|
22
|
-
"ModelOptions",
|
|
23
20
|
"ModelResponse",
|
|
21
|
+
"ModelOptions",
|
|
24
22
|
"StructuredModelResponse",
|
|
25
23
|
"generate",
|
|
26
24
|
"generate_structured",
|
|
27
|
-
# Internal functions exposed for testing only
|
|
28
|
-
"process_messages_for_testing",
|
|
29
|
-
"generate_with_retry_for_testing",
|
|
30
25
|
]
|