ai-pipeline-core 0.1.14__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ai_pipeline_core/__init__.py +21 -13
- ai_pipeline_core/documents/document.py +93 -50
- ai_pipeline_core/documents/document_list.py +70 -23
- ai_pipeline_core/documents/flow_document.py +2 -6
- ai_pipeline_core/documents/task_document.py +0 -4
- ai_pipeline_core/documents/temporary_document.py +1 -8
- ai_pipeline_core/flow/config.py +174 -5
- ai_pipeline_core/llm/__init__.py +1 -1
- ai_pipeline_core/llm/ai_messages.py +14 -4
- ai_pipeline_core/llm/client.py +116 -59
- ai_pipeline_core/llm/model_options.py +2 -5
- ai_pipeline_core/llm/model_response.py +17 -16
- ai_pipeline_core/llm/model_types.py +0 -4
- ai_pipeline_core/logging/__init__.py +0 -2
- ai_pipeline_core/logging/logging_config.py +0 -6
- ai_pipeline_core/logging/logging_mixin.py +2 -10
- ai_pipeline_core/pipeline.py +45 -68
- ai_pipeline_core/prefect.py +12 -3
- ai_pipeline_core/prompt_manager.py +6 -7
- ai_pipeline_core/settings.py +13 -5
- ai_pipeline_core/simple_runner/__init__.py +1 -11
- ai_pipeline_core/simple_runner/cli.py +13 -12
- ai_pipeline_core/simple_runner/simple_runner.py +34 -189
- ai_pipeline_core/storage/__init__.py +8 -0
- ai_pipeline_core/storage/storage.py +628 -0
- ai_pipeline_core/tracing.py +3 -26
- {ai_pipeline_core-0.1.14.dist-info → ai_pipeline_core-0.2.0.dist-info}/METADATA +19 -17
- ai_pipeline_core-0.2.0.dist-info/RECORD +38 -0
- ai_pipeline_core-0.1.14.dist-info/RECORD +0 -36
- {ai_pipeline_core-0.1.14.dist-info → ai_pipeline_core-0.2.0.dist-info}/WHEEL +0 -0
- {ai_pipeline_core-0.1.14.dist-info → ai_pipeline_core-0.2.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -5,24 +5,24 @@ locally without full Prefect orchestration. It handles document I/O,
|
|
|
5
5
|
flow sequencing, and error management.
|
|
6
6
|
|
|
7
7
|
Key components:
|
|
8
|
-
- Document I/O from/to filesystem directories
|
|
8
|
+
- Document I/O from/to filesystem directories via FlowConfig
|
|
9
9
|
- Single and multi-flow execution
|
|
10
10
|
- Automatic document validation and passing between flows
|
|
11
11
|
- Step-based execution control (start/end steps)
|
|
12
12
|
|
|
13
13
|
Directory structure:
|
|
14
14
|
working_dir/
|
|
15
|
-
├──
|
|
15
|
+
├── inputdocument/ # Documents of type InputDocument (lowercase)
|
|
16
16
|
│ ├── file1.txt
|
|
17
17
|
│ └── file1.txt.description.md # Optional description
|
|
18
|
-
└──
|
|
18
|
+
└── outputdocument/ # Documents of type OutputDocument (lowercase)
|
|
19
19
|
└── result.json
|
|
20
20
|
|
|
21
21
|
Example:
|
|
22
|
-
>>> from ai_pipeline_core
|
|
22
|
+
>>> from ai_pipeline_core import simple_runner
|
|
23
23
|
>>>
|
|
24
24
|
>>> # Run single flow
|
|
25
|
-
>>> results = await run_pipeline(
|
|
25
|
+
>>> results = await simple_runner.run_pipeline(
|
|
26
26
|
... flow_func=MyFlow,
|
|
27
27
|
... config=MyConfig,
|
|
28
28
|
... project_name="test",
|
|
@@ -31,16 +31,14 @@ Example:
|
|
|
31
31
|
... )
|
|
32
32
|
|
|
33
33
|
Note:
|
|
34
|
-
Document directories are
|
|
35
|
-
|
|
34
|
+
Document directories are organized by document type names (lowercase)
|
|
35
|
+
for consistent structure and easy access.
|
|
36
36
|
"""
|
|
37
37
|
|
|
38
|
-
import json
|
|
39
38
|
from pathlib import Path
|
|
40
|
-
from typing import Any, Callable, Sequence
|
|
39
|
+
from typing import Any, Callable, Sequence
|
|
41
40
|
|
|
42
|
-
from ai_pipeline_core.documents import
|
|
43
|
-
from ai_pipeline_core.flow.config import FlowConfig
|
|
41
|
+
from ai_pipeline_core.documents import DocumentList
|
|
44
42
|
from ai_pipeline_core.flow.options import FlowOptions
|
|
45
43
|
from ai_pipeline_core.logging import get_pipeline_logger
|
|
46
44
|
|
|
@@ -49,163 +47,9 @@ logger = get_pipeline_logger(__name__)
|
|
|
49
47
|
FlowSequence = Sequence[Callable[..., Any]]
|
|
50
48
|
"""Type alias for a sequence of flow functions."""
|
|
51
49
|
|
|
52
|
-
ConfigSequence = Sequence[Type[FlowConfig]]
|
|
53
|
-
"""Type alias for a sequence of flow configuration classes."""
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
def load_documents_from_directory(
|
|
57
|
-
base_dir: Path, document_types: Sequence[Type[FlowDocument]]
|
|
58
|
-
) -> DocumentList:
|
|
59
|
-
"""Load documents from filesystem directories by type.
|
|
60
|
-
|
|
61
|
-
Scans subdirectories of base_dir for documents matching the provided
|
|
62
|
-
types. Each document type has its own subdirectory named after its
|
|
63
|
-
canonical_name().
|
|
64
|
-
|
|
65
|
-
Args:
|
|
66
|
-
base_dir: Base directory containing document subdirectories.
|
|
67
|
-
document_types: Sequence of FlowDocument subclasses to load.
|
|
68
|
-
Each type corresponds to a subdirectory.
|
|
69
|
-
|
|
70
|
-
Returns:
|
|
71
|
-
DocumentList containing all successfully loaded documents.
|
|
72
|
-
Empty list if no documents found or directories don't exist.
|
|
73
|
-
|
|
74
|
-
Directory structure:
|
|
75
|
-
base_dir/
|
|
76
|
-
├── DocumentTypeA/ # canonical_name() of DocumentTypeA
|
|
77
|
-
│ ├── doc1.txt
|
|
78
|
-
│ ├── doc1.txt.description.md # Optional description file
|
|
79
|
-
│ └── doc2.json
|
|
80
|
-
└── DocumentTypeB/
|
|
81
|
-
└── data.csv
|
|
82
|
-
|
|
83
|
-
File handling:
|
|
84
|
-
- Document content is read as bytes
|
|
85
|
-
- Optional .description.md files provide document descriptions
|
|
86
|
-
- Failed loads are logged but don't stop processing
|
|
87
|
-
- Non-file entries are skipped
|
|
88
|
-
|
|
89
|
-
Example:
|
|
90
|
-
>>> from my_docs import InputDoc, ConfigDoc
|
|
91
|
-
>>> docs = load_documents_from_directory(
|
|
92
|
-
... Path("./data"),
|
|
93
|
-
... [InputDoc, ConfigDoc]
|
|
94
|
-
... )
|
|
95
|
-
>>> print(f"Loaded {len(docs)} documents")
|
|
96
|
-
|
|
97
|
-
Note:
|
|
98
|
-
- Uses canonical_name() for directory names (e.g., "InputDocument")
|
|
99
|
-
- Descriptions are loaded from "{filename}.description.md" files
|
|
100
|
-
- All file types are supported (determined by document class)
|
|
101
|
-
"""
|
|
102
|
-
documents = DocumentList()
|
|
103
|
-
|
|
104
|
-
for doc_class in document_types:
|
|
105
|
-
dir_name = doc_class.canonical_name()
|
|
106
|
-
type_dir = base_dir / dir_name
|
|
107
|
-
|
|
108
|
-
if not type_dir.exists() or not type_dir.is_dir():
|
|
109
|
-
continue
|
|
110
|
-
|
|
111
|
-
logger.info(f"Loading documents from {type_dir.relative_to(base_dir)}")
|
|
112
|
-
|
|
113
|
-
for file_path in type_dir.iterdir():
|
|
114
|
-
if not file_path.is_file() or file_path.name.endswith(Document.DESCRIPTION_EXTENSION):
|
|
115
|
-
continue
|
|
116
|
-
|
|
117
|
-
# Skip .sources.json files - they are metadata, not documents
|
|
118
|
-
if file_path.name.endswith(".sources.json"):
|
|
119
|
-
continue
|
|
120
|
-
|
|
121
|
-
try:
|
|
122
|
-
content = file_path.read_bytes()
|
|
123
|
-
|
|
124
|
-
# Load sources if .sources.json exists
|
|
125
|
-
sources = []
|
|
126
|
-
sources_file = file_path.with_name(file_path.name + ".sources.json")
|
|
127
|
-
if sources_file.exists():
|
|
128
|
-
sources = json.loads(sources_file.read_text(encoding="utf-8"))
|
|
129
|
-
|
|
130
|
-
doc = doc_class(name=file_path.name, content=content, sources=sources)
|
|
131
|
-
|
|
132
|
-
desc_file = file_path.with_name(file_path.name + Document.DESCRIPTION_EXTENSION)
|
|
133
|
-
if desc_file.exists():
|
|
134
|
-
object.__setattr__(doc, "description", desc_file.read_text(encoding="utf-8"))
|
|
135
|
-
|
|
136
|
-
documents.append(doc)
|
|
137
|
-
except Exception as e:
|
|
138
|
-
logger.error(
|
|
139
|
-
f" Failed to load {file_path.name} as {doc_class.__name__}: {e}", exc_info=True
|
|
140
|
-
)
|
|
141
|
-
|
|
142
|
-
return documents
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
def save_documents_to_directory(base_dir: Path, documents: DocumentList) -> None:
|
|
146
|
-
"""Save documents to filesystem directories by type.
|
|
147
|
-
|
|
148
|
-
Creates subdirectories under base_dir for each document type and
|
|
149
|
-
saves documents with their original filenames. Only FlowDocument
|
|
150
|
-
instances are saved (temporary documents are skipped).
|
|
151
|
-
|
|
152
|
-
Args:
|
|
153
|
-
base_dir: Base directory for saving document subdirectories.
|
|
154
|
-
Created if it doesn't exist.
|
|
155
|
-
documents: DocumentList containing documents to save.
|
|
156
|
-
Non-FlowDocument instances are silently skipped.
|
|
157
|
-
|
|
158
|
-
Side effects:
|
|
159
|
-
- Creates base_dir and subdirectories as needed
|
|
160
|
-
- Overwrites existing files with the same name
|
|
161
|
-
- Logs each saved document
|
|
162
|
-
- Creates .description.md files for documents with descriptions
|
|
163
|
-
|
|
164
|
-
Directory structure created:
|
|
165
|
-
base_dir/
|
|
166
|
-
└── DocumentType/ # canonical_name() of document
|
|
167
|
-
├── output.json # Document content
|
|
168
|
-
└── output.json.description.md # Optional description
|
|
169
|
-
|
|
170
|
-
Example:
|
|
171
|
-
>>> docs = DocumentList([
|
|
172
|
-
... OutputDoc(name="result.txt", content=b"data"),
|
|
173
|
-
... OutputDoc(name="stats.json", content=b'{...}')
|
|
174
|
-
... ])
|
|
175
|
-
>>> save_documents_to_directory(Path("./output"), docs)
|
|
176
|
-
>>> # Creates ./output/OutputDocument/result.txt
|
|
177
|
-
>>> # and ./output/OutputDocument/stats.json
|
|
178
|
-
|
|
179
|
-
Note:
|
|
180
|
-
- Only FlowDocument subclasses are saved
|
|
181
|
-
- TaskDocument and other temporary documents are skipped
|
|
182
|
-
- Descriptions are saved as separate .description.md files
|
|
183
|
-
"""
|
|
184
|
-
for document in documents:
|
|
185
|
-
if not isinstance(document, FlowDocument):
|
|
186
|
-
continue
|
|
187
|
-
|
|
188
|
-
dir_name = document.canonical_name()
|
|
189
|
-
document_dir = base_dir / dir_name
|
|
190
|
-
document_dir.mkdir(parents=True, exist_ok=True)
|
|
191
|
-
|
|
192
|
-
file_path = document_dir / document.name
|
|
193
|
-
file_path.write_bytes(document.content)
|
|
194
|
-
logger.info(f"Saved: {dir_name}/{document.name}")
|
|
195
|
-
|
|
196
|
-
if document.description:
|
|
197
|
-
desc_file = file_path.with_name(file_path.name + Document.DESCRIPTION_EXTENSION)
|
|
198
|
-
desc_file.write_text(document.description, encoding="utf-8")
|
|
199
|
-
|
|
200
|
-
# Save sources to .sources.json if present
|
|
201
|
-
if document.sources:
|
|
202
|
-
sources_file = file_path.with_name(file_path.name + ".sources.json")
|
|
203
|
-
sources_file.write_text(json.dumps(document.sources, indent=2), encoding="utf-8")
|
|
204
|
-
|
|
205
50
|
|
|
206
51
|
async def run_pipeline(
|
|
207
52
|
flow_func: Callable[..., Any],
|
|
208
|
-
config: Type[FlowConfig],
|
|
209
53
|
project_name: str,
|
|
210
54
|
output_dir: Path,
|
|
211
55
|
flow_options: FlowOptions,
|
|
@@ -219,7 +63,7 @@ async def run_pipeline(
|
|
|
219
63
|
|
|
220
64
|
The execution proceeds through these steps:
|
|
221
65
|
1. Load input documents from output_dir subdirectories
|
|
222
|
-
2. Validate input documents against config requirements
|
|
66
|
+
2. Validate input documents against flow's config requirements
|
|
223
67
|
3. Execute flow function with documents and options
|
|
224
68
|
4. Validate output documents match config.OUTPUT_DOCUMENT_TYPE
|
|
225
69
|
5. Save output documents to output_dir subdirectories
|
|
@@ -227,9 +71,7 @@ async def run_pipeline(
|
|
|
227
71
|
Args:
|
|
228
72
|
flow_func: Async flow function decorated with @pipeline_flow.
|
|
229
73
|
Must accept (project_name, documents, flow_options).
|
|
230
|
-
|
|
231
|
-
config: FlowConfig subclass defining input/output document types.
|
|
232
|
-
Used for validation and directory organization.
|
|
74
|
+
The flow must have a config attribute set by @pipeline_flow.
|
|
233
75
|
|
|
234
76
|
project_name: Name of the project/pipeline for logging and tracking.
|
|
235
77
|
|
|
@@ -246,14 +88,14 @@ async def run_pipeline(
|
|
|
246
88
|
DocumentList containing the flow's output documents.
|
|
247
89
|
|
|
248
90
|
Raises:
|
|
249
|
-
RuntimeError: If required input documents are missing
|
|
91
|
+
RuntimeError: If required input documents are missing or if
|
|
92
|
+
flow doesn't have a config attribute.
|
|
250
93
|
|
|
251
94
|
Example:
|
|
252
|
-
>>> from my_flows import AnalysisFlow
|
|
95
|
+
>>> from my_flows import AnalysisFlow
|
|
253
96
|
>>>
|
|
254
97
|
>>> results = await run_pipeline(
|
|
255
98
|
... flow_func=AnalysisFlow,
|
|
256
|
-
... config=AnalysisConfig,
|
|
257
99
|
... project_name="analysis_001",
|
|
258
100
|
... output_dir=Path("./results"),
|
|
259
101
|
... flow_options=FlowOptions(temperature=0.7)
|
|
@@ -261,8 +103,8 @@ async def run_pipeline(
|
|
|
261
103
|
>>> print(f"Generated {len(results)} documents")
|
|
262
104
|
|
|
263
105
|
Note:
|
|
264
|
-
- Flow must be async (decorated with @pipeline_flow)
|
|
265
|
-
- Input documents are loaded based on config.INPUT_DOCUMENT_TYPES
|
|
106
|
+
- Flow must be async (decorated with @pipeline_flow with config)
|
|
107
|
+
- Input documents are loaded based on flow's config.INPUT_DOCUMENT_TYPES
|
|
266
108
|
- Output is validated against config.OUTPUT_DOCUMENT_TYPE
|
|
267
109
|
- All I/O is logged for debugging
|
|
268
110
|
"""
|
|
@@ -273,7 +115,16 @@ async def run_pipeline(
|
|
|
273
115
|
|
|
274
116
|
logger.info(f"Running Flow: {flow_name}")
|
|
275
117
|
|
|
276
|
-
|
|
118
|
+
# Get config from the flow function (attached by @pipeline_flow decorator)
|
|
119
|
+
config = getattr(flow_func, "config", None)
|
|
120
|
+
if config is None:
|
|
121
|
+
raise RuntimeError(
|
|
122
|
+
f"Flow {flow_name} does not have a config attribute. "
|
|
123
|
+
"Ensure it's decorated with @pipeline_flow(config=YourConfig)"
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
# Load input documents using FlowConfig's new async method
|
|
127
|
+
input_documents = await config.load_documents(str(output_dir))
|
|
277
128
|
|
|
278
129
|
if not config.has_input_documents(input_documents):
|
|
279
130
|
raise RuntimeError(f"Missing input documents for flow {flow_name}")
|
|
@@ -282,7 +133,8 @@ async def run_pipeline(
|
|
|
282
133
|
|
|
283
134
|
config.validate_output_documents(result_documents)
|
|
284
135
|
|
|
285
|
-
|
|
136
|
+
# Save output documents using FlowConfig's new async method
|
|
137
|
+
await config.save_documents(str(output_dir), result_documents)
|
|
286
138
|
|
|
287
139
|
logger.info(f"Completed Flow: {flow_name}")
|
|
288
140
|
|
|
@@ -293,7 +145,6 @@ async def run_pipelines(
|
|
|
293
145
|
project_name: str,
|
|
294
146
|
output_dir: Path,
|
|
295
147
|
flows: FlowSequence,
|
|
296
|
-
flow_configs: ConfigSequence,
|
|
297
148
|
flow_options: FlowOptions,
|
|
298
149
|
start_step: int = 1,
|
|
299
150
|
end_step: int | None = None,
|
|
@@ -305,7 +156,7 @@ async def run_pipelines(
|
|
|
305
156
|
for debugging and resuming failed pipelines.
|
|
306
157
|
|
|
307
158
|
Execution proceeds by:
|
|
308
|
-
1. Validating step indices
|
|
159
|
+
1. Validating step indices
|
|
309
160
|
2. For each flow in range [start_step, end_step]:
|
|
310
161
|
a. Loading input documents from output_dir
|
|
311
162
|
b. Executing flow with documents
|
|
@@ -322,9 +173,8 @@ async def run_pipelines(
|
|
|
322
173
|
output_dir: Directory for document I/O between flows.
|
|
323
174
|
Shared by all flows in the sequence.
|
|
324
175
|
flows: Sequence of flow functions to execute in order.
|
|
325
|
-
Must all be async functions decorated with @pipeline_flow
|
|
326
|
-
|
|
327
|
-
Must have same length as flows sequence.
|
|
176
|
+
Must all be async functions decorated with @pipeline_flow
|
|
177
|
+
with a config parameter.
|
|
328
178
|
flow_options: Options passed to all flows in the sequence.
|
|
329
179
|
Individual flows can use different fields.
|
|
330
180
|
start_step: First flow to execute (1-based index).
|
|
@@ -333,8 +183,8 @@ async def run_pipelines(
|
|
|
333
183
|
None runs through the last flow.
|
|
334
184
|
|
|
335
185
|
Raises:
|
|
336
|
-
ValueError: If
|
|
337
|
-
|
|
186
|
+
ValueError: If start_step or end_step are out of range.
|
|
187
|
+
RuntimeError: If any flow doesn't have a config attribute.
|
|
338
188
|
|
|
339
189
|
Example:
|
|
340
190
|
>>> # Run full pipeline
|
|
@@ -342,7 +192,6 @@ async def run_pipelines(
|
|
|
342
192
|
... project_name="analysis",
|
|
343
193
|
... output_dir=Path("./work"),
|
|
344
194
|
... flows=[ExtractFlow, AnalyzeFlow, SummarizeFlow],
|
|
345
|
-
... flow_configs=[ExtractConfig, AnalyzeConfig, SummaryConfig],
|
|
346
195
|
... flow_options=options
|
|
347
196
|
... )
|
|
348
197
|
>>>
|
|
@@ -354,14 +203,12 @@ async def run_pipelines(
|
|
|
354
203
|
... )
|
|
355
204
|
|
|
356
205
|
Note:
|
|
206
|
+
- Each flow must be decorated with @pipeline_flow(config=...)
|
|
357
207
|
- Each flow's output must match the next flow's input types
|
|
358
208
|
- Failed flows stop the entire pipeline
|
|
359
209
|
- Progress is logged with step numbers for debugging
|
|
360
210
|
- Documents persist in output_dir between runs
|
|
361
211
|
"""
|
|
362
|
-
if len(flows) != len(flow_configs):
|
|
363
|
-
raise ValueError("The number of flows and flow configs must match.")
|
|
364
|
-
|
|
365
212
|
num_steps = len(flows)
|
|
366
213
|
start_index = start_step - 1
|
|
367
214
|
end_index = (end_step if end_step is not None else num_steps) - 1
|
|
@@ -377,7 +224,6 @@ async def run_pipelines(
|
|
|
377
224
|
|
|
378
225
|
for i in range(start_index, end_index + 1):
|
|
379
226
|
flow_func = flows[i]
|
|
380
|
-
config = flow_configs[i]
|
|
381
227
|
# For Prefect Flow objects, use their name attribute; for functions, use __name__
|
|
382
228
|
flow_name = getattr(flow_func, "name", None) or getattr(
|
|
383
229
|
flow_func, "__name__", f"flow_{i + 1}"
|
|
@@ -388,7 +234,6 @@ async def run_pipelines(
|
|
|
388
234
|
try:
|
|
389
235
|
await run_pipeline(
|
|
390
236
|
flow_func=flow_func,
|
|
391
|
-
config=config,
|
|
392
237
|
project_name=project_name,
|
|
393
238
|
output_dir=output_dir,
|
|
394
239
|
flow_options=flow_options,
|