ai-pipeline-core 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ai_pipeline_core/__init__.py +54 -13
- ai_pipeline_core/documents/__init__.py +3 -0
- ai_pipeline_core/documents/document.py +57 -3
- ai_pipeline_core/documents/mime_type.py +64 -32
- ai_pipeline_core/flow/__init__.py +5 -1
- ai_pipeline_core/flow/options.py +26 -0
- ai_pipeline_core/llm/client.py +5 -3
- ai_pipeline_core/pipeline.py +418 -0
- ai_pipeline_core/prefect.py +7 -0
- ai_pipeline_core/simple_runner/__init__.py +19 -0
- ai_pipeline_core/simple_runner/cli.py +95 -0
- ai_pipeline_core/simple_runner/simple_runner.py +147 -0
- ai_pipeline_core/tracing.py +63 -20
- {ai_pipeline_core-0.1.5.dist-info → ai_pipeline_core-0.1.7.dist-info}/METADATA +92 -30
- {ai_pipeline_core-0.1.5.dist-info → ai_pipeline_core-0.1.7.dist-info}/RECORD +17 -11
- {ai_pipeline_core-0.1.5.dist-info → ai_pipeline_core-0.1.7.dist-info}/WHEEL +0 -0
- {ai_pipeline_core-0.1.5.dist-info → ai_pipeline_core-0.1.7.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,418 @@
|
|
|
1
|
+
"""Pipeline decorators that combine Prefect functionality with tracing support.
|
|
2
|
+
|
|
3
|
+
These decorators extend the base Prefect decorators with automatic tracing capabilities.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import datetime
|
|
7
|
+
import functools
|
|
8
|
+
import inspect
|
|
9
|
+
from typing import (
|
|
10
|
+
TYPE_CHECKING,
|
|
11
|
+
Any,
|
|
12
|
+
Callable,
|
|
13
|
+
Coroutine,
|
|
14
|
+
Dict,
|
|
15
|
+
Iterable,
|
|
16
|
+
Optional,
|
|
17
|
+
TypeVar,
|
|
18
|
+
Union,
|
|
19
|
+
cast,
|
|
20
|
+
overload,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
from prefect.assets import Asset
|
|
24
|
+
from prefect.cache_policies import CachePolicy
|
|
25
|
+
from prefect.context import TaskRunContext
|
|
26
|
+
from prefect.flows import Flow, FlowStateHook
|
|
27
|
+
from prefect.futures import PrefectFuture
|
|
28
|
+
from prefect.results import ResultSerializer, ResultStorage
|
|
29
|
+
from prefect.task_runners import TaskRunner
|
|
30
|
+
from prefect.tasks import (
|
|
31
|
+
RetryConditionCallable,
|
|
32
|
+
StateHookCallable,
|
|
33
|
+
Task,
|
|
34
|
+
TaskRunNameValueOrCallable,
|
|
35
|
+
)
|
|
36
|
+
from prefect.utilities.annotations import NotSet
|
|
37
|
+
from typing_extensions import Concatenate, ParamSpec
|
|
38
|
+
|
|
39
|
+
from ai_pipeline_core.documents import DocumentList
|
|
40
|
+
from ai_pipeline_core.flow.options import FlowOptions
|
|
41
|
+
from ai_pipeline_core.prefect import flow, task
|
|
42
|
+
from ai_pipeline_core.tracing import TraceLevel, trace
|
|
43
|
+
|
|
44
|
+
if TYPE_CHECKING:
|
|
45
|
+
pass
|
|
46
|
+
|
|
47
|
+
P = ParamSpec("P")
|
|
48
|
+
R = TypeVar("R")
|
|
49
|
+
|
|
50
|
+
# ============================================================================
|
|
51
|
+
# PIPELINE TASK DECORATOR
|
|
52
|
+
# ============================================================================
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@overload
|
|
56
|
+
def pipeline_task(__fn: Callable[P, R], /) -> Task[P, R]: ...
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
@overload
|
|
60
|
+
def pipeline_task(
|
|
61
|
+
*,
|
|
62
|
+
# Tracing parameters
|
|
63
|
+
trace_level: TraceLevel = "always",
|
|
64
|
+
trace_ignore_input: bool = False,
|
|
65
|
+
trace_ignore_output: bool = False,
|
|
66
|
+
trace_ignore_inputs: list[str] | None = None,
|
|
67
|
+
trace_input_formatter: Optional[Callable[..., str]] = None,
|
|
68
|
+
trace_output_formatter: Optional[Callable[..., str]] = None,
|
|
69
|
+
# Prefect parameters
|
|
70
|
+
name: Optional[str] = None,
|
|
71
|
+
description: Optional[str] = None,
|
|
72
|
+
tags: Optional[Iterable[str]] = None,
|
|
73
|
+
version: Optional[str] = None,
|
|
74
|
+
cache_policy: Union[CachePolicy, type[NotSet]] = NotSet,
|
|
75
|
+
cache_key_fn: Optional[Callable[[TaskRunContext, Dict[str, Any]], Optional[str]]] = None,
|
|
76
|
+
cache_expiration: Optional[datetime.timedelta] = None,
|
|
77
|
+
task_run_name: Optional[TaskRunNameValueOrCallable] = None,
|
|
78
|
+
retries: Optional[int] = None,
|
|
79
|
+
retry_delay_seconds: Optional[
|
|
80
|
+
Union[float, int, list[float], Callable[[int], list[float]]]
|
|
81
|
+
] = None,
|
|
82
|
+
retry_jitter_factor: Optional[float] = None,
|
|
83
|
+
persist_result: Optional[bool] = None,
|
|
84
|
+
result_storage: Optional[Union[ResultStorage, str]] = None,
|
|
85
|
+
result_serializer: Optional[Union[ResultSerializer, str]] = None,
|
|
86
|
+
result_storage_key: Optional[str] = None,
|
|
87
|
+
cache_result_in_memory: bool = True,
|
|
88
|
+
timeout_seconds: Union[int, float, None] = None,
|
|
89
|
+
log_prints: Optional[bool] = False,
|
|
90
|
+
refresh_cache: Optional[bool] = None,
|
|
91
|
+
on_completion: Optional[list[StateHookCallable]] = None,
|
|
92
|
+
on_failure: Optional[list[StateHookCallable]] = None,
|
|
93
|
+
retry_condition_fn: Optional[RetryConditionCallable] = None,
|
|
94
|
+
viz_return_value: Optional[bool] = None,
|
|
95
|
+
asset_deps: Optional[list[Union[str, Asset]]] = None,
|
|
96
|
+
) -> Callable[[Callable[P, R]], Task[P, R]]: ...
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def pipeline_task(
|
|
100
|
+
__fn: Optional[Callable[P, R]] = None,
|
|
101
|
+
/,
|
|
102
|
+
*,
|
|
103
|
+
# Tracing parameters
|
|
104
|
+
trace_level: TraceLevel = "always",
|
|
105
|
+
trace_ignore_input: bool = False,
|
|
106
|
+
trace_ignore_output: bool = False,
|
|
107
|
+
trace_ignore_inputs: list[str] | None = None,
|
|
108
|
+
trace_input_formatter: Optional[Callable[..., str]] = None,
|
|
109
|
+
trace_output_formatter: Optional[Callable[..., str]] = None,
|
|
110
|
+
# Prefect parameters
|
|
111
|
+
name: Optional[str] = None,
|
|
112
|
+
description: Optional[str] = None,
|
|
113
|
+
tags: Optional[Iterable[str]] = None,
|
|
114
|
+
version: Optional[str] = None,
|
|
115
|
+
cache_policy: Union[CachePolicy, type[NotSet]] = NotSet,
|
|
116
|
+
cache_key_fn: Optional[Callable[[TaskRunContext, Dict[str, Any]], Optional[str]]] = None,
|
|
117
|
+
cache_expiration: Optional[datetime.timedelta] = None,
|
|
118
|
+
task_run_name: Optional[TaskRunNameValueOrCallable] = None,
|
|
119
|
+
retries: Optional[int] = None,
|
|
120
|
+
retry_delay_seconds: Optional[
|
|
121
|
+
Union[float, int, list[float], Callable[[int], list[float]]]
|
|
122
|
+
] = None,
|
|
123
|
+
retry_jitter_factor: Optional[float] = None,
|
|
124
|
+
persist_result: Optional[bool] = None,
|
|
125
|
+
result_storage: Optional[Union[ResultStorage, str]] = None,
|
|
126
|
+
result_serializer: Optional[Union[ResultSerializer, str]] = None,
|
|
127
|
+
result_storage_key: Optional[str] = None,
|
|
128
|
+
cache_result_in_memory: bool = True,
|
|
129
|
+
timeout_seconds: Union[int, float, None] = None,
|
|
130
|
+
log_prints: Optional[bool] = False,
|
|
131
|
+
refresh_cache: Optional[bool] = None,
|
|
132
|
+
on_completion: Optional[list[StateHookCallable]] = None,
|
|
133
|
+
on_failure: Optional[list[StateHookCallable]] = None,
|
|
134
|
+
retry_condition_fn: Optional[RetryConditionCallable] = None,
|
|
135
|
+
viz_return_value: Optional[bool] = None,
|
|
136
|
+
asset_deps: Optional[list[Union[str, Asset]]] = None,
|
|
137
|
+
) -> Union[Task[P, R], Callable[[Callable[P, R]], Task[P, R]]]:
|
|
138
|
+
"""
|
|
139
|
+
Pipeline task decorator that combines Prefect task functionality with automatic tracing.
|
|
140
|
+
|
|
141
|
+
This decorator applies tracing before the Prefect task decorator, allowing you to
|
|
142
|
+
monitor task execution with LMNR while maintaining all Prefect functionality.
|
|
143
|
+
|
|
144
|
+
Args:
|
|
145
|
+
trace_level: Control tracing ("always", "debug", "off")
|
|
146
|
+
trace_ignore_input: Whether to ignore input in traces
|
|
147
|
+
trace_ignore_output: Whether to ignore output in traces
|
|
148
|
+
trace_ignore_inputs: List of input parameter names to ignore
|
|
149
|
+
trace_input_formatter: Custom formatter for inputs
|
|
150
|
+
trace_output_formatter: Custom formatter for outputs
|
|
151
|
+
|
|
152
|
+
Plus all standard Prefect task parameters...
|
|
153
|
+
"""
|
|
154
|
+
|
|
155
|
+
def decorator(fn: Callable[P, R]) -> Task[P, R]:
|
|
156
|
+
# Apply tracing first if enabled
|
|
157
|
+
if trace_level != "off":
|
|
158
|
+
traced_fn = trace(
|
|
159
|
+
level=trace_level,
|
|
160
|
+
name=name or fn.__name__,
|
|
161
|
+
ignore_input=trace_ignore_input,
|
|
162
|
+
ignore_output=trace_ignore_output,
|
|
163
|
+
ignore_inputs=trace_ignore_inputs,
|
|
164
|
+
input_formatter=trace_input_formatter,
|
|
165
|
+
output_formatter=trace_output_formatter,
|
|
166
|
+
)(fn)
|
|
167
|
+
else:
|
|
168
|
+
traced_fn = fn
|
|
169
|
+
|
|
170
|
+
# Then apply Prefect task decorator
|
|
171
|
+
return task( # pyright: ignore[reportCallIssue,reportUnknownVariableType]
|
|
172
|
+
traced_fn, # pyright: ignore[reportArgumentType]
|
|
173
|
+
name=name,
|
|
174
|
+
description=description,
|
|
175
|
+
tags=tags,
|
|
176
|
+
version=version,
|
|
177
|
+
cache_policy=cache_policy,
|
|
178
|
+
cache_key_fn=cache_key_fn,
|
|
179
|
+
cache_expiration=cache_expiration,
|
|
180
|
+
task_run_name=task_run_name,
|
|
181
|
+
retries=retries or 0,
|
|
182
|
+
retry_delay_seconds=retry_delay_seconds,
|
|
183
|
+
retry_jitter_factor=retry_jitter_factor,
|
|
184
|
+
persist_result=persist_result,
|
|
185
|
+
result_storage=result_storage,
|
|
186
|
+
result_serializer=result_serializer,
|
|
187
|
+
result_storage_key=result_storage_key,
|
|
188
|
+
cache_result_in_memory=cache_result_in_memory,
|
|
189
|
+
timeout_seconds=timeout_seconds,
|
|
190
|
+
log_prints=log_prints,
|
|
191
|
+
refresh_cache=refresh_cache,
|
|
192
|
+
on_completion=on_completion,
|
|
193
|
+
on_failure=on_failure,
|
|
194
|
+
retry_condition_fn=retry_condition_fn,
|
|
195
|
+
viz_return_value=viz_return_value,
|
|
196
|
+
asset_deps=asset_deps,
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
if __fn:
|
|
200
|
+
return decorator(__fn)
|
|
201
|
+
return decorator
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
# ============================================================================
|
|
205
|
+
# PIPELINE FLOW DECORATOR WITH DOCUMENT PROCESSING
|
|
206
|
+
# ============================================================================
|
|
207
|
+
|
|
208
|
+
# Type aliases for document flow signatures
|
|
209
|
+
DocumentsFlowSig = Callable[
|
|
210
|
+
Concatenate[str, DocumentList, FlowOptions, P],
|
|
211
|
+
Union[DocumentList, Coroutine[Any, Any, DocumentList]],
|
|
212
|
+
]
|
|
213
|
+
|
|
214
|
+
DocumentsFlowResult = Flow[Concatenate[str, DocumentList, FlowOptions, P], DocumentList]
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
@overload
|
|
218
|
+
def pipeline_flow(
|
|
219
|
+
__fn: DocumentsFlowSig[P],
|
|
220
|
+
/,
|
|
221
|
+
) -> DocumentsFlowResult[P]: ...
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
@overload
|
|
225
|
+
def pipeline_flow(
|
|
226
|
+
*,
|
|
227
|
+
# Tracing parameters
|
|
228
|
+
trace_level: TraceLevel = "always",
|
|
229
|
+
trace_ignore_input: bool = False,
|
|
230
|
+
trace_ignore_output: bool = False,
|
|
231
|
+
trace_ignore_inputs: list[str] | None = None,
|
|
232
|
+
trace_input_formatter: Optional[Callable[..., str]] = None,
|
|
233
|
+
trace_output_formatter: Optional[Callable[..., str]] = None,
|
|
234
|
+
# Prefect parameters
|
|
235
|
+
name: Optional[str] = None,
|
|
236
|
+
version: Optional[str] = None,
|
|
237
|
+
flow_run_name: Optional[Union[Callable[[], str], str]] = None,
|
|
238
|
+
retries: Optional[int] = None,
|
|
239
|
+
retry_delay_seconds: Optional[Union[int, float]] = None,
|
|
240
|
+
task_runner: Optional[TaskRunner[PrefectFuture[Any]]] = None,
|
|
241
|
+
description: Optional[str] = None,
|
|
242
|
+
timeout_seconds: Union[int, float, None] = None,
|
|
243
|
+
validate_parameters: bool = True,
|
|
244
|
+
persist_result: Optional[bool] = None,
|
|
245
|
+
result_storage: Optional[Union[ResultStorage, str]] = None,
|
|
246
|
+
result_serializer: Optional[Union[ResultSerializer, str]] = None,
|
|
247
|
+
cache_result_in_memory: bool = True,
|
|
248
|
+
log_prints: Optional[bool] = None,
|
|
249
|
+
on_completion: Optional[list["FlowStateHook[..., Any]"]] = None,
|
|
250
|
+
on_failure: Optional[list["FlowStateHook[..., Any]"]] = None,
|
|
251
|
+
on_cancellation: Optional[list["FlowStateHook[..., Any]"]] = None,
|
|
252
|
+
on_crashed: Optional[list["FlowStateHook[..., Any]"]] = None,
|
|
253
|
+
on_running: Optional[list["FlowStateHook[..., Any]"]] = None,
|
|
254
|
+
) -> Callable[[DocumentsFlowSig[P]], DocumentsFlowResult[P]]: ...
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
def pipeline_flow(
|
|
258
|
+
__fn: Optional[DocumentsFlowSig[P]] = None,
|
|
259
|
+
/,
|
|
260
|
+
*,
|
|
261
|
+
# Tracing parameters
|
|
262
|
+
trace_level: TraceLevel = "always",
|
|
263
|
+
trace_ignore_input: bool = False,
|
|
264
|
+
trace_ignore_output: bool = False,
|
|
265
|
+
trace_ignore_inputs: list[str] | None = None,
|
|
266
|
+
trace_input_formatter: Optional[Callable[..., str]] = None,
|
|
267
|
+
trace_output_formatter: Optional[Callable[..., str]] = None,
|
|
268
|
+
# Prefect parameters
|
|
269
|
+
name: Optional[str] = None,
|
|
270
|
+
version: Optional[str] = None,
|
|
271
|
+
flow_run_name: Optional[Union[Callable[[], str], str]] = None,
|
|
272
|
+
retries: Optional[int] = None,
|
|
273
|
+
retry_delay_seconds: Optional[Union[int, float]] = None,
|
|
274
|
+
task_runner: Optional[TaskRunner[PrefectFuture[Any]]] = None,
|
|
275
|
+
description: Optional[str] = None,
|
|
276
|
+
timeout_seconds: Union[int, float, None] = None,
|
|
277
|
+
validate_parameters: bool = True,
|
|
278
|
+
persist_result: Optional[bool] = None,
|
|
279
|
+
result_storage: Optional[Union[ResultStorage, str]] = None,
|
|
280
|
+
result_serializer: Optional[Union[ResultSerializer, str]] = None,
|
|
281
|
+
cache_result_in_memory: bool = True,
|
|
282
|
+
log_prints: Optional[bool] = None,
|
|
283
|
+
on_completion: Optional[list["FlowStateHook[..., Any]"]] = None,
|
|
284
|
+
on_failure: Optional[list["FlowStateHook[..., Any]"]] = None,
|
|
285
|
+
on_cancellation: Optional[list["FlowStateHook[..., Any]"]] = None,
|
|
286
|
+
on_crashed: Optional[list["FlowStateHook[..., Any]"]] = None,
|
|
287
|
+
on_running: Optional[list["FlowStateHook[..., Any]"]] = None,
|
|
288
|
+
) -> Union[DocumentsFlowResult[P], Callable[[DocumentsFlowSig[P]], DocumentsFlowResult[P]]]:
|
|
289
|
+
"""
|
|
290
|
+
Pipeline flow for document processing with standardized signature.
|
|
291
|
+
|
|
292
|
+
This decorator enforces a specific signature for document processing flows:
|
|
293
|
+
- First parameter: project_name (str)
|
|
294
|
+
- Second parameter: documents (DocumentList)
|
|
295
|
+
- Third parameter: flow_options (FlowOptions or subclass)
|
|
296
|
+
- Additional parameters allowed
|
|
297
|
+
- Must return DocumentList
|
|
298
|
+
|
|
299
|
+
It includes automatic tracing and all Prefect flow functionality.
|
|
300
|
+
|
|
301
|
+
Args:
|
|
302
|
+
trace_level: Control tracing ("always", "debug", "off")
|
|
303
|
+
trace_ignore_input: Whether to ignore input in traces
|
|
304
|
+
trace_ignore_output: Whether to ignore output in traces
|
|
305
|
+
trace_ignore_inputs: List of input parameter names to ignore
|
|
306
|
+
trace_input_formatter: Custom formatter for inputs
|
|
307
|
+
trace_output_formatter: Custom formatter for outputs
|
|
308
|
+
|
|
309
|
+
Plus all standard Prefect flow parameters...
|
|
310
|
+
"""
|
|
311
|
+
|
|
312
|
+
def decorator(func: DocumentsFlowSig[P]) -> DocumentsFlowResult[P]:
|
|
313
|
+
sig = inspect.signature(func)
|
|
314
|
+
params = list(sig.parameters.values())
|
|
315
|
+
|
|
316
|
+
if len(params) < 3:
|
|
317
|
+
raise TypeError(
|
|
318
|
+
f"@pipeline_flow '{func.__name__}' must accept at least 3 arguments: "
|
|
319
|
+
"(project_name, documents, flow_options)"
|
|
320
|
+
)
|
|
321
|
+
|
|
322
|
+
# Validate parameter types (optional but recommended)
|
|
323
|
+
# We check names as a convention, not strict type checking at decoration time
|
|
324
|
+
expected_names = ["project_name", "documents", "flow_options"]
|
|
325
|
+
for i, expected in enumerate(expected_names):
|
|
326
|
+
if i < len(params) and params[i].name != expected:
|
|
327
|
+
print(
|
|
328
|
+
f"Warning: Parameter {i + 1} of '{func.__name__}' is named '{params[i].name}' "
|
|
329
|
+
f"but convention suggests '{expected}'"
|
|
330
|
+
)
|
|
331
|
+
|
|
332
|
+
# Create wrapper that ensures return type
|
|
333
|
+
if inspect.iscoroutinefunction(func):
|
|
334
|
+
|
|
335
|
+
@functools.wraps(func)
|
|
336
|
+
async def wrapper( # pyright: ignore[reportRedeclaration]
|
|
337
|
+
project_name: str,
|
|
338
|
+
documents: DocumentList,
|
|
339
|
+
flow_options: FlowOptions,
|
|
340
|
+
*args, # pyright: ignore[reportMissingParameterType]
|
|
341
|
+
**kwargs, # pyright: ignore[reportMissingParameterType]
|
|
342
|
+
) -> DocumentList:
|
|
343
|
+
result = await func(project_name, documents, flow_options, *args, **kwargs)
|
|
344
|
+
# Runtime type checking
|
|
345
|
+
DL = DocumentList # Avoid recomputation
|
|
346
|
+
if not isinstance(result, DL):
|
|
347
|
+
raise TypeError(
|
|
348
|
+
f"Flow '{func.__name__}' must return a DocumentList, "
|
|
349
|
+
f"but returned {type(result).__name__}"
|
|
350
|
+
)
|
|
351
|
+
return result
|
|
352
|
+
else:
|
|
353
|
+
|
|
354
|
+
@functools.wraps(func)
|
|
355
|
+
def wrapper( # pyright: ignore[reportRedeclaration]
|
|
356
|
+
project_name: str,
|
|
357
|
+
documents: DocumentList,
|
|
358
|
+
flow_options: FlowOptions,
|
|
359
|
+
*args, # pyright: ignore[reportMissingParameterType]
|
|
360
|
+
**kwargs, # pyright: ignore[reportMissingParameterType]
|
|
361
|
+
) -> DocumentList:
|
|
362
|
+
result = func(project_name, documents, flow_options, *args, **kwargs)
|
|
363
|
+
# Runtime type checking
|
|
364
|
+
DL = DocumentList # Avoid recomputation
|
|
365
|
+
if not isinstance(result, DL):
|
|
366
|
+
raise TypeError(
|
|
367
|
+
f"Flow '{func.__name__}' must return a DocumentList, "
|
|
368
|
+
f"but returned {type(result).__name__}"
|
|
369
|
+
)
|
|
370
|
+
return result
|
|
371
|
+
|
|
372
|
+
# Apply tracing first if enabled
|
|
373
|
+
if trace_level != "off":
|
|
374
|
+
traced_wrapper = trace(
|
|
375
|
+
level=trace_level,
|
|
376
|
+
name=name or func.__name__,
|
|
377
|
+
ignore_input=trace_ignore_input,
|
|
378
|
+
ignore_output=trace_ignore_output,
|
|
379
|
+
ignore_inputs=trace_ignore_inputs,
|
|
380
|
+
input_formatter=trace_input_formatter,
|
|
381
|
+
output_formatter=trace_output_formatter,
|
|
382
|
+
)(wrapper)
|
|
383
|
+
else:
|
|
384
|
+
traced_wrapper = wrapper
|
|
385
|
+
|
|
386
|
+
# Then apply Prefect flow decorator
|
|
387
|
+
return cast(
|
|
388
|
+
DocumentsFlowResult[P],
|
|
389
|
+
flow( # pyright: ignore[reportCallIssue,reportUnknownVariableType]
|
|
390
|
+
traced_wrapper, # pyright: ignore[reportArgumentType]
|
|
391
|
+
name=name,
|
|
392
|
+
version=version,
|
|
393
|
+
flow_run_name=flow_run_name,
|
|
394
|
+
retries=retries,
|
|
395
|
+
retry_delay_seconds=retry_delay_seconds,
|
|
396
|
+
task_runner=task_runner,
|
|
397
|
+
description=description,
|
|
398
|
+
timeout_seconds=timeout_seconds,
|
|
399
|
+
validate_parameters=validate_parameters,
|
|
400
|
+
persist_result=persist_result,
|
|
401
|
+
result_storage=result_storage,
|
|
402
|
+
result_serializer=result_serializer,
|
|
403
|
+
cache_result_in_memory=cache_result_in_memory,
|
|
404
|
+
log_prints=log_prints,
|
|
405
|
+
on_completion=on_completion,
|
|
406
|
+
on_failure=on_failure,
|
|
407
|
+
on_cancellation=on_cancellation,
|
|
408
|
+
on_crashed=on_crashed,
|
|
409
|
+
on_running=on_running,
|
|
410
|
+
),
|
|
411
|
+
)
|
|
412
|
+
|
|
413
|
+
if __fn:
|
|
414
|
+
return decorator(__fn)
|
|
415
|
+
return decorator
|
|
416
|
+
|
|
417
|
+
|
|
418
|
+
__all__ = ["pipeline_task", "pipeline_flow"]
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
from .cli import run_cli
|
|
2
|
+
from .simple_runner import (
|
|
3
|
+
ConfigSequence,
|
|
4
|
+
FlowSequence,
|
|
5
|
+
load_documents_from_directory,
|
|
6
|
+
run_pipeline,
|
|
7
|
+
run_pipelines,
|
|
8
|
+
save_documents_to_directory,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"run_cli",
|
|
13
|
+
"run_pipeline",
|
|
14
|
+
"run_pipelines",
|
|
15
|
+
"load_documents_from_directory",
|
|
16
|
+
"save_documents_to_directory",
|
|
17
|
+
"FlowSequence",
|
|
18
|
+
"ConfigSequence",
|
|
19
|
+
]
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Callable, Type, TypeVar, cast
|
|
6
|
+
|
|
7
|
+
from lmnr import Laminar
|
|
8
|
+
from pydantic_settings import CliPositionalArg, SettingsConfigDict
|
|
9
|
+
|
|
10
|
+
from ai_pipeline_core.documents import DocumentList
|
|
11
|
+
from ai_pipeline_core.flow.options import FlowOptions
|
|
12
|
+
from ai_pipeline_core.logging import get_pipeline_logger, setup_logging
|
|
13
|
+
|
|
14
|
+
from .simple_runner import ConfigSequence, FlowSequence, run_pipelines, save_documents_to_directory
|
|
15
|
+
|
|
16
|
+
logger = get_pipeline_logger(__name__)
|
|
17
|
+
|
|
18
|
+
TOptions = TypeVar("TOptions", bound=FlowOptions)
|
|
19
|
+
InitializerFunc = Callable[[FlowOptions], tuple[str, DocumentList]] | None
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _initialize_environment() -> None:
|
|
23
|
+
setup_logging()
|
|
24
|
+
try:
|
|
25
|
+
Laminar.initialize()
|
|
26
|
+
logger.info("LMNR tracing initialized.")
|
|
27
|
+
except Exception as e:
|
|
28
|
+
logger.warning(f"Failed to initialize LMNR tracing: {e}")
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def run_cli(
|
|
32
|
+
*,
|
|
33
|
+
flows: FlowSequence,
|
|
34
|
+
flow_configs: ConfigSequence,
|
|
35
|
+
options_cls: Type[TOptions],
|
|
36
|
+
initializer: InitializerFunc = None,
|
|
37
|
+
) -> None:
|
|
38
|
+
"""
|
|
39
|
+
Parse CLI+env into options, then run the pipeline.
|
|
40
|
+
|
|
41
|
+
- working_directory: required positional arg
|
|
42
|
+
- --project-name: optional, defaults to directory name
|
|
43
|
+
- --start/--end: optional, 1-based step bounds
|
|
44
|
+
- all other flags come from options_cls (fields & Field descriptions)
|
|
45
|
+
"""
|
|
46
|
+
_initialize_environment()
|
|
47
|
+
|
|
48
|
+
class _RunnerOptions( # type: ignore[reportRedeclaration]
|
|
49
|
+
options_cls,
|
|
50
|
+
cli_parse_args=True,
|
|
51
|
+
cli_kebab_case=True,
|
|
52
|
+
cli_exit_on_error=False,
|
|
53
|
+
):
|
|
54
|
+
working_directory: CliPositionalArg[Path]
|
|
55
|
+
project_name: str | None = None
|
|
56
|
+
start: int = 1
|
|
57
|
+
end: int | None = None
|
|
58
|
+
|
|
59
|
+
model_config = SettingsConfigDict(frozen=True, extra="ignore")
|
|
60
|
+
|
|
61
|
+
opts = cast(FlowOptions, _RunnerOptions()) # type: ignore[reportCallIssue]
|
|
62
|
+
|
|
63
|
+
wd: Path = cast(Path, getattr(opts, "working_directory"))
|
|
64
|
+
wd.mkdir(parents=True, exist_ok=True)
|
|
65
|
+
|
|
66
|
+
# Get project name from options or use directory basename
|
|
67
|
+
project_name = getattr(opts, "project_name", None)
|
|
68
|
+
if not project_name: # None or empty string
|
|
69
|
+
project_name = wd.name
|
|
70
|
+
|
|
71
|
+
# Ensure project_name is not empty
|
|
72
|
+
if not project_name:
|
|
73
|
+
raise ValueError("Project name cannot be empty")
|
|
74
|
+
|
|
75
|
+
# Use initializer if provided, otherwise use defaults
|
|
76
|
+
initial_documents = DocumentList([])
|
|
77
|
+
if initializer:
|
|
78
|
+
init_result = initializer(opts)
|
|
79
|
+
# Always expect tuple format from initializer
|
|
80
|
+
_, initial_documents = init_result # Ignore project name from initializer
|
|
81
|
+
|
|
82
|
+
if getattr(opts, "start", 1) == 1 and initial_documents:
|
|
83
|
+
save_documents_to_directory(wd, initial_documents)
|
|
84
|
+
|
|
85
|
+
asyncio.run(
|
|
86
|
+
run_pipelines(
|
|
87
|
+
project_name=project_name,
|
|
88
|
+
output_dir=wd,
|
|
89
|
+
flows=flows,
|
|
90
|
+
flow_configs=flow_configs,
|
|
91
|
+
flow_options=opts,
|
|
92
|
+
start_step=getattr(opts, "start", 1),
|
|
93
|
+
end_step=getattr(opts, "end", None),
|
|
94
|
+
)
|
|
95
|
+
)
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import Any, Callable, Sequence, Type
|
|
3
|
+
|
|
4
|
+
from ai_pipeline_core.documents import Document, DocumentList, FlowDocument
|
|
5
|
+
from ai_pipeline_core.flow.config import FlowConfig
|
|
6
|
+
from ai_pipeline_core.flow.options import FlowOptions
|
|
7
|
+
from ai_pipeline_core.logging import get_pipeline_logger
|
|
8
|
+
|
|
9
|
+
logger = get_pipeline_logger(__name__)
|
|
10
|
+
|
|
11
|
+
FlowSequence = Sequence[Callable[..., Any]]
|
|
12
|
+
ConfigSequence = Sequence[Type[FlowConfig]]
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def load_documents_from_directory(
|
|
16
|
+
base_dir: Path, document_types: Sequence[Type[FlowDocument]]
|
|
17
|
+
) -> DocumentList:
|
|
18
|
+
"""Loads documents using canonical_name."""
|
|
19
|
+
documents = DocumentList()
|
|
20
|
+
|
|
21
|
+
for doc_class in document_types:
|
|
22
|
+
dir_name = doc_class.canonical_name()
|
|
23
|
+
type_dir = base_dir / dir_name
|
|
24
|
+
|
|
25
|
+
if not type_dir.exists() or not type_dir.is_dir():
|
|
26
|
+
continue
|
|
27
|
+
|
|
28
|
+
logger.info(f"Loading documents from {type_dir.relative_to(base_dir)}")
|
|
29
|
+
|
|
30
|
+
for file_path in type_dir.iterdir():
|
|
31
|
+
if not file_path.is_file() or file_path.name.endswith(Document.DESCRIPTION_EXTENSION):
|
|
32
|
+
continue
|
|
33
|
+
|
|
34
|
+
try:
|
|
35
|
+
content = file_path.read_bytes()
|
|
36
|
+
doc = doc_class(name=file_path.name, content=content)
|
|
37
|
+
|
|
38
|
+
desc_file = file_path.with_name(file_path.name + Document.DESCRIPTION_EXTENSION)
|
|
39
|
+
if desc_file.exists():
|
|
40
|
+
object.__setattr__(doc, "description", desc_file.read_text(encoding="utf-8"))
|
|
41
|
+
|
|
42
|
+
documents.append(doc)
|
|
43
|
+
except Exception as e:
|
|
44
|
+
logger.error(
|
|
45
|
+
f" Failed to load {file_path.name} as {doc_class.__name__}: {e}", exc_info=True
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
return documents
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def save_documents_to_directory(base_dir: Path, documents: DocumentList) -> None:
|
|
52
|
+
"""Saves documents using canonical_name."""
|
|
53
|
+
for document in documents:
|
|
54
|
+
if not isinstance(document, FlowDocument):
|
|
55
|
+
continue
|
|
56
|
+
|
|
57
|
+
dir_name = document.canonical_name()
|
|
58
|
+
document_dir = base_dir / dir_name
|
|
59
|
+
document_dir.mkdir(parents=True, exist_ok=True)
|
|
60
|
+
|
|
61
|
+
file_path = document_dir / document.name
|
|
62
|
+
file_path.write_bytes(document.content)
|
|
63
|
+
logger.info(f"Saved: {dir_name}/{document.name}")
|
|
64
|
+
|
|
65
|
+
if document.description:
|
|
66
|
+
desc_file = file_path.with_name(file_path.name + Document.DESCRIPTION_EXTENSION)
|
|
67
|
+
desc_file.write_text(document.description, encoding="utf-8")
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
async def run_pipeline(
|
|
71
|
+
flow_func: Callable[..., Any],
|
|
72
|
+
config: Type[FlowConfig],
|
|
73
|
+
project_name: str,
|
|
74
|
+
output_dir: Path,
|
|
75
|
+
flow_options: FlowOptions,
|
|
76
|
+
flow_name: str | None = None,
|
|
77
|
+
) -> DocumentList:
|
|
78
|
+
"""Execute a single pipeline flow."""
|
|
79
|
+
if flow_name is None:
|
|
80
|
+
flow_name = getattr(flow_func, "name", getattr(flow_func, "__name__", "flow"))
|
|
81
|
+
|
|
82
|
+
logger.info(f"Running Flow: {flow_name}")
|
|
83
|
+
|
|
84
|
+
input_documents = load_documents_from_directory(output_dir, config.INPUT_DOCUMENT_TYPES)
|
|
85
|
+
|
|
86
|
+
if not config.has_input_documents(input_documents):
|
|
87
|
+
raise RuntimeError(f"Missing input documents for flow {flow_name}")
|
|
88
|
+
|
|
89
|
+
result_documents = await flow_func(project_name, input_documents, flow_options)
|
|
90
|
+
|
|
91
|
+
config.validate_output_documents(result_documents)
|
|
92
|
+
|
|
93
|
+
save_documents_to_directory(output_dir, result_documents)
|
|
94
|
+
|
|
95
|
+
logger.info(f"Completed Flow: {flow_name}")
|
|
96
|
+
|
|
97
|
+
return result_documents
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
async def run_pipelines(
|
|
101
|
+
project_name: str,
|
|
102
|
+
output_dir: Path,
|
|
103
|
+
flows: FlowSequence,
|
|
104
|
+
flow_configs: ConfigSequence,
|
|
105
|
+
flow_options: FlowOptions,
|
|
106
|
+
start_step: int = 1,
|
|
107
|
+
end_step: int | None = None,
|
|
108
|
+
) -> None:
|
|
109
|
+
"""Executes multiple pipeline flows sequentially."""
|
|
110
|
+
if len(flows) != len(flow_configs):
|
|
111
|
+
raise ValueError("The number of flows and flow configs must match.")
|
|
112
|
+
|
|
113
|
+
num_steps = len(flows)
|
|
114
|
+
start_index = start_step - 1
|
|
115
|
+
end_index = (end_step if end_step is not None else num_steps) - 1
|
|
116
|
+
|
|
117
|
+
if (
|
|
118
|
+
not (0 <= start_index < num_steps)
|
|
119
|
+
or not (0 <= end_index < num_steps)
|
|
120
|
+
or start_index > end_index
|
|
121
|
+
):
|
|
122
|
+
raise ValueError("Invalid start/end steps.")
|
|
123
|
+
|
|
124
|
+
logger.info(f"Starting pipeline '{project_name}' (Steps {start_step} to {end_index + 1})")
|
|
125
|
+
|
|
126
|
+
for i in range(start_index, end_index + 1):
|
|
127
|
+
flow_func = flows[i]
|
|
128
|
+
config = flow_configs[i]
|
|
129
|
+
flow_name = getattr(flow_func, "name", getattr(flow_func, "__name__", f"flow_{i + 1}"))
|
|
130
|
+
|
|
131
|
+
logger.info(f"--- [Step {i + 1}/{num_steps}] Running Flow: {flow_name} ---")
|
|
132
|
+
|
|
133
|
+
try:
|
|
134
|
+
await run_pipeline(
|
|
135
|
+
flow_func=flow_func,
|
|
136
|
+
config=config,
|
|
137
|
+
project_name=project_name,
|
|
138
|
+
output_dir=output_dir,
|
|
139
|
+
flow_options=flow_options,
|
|
140
|
+
flow_name=f"[Step {i + 1}/{num_steps}] {flow_name}",
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
except Exception as e:
|
|
144
|
+
logger.error(
|
|
145
|
+
f"--- [Step {i + 1}/{num_steps}] Flow {flow_name} Failed: {e} ---", exc_info=True
|
|
146
|
+
)
|
|
147
|
+
raise
|