ai-pipeline-core 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,418 @@
1
+ """Pipeline decorators that combine Prefect functionality with tracing support.
2
+
3
+ These decorators extend the base Prefect decorators with automatic tracing capabilities.
4
+ """
5
+
6
+ import datetime
7
+ import functools
8
+ import inspect
9
+ from typing import (
10
+ TYPE_CHECKING,
11
+ Any,
12
+ Callable,
13
+ Coroutine,
14
+ Dict,
15
+ Iterable,
16
+ Optional,
17
+ TypeVar,
18
+ Union,
19
+ cast,
20
+ overload,
21
+ )
22
+
23
+ from prefect.assets import Asset
24
+ from prefect.cache_policies import CachePolicy
25
+ from prefect.context import TaskRunContext
26
+ from prefect.flows import Flow, FlowStateHook
27
+ from prefect.futures import PrefectFuture
28
+ from prefect.results import ResultSerializer, ResultStorage
29
+ from prefect.task_runners import TaskRunner
30
+ from prefect.tasks import (
31
+ RetryConditionCallable,
32
+ StateHookCallable,
33
+ Task,
34
+ TaskRunNameValueOrCallable,
35
+ )
36
+ from prefect.utilities.annotations import NotSet
37
+ from typing_extensions import Concatenate, ParamSpec
38
+
39
+ from ai_pipeline_core.documents import DocumentList
40
+ from ai_pipeline_core.flow.options import FlowOptions
41
+ from ai_pipeline_core.prefect import flow, task
42
+ from ai_pipeline_core.tracing import TraceLevel, trace
43
+
44
+ if TYPE_CHECKING:
45
+ pass
46
+
47
+ P = ParamSpec("P")
48
+ R = TypeVar("R")
49
+
50
+ # ============================================================================
51
+ # PIPELINE TASK DECORATOR
52
+ # ============================================================================
53
+
54
+
55
+ @overload
56
+ def pipeline_task(__fn: Callable[P, R], /) -> Task[P, R]: ...
57
+
58
+
59
+ @overload
60
+ def pipeline_task(
61
+ *,
62
+ # Tracing parameters
63
+ trace_level: TraceLevel = "always",
64
+ trace_ignore_input: bool = False,
65
+ trace_ignore_output: bool = False,
66
+ trace_ignore_inputs: list[str] | None = None,
67
+ trace_input_formatter: Optional[Callable[..., str]] = None,
68
+ trace_output_formatter: Optional[Callable[..., str]] = None,
69
+ # Prefect parameters
70
+ name: Optional[str] = None,
71
+ description: Optional[str] = None,
72
+ tags: Optional[Iterable[str]] = None,
73
+ version: Optional[str] = None,
74
+ cache_policy: Union[CachePolicy, type[NotSet]] = NotSet,
75
+ cache_key_fn: Optional[Callable[[TaskRunContext, Dict[str, Any]], Optional[str]]] = None,
76
+ cache_expiration: Optional[datetime.timedelta] = None,
77
+ task_run_name: Optional[TaskRunNameValueOrCallable] = None,
78
+ retries: Optional[int] = None,
79
+ retry_delay_seconds: Optional[
80
+ Union[float, int, list[float], Callable[[int], list[float]]]
81
+ ] = None,
82
+ retry_jitter_factor: Optional[float] = None,
83
+ persist_result: Optional[bool] = None,
84
+ result_storage: Optional[Union[ResultStorage, str]] = None,
85
+ result_serializer: Optional[Union[ResultSerializer, str]] = None,
86
+ result_storage_key: Optional[str] = None,
87
+ cache_result_in_memory: bool = True,
88
+ timeout_seconds: Union[int, float, None] = None,
89
+ log_prints: Optional[bool] = False,
90
+ refresh_cache: Optional[bool] = None,
91
+ on_completion: Optional[list[StateHookCallable]] = None,
92
+ on_failure: Optional[list[StateHookCallable]] = None,
93
+ retry_condition_fn: Optional[RetryConditionCallable] = None,
94
+ viz_return_value: Optional[bool] = None,
95
+ asset_deps: Optional[list[Union[str, Asset]]] = None,
96
+ ) -> Callable[[Callable[P, R]], Task[P, R]]: ...
97
+
98
+
99
+ def pipeline_task(
100
+ __fn: Optional[Callable[P, R]] = None,
101
+ /,
102
+ *,
103
+ # Tracing parameters
104
+ trace_level: TraceLevel = "always",
105
+ trace_ignore_input: bool = False,
106
+ trace_ignore_output: bool = False,
107
+ trace_ignore_inputs: list[str] | None = None,
108
+ trace_input_formatter: Optional[Callable[..., str]] = None,
109
+ trace_output_formatter: Optional[Callable[..., str]] = None,
110
+ # Prefect parameters
111
+ name: Optional[str] = None,
112
+ description: Optional[str] = None,
113
+ tags: Optional[Iterable[str]] = None,
114
+ version: Optional[str] = None,
115
+ cache_policy: Union[CachePolicy, type[NotSet]] = NotSet,
116
+ cache_key_fn: Optional[Callable[[TaskRunContext, Dict[str, Any]], Optional[str]]] = None,
117
+ cache_expiration: Optional[datetime.timedelta] = None,
118
+ task_run_name: Optional[TaskRunNameValueOrCallable] = None,
119
+ retries: Optional[int] = None,
120
+ retry_delay_seconds: Optional[
121
+ Union[float, int, list[float], Callable[[int], list[float]]]
122
+ ] = None,
123
+ retry_jitter_factor: Optional[float] = None,
124
+ persist_result: Optional[bool] = None,
125
+ result_storage: Optional[Union[ResultStorage, str]] = None,
126
+ result_serializer: Optional[Union[ResultSerializer, str]] = None,
127
+ result_storage_key: Optional[str] = None,
128
+ cache_result_in_memory: bool = True,
129
+ timeout_seconds: Union[int, float, None] = None,
130
+ log_prints: Optional[bool] = False,
131
+ refresh_cache: Optional[bool] = None,
132
+ on_completion: Optional[list[StateHookCallable]] = None,
133
+ on_failure: Optional[list[StateHookCallable]] = None,
134
+ retry_condition_fn: Optional[RetryConditionCallable] = None,
135
+ viz_return_value: Optional[bool] = None,
136
+ asset_deps: Optional[list[Union[str, Asset]]] = None,
137
+ ) -> Union[Task[P, R], Callable[[Callable[P, R]], Task[P, R]]]:
138
+ """
139
+ Pipeline task decorator that combines Prefect task functionality with automatic tracing.
140
+
141
+ This decorator applies tracing before the Prefect task decorator, allowing you to
142
+ monitor task execution with LMNR while maintaining all Prefect functionality.
143
+
144
+ Args:
145
+ trace_level: Control tracing ("always", "debug", "off")
146
+ trace_ignore_input: Whether to ignore input in traces
147
+ trace_ignore_output: Whether to ignore output in traces
148
+ trace_ignore_inputs: List of input parameter names to ignore
149
+ trace_input_formatter: Custom formatter for inputs
150
+ trace_output_formatter: Custom formatter for outputs
151
+
152
+ Plus all standard Prefect task parameters...
153
+ """
154
+
155
+ def decorator(fn: Callable[P, R]) -> Task[P, R]:
156
+ # Apply tracing first if enabled
157
+ if trace_level != "off":
158
+ traced_fn = trace(
159
+ level=trace_level,
160
+ name=name or fn.__name__,
161
+ ignore_input=trace_ignore_input,
162
+ ignore_output=trace_ignore_output,
163
+ ignore_inputs=trace_ignore_inputs,
164
+ input_formatter=trace_input_formatter,
165
+ output_formatter=trace_output_formatter,
166
+ )(fn)
167
+ else:
168
+ traced_fn = fn
169
+
170
+ # Then apply Prefect task decorator
171
+ return task( # pyright: ignore[reportCallIssue,reportUnknownVariableType]
172
+ traced_fn, # pyright: ignore[reportArgumentType]
173
+ name=name,
174
+ description=description,
175
+ tags=tags,
176
+ version=version,
177
+ cache_policy=cache_policy,
178
+ cache_key_fn=cache_key_fn,
179
+ cache_expiration=cache_expiration,
180
+ task_run_name=task_run_name,
181
+ retries=retries or 0,
182
+ retry_delay_seconds=retry_delay_seconds,
183
+ retry_jitter_factor=retry_jitter_factor,
184
+ persist_result=persist_result,
185
+ result_storage=result_storage,
186
+ result_serializer=result_serializer,
187
+ result_storage_key=result_storage_key,
188
+ cache_result_in_memory=cache_result_in_memory,
189
+ timeout_seconds=timeout_seconds,
190
+ log_prints=log_prints,
191
+ refresh_cache=refresh_cache,
192
+ on_completion=on_completion,
193
+ on_failure=on_failure,
194
+ retry_condition_fn=retry_condition_fn,
195
+ viz_return_value=viz_return_value,
196
+ asset_deps=asset_deps,
197
+ )
198
+
199
+ if __fn:
200
+ return decorator(__fn)
201
+ return decorator
202
+
203
+
204
+ # ============================================================================
205
+ # PIPELINE FLOW DECORATOR WITH DOCUMENT PROCESSING
206
+ # ============================================================================
207
+
208
+ # Type aliases for document flow signatures
209
+ DocumentsFlowSig = Callable[
210
+ Concatenate[str, DocumentList, FlowOptions, P],
211
+ Union[DocumentList, Coroutine[Any, Any, DocumentList]],
212
+ ]
213
+
214
+ DocumentsFlowResult = Flow[Concatenate[str, DocumentList, FlowOptions, P], DocumentList]
215
+
216
+
217
+ @overload
218
+ def pipeline_flow(
219
+ __fn: DocumentsFlowSig[P],
220
+ /,
221
+ ) -> DocumentsFlowResult[P]: ...
222
+
223
+
224
+ @overload
225
+ def pipeline_flow(
226
+ *,
227
+ # Tracing parameters
228
+ trace_level: TraceLevel = "always",
229
+ trace_ignore_input: bool = False,
230
+ trace_ignore_output: bool = False,
231
+ trace_ignore_inputs: list[str] | None = None,
232
+ trace_input_formatter: Optional[Callable[..., str]] = None,
233
+ trace_output_formatter: Optional[Callable[..., str]] = None,
234
+ # Prefect parameters
235
+ name: Optional[str] = None,
236
+ version: Optional[str] = None,
237
+ flow_run_name: Optional[Union[Callable[[], str], str]] = None,
238
+ retries: Optional[int] = None,
239
+ retry_delay_seconds: Optional[Union[int, float]] = None,
240
+ task_runner: Optional[TaskRunner[PrefectFuture[Any]]] = None,
241
+ description: Optional[str] = None,
242
+ timeout_seconds: Union[int, float, None] = None,
243
+ validate_parameters: bool = True,
244
+ persist_result: Optional[bool] = None,
245
+ result_storage: Optional[Union[ResultStorage, str]] = None,
246
+ result_serializer: Optional[Union[ResultSerializer, str]] = None,
247
+ cache_result_in_memory: bool = True,
248
+ log_prints: Optional[bool] = None,
249
+ on_completion: Optional[list["FlowStateHook[..., Any]"]] = None,
250
+ on_failure: Optional[list["FlowStateHook[..., Any]"]] = None,
251
+ on_cancellation: Optional[list["FlowStateHook[..., Any]"]] = None,
252
+ on_crashed: Optional[list["FlowStateHook[..., Any]"]] = None,
253
+ on_running: Optional[list["FlowStateHook[..., Any]"]] = None,
254
+ ) -> Callable[[DocumentsFlowSig[P]], DocumentsFlowResult[P]]: ...
255
+
256
+
257
+ def pipeline_flow(
258
+ __fn: Optional[DocumentsFlowSig[P]] = None,
259
+ /,
260
+ *,
261
+ # Tracing parameters
262
+ trace_level: TraceLevel = "always",
263
+ trace_ignore_input: bool = False,
264
+ trace_ignore_output: bool = False,
265
+ trace_ignore_inputs: list[str] | None = None,
266
+ trace_input_formatter: Optional[Callable[..., str]] = None,
267
+ trace_output_formatter: Optional[Callable[..., str]] = None,
268
+ # Prefect parameters
269
+ name: Optional[str] = None,
270
+ version: Optional[str] = None,
271
+ flow_run_name: Optional[Union[Callable[[], str], str]] = None,
272
+ retries: Optional[int] = None,
273
+ retry_delay_seconds: Optional[Union[int, float]] = None,
274
+ task_runner: Optional[TaskRunner[PrefectFuture[Any]]] = None,
275
+ description: Optional[str] = None,
276
+ timeout_seconds: Union[int, float, None] = None,
277
+ validate_parameters: bool = True,
278
+ persist_result: Optional[bool] = None,
279
+ result_storage: Optional[Union[ResultStorage, str]] = None,
280
+ result_serializer: Optional[Union[ResultSerializer, str]] = None,
281
+ cache_result_in_memory: bool = True,
282
+ log_prints: Optional[bool] = None,
283
+ on_completion: Optional[list["FlowStateHook[..., Any]"]] = None,
284
+ on_failure: Optional[list["FlowStateHook[..., Any]"]] = None,
285
+ on_cancellation: Optional[list["FlowStateHook[..., Any]"]] = None,
286
+ on_crashed: Optional[list["FlowStateHook[..., Any]"]] = None,
287
+ on_running: Optional[list["FlowStateHook[..., Any]"]] = None,
288
+ ) -> Union[DocumentsFlowResult[P], Callable[[DocumentsFlowSig[P]], DocumentsFlowResult[P]]]:
289
+ """
290
+ Pipeline flow for document processing with standardized signature.
291
+
292
+ This decorator enforces a specific signature for document processing flows:
293
+ - First parameter: project_name (str)
294
+ - Second parameter: documents (DocumentList)
295
+ - Third parameter: flow_options (FlowOptions or subclass)
296
+ - Additional parameters allowed
297
+ - Must return DocumentList
298
+
299
+ It includes automatic tracing and all Prefect flow functionality.
300
+
301
+ Args:
302
+ trace_level: Control tracing ("always", "debug", "off")
303
+ trace_ignore_input: Whether to ignore input in traces
304
+ trace_ignore_output: Whether to ignore output in traces
305
+ trace_ignore_inputs: List of input parameter names to ignore
306
+ trace_input_formatter: Custom formatter for inputs
307
+ trace_output_formatter: Custom formatter for outputs
308
+
309
+ Plus all standard Prefect flow parameters...
310
+ """
311
+
312
+ def decorator(func: DocumentsFlowSig[P]) -> DocumentsFlowResult[P]:
313
+ sig = inspect.signature(func)
314
+ params = list(sig.parameters.values())
315
+
316
+ if len(params) < 3:
317
+ raise TypeError(
318
+ f"@pipeline_flow '{func.__name__}' must accept at least 3 arguments: "
319
+ "(project_name, documents, flow_options)"
320
+ )
321
+
322
+ # Validate parameter types (optional but recommended)
323
+ # We check names as a convention, not strict type checking at decoration time
324
+ expected_names = ["project_name", "documents", "flow_options"]
325
+ for i, expected in enumerate(expected_names):
326
+ if i < len(params) and params[i].name != expected:
327
+ print(
328
+ f"Warning: Parameter {i + 1} of '{func.__name__}' is named '{params[i].name}' "
329
+ f"but convention suggests '{expected}'"
330
+ )
331
+
332
+ # Create wrapper that ensures return type
333
+ if inspect.iscoroutinefunction(func):
334
+
335
+ @functools.wraps(func)
336
+ async def wrapper( # pyright: ignore[reportRedeclaration]
337
+ project_name: str,
338
+ documents: DocumentList,
339
+ flow_options: FlowOptions,
340
+ *args, # pyright: ignore[reportMissingParameterType]
341
+ **kwargs, # pyright: ignore[reportMissingParameterType]
342
+ ) -> DocumentList:
343
+ result = await func(project_name, documents, flow_options, *args, **kwargs)
344
+ # Runtime type checking
345
+ DL = DocumentList # Avoid recomputation
346
+ if not isinstance(result, DL):
347
+ raise TypeError(
348
+ f"Flow '{func.__name__}' must return a DocumentList, "
349
+ f"but returned {type(result).__name__}"
350
+ )
351
+ return result
352
+ else:
353
+
354
+ @functools.wraps(func)
355
+ def wrapper( # pyright: ignore[reportRedeclaration]
356
+ project_name: str,
357
+ documents: DocumentList,
358
+ flow_options: FlowOptions,
359
+ *args, # pyright: ignore[reportMissingParameterType]
360
+ **kwargs, # pyright: ignore[reportMissingParameterType]
361
+ ) -> DocumentList:
362
+ result = func(project_name, documents, flow_options, *args, **kwargs)
363
+ # Runtime type checking
364
+ DL = DocumentList # Avoid recomputation
365
+ if not isinstance(result, DL):
366
+ raise TypeError(
367
+ f"Flow '{func.__name__}' must return a DocumentList, "
368
+ f"but returned {type(result).__name__}"
369
+ )
370
+ return result
371
+
372
+ # Apply tracing first if enabled
373
+ if trace_level != "off":
374
+ traced_wrapper = trace(
375
+ level=trace_level,
376
+ name=name or func.__name__,
377
+ ignore_input=trace_ignore_input,
378
+ ignore_output=trace_ignore_output,
379
+ ignore_inputs=trace_ignore_inputs,
380
+ input_formatter=trace_input_formatter,
381
+ output_formatter=trace_output_formatter,
382
+ )(wrapper)
383
+ else:
384
+ traced_wrapper = wrapper
385
+
386
+ # Then apply Prefect flow decorator
387
+ return cast(
388
+ DocumentsFlowResult[P],
389
+ flow( # pyright: ignore[reportCallIssue,reportUnknownVariableType]
390
+ traced_wrapper, # pyright: ignore[reportArgumentType]
391
+ name=name,
392
+ version=version,
393
+ flow_run_name=flow_run_name,
394
+ retries=retries,
395
+ retry_delay_seconds=retry_delay_seconds,
396
+ task_runner=task_runner,
397
+ description=description,
398
+ timeout_seconds=timeout_seconds,
399
+ validate_parameters=validate_parameters,
400
+ persist_result=persist_result,
401
+ result_storage=result_storage,
402
+ result_serializer=result_serializer,
403
+ cache_result_in_memory=cache_result_in_memory,
404
+ log_prints=log_prints,
405
+ on_completion=on_completion,
406
+ on_failure=on_failure,
407
+ on_cancellation=on_cancellation,
408
+ on_crashed=on_crashed,
409
+ on_running=on_running,
410
+ ),
411
+ )
412
+
413
+ if __fn:
414
+ return decorator(__fn)
415
+ return decorator
416
+
417
+
418
+ __all__ = ["pipeline_task", "pipeline_flow"]
@@ -0,0 +1,7 @@
1
+ """Prefect core features."""
2
+
3
+ from prefect import flow, task
4
+ from prefect.logging import disable_run_logger
5
+ from prefect.testing.utilities import prefect_test_harness
6
+
7
+ __all__ = ["task", "flow", "disable_run_logger", "prefect_test_harness"]
@@ -0,0 +1,19 @@
1
+ from .cli import run_cli
2
+ from .simple_runner import (
3
+ ConfigSequence,
4
+ FlowSequence,
5
+ load_documents_from_directory,
6
+ run_pipeline,
7
+ run_pipelines,
8
+ save_documents_to_directory,
9
+ )
10
+
11
+ __all__ = [
12
+ "run_cli",
13
+ "run_pipeline",
14
+ "run_pipelines",
15
+ "load_documents_from_directory",
16
+ "save_documents_to_directory",
17
+ "FlowSequence",
18
+ "ConfigSequence",
19
+ ]
@@ -0,0 +1,95 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ from pathlib import Path
5
+ from typing import Callable, Type, TypeVar, cast
6
+
7
+ from lmnr import Laminar
8
+ from pydantic_settings import CliPositionalArg, SettingsConfigDict
9
+
10
+ from ai_pipeline_core.documents import DocumentList
11
+ from ai_pipeline_core.flow.options import FlowOptions
12
+ from ai_pipeline_core.logging import get_pipeline_logger, setup_logging
13
+
14
+ from .simple_runner import ConfigSequence, FlowSequence, run_pipelines, save_documents_to_directory
15
+
16
+ logger = get_pipeline_logger(__name__)
17
+
18
+ TOptions = TypeVar("TOptions", bound=FlowOptions)
19
+ InitializerFunc = Callable[[FlowOptions], tuple[str, DocumentList]] | None
20
+
21
+
22
+ def _initialize_environment() -> None:
23
+ setup_logging()
24
+ try:
25
+ Laminar.initialize()
26
+ logger.info("LMNR tracing initialized.")
27
+ except Exception as e:
28
+ logger.warning(f"Failed to initialize LMNR tracing: {e}")
29
+
30
+
31
+ def run_cli(
32
+ *,
33
+ flows: FlowSequence,
34
+ flow_configs: ConfigSequence,
35
+ options_cls: Type[TOptions],
36
+ initializer: InitializerFunc = None,
37
+ ) -> None:
38
+ """
39
+ Parse CLI+env into options, then run the pipeline.
40
+
41
+ - working_directory: required positional arg
42
+ - --project-name: optional, defaults to directory name
43
+ - --start/--end: optional, 1-based step bounds
44
+ - all other flags come from options_cls (fields & Field descriptions)
45
+ """
46
+ _initialize_environment()
47
+
48
+ class _RunnerOptions( # type: ignore[reportRedeclaration]
49
+ options_cls,
50
+ cli_parse_args=True,
51
+ cli_kebab_case=True,
52
+ cli_exit_on_error=False,
53
+ ):
54
+ working_directory: CliPositionalArg[Path]
55
+ project_name: str | None = None
56
+ start: int = 1
57
+ end: int | None = None
58
+
59
+ model_config = SettingsConfigDict(frozen=True, extra="ignore")
60
+
61
+ opts = cast(FlowOptions, _RunnerOptions()) # type: ignore[reportCallIssue]
62
+
63
+ wd: Path = cast(Path, getattr(opts, "working_directory"))
64
+ wd.mkdir(parents=True, exist_ok=True)
65
+
66
+ # Get project name from options or use directory basename
67
+ project_name = getattr(opts, "project_name", None)
68
+ if not project_name: # None or empty string
69
+ project_name = wd.name
70
+
71
+ # Ensure project_name is not empty
72
+ if not project_name:
73
+ raise ValueError("Project name cannot be empty")
74
+
75
+ # Use initializer if provided, otherwise use defaults
76
+ initial_documents = DocumentList([])
77
+ if initializer:
78
+ init_result = initializer(opts)
79
+ # Always expect tuple format from initializer
80
+ _, initial_documents = init_result # Ignore project name from initializer
81
+
82
+ if getattr(opts, "start", 1) == 1 and initial_documents:
83
+ save_documents_to_directory(wd, initial_documents)
84
+
85
+ asyncio.run(
86
+ run_pipelines(
87
+ project_name=project_name,
88
+ output_dir=wd,
89
+ flows=flows,
90
+ flow_configs=flow_configs,
91
+ flow_options=opts,
92
+ start_step=getattr(opts, "start", 1),
93
+ end_step=getattr(opts, "end", None),
94
+ )
95
+ )
@@ -0,0 +1,147 @@
1
+ from pathlib import Path
2
+ from typing import Any, Callable, Sequence, Type
3
+
4
+ from ai_pipeline_core.documents import Document, DocumentList, FlowDocument
5
+ from ai_pipeline_core.flow.config import FlowConfig
6
+ from ai_pipeline_core.flow.options import FlowOptions
7
+ from ai_pipeline_core.logging import get_pipeline_logger
8
+
9
+ logger = get_pipeline_logger(__name__)
10
+
11
+ FlowSequence = Sequence[Callable[..., Any]]
12
+ ConfigSequence = Sequence[Type[FlowConfig]]
13
+
14
+
15
+ def load_documents_from_directory(
16
+ base_dir: Path, document_types: Sequence[Type[FlowDocument]]
17
+ ) -> DocumentList:
18
+ """Loads documents using canonical_name."""
19
+ documents = DocumentList()
20
+
21
+ for doc_class in document_types:
22
+ dir_name = doc_class.canonical_name()
23
+ type_dir = base_dir / dir_name
24
+
25
+ if not type_dir.exists() or not type_dir.is_dir():
26
+ continue
27
+
28
+ logger.info(f"Loading documents from {type_dir.relative_to(base_dir)}")
29
+
30
+ for file_path in type_dir.iterdir():
31
+ if not file_path.is_file() or file_path.name.endswith(Document.DESCRIPTION_EXTENSION):
32
+ continue
33
+
34
+ try:
35
+ content = file_path.read_bytes()
36
+ doc = doc_class(name=file_path.name, content=content)
37
+
38
+ desc_file = file_path.with_name(file_path.name + Document.DESCRIPTION_EXTENSION)
39
+ if desc_file.exists():
40
+ object.__setattr__(doc, "description", desc_file.read_text(encoding="utf-8"))
41
+
42
+ documents.append(doc)
43
+ except Exception as e:
44
+ logger.error(
45
+ f" Failed to load {file_path.name} as {doc_class.__name__}: {e}", exc_info=True
46
+ )
47
+
48
+ return documents
49
+
50
+
51
+ def save_documents_to_directory(base_dir: Path, documents: DocumentList) -> None:
52
+ """Saves documents using canonical_name."""
53
+ for document in documents:
54
+ if not isinstance(document, FlowDocument):
55
+ continue
56
+
57
+ dir_name = document.canonical_name()
58
+ document_dir = base_dir / dir_name
59
+ document_dir.mkdir(parents=True, exist_ok=True)
60
+
61
+ file_path = document_dir / document.name
62
+ file_path.write_bytes(document.content)
63
+ logger.info(f"Saved: {dir_name}/{document.name}")
64
+
65
+ if document.description:
66
+ desc_file = file_path.with_name(file_path.name + Document.DESCRIPTION_EXTENSION)
67
+ desc_file.write_text(document.description, encoding="utf-8")
68
+
69
+
70
+ async def run_pipeline(
71
+ flow_func: Callable[..., Any],
72
+ config: Type[FlowConfig],
73
+ project_name: str,
74
+ output_dir: Path,
75
+ flow_options: FlowOptions,
76
+ flow_name: str | None = None,
77
+ ) -> DocumentList:
78
+ """Execute a single pipeline flow."""
79
+ if flow_name is None:
80
+ flow_name = getattr(flow_func, "name", getattr(flow_func, "__name__", "flow"))
81
+
82
+ logger.info(f"Running Flow: {flow_name}")
83
+
84
+ input_documents = load_documents_from_directory(output_dir, config.INPUT_DOCUMENT_TYPES)
85
+
86
+ if not config.has_input_documents(input_documents):
87
+ raise RuntimeError(f"Missing input documents for flow {flow_name}")
88
+
89
+ result_documents = await flow_func(project_name, input_documents, flow_options)
90
+
91
+ config.validate_output_documents(result_documents)
92
+
93
+ save_documents_to_directory(output_dir, result_documents)
94
+
95
+ logger.info(f"Completed Flow: {flow_name}")
96
+
97
+ return result_documents
98
+
99
+
100
+ async def run_pipelines(
101
+ project_name: str,
102
+ output_dir: Path,
103
+ flows: FlowSequence,
104
+ flow_configs: ConfigSequence,
105
+ flow_options: FlowOptions,
106
+ start_step: int = 1,
107
+ end_step: int | None = None,
108
+ ) -> None:
109
+ """Executes multiple pipeline flows sequentially."""
110
+ if len(flows) != len(flow_configs):
111
+ raise ValueError("The number of flows and flow configs must match.")
112
+
113
+ num_steps = len(flows)
114
+ start_index = start_step - 1
115
+ end_index = (end_step if end_step is not None else num_steps) - 1
116
+
117
+ if (
118
+ not (0 <= start_index < num_steps)
119
+ or not (0 <= end_index < num_steps)
120
+ or start_index > end_index
121
+ ):
122
+ raise ValueError("Invalid start/end steps.")
123
+
124
+ logger.info(f"Starting pipeline '{project_name}' (Steps {start_step} to {end_index + 1})")
125
+
126
+ for i in range(start_index, end_index + 1):
127
+ flow_func = flows[i]
128
+ config = flow_configs[i]
129
+ flow_name = getattr(flow_func, "name", getattr(flow_func, "__name__", f"flow_{i + 1}"))
130
+
131
+ logger.info(f"--- [Step {i + 1}/{num_steps}] Running Flow: {flow_name} ---")
132
+
133
+ try:
134
+ await run_pipeline(
135
+ flow_func=flow_func,
136
+ config=config,
137
+ project_name=project_name,
138
+ output_dir=output_dir,
139
+ flow_options=flow_options,
140
+ flow_name=f"[Step {i + 1}/{num_steps}] {flow_name}",
141
+ )
142
+
143
+ except Exception as e:
144
+ logger.error(
145
+ f"--- [Step {i + 1}/{num_steps}] Flow {flow_name} Failed: {e} ---", exc_info=True
146
+ )
147
+ raise