runnable 0.50.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- extensions/README.md +0 -0
- extensions/__init__.py +0 -0
- extensions/catalog/README.md +0 -0
- extensions/catalog/any_path.py +214 -0
- extensions/catalog/file_system.py +52 -0
- extensions/catalog/minio.py +72 -0
- extensions/catalog/pyproject.toml +14 -0
- extensions/catalog/s3.py +11 -0
- extensions/job_executor/README.md +0 -0
- extensions/job_executor/__init__.py +236 -0
- extensions/job_executor/emulate.py +70 -0
- extensions/job_executor/k8s.py +553 -0
- extensions/job_executor/k8s_job_spec.yaml +37 -0
- extensions/job_executor/local.py +35 -0
- extensions/job_executor/local_container.py +161 -0
- extensions/job_executor/pyproject.toml +16 -0
- extensions/nodes/README.md +0 -0
- extensions/nodes/__init__.py +0 -0
- extensions/nodes/conditional.py +301 -0
- extensions/nodes/fail.py +78 -0
- extensions/nodes/loop.py +394 -0
- extensions/nodes/map.py +477 -0
- extensions/nodes/parallel.py +281 -0
- extensions/nodes/pyproject.toml +15 -0
- extensions/nodes/stub.py +93 -0
- extensions/nodes/success.py +78 -0
- extensions/nodes/task.py +156 -0
- extensions/pipeline_executor/README.md +0 -0
- extensions/pipeline_executor/__init__.py +871 -0
- extensions/pipeline_executor/argo.py +1266 -0
- extensions/pipeline_executor/emulate.py +119 -0
- extensions/pipeline_executor/local.py +226 -0
- extensions/pipeline_executor/local_container.py +369 -0
- extensions/pipeline_executor/mocked.py +159 -0
- extensions/pipeline_executor/pyproject.toml +16 -0
- extensions/run_log_store/README.md +0 -0
- extensions/run_log_store/__init__.py +0 -0
- extensions/run_log_store/any_path.py +100 -0
- extensions/run_log_store/chunked_fs.py +122 -0
- extensions/run_log_store/chunked_minio.py +141 -0
- extensions/run_log_store/file_system.py +91 -0
- extensions/run_log_store/generic_chunked.py +549 -0
- extensions/run_log_store/minio.py +114 -0
- extensions/run_log_store/pyproject.toml +15 -0
- extensions/secrets/README.md +0 -0
- extensions/secrets/dotenv.py +62 -0
- extensions/secrets/pyproject.toml +15 -0
- runnable/__init__.py +108 -0
- runnable/catalog.py +141 -0
- runnable/cli.py +484 -0
- runnable/context.py +730 -0
- runnable/datastore.py +1058 -0
- runnable/defaults.py +159 -0
- runnable/entrypoints.py +390 -0
- runnable/exceptions.py +137 -0
- runnable/executor.py +561 -0
- runnable/gantt.py +1646 -0
- runnable/graph.py +501 -0
- runnable/names.py +546 -0
- runnable/nodes.py +593 -0
- runnable/parameters.py +217 -0
- runnable/pickler.py +96 -0
- runnable/sdk.py +1277 -0
- runnable/secrets.py +92 -0
- runnable/tasks.py +1268 -0
- runnable/telemetry.py +142 -0
- runnable/utils.py +423 -0
- runnable-0.50.0.dist-info/METADATA +189 -0
- runnable-0.50.0.dist-info/RECORD +72 -0
- runnable-0.50.0.dist-info/WHEEL +4 -0
- runnable-0.50.0.dist-info/entry_points.txt +53 -0
- runnable-0.50.0.dist-info/licenses/LICENSE +201 -0
runnable/sdk.py
ADDED
|
@@ -0,0 +1,1277 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import inspect
|
|
5
|
+
import logging
|
|
6
|
+
import re
|
|
7
|
+
from abc import ABC, abstractmethod
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any, Callable, Dict, List, Optional, Union, cast
|
|
10
|
+
|
|
11
|
+
from pydantic import (
|
|
12
|
+
BaseModel,
|
|
13
|
+
ConfigDict,
|
|
14
|
+
Field,
|
|
15
|
+
PrivateAttr,
|
|
16
|
+
computed_field,
|
|
17
|
+
field_validator,
|
|
18
|
+
model_validator,
|
|
19
|
+
)
|
|
20
|
+
from typing_extensions import Self
|
|
21
|
+
|
|
22
|
+
from extensions.nodes.conditional import ConditionalNode
|
|
23
|
+
from extensions.nodes.fail import FailNode
|
|
24
|
+
from extensions.nodes.loop import LoopNode
|
|
25
|
+
from extensions.nodes.map import MapNode
|
|
26
|
+
from extensions.nodes.parallel import ParallelNode
|
|
27
|
+
from extensions.nodes.stub import StubNode
|
|
28
|
+
from extensions.nodes.success import SuccessNode
|
|
29
|
+
from extensions.nodes.task import TaskNode
|
|
30
|
+
from runnable import defaults, graph
|
|
31
|
+
from runnable.executor import BaseJobExecutor
|
|
32
|
+
from runnable.nodes import TraversalNode
|
|
33
|
+
from runnable.tasks import BaseTaskType as RunnableTask
|
|
34
|
+
from runnable.tasks import TaskReturns, create_task
|
|
35
|
+
|
|
36
|
+
logger = logging.getLogger(defaults.LOGGER_NAME)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
StepType = Union[
|
|
40
|
+
"Stub",
|
|
41
|
+
"PythonTask",
|
|
42
|
+
"NotebookTask",
|
|
43
|
+
"ShellTask",
|
|
44
|
+
"Parallel",
|
|
45
|
+
"Map",
|
|
46
|
+
"Loop",
|
|
47
|
+
"Conditional",
|
|
48
|
+
]
|
|
49
|
+
|
|
50
|
+
# Async-compatible step types for AsyncPipeline
|
|
51
|
+
AsyncStepType = Union[
|
|
52
|
+
"Stub",
|
|
53
|
+
"AsyncPythonTask",
|
|
54
|
+
"Parallel",
|
|
55
|
+
"Map",
|
|
56
|
+
"Loop",
|
|
57
|
+
"Conditional",
|
|
58
|
+
]
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def pickled(name: str) -> TaskReturns:
|
|
62
|
+
return TaskReturns(name=name, kind="object")
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def json(name: str) -> TaskReturns:
|
|
66
|
+
return TaskReturns(name=name, kind="json")
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def metric(name: str) -> TaskReturns:
|
|
70
|
+
return TaskReturns(name=name, kind="metric")
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class Catalog(BaseModel):
|
|
74
|
+
"""
|
|
75
|
+
Use to instruct a task to sync data from/to the central catalog.
|
|
76
|
+
Please refer to [concepts](concepts/catalog.md) for more information.
|
|
77
|
+
|
|
78
|
+
Attributes:
|
|
79
|
+
get (List[str]): List of glob patterns to get from central catalog to the compute data folder.
|
|
80
|
+
put (List[str]): List of glob patterns to put into central catalog from the compute data folder.
|
|
81
|
+
store_copy (bool): Whether to store a copy of the data in the central catalog.
|
|
82
|
+
|
|
83
|
+
Examples:
|
|
84
|
+
>>> from runnable import Catalog
|
|
85
|
+
>>> catalog = Catalog(compute_data_folder="/path/to/data", get=["*.csv"], put=["*.csv"])
|
|
86
|
+
|
|
87
|
+
"""
|
|
88
|
+
|
|
89
|
+
model_config = ConfigDict(
|
|
90
|
+
extra="forbid"
|
|
91
|
+
) # Need to be for command, would be validated later
|
|
92
|
+
# Note: compute_data_folder was confusing to explain, might be introduced later.
|
|
93
|
+
# compute_data_folder: str = Field(default="", alias="compute_data_folder")
|
|
94
|
+
get: List[str] = Field(default_factory=list, alias="get")
|
|
95
|
+
put: List[str] = Field(default_factory=list, alias="put")
|
|
96
|
+
store_copy: bool = Field(default=True, alias="store_copy")
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class BaseTraversal(ABC, BaseModel):
|
|
100
|
+
name: str
|
|
101
|
+
next_node: str = Field(default="", serialization_alias="next_node")
|
|
102
|
+
terminate_with_success: bool = Field(default=False, exclude=True)
|
|
103
|
+
terminate_with_failure: bool = Field(default=False, exclude=True)
|
|
104
|
+
on_failure: Optional["Pipeline | AsyncPipeline"] = Field(default=None)
|
|
105
|
+
|
|
106
|
+
model_config = ConfigDict(extra="forbid")
|
|
107
|
+
|
|
108
|
+
@computed_field # type: ignore
|
|
109
|
+
@property
|
|
110
|
+
def internal_name(self) -> str:
|
|
111
|
+
return self.name
|
|
112
|
+
|
|
113
|
+
def __hash__(self):
|
|
114
|
+
"""
|
|
115
|
+
Needed to Uniqueize DataCatalog objects.
|
|
116
|
+
"""
|
|
117
|
+
return hash(self.name)
|
|
118
|
+
|
|
119
|
+
def __rshift__(
|
|
120
|
+
self, other: "Union[StepType, AsyncStepType]"
|
|
121
|
+
) -> "Union[StepType, AsyncStepType]":
|
|
122
|
+
if self.next_node:
|
|
123
|
+
raise Exception(
|
|
124
|
+
f"The node {self} already has a next node: {self.next_node}"
|
|
125
|
+
)
|
|
126
|
+
self.next_node = other.name
|
|
127
|
+
|
|
128
|
+
return other
|
|
129
|
+
|
|
130
|
+
def __lshift__(self, other: TraversalNode) -> TraversalNode:
|
|
131
|
+
if other.next_node:
|
|
132
|
+
raise Exception(
|
|
133
|
+
f"The {other} node already has a next node: {other.next_node}"
|
|
134
|
+
)
|
|
135
|
+
other.next_node = self.name
|
|
136
|
+
|
|
137
|
+
return other
|
|
138
|
+
|
|
139
|
+
@model_validator(mode="after")
|
|
140
|
+
def validate_terminations(self) -> Self:
|
|
141
|
+
if self.terminate_with_failure and self.terminate_with_success:
|
|
142
|
+
raise AssertionError("A node cannot terminate with success and failure")
|
|
143
|
+
|
|
144
|
+
if self.terminate_with_failure or self.terminate_with_success:
|
|
145
|
+
if self.next_node and self.next_node not in ["success", "fail"]:
|
|
146
|
+
raise AssertionError(
|
|
147
|
+
"A node being terminated cannot have a user defined next node"
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
if self.terminate_with_failure:
|
|
151
|
+
self.next_node = "fail"
|
|
152
|
+
|
|
153
|
+
if self.terminate_with_success:
|
|
154
|
+
self.next_node = "success"
|
|
155
|
+
|
|
156
|
+
return self
|
|
157
|
+
|
|
158
|
+
@abstractmethod
|
|
159
|
+
def create_node(self) -> TraversalNode: ...
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
class BaseTask(BaseTraversal):
|
|
163
|
+
"""
|
|
164
|
+
Base task type which has catalog, overrides, returns and secrets.
|
|
165
|
+
"""
|
|
166
|
+
|
|
167
|
+
catalog: Optional[Catalog] = Field(default=None, alias="catalog")
|
|
168
|
+
overrides: Dict[str, Any] = Field(default_factory=dict, alias="overrides")
|
|
169
|
+
returns: List[Union[str, TaskReturns]] = Field(
|
|
170
|
+
default_factory=list, alias="returns"
|
|
171
|
+
)
|
|
172
|
+
secrets: List[str] = Field(default_factory=list)
|
|
173
|
+
|
|
174
|
+
@field_validator("returns", mode="before")
|
|
175
|
+
@classmethod
|
|
176
|
+
def serialize_returns(
|
|
177
|
+
cls, returns: List[Union[str, TaskReturns]]
|
|
178
|
+
) -> List[TaskReturns]:
|
|
179
|
+
task_returns = []
|
|
180
|
+
|
|
181
|
+
for x in returns:
|
|
182
|
+
if isinstance(x, str):
|
|
183
|
+
task_returns.append(TaskReturns(name=x, kind="json"))
|
|
184
|
+
continue
|
|
185
|
+
# Its already task returns
|
|
186
|
+
task_returns.append(x)
|
|
187
|
+
|
|
188
|
+
return task_returns
|
|
189
|
+
|
|
190
|
+
def create_node(self) -> TaskNode:
|
|
191
|
+
if not self.next_node:
|
|
192
|
+
if not (self.terminate_with_failure or self.terminate_with_success):
|
|
193
|
+
raise AssertionError(
|
|
194
|
+
"A node not being terminated must have a user defined next node"
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
if self.on_failure:
|
|
198
|
+
self.on_failure = self.on_failure.steps[0].name # type: ignore
|
|
199
|
+
|
|
200
|
+
return TaskNode.parse_from_config(
|
|
201
|
+
self.model_dump(exclude_none=True, by_alias=True)
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
def create_job(self) -> RunnableTask:
|
|
205
|
+
raise NotImplementedError(
|
|
206
|
+
"This method should be implemented in the child class"
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
def as_pipeline(self) -> "Pipeline":
|
|
210
|
+
return Pipeline(steps=[self], name=self.internal_name) # type: ignore
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
class PythonTask(BaseTask):
|
|
214
|
+
"""
|
|
215
|
+
An execution node of the pipeline of python functions.
|
|
216
|
+
Please refer to [concepts](concepts/task.md/#python_functions) for more information.
|
|
217
|
+
|
|
218
|
+
Attributes:
|
|
219
|
+
name (str): The name of the node.
|
|
220
|
+
function (callable): The function to execute.
|
|
221
|
+
|
|
222
|
+
terminate_with_success (bool): Whether to terminate the pipeline with a success after this node.
|
|
223
|
+
Defaults to False.
|
|
224
|
+
terminate_with_failure (bool): Whether to terminate the pipeline with a failure after this node.
|
|
225
|
+
Defaults to False.
|
|
226
|
+
|
|
227
|
+
on_failure (str): The name of the node to execute if the step fails.
|
|
228
|
+
|
|
229
|
+
returns List[Union[str, TaskReturns]] : A list of the names of variables to return from the task.
|
|
230
|
+
The names should match the order of the variables returned by the function.
|
|
231
|
+
|
|
232
|
+
```TaskReturns```: can be JSON friendly variables, objects or metrics.
|
|
233
|
+
|
|
234
|
+
By default, all variables are assumed to be JSON friendly and will be serialized to JSON.
|
|
235
|
+
Pydantic models are readily supported and will be serialized to JSON.
|
|
236
|
+
|
|
237
|
+
To return a python object, please use ```pickled(<name>)```.
|
|
238
|
+
It is advised to use ```pickled(<name>)``` for big JSON friendly variables.
|
|
239
|
+
|
|
240
|
+
For example,
|
|
241
|
+
```python
|
|
242
|
+
from runnable import pickled
|
|
243
|
+
|
|
244
|
+
def f():
|
|
245
|
+
...
|
|
246
|
+
x = 1
|
|
247
|
+
return x, df # A simple JSON friendly variable and a python object.
|
|
248
|
+
|
|
249
|
+
task = PythonTask(name="task", function=f, returns=["x", pickled(df)]))
|
|
250
|
+
```
|
|
251
|
+
|
|
252
|
+
To mark any JSON friendly variable as a ```metric```, please use ```metric(x)```.
|
|
253
|
+
Metric variables should be JSON friendly and can be treated just like any other parameter.
|
|
254
|
+
|
|
255
|
+
catalog Optional[Catalog]: The files sync data from/to, refer to Catalog.
|
|
256
|
+
|
|
257
|
+
secrets List[str]: List of secrets to pass to the task. They are exposed as environment variables
|
|
258
|
+
and removed after execution.
|
|
259
|
+
|
|
260
|
+
overrides (Dict[str, Any]): Any overrides to the command.
|
|
261
|
+
Individual tasks can override the global configuration config by referring to the
|
|
262
|
+
specific override.
|
|
263
|
+
|
|
264
|
+
For example,
|
|
265
|
+
### Global configuration
|
|
266
|
+
```yaml
|
|
267
|
+
executor:
|
|
268
|
+
type: local-container
|
|
269
|
+
config:
|
|
270
|
+
docker_image: "runnable/runnable:latest"
|
|
271
|
+
overrides:
|
|
272
|
+
custom_docker_image:
|
|
273
|
+
docker_image: "runnable/runnable:custom"
|
|
274
|
+
```
|
|
275
|
+
### Task specific configuration
|
|
276
|
+
```python
|
|
277
|
+
task = PythonTask(name="task", function="function'",
|
|
278
|
+
overrides={'local-container': custom_docker_image})
|
|
279
|
+
```
|
|
280
|
+
"""
|
|
281
|
+
|
|
282
|
+
function: Callable = Field(exclude=True)
|
|
283
|
+
|
|
284
|
+
@computed_field
|
|
285
|
+
def command_type(self) -> str:
|
|
286
|
+
return "python"
|
|
287
|
+
|
|
288
|
+
@computed_field
|
|
289
|
+
def command(self) -> str:
|
|
290
|
+
module = self.function.__module__
|
|
291
|
+
name = self.function.__name__
|
|
292
|
+
|
|
293
|
+
return f"{module}.{name}"
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
class NotebookTask(BaseTask):
|
|
297
|
+
"""
|
|
298
|
+
An execution node of the pipeline of notebook.
|
|
299
|
+
Please refer to [concepts](concepts/task.md/#notebooks) for more information.
|
|
300
|
+
|
|
301
|
+
We internally use [Ploomber engine](https://github.com/ploomber/ploomber-engine) to execute the notebook.
|
|
302
|
+
|
|
303
|
+
Attributes:
|
|
304
|
+
name (str): The name of the node.
|
|
305
|
+
notebook (str): The path to the notebook relative the project root.
|
|
306
|
+
optional_ploomber_args (Dict[str, Any]): Any optional ploomber args, please refer to
|
|
307
|
+
[Ploomber engine](https://github.com/ploomber/ploomber-engine) for more information.
|
|
308
|
+
|
|
309
|
+
terminate_with_success (bool): Whether to terminate the pipeline with a success after this node.
|
|
310
|
+
Defaults to False.
|
|
311
|
+
terminate_with_failure (bool): Whether to terminate the pipeline with a failure after this node.
|
|
312
|
+
Defaults to False.
|
|
313
|
+
|
|
314
|
+
on_failure (str): The name of the node to execute if the step fails.
|
|
315
|
+
|
|
316
|
+
returns List[Union[str, TaskReturns]] : A list of the names of variables to return from the task.
|
|
317
|
+
The names should match the order of the variables returned by the function.
|
|
318
|
+
|
|
319
|
+
```TaskReturns```: can be JSON friendly variables, objects or metrics.
|
|
320
|
+
|
|
321
|
+
By default, all variables are assumed to be JSON friendly and will be serialized to JSON.
|
|
322
|
+
Pydantic models are readily supported and will be serialized to JSON.
|
|
323
|
+
|
|
324
|
+
To return a python object, please use ```pickled(<name>)```.
|
|
325
|
+
It is advised to use ```pickled(<name>)``` for big JSON friendly variables.
|
|
326
|
+
|
|
327
|
+
For example,
|
|
328
|
+
```python
|
|
329
|
+
from runnable import pickled
|
|
330
|
+
|
|
331
|
+
# assume, example.ipynb is the notebook with df and x as variables in some cells.
|
|
332
|
+
|
|
333
|
+
task = Notebook(name="task", notebook="example.ipynb", returns=["x", pickled(df)]))
|
|
334
|
+
```
|
|
335
|
+
|
|
336
|
+
To mark any JSON friendly variable as a ```metric```, please use ```metric(x)```.
|
|
337
|
+
Metric variables should be JSON friendly and can be treated just like any other parameter.
|
|
338
|
+
|
|
339
|
+
catalog Optional[Catalog]: The files sync data from/to, refer to Catalog.
|
|
340
|
+
|
|
341
|
+
secrets List[str]: List of secrets to pass to the task. They are exposed as environment variables
|
|
342
|
+
and removed after execution.
|
|
343
|
+
|
|
344
|
+
overrides (Dict[str, Any]): Any overrides to the command.
|
|
345
|
+
Individual tasks can override the global configuration config by referring to the
|
|
346
|
+
specific override.
|
|
347
|
+
|
|
348
|
+
For example,
|
|
349
|
+
### Global configuration
|
|
350
|
+
```yaml
|
|
351
|
+
executor:
|
|
352
|
+
type: local-container
|
|
353
|
+
config:
|
|
354
|
+
docker_image: "runnable/runnable:latest"
|
|
355
|
+
overrides:
|
|
356
|
+
custom_docker_image:
|
|
357
|
+
docker_image: "runnable/runnable:custom"
|
|
358
|
+
```
|
|
359
|
+
### Task specific configuration
|
|
360
|
+
```python
|
|
361
|
+
task = NotebookTask(name="task", notebook="example.ipynb",
|
|
362
|
+
overrides={'local-container': custom_docker_image})
|
|
363
|
+
```
|
|
364
|
+
"""
|
|
365
|
+
|
|
366
|
+
notebook: str = Field(serialization_alias="command")
|
|
367
|
+
optional_ploomber_args: Optional[Dict[str, Any]] = Field(
|
|
368
|
+
default=None, alias="optional_ploomber_args"
|
|
369
|
+
)
|
|
370
|
+
|
|
371
|
+
@computed_field
|
|
372
|
+
def command_type(self) -> str:
|
|
373
|
+
return "notebook"
|
|
374
|
+
|
|
375
|
+
|
|
376
|
+
class ShellTask(BaseTask):
|
|
377
|
+
"""
|
|
378
|
+
An execution node of the pipeline of shell script.
|
|
379
|
+
Please refer to [concepts](concepts/task.md/#shell) for more information.
|
|
380
|
+
|
|
381
|
+
|
|
382
|
+
Attributes:
|
|
383
|
+
name (str): The name of the node.
|
|
384
|
+
command (str): The path to the notebook relative the project root.
|
|
385
|
+
terminate_with_success (bool): Whether to terminate the pipeline with a success after this node.
|
|
386
|
+
Defaults to False.
|
|
387
|
+
terminate_with_failure (bool): Whether to terminate the pipeline with a failure after this node.
|
|
388
|
+
Defaults to False.
|
|
389
|
+
|
|
390
|
+
on_failure (str): The name of the node to execute if the step fails.
|
|
391
|
+
|
|
392
|
+
returns List[str] : A list of the names of environment variables to collect from the task.
|
|
393
|
+
|
|
394
|
+
The names should match the order of the variables returned by the function.
|
|
395
|
+
Shell based tasks can only return JSON friendly variables.
|
|
396
|
+
|
|
397
|
+
To mark any JSON friendly variable as a ```metric```, please use ```metric(x)```.
|
|
398
|
+
Metric variables should be JSON friendly and can be treated just like any other parameter.
|
|
399
|
+
|
|
400
|
+
catalog Optional[Catalog]: The files sync data from/to, refer to Catalog.
|
|
401
|
+
|
|
402
|
+
secrets List[str]: List of secrets to pass to the task. They are exposed as environment variables
|
|
403
|
+
and removed after execution.
|
|
404
|
+
|
|
405
|
+
overrides (Dict[str, Any]): Any overrides to the command.
|
|
406
|
+
Individual tasks can override the global configuration config by referring to the
|
|
407
|
+
specific override.
|
|
408
|
+
|
|
409
|
+
For example,
|
|
410
|
+
### Global configuration
|
|
411
|
+
```yaml
|
|
412
|
+
executor:
|
|
413
|
+
type: local-container
|
|
414
|
+
config:
|
|
415
|
+
docker_image: "runnable/runnable:latest"
|
|
416
|
+
overrides:
|
|
417
|
+
custom_docker_image:
|
|
418
|
+
docker_image: "runnable/runnable:custom"
|
|
419
|
+
```
|
|
420
|
+
### Task specific configuration
|
|
421
|
+
```python
|
|
422
|
+
task = ShellTask(name="task", command="export x=1",
|
|
423
|
+
overrides={'local-container': custom_docker_image})
|
|
424
|
+
```
|
|
425
|
+
|
|
426
|
+
"""
|
|
427
|
+
|
|
428
|
+
command: str = Field(alias="command")
|
|
429
|
+
|
|
430
|
+
@computed_field
|
|
431
|
+
def command_type(self) -> str:
|
|
432
|
+
return "shell"
|
|
433
|
+
|
|
434
|
+
|
|
435
|
+
class AsyncPythonTask(BaseTask):
|
|
436
|
+
"""
|
|
437
|
+
An execution node for async Python functions.
|
|
438
|
+
Please refer to [concepts](concepts/task.md/#async_python_functions) for more information.
|
|
439
|
+
|
|
440
|
+
Attributes:
|
|
441
|
+
name (str): The name of the node.
|
|
442
|
+
function (callable): The async function to execute.
|
|
443
|
+
|
|
444
|
+
terminate_with_success (bool): Whether to terminate the pipeline with a success after this node.
|
|
445
|
+
Defaults to False.
|
|
446
|
+
terminate_with_failure (bool): Whether to terminate the pipeline with a failure after this node.
|
|
447
|
+
Defaults to False.
|
|
448
|
+
|
|
449
|
+
on_failure (str): The name of the node to execute if the step fails.
|
|
450
|
+
|
|
451
|
+
returns List[Union[str, TaskReturns]] : A list of the names of variables to return from the task.
|
|
452
|
+
The names should match the order of the variables returned by the function.
|
|
453
|
+
|
|
454
|
+
```TaskReturns```: can be JSON friendly variables, objects or metrics.
|
|
455
|
+
|
|
456
|
+
By default, all variables are assumed to be JSON friendly and will be serialized to JSON.
|
|
457
|
+
Pydantic models are readily supported and will be serialized to JSON.
|
|
458
|
+
|
|
459
|
+
To return a python object, please use ```pickled(<name>)```.
|
|
460
|
+
|
|
461
|
+
For example,
|
|
462
|
+
```python
|
|
463
|
+
from runnable import pickled, AsyncPythonTask
|
|
464
|
+
|
|
465
|
+
async def my_async_func():
|
|
466
|
+
...
|
|
467
|
+
x = 1
|
|
468
|
+
return x, df
|
|
469
|
+
|
|
470
|
+
task = AsyncPythonTask(name="task", function=my_async_func, returns=["x", pickled(df)]))
|
|
471
|
+
```
|
|
472
|
+
|
|
473
|
+
catalog Optional[Catalog]: The files sync data from/to, refer to Catalog.
|
|
474
|
+
|
|
475
|
+
secrets List[str]: List of secrets to pass to the task. They are exposed as environment variables
|
|
476
|
+
and removed after execution.
|
|
477
|
+
|
|
478
|
+
overrides (Dict[str, Any]): Any overrides to the command.
|
|
479
|
+
|
|
480
|
+
stream_end_type (str): The event type that indicates end of stream for AsyncGenerator functions.
|
|
481
|
+
When this event type is encountered, its contents (excluding 'type') are extracted as return values.
|
|
482
|
+
Defaults to "done".
|
|
483
|
+
"""
|
|
484
|
+
|
|
485
|
+
function: Callable = Field(exclude=True)
|
|
486
|
+
stream_end_type: str = Field(default="done")
|
|
487
|
+
|
|
488
|
+
@computed_field
|
|
489
|
+
def command_type(self) -> str:
|
|
490
|
+
return "async-python"
|
|
491
|
+
|
|
492
|
+
@computed_field
|
|
493
|
+
def command(self) -> str:
|
|
494
|
+
module = self.function.__module__
|
|
495
|
+
name = self.function.__name__
|
|
496
|
+
|
|
497
|
+
return f"{module}.{name}"
|
|
498
|
+
|
|
499
|
+
def as_async_pipeline(self) -> "AsyncPipeline":
|
|
500
|
+
return AsyncPipeline(steps=[self], name=self.internal_name) # type: ignore
|
|
501
|
+
|
|
502
|
+
|
|
503
|
+
class Stub(BaseTraversal):
|
|
504
|
+
"""
|
|
505
|
+
A node that passes through the pipeline with no action. Just like ```pass``` in Python.
|
|
506
|
+
Please refer to [concepts](concepts/task.md/#stub) for more information.
|
|
507
|
+
|
|
508
|
+
A stub node can tak arbitrary number of arguments.
|
|
509
|
+
|
|
510
|
+
Attributes:
|
|
511
|
+
name (str): The name of the node.
|
|
512
|
+
command (str): The path to the notebook relative the project root.
|
|
513
|
+
terminate_with_success (bool): Whether to terminate the pipeline with a success after this node.
|
|
514
|
+
Defaults to False.
|
|
515
|
+
terminate_with_failure (bool): Whether to terminate the pipeline with a failure after this node.
|
|
516
|
+
Defaults to False.
|
|
517
|
+
|
|
518
|
+
on_failure (str): The name of the node to execute if the step fails.
|
|
519
|
+
"""
|
|
520
|
+
|
|
521
|
+
model_config = ConfigDict(extra="ignore")
|
|
522
|
+
catalog: Optional[Catalog] = Field(default=None, alias="catalog")
|
|
523
|
+
|
|
524
|
+
def create_node(self) -> StubNode:
|
|
525
|
+
if not self.next_node:
|
|
526
|
+
if not (self.terminate_with_failure or self.terminate_with_success):
|
|
527
|
+
raise AssertionError(
|
|
528
|
+
"A node not being terminated must have a user defined next node"
|
|
529
|
+
)
|
|
530
|
+
|
|
531
|
+
return StubNode.parse_from_config(self.model_dump(exclude_none=True))
|
|
532
|
+
|
|
533
|
+
def as_pipeline(self) -> "Pipeline":
|
|
534
|
+
return Pipeline(steps=[self])
|
|
535
|
+
|
|
536
|
+
|
|
537
|
+
class Parallel(BaseTraversal):
|
|
538
|
+
"""
|
|
539
|
+
A node that executes multiple branches in parallel.
|
|
540
|
+
Please refer to [concepts](concepts/parallel.md) for more information.
|
|
541
|
+
|
|
542
|
+
Attributes:
|
|
543
|
+
name (str): The name of the node.
|
|
544
|
+
branches (Dict[str, Pipeline]): A dictionary of branches to execute in parallel.
|
|
545
|
+
terminate_with_failure (bool): Whether to terminate the pipeline with a failure after this node.
|
|
546
|
+
terminate_with_success (bool): Whether to terminate the pipeline with a success after this node.
|
|
547
|
+
on_failure (str): The name of the node to execute if any of the branches fail.
|
|
548
|
+
"""
|
|
549
|
+
|
|
550
|
+
branches: Dict[str, "Pipeline | AsyncPipeline"]
|
|
551
|
+
|
|
552
|
+
@computed_field # type: ignore
|
|
553
|
+
@property
|
|
554
|
+
def graph_branches(self) -> Dict[str, graph.Graph]:
|
|
555
|
+
return {
|
|
556
|
+
name: pipeline._dag.model_copy() for name, pipeline in self.branches.items()
|
|
557
|
+
}
|
|
558
|
+
|
|
559
|
+
def create_node(self) -> ParallelNode:
|
|
560
|
+
if not self.next_node:
|
|
561
|
+
if not (self.terminate_with_failure or self.terminate_with_success):
|
|
562
|
+
raise AssertionError(
|
|
563
|
+
"A node not being terminated must have a user defined next node"
|
|
564
|
+
)
|
|
565
|
+
|
|
566
|
+
node = ParallelNode(
|
|
567
|
+
name=self.name,
|
|
568
|
+
branches=self.graph_branches,
|
|
569
|
+
internal_name="",
|
|
570
|
+
next_node=self.next_node,
|
|
571
|
+
)
|
|
572
|
+
return node
|
|
573
|
+
|
|
574
|
+
|
|
575
|
+
class Conditional(BaseTraversal):
|
|
576
|
+
branches: Dict[str, "Pipeline | AsyncPipeline"]
|
|
577
|
+
parameter: str # the name of the parameter should be isalnum
|
|
578
|
+
|
|
579
|
+
@field_validator("parameter")
|
|
580
|
+
@classmethod
|
|
581
|
+
def validate_parameter(cls, parameter: str) -> str:
|
|
582
|
+
if not parameter.isalnum():
|
|
583
|
+
raise AssertionError(
|
|
584
|
+
"The parameter name should be alphanumeric and not empty"
|
|
585
|
+
)
|
|
586
|
+
return parameter
|
|
587
|
+
|
|
588
|
+
@field_validator("branches")
|
|
589
|
+
@classmethod
|
|
590
|
+
def validate_branches(
|
|
591
|
+
cls, branches: Dict[str, "Pipeline"]
|
|
592
|
+
) -> Dict[str, "Pipeline"]:
|
|
593
|
+
for branch_name in branches.keys():
|
|
594
|
+
if not branch_name.isalnum():
|
|
595
|
+
raise ValueError(f"Branch '{branch_name}' must be alphanumeric.")
|
|
596
|
+
return branches
|
|
597
|
+
|
|
598
|
+
@computed_field # type: ignore
|
|
599
|
+
@property
|
|
600
|
+
def graph_branches(self) -> Dict[str, graph.Graph]:
|
|
601
|
+
return {
|
|
602
|
+
name: pipeline._dag.model_copy() for name, pipeline in self.branches.items()
|
|
603
|
+
}
|
|
604
|
+
|
|
605
|
+
def create_node(self) -> ConditionalNode:
|
|
606
|
+
if not self.next_node:
|
|
607
|
+
if not (self.terminate_with_failure or self.terminate_with_success):
|
|
608
|
+
raise AssertionError(
|
|
609
|
+
"A node not being terminated must have a user defined next node"
|
|
610
|
+
)
|
|
611
|
+
|
|
612
|
+
node = ConditionalNode(
|
|
613
|
+
name=self.name,
|
|
614
|
+
branches=self.graph_branches,
|
|
615
|
+
internal_name="",
|
|
616
|
+
next_node=self.next_node,
|
|
617
|
+
parameter=self.parameter,
|
|
618
|
+
)
|
|
619
|
+
return node
|
|
620
|
+
|
|
621
|
+
|
|
622
|
+
class Map(BaseTraversal):
|
|
623
|
+
"""
|
|
624
|
+
A node that iterates over a list of items and executes a pipeline for each item.
|
|
625
|
+
Please refer to [concepts](concepts/map.md) for more information.
|
|
626
|
+
|
|
627
|
+
Attributes:
|
|
628
|
+
branch (Pipeline): The pipeline to execute for each item.
|
|
629
|
+
|
|
630
|
+
iterate_on (str): The name of the parameter to iterate over.
|
|
631
|
+
The parameter should be defined either by previous steps or statically at the start of execution.
|
|
632
|
+
|
|
633
|
+
iterate_as (str): The name of the iterable to be passed to functions.
|
|
634
|
+
reducer (Callable): The function to reduce the results of the branches.
|
|
635
|
+
|
|
636
|
+
|
|
637
|
+
overrides (Dict[str, Any]): Any overrides to the command.
|
|
638
|
+
|
|
639
|
+
"""
|
|
640
|
+
|
|
641
|
+
branch: "Pipeline | AsyncPipeline"
|
|
642
|
+
iterate_on: str
|
|
643
|
+
iterate_as: str
|
|
644
|
+
reducer: Optional[str] = Field(default=None, alias="reducer")
|
|
645
|
+
|
|
646
|
+
@computed_field # type: ignore
|
|
647
|
+
@property
|
|
648
|
+
def graph_branch(self) -> graph.Graph:
|
|
649
|
+
return self.branch._dag.model_copy()
|
|
650
|
+
|
|
651
|
+
def create_node(self) -> MapNode:
|
|
652
|
+
if not self.next_node:
|
|
653
|
+
if not (self.terminate_with_failure or self.terminate_with_success):
|
|
654
|
+
raise AssertionError(
|
|
655
|
+
"A node not being terminated must have a user defined next node"
|
|
656
|
+
)
|
|
657
|
+
|
|
658
|
+
node = MapNode(
|
|
659
|
+
name=self.name,
|
|
660
|
+
branch=self.graph_branch,
|
|
661
|
+
internal_name="",
|
|
662
|
+
next_node=self.next_node,
|
|
663
|
+
iterate_on=self.iterate_on,
|
|
664
|
+
iterate_as=self.iterate_as,
|
|
665
|
+
reducer=self.reducer,
|
|
666
|
+
)
|
|
667
|
+
|
|
668
|
+
return node
|
|
669
|
+
|
|
670
|
+
|
|
671
|
+
class Loop(BaseTraversal):
|
|
672
|
+
"""
|
|
673
|
+
A loop node that iterates over a branch until a break condition is met.
|
|
674
|
+
Please refer to [concepts](concepts/loop.md) for more information.
|
|
675
|
+
|
|
676
|
+
The loop executes the branch repeatedly until either:
|
|
677
|
+
- parameters[break_on] == True
|
|
678
|
+
- max_iterations is reached (safety limit)
|
|
679
|
+
|
|
680
|
+
Attributes:
|
|
681
|
+
branch (Pipeline | AsyncPipeline): The pipeline to execute repeatedly.
|
|
682
|
+
|
|
683
|
+
max_iterations (int): Maximum number of iterations (safety limit).
|
|
684
|
+
|
|
685
|
+
break_on (str): The name of the boolean parameter that controls loop exit.
|
|
686
|
+
When this parameter becomes True, the loop exits.
|
|
687
|
+
|
|
688
|
+
index_as (str): The name of the environment variable that will contain
|
|
689
|
+
the current iteration index (0-based).
|
|
690
|
+
|
|
691
|
+
Example:
|
|
692
|
+
```python
|
|
693
|
+
from runnable.sdk import Pipeline, PythonTask, Loop, json
|
|
694
|
+
|
|
695
|
+
def process_data(iteration_num):
|
|
696
|
+
# iteration_num is available as environment variable
|
|
697
|
+
result = call_api()
|
|
698
|
+
return {"success": result.ok, "should_stop": result.ok}
|
|
699
|
+
|
|
700
|
+
task = PythonTask(
|
|
701
|
+
name="process",
|
|
702
|
+
function=process_data,
|
|
703
|
+
returns=[json("success"), json("should_stop")]
|
|
704
|
+
)
|
|
705
|
+
|
|
706
|
+
process_pipeline = Pipeline(steps=[task])
|
|
707
|
+
|
|
708
|
+
loop = Loop(
|
|
709
|
+
name="retry_loop",
|
|
710
|
+
branch=process_pipeline,
|
|
711
|
+
max_iterations=5,
|
|
712
|
+
break_on="should_stop",
|
|
713
|
+
index_as="iteration_num"
|
|
714
|
+
)
|
|
715
|
+
```
|
|
716
|
+
"""
|
|
717
|
+
|
|
718
|
+
branch: "Pipeline | AsyncPipeline"
|
|
719
|
+
max_iterations: int
|
|
720
|
+
break_on: str
|
|
721
|
+
index_as: str
|
|
722
|
+
|
|
723
|
+
@computed_field # type: ignore
|
|
724
|
+
@property
|
|
725
|
+
def graph_branch(self) -> graph.Graph:
|
|
726
|
+
return self.branch._dag.model_copy()
|
|
727
|
+
|
|
728
|
+
def create_node(self) -> LoopNode:
|
|
729
|
+
if not self.next_node:
|
|
730
|
+
if not (self.terminate_with_failure or self.terminate_with_success):
|
|
731
|
+
raise AssertionError(
|
|
732
|
+
"A node not being terminated must have a user defined next node"
|
|
733
|
+
)
|
|
734
|
+
|
|
735
|
+
node = LoopNode(
|
|
736
|
+
name=self.name,
|
|
737
|
+
branch=self.graph_branch,
|
|
738
|
+
internal_name="",
|
|
739
|
+
next_node=self.next_node,
|
|
740
|
+
max_iterations=self.max_iterations,
|
|
741
|
+
break_on=self.break_on,
|
|
742
|
+
index_as=self.index_as,
|
|
743
|
+
)
|
|
744
|
+
|
|
745
|
+
return node
|
|
746
|
+
|
|
747
|
+
|
|
748
|
+
class Success(BaseModel):
|
|
749
|
+
"""
|
|
750
|
+
A node that represents a successful execution of the pipeline.
|
|
751
|
+
|
|
752
|
+
Most often, there is no need to use this node as nodes can be instructed to
|
|
753
|
+
terminate_with_success and pipeline with add_terminal_nodes=True.
|
|
754
|
+
|
|
755
|
+
Attributes:
|
|
756
|
+
name (str): The name of the node.
|
|
757
|
+
"""
|
|
758
|
+
|
|
759
|
+
name: str = "success"
|
|
760
|
+
|
|
761
|
+
@computed_field # type: ignore
|
|
762
|
+
@property
|
|
763
|
+
def internal_name(self) -> str:
|
|
764
|
+
return self.name
|
|
765
|
+
|
|
766
|
+
def create_node(self) -> SuccessNode:
|
|
767
|
+
return SuccessNode.parse_from_config(self.model_dump())
|
|
768
|
+
|
|
769
|
+
|
|
770
|
+
class Fail(BaseModel):
|
|
771
|
+
"""
|
|
772
|
+
A node that represents a failed execution of the pipeline.
|
|
773
|
+
|
|
774
|
+
Most often, there is no need to use this node as nodes can be instructed to
|
|
775
|
+
terminate_with_failure and pipeline with add_terminal_nodes=True.
|
|
776
|
+
|
|
777
|
+
Attributes:
|
|
778
|
+
name (str): The name of the node.
|
|
779
|
+
"""
|
|
780
|
+
|
|
781
|
+
name: str = "fail"
|
|
782
|
+
|
|
783
|
+
@computed_field # type: ignore
|
|
784
|
+
@property
|
|
785
|
+
def internal_name(self) -> str:
|
|
786
|
+
return self.name
|
|
787
|
+
|
|
788
|
+
def create_node(self) -> FailNode:
|
|
789
|
+
return FailNode.parse_from_config(self.model_dump())
|
|
790
|
+
|
|
791
|
+
|
|
792
|
+
class Pipeline(BaseModel):
|
|
793
|
+
"""
|
|
794
|
+
A Pipeline is a sequence of Steps.
|
|
795
|
+
|
|
796
|
+
Attributes:
|
|
797
|
+
steps (List[Stub | PythonTask | NotebookTask | ShellTask | Parallel | Map | Loop | Conditional]):
|
|
798
|
+
A list of Steps that make up the Pipeline.
|
|
799
|
+
|
|
800
|
+
The order of steps is important as it determines the order of execution.
|
|
801
|
+
Any on failure behavior should the first step in ```on_failure``` pipelines.
|
|
802
|
+
|
|
803
|
+
on_failure (List[List[Pipeline], optional): A list of Pipelines to execute in case of failure.
|
|
804
|
+
|
|
805
|
+
For example, for the below pipeline:
|
|
806
|
+
step1 >> step2
|
|
807
|
+
and step1 to reach step3 in case of failure.
|
|
808
|
+
|
|
809
|
+
failure_pipeline = Pipeline(steps=[step1, step3])
|
|
810
|
+
|
|
811
|
+
pipeline = Pipeline(steps=[step1, step2, on_failure=[failure_pipeline])
|
|
812
|
+
|
|
813
|
+
name (str, optional): The name of the Pipeline. Defaults to "".
|
|
814
|
+
description (str, optional): A description of the Pipeline. Defaults to "".
|
|
815
|
+
|
|
816
|
+
The pipeline implicitly add success and fail nodes.
|
|
817
|
+
|
|
818
|
+
"""
|
|
819
|
+
|
|
820
|
+
steps: List[StepType]
|
|
821
|
+
name: str = ""
|
|
822
|
+
description: str = ""
|
|
823
|
+
|
|
824
|
+
internal_branch_name: str = ""
|
|
825
|
+
|
|
826
|
+
@property
|
|
827
|
+
def add_terminal_nodes(self) -> bool:
|
|
828
|
+
return True
|
|
829
|
+
|
|
830
|
+
_dag: graph.Graph = PrivateAttr()
|
|
831
|
+
model_config = ConfigDict(extra="forbid")
|
|
832
|
+
|
|
833
|
+
def model_post_init(self, __context: Any) -> None:
|
|
834
|
+
"""
|
|
835
|
+
The sequence of steps can either be:
|
|
836
|
+
[step1, step2,..., stepN]
|
|
837
|
+
indicates:
|
|
838
|
+
- step1 > step2 > ... > stepN
|
|
839
|
+
- We expect terminate with success or fail to be explicitly stated on a step.
|
|
840
|
+
- If it is stated, the step cannot have a next step defined apart from "success" and "fail".
|
|
841
|
+
Any definition of pipeline should have one node that terminates with success.
|
|
842
|
+
"""
|
|
843
|
+
# The last step of the pipeline is defaulted to be a success step
|
|
844
|
+
# unless it is explicitly stated to terminate with failure.
|
|
845
|
+
terminal_step: StepType = self.steps[-1]
|
|
846
|
+
if not terminal_step.terminate_with_failure:
|
|
847
|
+
terminal_step.terminate_with_success = True
|
|
848
|
+
terminal_step.next_node = "success"
|
|
849
|
+
|
|
850
|
+
# assert that there is only one termination node with success or failure
|
|
851
|
+
# Assert that there are no duplicate step names
|
|
852
|
+
observed: Dict[str, str] = {}
|
|
853
|
+
count_termination: int = 0
|
|
854
|
+
|
|
855
|
+
for step in self.steps:
|
|
856
|
+
if isinstance(
|
|
857
|
+
step, (Stub, PythonTask, NotebookTask, ShellTask, Parallel, Map)
|
|
858
|
+
):
|
|
859
|
+
if step.terminate_with_success or step.terminate_with_failure:
|
|
860
|
+
count_termination += 1
|
|
861
|
+
if step.name in observed:
|
|
862
|
+
raise Exception(
|
|
863
|
+
f"Step names should be unique. Found duplicate: {step.name}"
|
|
864
|
+
)
|
|
865
|
+
observed[step.name] = step.name
|
|
866
|
+
|
|
867
|
+
if count_termination > 1:
|
|
868
|
+
raise AssertionError(
|
|
869
|
+
"A pipeline can only have one termination node with success or failure"
|
|
870
|
+
)
|
|
871
|
+
|
|
872
|
+
# link the steps by assigning the next_node name to be that name of the node
|
|
873
|
+
# immediately after it.
|
|
874
|
+
for i in range(len(self.steps) - 1):
|
|
875
|
+
self.steps[i] >> self.steps[i + 1]
|
|
876
|
+
|
|
877
|
+
# Add any on_failure pipelines to the steps
|
|
878
|
+
gathered_on_failure: List[StepType] = []
|
|
879
|
+
for step in self.steps:
|
|
880
|
+
if step.on_failure:
|
|
881
|
+
gathered_on_failure.extend(cast(List[StepType], step.on_failure.steps))
|
|
882
|
+
|
|
883
|
+
self._dag = graph.Graph(
|
|
884
|
+
start_at=self.steps[0].name,
|
|
885
|
+
description=self.description,
|
|
886
|
+
internal_branch_name=self.internal_branch_name,
|
|
887
|
+
)
|
|
888
|
+
|
|
889
|
+
self.steps.extend(gathered_on_failure)
|
|
890
|
+
|
|
891
|
+
for step in self.steps:
|
|
892
|
+
self._dag.add_node(step.create_node())
|
|
893
|
+
|
|
894
|
+
self._dag.add_terminal_nodes()
|
|
895
|
+
|
|
896
|
+
self._dag.check_graph()
|
|
897
|
+
|
|
898
|
+
def return_dag(self) -> graph.Graph:
|
|
899
|
+
dag_definition = self._dag.model_dump(by_alias=True, exclude_none=True)
|
|
900
|
+
return graph.create_graph(dag_definition)
|
|
901
|
+
|
|
902
|
+
def _is_called_for_definition(self) -> bool:
|
|
903
|
+
"""
|
|
904
|
+
If the run context is set, we are coming in only to get the pipeline definition.
|
|
905
|
+
"""
|
|
906
|
+
from runnable.context import get_run_context
|
|
907
|
+
|
|
908
|
+
if get_run_context() is None:
|
|
909
|
+
return False
|
|
910
|
+
return True
|
|
911
|
+
|
|
912
|
+
def get_caller(self) -> str:
|
|
913
|
+
caller_stack = inspect.stack()[2]
|
|
914
|
+
relative_to_root = str(Path(caller_stack.filename).relative_to(Path.cwd()))
|
|
915
|
+
|
|
916
|
+
module_name = re.sub(r"\b.py\b", "", relative_to_root.replace("/", "."))
|
|
917
|
+
module_to_call = f"{module_name}.{caller_stack.function}"
|
|
918
|
+
|
|
919
|
+
return module_to_call
|
|
920
|
+
|
|
921
|
+
def execute(
|
|
922
|
+
self,
|
|
923
|
+
configuration_file: str = "",
|
|
924
|
+
run_id: str = "",
|
|
925
|
+
tag: str = "",
|
|
926
|
+
parameters_file: str = "",
|
|
927
|
+
log_level: str = defaults.LOG_LEVEL,
|
|
928
|
+
):
|
|
929
|
+
"""
|
|
930
|
+
Overloaded method:
|
|
931
|
+
- Could be called by the user when executing the pipeline via SDK
|
|
932
|
+
- Could be called by the system itself when getting the pipeline definition
|
|
933
|
+
"""
|
|
934
|
+
if self._is_called_for_definition():
|
|
935
|
+
# Immediately return as this call is only for getting the pipeline definition
|
|
936
|
+
return {}
|
|
937
|
+
|
|
938
|
+
from runnable import context
|
|
939
|
+
|
|
940
|
+
logger.setLevel(log_level)
|
|
941
|
+
|
|
942
|
+
service_configurations = context.ServiceConfigurations(
|
|
943
|
+
configuration_file=configuration_file,
|
|
944
|
+
execution_context=context.ExecutionContext.PIPELINE,
|
|
945
|
+
)
|
|
946
|
+
|
|
947
|
+
configurations = {
|
|
948
|
+
"pipeline_definition_file": self.get_caller(),
|
|
949
|
+
"parameters_file": parameters_file,
|
|
950
|
+
"tag": tag,
|
|
951
|
+
"run_id": run_id,
|
|
952
|
+
"execution_mode": context.ExecutionMode.PYTHON,
|
|
953
|
+
"configuration_file": configuration_file,
|
|
954
|
+
**service_configurations.services,
|
|
955
|
+
}
|
|
956
|
+
|
|
957
|
+
run_context = context.PipelineContext.model_validate(configurations)
|
|
958
|
+
context.set_run_context(run_context)
|
|
959
|
+
|
|
960
|
+
assert isinstance(run_context, context.PipelineContext)
|
|
961
|
+
|
|
962
|
+
run_context.execute()
|
|
963
|
+
|
|
964
|
+
|
|
965
|
+
class AsyncPipeline(BaseModel):
|
|
966
|
+
"""
|
|
967
|
+
An AsyncPipeline is a sequence of async-compatible Steps that executes asynchronously.
|
|
968
|
+
|
|
969
|
+
Use this when you have async functions that need native async execution with await support.
|
|
970
|
+
|
|
971
|
+
Attributes:
|
|
972
|
+
steps (List[Stub | AsyncPythonTask]]):
|
|
973
|
+
A list of async-compatible Steps that make up the Pipeline.
|
|
974
|
+
|
|
975
|
+
The order of steps is important as it determines the order of execution.
|
|
976
|
+
|
|
977
|
+
name (str, optional): The name of the Pipeline. Defaults to "".
|
|
978
|
+
description (str, optional): A description of the Pipeline. Defaults to "".
|
|
979
|
+
|
|
980
|
+
Example:
|
|
981
|
+
```python
|
|
982
|
+
from runnable import AsyncPipeline, AsyncPythonTask
|
|
983
|
+
import asyncio
|
|
984
|
+
|
|
985
|
+
async def my_async_func():
|
|
986
|
+
await asyncio.sleep(1)
|
|
987
|
+
return "done"
|
|
988
|
+
|
|
989
|
+
async def main():
|
|
990
|
+
pipeline = AsyncPipeline(
|
|
991
|
+
steps=[AsyncPythonTask(name="task", function=my_async_func, returns=["result"])]
|
|
992
|
+
)
|
|
993
|
+
await pipeline.execute()
|
|
994
|
+
|
|
995
|
+
asyncio.run(main())
|
|
996
|
+
```
|
|
997
|
+
"""
|
|
998
|
+
|
|
999
|
+
steps: List[AsyncStepType]
|
|
1000
|
+
name: str = ""
|
|
1001
|
+
description: str = ""
|
|
1002
|
+
|
|
1003
|
+
internal_branch_name: str = ""
|
|
1004
|
+
|
|
1005
|
+
@property
|
|
1006
|
+
def add_terminal_nodes(self) -> bool:
|
|
1007
|
+
return True
|
|
1008
|
+
|
|
1009
|
+
_dag: graph.Graph = PrivateAttr()
|
|
1010
|
+
model_config = ConfigDict(extra="forbid")
|
|
1011
|
+
|
|
1012
|
+
def model_post_init(self, __context: Any) -> None:
|
|
1013
|
+
"""
|
|
1014
|
+
The sequence of steps can either be:
|
|
1015
|
+
[step1, step2,..., stepN]
|
|
1016
|
+
indicates:
|
|
1017
|
+
- step1 > step2 > ... > stepN
|
|
1018
|
+
- We expect terminate with success or fail to be explicitly stated on a step.
|
|
1019
|
+
- If it is stated, the step cannot have a next step defined apart from "success" and "fail".
|
|
1020
|
+
Any definition of pipeline should have one node that terminates with success.
|
|
1021
|
+
"""
|
|
1022
|
+
# The last step of the pipeline is defaulted to be a success step
|
|
1023
|
+
# unless it is explicitly stated to terminate with failure.
|
|
1024
|
+
terminal_step: AsyncStepType = self.steps[-1]
|
|
1025
|
+
if not terminal_step.terminate_with_failure:
|
|
1026
|
+
terminal_step.terminate_with_success = True
|
|
1027
|
+
terminal_step.next_node = "success"
|
|
1028
|
+
|
|
1029
|
+
# assert that there is only one termination node with success or failure
|
|
1030
|
+
# Assert that there are no duplicate step names
|
|
1031
|
+
observed: Dict[str, str] = {}
|
|
1032
|
+
count_termination: int = 0
|
|
1033
|
+
|
|
1034
|
+
for step in self.steps:
|
|
1035
|
+
if isinstance(step, (Stub, AsyncPythonTask)):
|
|
1036
|
+
if step.terminate_with_success or step.terminate_with_failure:
|
|
1037
|
+
count_termination += 1
|
|
1038
|
+
if step.name in observed:
|
|
1039
|
+
raise Exception(
|
|
1040
|
+
f"Step names should be unique. Found duplicate: {step.name}"
|
|
1041
|
+
)
|
|
1042
|
+
observed[step.name] = step.name
|
|
1043
|
+
|
|
1044
|
+
if count_termination > 1:
|
|
1045
|
+
raise AssertionError(
|
|
1046
|
+
"A pipeline can only have one termination node with success or failure"
|
|
1047
|
+
)
|
|
1048
|
+
|
|
1049
|
+
# link the steps by assigning the next_node name to be that name of the node
|
|
1050
|
+
# immediately after it.
|
|
1051
|
+
for i in range(len(self.steps) - 1):
|
|
1052
|
+
self.steps[i] >> self.steps[i + 1]
|
|
1053
|
+
|
|
1054
|
+
# Add any on_failure pipelines to the steps
|
|
1055
|
+
gathered_on_failure: List[AsyncStepType] = []
|
|
1056
|
+
for step in self.steps:
|
|
1057
|
+
if step.on_failure:
|
|
1058
|
+
gathered_on_failure.extend(
|
|
1059
|
+
cast(List[AsyncStepType], step.on_failure.steps)
|
|
1060
|
+
)
|
|
1061
|
+
|
|
1062
|
+
self._dag = graph.Graph(
|
|
1063
|
+
start_at=self.steps[0].name,
|
|
1064
|
+
description=self.description,
|
|
1065
|
+
internal_branch_name=self.internal_branch_name,
|
|
1066
|
+
)
|
|
1067
|
+
|
|
1068
|
+
self.steps.extend(gathered_on_failure)
|
|
1069
|
+
|
|
1070
|
+
for step in self.steps:
|
|
1071
|
+
self._dag.add_node(step.create_node())
|
|
1072
|
+
|
|
1073
|
+
self._dag.add_terminal_nodes()
|
|
1074
|
+
|
|
1075
|
+
self._dag.check_graph()
|
|
1076
|
+
|
|
1077
|
+
def return_dag(self) -> graph.Graph:
|
|
1078
|
+
dag_definition = self._dag.model_dump(by_alias=True, exclude_none=True)
|
|
1079
|
+
return graph.create_graph(dag_definition)
|
|
1080
|
+
|
|
1081
|
+
async def execute_streaming(
|
|
1082
|
+
self,
|
|
1083
|
+
configuration_file: str = "",
|
|
1084
|
+
run_id: str = "",
|
|
1085
|
+
tag: str = "",
|
|
1086
|
+
parameters_file: str = "",
|
|
1087
|
+
log_level: str = defaults.LOG_LEVEL,
|
|
1088
|
+
):
|
|
1089
|
+
"""
|
|
1090
|
+
Execute the async pipeline and yield events as an AsyncGenerator.
|
|
1091
|
+
|
|
1092
|
+
This method allows streaming events from AsyncGenerator functions
|
|
1093
|
+
directly to the caller, enabling patterns like SSE streaming.
|
|
1094
|
+
|
|
1095
|
+
Usage:
|
|
1096
|
+
async for event in pipeline.execute_streaming():
|
|
1097
|
+
print(event)
|
|
1098
|
+
|
|
1099
|
+
Yields:
|
|
1100
|
+
dict: Events yielded by AsyncGenerator functions in the pipeline.
|
|
1101
|
+
"""
|
|
1102
|
+
from runnable import context
|
|
1103
|
+
|
|
1104
|
+
logger.setLevel(log_level)
|
|
1105
|
+
|
|
1106
|
+
service_configurations = context.ServiceConfigurations(
|
|
1107
|
+
configuration_file=configuration_file,
|
|
1108
|
+
execution_context=context.ExecutionContext.PIPELINE,
|
|
1109
|
+
)
|
|
1110
|
+
|
|
1111
|
+
configurations = {
|
|
1112
|
+
"dag": self.return_dag(),
|
|
1113
|
+
"parameters_file": parameters_file,
|
|
1114
|
+
"tag": tag,
|
|
1115
|
+
"run_id": run_id,
|
|
1116
|
+
"configuration_file": configuration_file,
|
|
1117
|
+
**service_configurations.services,
|
|
1118
|
+
}
|
|
1119
|
+
|
|
1120
|
+
run_context = context.AsyncPipelineContext.model_validate(configurations)
|
|
1121
|
+
context.set_run_context(run_context)
|
|
1122
|
+
|
|
1123
|
+
# Use asyncio.Queue to bridge callback to AsyncGenerator
|
|
1124
|
+
queue: asyncio.Queue = asyncio.Queue()
|
|
1125
|
+
|
|
1126
|
+
# Set the callback on the executor
|
|
1127
|
+
run_context.pipeline_executor._event_callback = queue.put_nowait
|
|
1128
|
+
|
|
1129
|
+
async def run_pipeline():
|
|
1130
|
+
try:
|
|
1131
|
+
await run_context.execute_async()
|
|
1132
|
+
finally:
|
|
1133
|
+
await queue.put(None) # Sentinel to signal completion
|
|
1134
|
+
|
|
1135
|
+
# Start pipeline execution in background
|
|
1136
|
+
task = asyncio.create_task(run_pipeline())
|
|
1137
|
+
|
|
1138
|
+
# Yield events as they arrive
|
|
1139
|
+
while True:
|
|
1140
|
+
event = await queue.get()
|
|
1141
|
+
if event is None:
|
|
1142
|
+
break
|
|
1143
|
+
yield event
|
|
1144
|
+
|
|
1145
|
+
# Ensure task completed (will raise if there was an exception)
|
|
1146
|
+
await task
|
|
1147
|
+
|
|
1148
|
+
|
|
1149
|
+
class BaseJob(BaseModel):
|
|
1150
|
+
catalog: Optional[Catalog] = Field(default=None, alias="catalog", exclude=True)
|
|
1151
|
+
returns: List[Union[str, TaskReturns]] = Field(
|
|
1152
|
+
default_factory=list, alias="returns"
|
|
1153
|
+
)
|
|
1154
|
+
secrets: List[str] = Field(default_factory=list)
|
|
1155
|
+
|
|
1156
|
+
@field_validator("returns", mode="before")
|
|
1157
|
+
@classmethod
|
|
1158
|
+
def serialize_returns(
|
|
1159
|
+
cls, returns: List[Union[str, TaskReturns]]
|
|
1160
|
+
) -> List[TaskReturns]:
|
|
1161
|
+
task_returns = []
|
|
1162
|
+
|
|
1163
|
+
for x in returns:
|
|
1164
|
+
if isinstance(x, str):
|
|
1165
|
+
task_returns.append(TaskReturns(name=x, kind="json"))
|
|
1166
|
+
continue
|
|
1167
|
+
# Its already task returns
|
|
1168
|
+
task_returns.append(x)
|
|
1169
|
+
|
|
1170
|
+
return task_returns
|
|
1171
|
+
|
|
1172
|
+
@field_validator("catalog", mode="after")
|
|
1173
|
+
@classmethod
|
|
1174
|
+
def validate_catalog(cls, catalog: Optional[Catalog]) -> Optional[Catalog]:
|
|
1175
|
+
if catalog is None:
|
|
1176
|
+
return None
|
|
1177
|
+
|
|
1178
|
+
if catalog.get:
|
|
1179
|
+
raise Exception("Catalog get is not supported for jobs")
|
|
1180
|
+
|
|
1181
|
+
return catalog
|
|
1182
|
+
|
|
1183
|
+
def get_task(self) -> RunnableTask:
|
|
1184
|
+
return create_task(self.model_dump(by_alias=True, exclude_none=True))
|
|
1185
|
+
|
|
1186
|
+
def get_caller(self) -> str:
|
|
1187
|
+
caller_stack = inspect.stack()[2]
|
|
1188
|
+
relative_to_root = str(Path(caller_stack.filename).relative_to(Path.cwd()))
|
|
1189
|
+
|
|
1190
|
+
module_name = re.sub(r"\b.py\b", "", relative_to_root.replace("/", "."))
|
|
1191
|
+
module_to_call = f"{module_name}.{caller_stack.function}"
|
|
1192
|
+
|
|
1193
|
+
return module_to_call
|
|
1194
|
+
|
|
1195
|
+
def return_catalog_settings(self) -> Optional[List[str]]:
|
|
1196
|
+
if self.catalog is None:
|
|
1197
|
+
return []
|
|
1198
|
+
return self.catalog.put
|
|
1199
|
+
|
|
1200
|
+
def return_bool_catalog_store_copy(self) -> bool:
|
|
1201
|
+
if self.catalog is None:
|
|
1202
|
+
return True
|
|
1203
|
+
return self.catalog.store_copy
|
|
1204
|
+
|
|
1205
|
+
def _is_called_for_definition(self) -> bool:
|
|
1206
|
+
"""
|
|
1207
|
+
If the run context is set, we are coming in only to get the pipeline definition.
|
|
1208
|
+
"""
|
|
1209
|
+
from runnable.context import get_run_context
|
|
1210
|
+
|
|
1211
|
+
if get_run_context() is None:
|
|
1212
|
+
return False
|
|
1213
|
+
return True
|
|
1214
|
+
|
|
1215
|
+
def execute(
|
|
1216
|
+
self,
|
|
1217
|
+
configuration_file: str = "",
|
|
1218
|
+
job_id: str = "",
|
|
1219
|
+
tag: str = "",
|
|
1220
|
+
parameters_file: str = "",
|
|
1221
|
+
log_level: str = defaults.LOG_LEVEL,
|
|
1222
|
+
):
|
|
1223
|
+
if self._is_called_for_definition():
|
|
1224
|
+
# Immediately return as this call is only for getting the job definition
|
|
1225
|
+
return {}
|
|
1226
|
+
from runnable import context
|
|
1227
|
+
|
|
1228
|
+
logger.setLevel(log_level)
|
|
1229
|
+
|
|
1230
|
+
service_configurations = context.ServiceConfigurations(
|
|
1231
|
+
configuration_file=configuration_file,
|
|
1232
|
+
execution_context=context.ExecutionContext.JOB,
|
|
1233
|
+
)
|
|
1234
|
+
|
|
1235
|
+
configurations = {
|
|
1236
|
+
"job_definition_file": self.get_caller(),
|
|
1237
|
+
"parameters_file": parameters_file,
|
|
1238
|
+
"tag": tag,
|
|
1239
|
+
"run_id": job_id,
|
|
1240
|
+
"execution_mode": context.ExecutionMode.PYTHON,
|
|
1241
|
+
"configuration_file": configuration_file,
|
|
1242
|
+
"job": self.get_task(),
|
|
1243
|
+
"catalog_settings": self.return_catalog_settings(),
|
|
1244
|
+
**service_configurations.services,
|
|
1245
|
+
}
|
|
1246
|
+
|
|
1247
|
+
run_context = context.JobContext.model_validate(configurations)
|
|
1248
|
+
run_context.catalog_store_copy = self.return_bool_catalog_store_copy()
|
|
1249
|
+
|
|
1250
|
+
assert isinstance(run_context.job_executor, BaseJobExecutor)
|
|
1251
|
+
|
|
1252
|
+
run_context.execute()
|
|
1253
|
+
|
|
1254
|
+
|
|
1255
|
+
class PythonJob(BaseJob):
|
|
1256
|
+
function: Callable = Field(exclude=True)
|
|
1257
|
+
command_type: str = Field(default="python")
|
|
1258
|
+
|
|
1259
|
+
@computed_field
|
|
1260
|
+
def command(self) -> str:
|
|
1261
|
+
module = self.function.__module__
|
|
1262
|
+
name = self.function.__name__
|
|
1263
|
+
|
|
1264
|
+
return f"{module}.{name}"
|
|
1265
|
+
|
|
1266
|
+
|
|
1267
|
+
class NotebookJob(BaseJob):
|
|
1268
|
+
notebook: str = Field(serialization_alias="command")
|
|
1269
|
+
optional_ploomber_args: Optional[Dict[str, Any]] = Field(
|
|
1270
|
+
default=None, alias="optional_ploomber_args"
|
|
1271
|
+
)
|
|
1272
|
+
command_type: str = Field(default="notebook")
|
|
1273
|
+
|
|
1274
|
+
|
|
1275
|
+
class ShellJob(BaseJob):
|
|
1276
|
+
command: str = Field(alias="command")
|
|
1277
|
+
command_type: str = Field(default="shell")
|