argo-kedro 0.1.5__tar.gz → 0.1.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {argo_kedro-0.1.5 → argo_kedro-0.1.7}/PKG-INFO +2 -1
- argo_kedro-0.1.7/argo_kedro/framework/cli/cli.py +400 -0
- argo_kedro-0.1.7/argo_kedro/framework/hooks/argo_hook.py +66 -0
- argo_kedro-0.1.7/argo_kedro/pipeline/__init__.py +4 -0
- {argo_kedro-0.1.5 → argo_kedro-0.1.7}/argo_kedro/pipeline/fused_pipeline.py +9 -4
- argo_kedro-0.1.7/argo_kedro/pipeline/node.py +26 -0
- argo_kedro-0.1.7/argo_kedro/templates/argo.yml +9 -0
- {argo_kedro-0.1.5 → argo_kedro-0.1.7}/argo_kedro/templates/argo_wf_spec.tmpl +25 -1
- {argo_kedro-0.1.5 → argo_kedro-0.1.7}/argo_kedro.egg-info/PKG-INFO +2 -1
- {argo_kedro-0.1.5 → argo_kedro-0.1.7}/argo_kedro.egg-info/SOURCES.txt +9 -1
- {argo_kedro-0.1.5 → argo_kedro-0.1.7}/argo_kedro.egg-info/entry_points.txt +3 -0
- {argo_kedro-0.1.5 → argo_kedro-0.1.7}/argo_kedro.egg-info/requires.txt +1 -0
- {argo_kedro-0.1.5 → argo_kedro-0.1.7}/pyproject.toml +6 -2
- argo_kedro-0.1.7/tests/__init__.py +0 -0
- argo_kedro-0.1.7/tests/cli/__init__.py +0 -0
- argo_kedro-0.1.7/tests/cli/test_cli.py +206 -0
- argo_kedro-0.1.7/tests/pipeline/__init__.py +0 -0
- argo_kedro-0.1.7/tests/pipeline/test_fused_pipeline.py +45 -0
- {argo_kedro-0.1.5 → argo_kedro-0.1.7}/uv.lock +23 -0
- argo_kedro-0.1.5/argo_kedro/framework/cli/cli.py +0 -242
- argo_kedro-0.1.5/argo_kedro/pipeline/__init__.py +0 -3
- {argo_kedro-0.1.5 → argo_kedro-0.1.7}/.gitignore +0 -0
- {argo_kedro-0.1.5 → argo_kedro-0.1.7}/.python-version +0 -0
- {argo_kedro-0.1.5 → argo_kedro-0.1.7}/LICENSE +0 -0
- {argo_kedro-0.1.5 → argo_kedro-0.1.7}/MANIFEST.in +0 -0
- {argo_kedro-0.1.5 → argo_kedro-0.1.7}/README.md +0 -0
- {argo_kedro-0.1.5 → argo_kedro-0.1.7}/argo_kedro/framework/__init__.py +0 -0
- {argo_kedro-0.1.5 → argo_kedro-0.1.7}/argo_kedro/framework/cli/__init__.py +0 -0
- {argo_kedro-0.1.5 → argo_kedro-0.1.7}/argo_kedro/runners/__init__.py +0 -0
- {argo_kedro-0.1.5 → argo_kedro-0.1.7}/argo_kedro/runners/fuse_runner.py +0 -0
- {argo_kedro-0.1.5 → argo_kedro-0.1.7}/argo_kedro.egg-info/dependency_links.txt +0 -0
- {argo_kedro-0.1.5 → argo_kedro-0.1.7}/argo_kedro.egg-info/top_level.txt +0 -0
- {argo_kedro-0.1.5 → argo_kedro-0.1.7}/locally_building_guide.md +0 -0
- {argo_kedro-0.1.5 → argo_kedro-0.1.7}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: argo-kedro
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.7
|
|
4
4
|
Summary: Kedro plugin for running pipelines on Argo Workflows
|
|
5
5
|
Author-email: Laurens Vijnck <laurens@everycure.org>, Nelson Alfonso <nelson@everycure.org>
|
|
6
6
|
License: MIT
|
|
@@ -26,6 +26,7 @@ Requires-Dist: kedro
|
|
|
26
26
|
Requires-Dist: pyyaml>=6.0.2
|
|
27
27
|
Requires-Dist: jinja2>=3.0.0
|
|
28
28
|
Requires-Dist: kubernetes>=35.0.0
|
|
29
|
+
Requires-Dist: pydantic>=2.0.0
|
|
29
30
|
Dynamic: license-file
|
|
30
31
|
|
|
31
32
|
# argo-kedro
|
|
@@ -0,0 +1,400 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Any, Dict, List, Iterable, Union
|
|
4
|
+
from logging import getLogger
|
|
5
|
+
|
|
6
|
+
import click
|
|
7
|
+
import yaml
|
|
8
|
+
from kubernetes import config
|
|
9
|
+
from kubernetes.dynamic import DynamicClient
|
|
10
|
+
from jinja2 import Environment, FileSystemLoader
|
|
11
|
+
from kedro.framework.cli.utils import CONTEXT_SETTINGS, KedroCliError
|
|
12
|
+
from kedro.framework.project import pipelines, settings
|
|
13
|
+
from kedro.framework.session import KedroSession
|
|
14
|
+
from kedro.framework.startup import bootstrap_project
|
|
15
|
+
from kedro.utils import find_kedro_project, is_kedro_project
|
|
16
|
+
from kedro.framework.cli.project import (
|
|
17
|
+
ASYNC_ARG_HELP,
|
|
18
|
+
CONF_SOURCE_HELP,
|
|
19
|
+
FROM_INPUTS_HELP,
|
|
20
|
+
FROM_NODES_HELP,
|
|
21
|
+
LOAD_VERSION_HELP,
|
|
22
|
+
NODE_ARG_HELP,
|
|
23
|
+
PARAMS_ARG_HELP,
|
|
24
|
+
PIPELINE_ARG_HELP,
|
|
25
|
+
RUNNER_ARG_HELP,
|
|
26
|
+
TAG_ARG_HELP,
|
|
27
|
+
TO_NODES_HELP,
|
|
28
|
+
TO_OUTPUTS_HELP,
|
|
29
|
+
project_group,
|
|
30
|
+
)
|
|
31
|
+
from kedro.framework.project import pipelines as kedro_pipelines
|
|
32
|
+
from kedro.pipeline import Pipeline
|
|
33
|
+
from kedro.pipeline.node import Node
|
|
34
|
+
from kedro.runner.sequential_runner import SequentialRunner
|
|
35
|
+
from argo_kedro.runners.fuse_runner import FusedRunner
|
|
36
|
+
from argo_kedro.framework.hooks.argo_hook import MachineType
|
|
37
|
+
from argo_kedro.pipeline.node import ArgoNode
|
|
38
|
+
|
|
39
|
+
LOGGER = getLogger(__name__)
|
|
40
|
+
ARGO_TEMPLATES_DIR_PATH = Path(__file__).parent.parent.parent / "templates"
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def render_jinja_template(
|
|
44
|
+
src: Union[str, Path], **kwargs
|
|
45
|
+
) -> str:
|
|
46
|
+
"""This functions enable to copy a file and render the
|
|
47
|
+
tags (identified by {{ my_tag }}) with the values provided in kwargs.
|
|
48
|
+
|
|
49
|
+
Arguments:
|
|
50
|
+
src {Union[str, Path]} -- The path to the template which should be rendered
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
str -- A string that contains all the files with replaced tags.
|
|
54
|
+
"""
|
|
55
|
+
src = Path(src)
|
|
56
|
+
template_loader = FileSystemLoader(searchpath=src.parent.as_posix())
|
|
57
|
+
template_env = Environment(loader=template_loader, keep_trailing_newline=True)
|
|
58
|
+
template = template_env.get_template(src.name)
|
|
59
|
+
return template.render(**kwargs)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def write_jinja_template(
|
|
63
|
+
src: Union[str, Path], dst: Union[str, Path], **kwargs
|
|
64
|
+
) -> None:
|
|
65
|
+
"""Write a template file and replace tis jinja's tags
|
|
66
|
+
(identified by {{ my_tag }}) with the values provided in kwargs.
|
|
67
|
+
|
|
68
|
+
Arguments:
|
|
69
|
+
src {Union[str, Path]} -- Path to the template which should be rendered
|
|
70
|
+
dst {Union[str, Path]} -- Path where the rendered template should be saved
|
|
71
|
+
"""
|
|
72
|
+
dst = Path(dst)
|
|
73
|
+
parsed_template = render_jinja_template(src, **kwargs)
|
|
74
|
+
with open(dst, "w") as file_handler:
|
|
75
|
+
file_handler.write(parsed_template)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
@click.group(context_settings=CONTEXT_SETTINGS)
|
|
79
|
+
def cli():
|
|
80
|
+
pass
|
|
81
|
+
|
|
82
|
+
@cli.command(name="run")
|
|
83
|
+
@click.option("--pipeline", "-p", type=str, default="__default__", help="Name of the pipeline to execute")
|
|
84
|
+
@click.option("--env", "-e", type=str, default=None, help="Kedro environment to run the pipeline in")
|
|
85
|
+
@click.option("--config", "-c", type=str, multiple=True, help="Extra config to pass to KedroContext")
|
|
86
|
+
@click.option("--params", type=str, multiple=True, help="Override parameters")
|
|
87
|
+
@click.option("--tags", "-t", type=str, multiple=True, help=TAG_ARG_HELP)
|
|
88
|
+
@click.option("--nodes", "-n", type=str, multiple=True, help="Run only nodes with specified names")
|
|
89
|
+
@click.option("--to-nodes", type=str, multiple=True, help="Run a sub-pipeline up to certain nodes")
|
|
90
|
+
@click.option("--from-nodes", type=str, multiple=True, help="Run a sub-pipeline starting from certain nodes")
|
|
91
|
+
@click.option("--from-inputs", type=str, multiple=True, help="Run a sub-pipeline starting from nodes that produce these inputs")
|
|
92
|
+
@click.option("--to-outputs", type=str, multiple=True, help="Run a sub-pipeline up to nodes that produce these outputs")
|
|
93
|
+
@click.option("--load-version", type=str, multiple=True, help="Specify a particular dataset version")
|
|
94
|
+
@click.option("--namespaces", type=str, multiple=True, help="Namespaces of the pipeline")
|
|
95
|
+
@click.pass_obj
|
|
96
|
+
def _run_command_impl(
|
|
97
|
+
ctx,
|
|
98
|
+
pipeline: str,
|
|
99
|
+
env: str,
|
|
100
|
+
config: tuple,
|
|
101
|
+
params: tuple,
|
|
102
|
+
tags: list[str],
|
|
103
|
+
nodes: tuple,
|
|
104
|
+
to_nodes: tuple,
|
|
105
|
+
from_nodes: tuple,
|
|
106
|
+
from_inputs: tuple,
|
|
107
|
+
to_outputs: tuple,
|
|
108
|
+
load_version: tuple,
|
|
109
|
+
namespaces: Iterable[str],
|
|
110
|
+
):
|
|
111
|
+
"""Run the pipeline with the FusedRunner."""
|
|
112
|
+
|
|
113
|
+
LOGGER.warning(f"Using plugin entrypoint")
|
|
114
|
+
|
|
115
|
+
load_versions = None
|
|
116
|
+
if load_version:
|
|
117
|
+
load_versions = {}
|
|
118
|
+
for version_spec in load_version:
|
|
119
|
+
if ":" in version_spec:
|
|
120
|
+
dataset, version = version_spec.split(":", 1)
|
|
121
|
+
load_versions[dataset] = version
|
|
122
|
+
|
|
123
|
+
conf_source = getattr(ctx, "conf_source", None)
|
|
124
|
+
env_value = env or getattr(ctx, "env", None)
|
|
125
|
+
|
|
126
|
+
with KedroSession.create(
|
|
127
|
+
env=env_value,
|
|
128
|
+
conf_source=conf_source,
|
|
129
|
+
) as session:
|
|
130
|
+
|
|
131
|
+
session.run(
|
|
132
|
+
pipeline_name=pipeline,
|
|
133
|
+
tags=tags,
|
|
134
|
+
runner=FusedRunner(pipeline_name=pipeline),
|
|
135
|
+
node_names=list(nodes) if nodes else None,
|
|
136
|
+
from_nodes=list(from_nodes) if from_nodes else None,
|
|
137
|
+
to_nodes=list(to_nodes) if to_nodes else None,
|
|
138
|
+
from_inputs=list(from_inputs) if from_inputs else None,
|
|
139
|
+
to_outputs=list(to_outputs) if to_outputs else None,
|
|
140
|
+
load_versions=load_versions,
|
|
141
|
+
namespaces=namespaces,
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
class KedroClickGroup(click.Group):
|
|
145
|
+
def reset_commands(self):
|
|
146
|
+
self.commands = {}
|
|
147
|
+
|
|
148
|
+
# add commands on the fly based on conditions
|
|
149
|
+
if is_kedro_project(find_kedro_project(Path.cwd())):
|
|
150
|
+
self.add_command(init)
|
|
151
|
+
self.add_command(submit)
|
|
152
|
+
|
|
153
|
+
def list_commands(self, ctx):
|
|
154
|
+
self.reset_commands()
|
|
155
|
+
commands_list = sorted(self.commands)
|
|
156
|
+
return commands_list
|
|
157
|
+
|
|
158
|
+
def get_command(self, ctx, cmd_name):
|
|
159
|
+
self.reset_commands()
|
|
160
|
+
return self.commands.get(cmd_name)
|
|
161
|
+
|
|
162
|
+
@click.group(name="argo")
|
|
163
|
+
def commands():
|
|
164
|
+
pass
|
|
165
|
+
|
|
166
|
+
@commands.command(name="argo", cls=KedroClickGroup)
|
|
167
|
+
def argo_commands():
|
|
168
|
+
"""Use mlflow-specific commands inside kedro project."""
|
|
169
|
+
pass # pragma: no cover
|
|
170
|
+
|
|
171
|
+
@argo_commands.command()
|
|
172
|
+
@click.option(
|
|
173
|
+
"--env",
|
|
174
|
+
"-e",
|
|
175
|
+
default="base",
|
|
176
|
+
help="The name of the kedro environment where the 'argo.yml' should be created. Default to 'base'",
|
|
177
|
+
)
|
|
178
|
+
@click.option(
|
|
179
|
+
"--force",
|
|
180
|
+
"-f",
|
|
181
|
+
is_flag=True,
|
|
182
|
+
default=False,
|
|
183
|
+
help="Update the template without any checks.",
|
|
184
|
+
)
|
|
185
|
+
@click.option(
|
|
186
|
+
"--silent",
|
|
187
|
+
"-s",
|
|
188
|
+
is_flag=True,
|
|
189
|
+
default=False,
|
|
190
|
+
help="Should message be logged when files are modified?",
|
|
191
|
+
)
|
|
192
|
+
def init(env: str, force: bool, silent: bool):
|
|
193
|
+
"""Updates the template of a kedro project.
|
|
194
|
+
Running this command is mandatory to use argo-kedro.
|
|
195
|
+
This adds "conf/base/argo.yml": This is a configuration file
|
|
196
|
+
used for run parametrization when calling "kedro run" command.
|
|
197
|
+
"""
|
|
198
|
+
|
|
199
|
+
# get constants
|
|
200
|
+
argo_yml = "argo.yml"
|
|
201
|
+
project_path = find_kedro_project(Path.cwd()) or Path.cwd()
|
|
202
|
+
project_metadata = bootstrap_project(project_path)
|
|
203
|
+
argo_yml_path = project_path / settings.CONF_SOURCE / env / argo_yml
|
|
204
|
+
|
|
205
|
+
# mlflow.yml is just a static file,
|
|
206
|
+
# but the name of the experiment is set to be the same as the project
|
|
207
|
+
if argo_yml_path.is_file() and not force:
|
|
208
|
+
click.secho(
|
|
209
|
+
click.style(
|
|
210
|
+
f"A 'argo.yml' already exists at '{argo_yml_path}' You can use the ``--force`` option to override it.",
|
|
211
|
+
fg="red",
|
|
212
|
+
)
|
|
213
|
+
)
|
|
214
|
+
else:
|
|
215
|
+
try:
|
|
216
|
+
write_jinja_template(
|
|
217
|
+
src=ARGO_TEMPLATES_DIR_PATH / argo_yml,
|
|
218
|
+
is_cookiecutter=False,
|
|
219
|
+
dst=argo_yml_path,
|
|
220
|
+
python_package=project_metadata.package_name,
|
|
221
|
+
)
|
|
222
|
+
if not silent:
|
|
223
|
+
click.secho(
|
|
224
|
+
click.style(
|
|
225
|
+
f"'{settings.CONF_SOURCE}/{env}/{argo_yml}' successfully updated.",
|
|
226
|
+
fg="green",
|
|
227
|
+
)
|
|
228
|
+
)
|
|
229
|
+
except FileNotFoundError:
|
|
230
|
+
click.secho(
|
|
231
|
+
click.style(
|
|
232
|
+
f"No env '{env}' found. Please check this folder exists inside '{settings.CONF_SOURCE}' folder.",
|
|
233
|
+
fg="red",
|
|
234
|
+
)
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
@argo_commands.command(name="submit")
|
|
238
|
+
@click.option("--pipeline", "-p", type=str, default="__default__", help="Specify which pipeline to execute")
|
|
239
|
+
@click.option("--environment", "-e", type=str, default="base", help="Kedro environment to execute in")
|
|
240
|
+
@click.option("--image", type=str, required=True, help="Image to execute")
|
|
241
|
+
@click.pass_obj
|
|
242
|
+
def submit(
|
|
243
|
+
ctx,
|
|
244
|
+
pipeline: str,
|
|
245
|
+
image: str,
|
|
246
|
+
environment: str
|
|
247
|
+
):
|
|
248
|
+
"""Submit the pipeline to Argo."""
|
|
249
|
+
LOGGER.info("Loading spec template..")
|
|
250
|
+
|
|
251
|
+
loader = FileSystemLoader(searchpath=ARGO_TEMPLATES_DIR_PATH)
|
|
252
|
+
template_env = Environment(loader=loader, trim_blocks=True, lstrip_blocks=True)
|
|
253
|
+
template = template_env.get_template("argo_wf_spec.tmpl")
|
|
254
|
+
|
|
255
|
+
LOGGER.info("Rendering Argo spec...")
|
|
256
|
+
|
|
257
|
+
project_path = find_kedro_project(Path.cwd()) or Path.cwd()
|
|
258
|
+
bootstrap_project(project_path)
|
|
259
|
+
with KedroSession.create(
|
|
260
|
+
project_path=project_path,
|
|
261
|
+
env=environment,
|
|
262
|
+
) as session:
|
|
263
|
+
context = session.load_context()
|
|
264
|
+
pipeline_tasks = get_argo_dag(
|
|
265
|
+
kedro_pipelines[pipeline],
|
|
266
|
+
machine_types=context.argo.machine_types,
|
|
267
|
+
default_machine_type=context.argo.default_machine_type
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
# Render the template
|
|
271
|
+
rendered_template = template.render(
|
|
272
|
+
pipeline_tasks=[task.to_dict() for task in pipeline_tasks.values()],
|
|
273
|
+
pipeline_name=pipeline,
|
|
274
|
+
image=image,
|
|
275
|
+
namespace=context.argo.namespace,
|
|
276
|
+
environment=environment
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
# Load as yaml
|
|
280
|
+
yaml_data = yaml.safe_load(rendered_template)
|
|
281
|
+
yaml_without_anchors = yaml.dump(yaml_data, sort_keys=False, default_flow_style=False)
|
|
282
|
+
save_argo_template(
|
|
283
|
+
yaml_without_anchors,
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
# Use kubeconfig to submit to kubernetes
|
|
287
|
+
config.load_kube_config()
|
|
288
|
+
client = DynamicClient(config.new_client_from_config())
|
|
289
|
+
|
|
290
|
+
resource = client.resources.get(
|
|
291
|
+
api_version=yaml_data["apiVersion"],
|
|
292
|
+
kind=yaml_data["kind"],
|
|
293
|
+
)
|
|
294
|
+
|
|
295
|
+
response = resource.create(
|
|
296
|
+
body=yaml_data,
|
|
297
|
+
namespace=context.argo.namespace
|
|
298
|
+
)
|
|
299
|
+
|
|
300
|
+
workflow_name = response.metadata.name
|
|
301
|
+
LOGGER.info(f"Workflow submitted successfully: {workflow_name}")
|
|
302
|
+
LOGGER.info(f"View workflow at: https://argo.ai-platform.dev.everycure.org/workflows/{context.argo.namespace}/{workflow_name}")
|
|
303
|
+
|
|
304
|
+
return workflow_name
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
def save_argo_template(argo_template: str) -> str:
|
|
308
|
+
file_path = Path("templates") / "argo-workflow-template.yml"
|
|
309
|
+
file_path.parent.mkdir(parents=True, exist_ok=True)
|
|
310
|
+
with open(file_path, "w") as f:
|
|
311
|
+
f.write(argo_template)
|
|
312
|
+
return str(file_path)
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
class ArgoTask:
|
|
316
|
+
"""Class to model an Argo task.
|
|
317
|
+
|
|
318
|
+
Argo's operating model slightly differs from Kedro's, i.e., while Kedro uses dataset
|
|
319
|
+
dependecies to model relationships, Argo uses task dependencies."""
|
|
320
|
+
|
|
321
|
+
def __init__(self, node: Node, machine_type: MachineType):
|
|
322
|
+
self._node = node
|
|
323
|
+
self._parents = []
|
|
324
|
+
self._machine_type = machine_type
|
|
325
|
+
|
|
326
|
+
@property
|
|
327
|
+
def node(self):
|
|
328
|
+
return self._node
|
|
329
|
+
|
|
330
|
+
def add_parents(self, nodes: List[Node]):
|
|
331
|
+
self._parents.extend(nodes)
|
|
332
|
+
|
|
333
|
+
def to_dict(self):
|
|
334
|
+
return {
|
|
335
|
+
"name": clean_name(self._node.name),
|
|
336
|
+
"nodes": self._node.name,
|
|
337
|
+
"deps": [clean_name(parent.name) for parent in sorted(self._parents)],
|
|
338
|
+
"mem": self._machine_type.mem,
|
|
339
|
+
"cpu": self._machine_type.cpu,
|
|
340
|
+
"num_gpu": self._machine_type.num_gpu,
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
def get_argo_dag(
|
|
345
|
+
pipeline: Pipeline,
|
|
346
|
+
machine_types: dict[str, MachineType],
|
|
347
|
+
default_machine_type: str,
|
|
348
|
+
) -> List[Dict[str, Any]]:
|
|
349
|
+
"""Function to convert the Kedro pipeline into Argo Tasks. The function
|
|
350
|
+
iterates the nodes of the pipeline and generates Argo tasks with dependencies.
|
|
351
|
+
These dependencies are inferred based on the input and output datasets for
|
|
352
|
+
each node.
|
|
353
|
+
|
|
354
|
+
NOTE: This function is now agnostic to the fact that nodes might be fused. The nodes
|
|
355
|
+
returned as part of the pipeline may optionally contain FusedNodes, which have correct
|
|
356
|
+
inputs and outputs for the perspective of the Argo Task.
|
|
357
|
+
"""
|
|
358
|
+
tasks = {}
|
|
359
|
+
|
|
360
|
+
# The `grouped_nodes` property returns the nodes list, in a toplogical order,
|
|
361
|
+
# allowing us to easily translate the Kedro DAG to an Argo WF.
|
|
362
|
+
for group in pipeline.grouped_nodes:
|
|
363
|
+
for target_node in group:
|
|
364
|
+
try:
|
|
365
|
+
task = ArgoTask(target_node, machine_types[target_node.machine_type] if isinstance(target_node, ArgoNode) and target_node.machine_type is not None else machine_types[default_machine_type])
|
|
366
|
+
except KeyError as e:
|
|
367
|
+
LOGGER.error(f"Machine type not found for node `{target_node.name}`")
|
|
368
|
+
raise KeyError(f"Machine type `{target_node.machine_type}` not found for node `{target_node.name}`")
|
|
369
|
+
|
|
370
|
+
task.add_parents(
|
|
371
|
+
[
|
|
372
|
+
parent.node
|
|
373
|
+
for parent in tasks.values()
|
|
374
|
+
if set(clean_dependencies(target_node.inputs)) & set(clean_dependencies(parent.node.outputs))
|
|
375
|
+
]
|
|
376
|
+
)
|
|
377
|
+
|
|
378
|
+
tasks[target_node.name] = task
|
|
379
|
+
|
|
380
|
+
return tasks
|
|
381
|
+
|
|
382
|
+
|
|
383
|
+
def clean_name(name: str) -> str:
|
|
384
|
+
"""Function to clean the node name.
|
|
385
|
+
|
|
386
|
+
Args:
|
|
387
|
+
name: name of the node
|
|
388
|
+
Returns:
|
|
389
|
+
Clean node name, according to Argo's requirements
|
|
390
|
+
"""
|
|
391
|
+
return re.sub(r"[\W_]+", "-", name).strip("-")
|
|
392
|
+
|
|
393
|
+
|
|
394
|
+
def clean_dependencies(elements) -> List[str]:
|
|
395
|
+
"""Function to clean node dependencies.
|
|
396
|
+
|
|
397
|
+
Operates by removing `params:` from the list and dismissing
|
|
398
|
+
the transcoding operator.
|
|
399
|
+
"""
|
|
400
|
+
return [el.split("@")[0] for el in elements if not el.startswith("params:")]
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
|
|
2
|
+
import os
|
|
3
|
+
import re
|
|
4
|
+
from logging import Logger, getLogger
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from tempfile import TemporaryDirectory
|
|
7
|
+
from typing import Any, Union
|
|
8
|
+
|
|
9
|
+
from kedro.config import MissingConfigException
|
|
10
|
+
from kedro.framework.context import KedroContext
|
|
11
|
+
from kedro.framework.hooks import hook_impl
|
|
12
|
+
from kedro.framework.startup import _get_project_metadata
|
|
13
|
+
from kedro.io import CatalogProtocol, DataCatalog
|
|
14
|
+
from kedro.pipeline import Pipeline
|
|
15
|
+
from kedro.pipeline.node import Node
|
|
16
|
+
from omegaconf import OmegaConf
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
from pydantic import BaseModel
|
|
20
|
+
|
|
21
|
+
class MachineType(BaseModel):
|
|
22
|
+
mem: int
|
|
23
|
+
cpu: int
|
|
24
|
+
num_gpu: int
|
|
25
|
+
|
|
26
|
+
class ArgoConfig(BaseModel):
|
|
27
|
+
namespace: str
|
|
28
|
+
machine_types: dict[str, MachineType]
|
|
29
|
+
default_machine_type: str
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class ArgoHook:
|
|
33
|
+
@property
|
|
34
|
+
def _logger(self) -> Logger:
|
|
35
|
+
return getLogger(__name__)
|
|
36
|
+
|
|
37
|
+
@hook_impl
|
|
38
|
+
def after_context_created(
|
|
39
|
+
self,
|
|
40
|
+
context: KedroContext,
|
|
41
|
+
) -> None:
|
|
42
|
+
"""Hooks to be invoked after a `KedroContext` is created. This is the earliest
|
|
43
|
+
hook triggered within a Kedro run. The `KedroContext` stores useful information
|
|
44
|
+
such as `credentials`, `config_loader` and `env`.
|
|
45
|
+
Args:
|
|
46
|
+
context: The context that was created.
|
|
47
|
+
"""
|
|
48
|
+
try:
|
|
49
|
+
if "argo" not in context.config_loader.config_patterns.keys():
|
|
50
|
+
context.config_loader.config_patterns.update(
|
|
51
|
+
{"argo": ["argo*", "argo*/**", "**/argo*"]}
|
|
52
|
+
)
|
|
53
|
+
conf_argo_yml = context.config_loader["argo"]
|
|
54
|
+
except MissingConfigException:
|
|
55
|
+
self._logger.warning(
|
|
56
|
+
"No 'argo.yml' config file found in environment. Default configuration will be used. Use ``kedro argo init`` command in CLI to customize the configuration."
|
|
57
|
+
)
|
|
58
|
+
# we create an empty dict to have the same behaviour when the argo.yml
|
|
59
|
+
# is commented out. In this situation there is no MissingConfigException
|
|
60
|
+
# but we got an empty dict
|
|
61
|
+
conf_argo_yml = {}
|
|
62
|
+
|
|
63
|
+
conf_argo_yml = ArgoConfig.model_validate(conf_argo_yml)
|
|
64
|
+
context.__setattr__("argo", conf_argo_yml)
|
|
65
|
+
|
|
66
|
+
argo_hook = ArgoHook()
|
|
@@ -1,14 +1,16 @@
|
|
|
1
1
|
from typing import Iterable, List
|
|
2
|
-
from kedro.pipeline import Pipeline
|
|
2
|
+
from kedro.pipeline import Pipeline
|
|
3
3
|
from functools import cached_property
|
|
4
|
+
from argo_kedro.pipeline.node import ArgoNode
|
|
5
|
+
from kedro.pipeline.node import Node
|
|
4
6
|
|
|
5
|
-
class FusedNode(
|
|
7
|
+
class FusedNode(ArgoNode):
|
|
6
8
|
"""FusedNode is an extension of Kedro's internal node. The FusedNode
|
|
7
9
|
wraps a set of nodes, and correctly sets it's `inputs` and `outputs`
|
|
8
10
|
allowing it to act as a single unit for execution.
|
|
9
11
|
"""
|
|
10
12
|
|
|
11
|
-
def __init__(self, nodes: List[Node], name: str):
|
|
13
|
+
def __init__(self, nodes: List[Node], name: str, machine_type: str | None = None):
|
|
12
14
|
self._nodes = nodes
|
|
13
15
|
self._name = name
|
|
14
16
|
self._namespace = None
|
|
@@ -17,6 +19,7 @@ class FusedNode(Node):
|
|
|
17
19
|
self._confirms = []
|
|
18
20
|
self._func = lambda: None
|
|
19
21
|
self._tags = []
|
|
22
|
+
self._machine_type = machine_type
|
|
20
23
|
|
|
21
24
|
for node in nodes:
|
|
22
25
|
self._inputs.extend(node.inputs)
|
|
@@ -49,10 +52,12 @@ class FusedPipeline(Pipeline):
|
|
|
49
52
|
name: str,
|
|
50
53
|
*,
|
|
51
54
|
tags: str | Iterable[str] | None = None,
|
|
55
|
+
machine_type: str | None = None,
|
|
52
56
|
):
|
|
53
57
|
self._name = name
|
|
58
|
+
self._machine_type = machine_type
|
|
54
59
|
super().__init__(nodes, tags=tags)
|
|
55
60
|
|
|
56
61
|
@property
|
|
57
62
|
def nodes(self) -> list[Node]:
|
|
58
|
-
return [FusedNode(self._nodes, name=self._name)]
|
|
63
|
+
return [FusedNode(self._nodes, name=self._name, machine_type=self._machine_type)]
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
from kedro.pipeline import Node
|
|
2
|
+
from typing import Callable, Iterable
|
|
3
|
+
|
|
4
|
+
class ArgoNode(Node):
|
|
5
|
+
"""ArgoNode is an extension of the Kedro node class, aimed at allowing
|
|
6
|
+
the node to be allocated to a specific machine type.
|
|
7
|
+
"""
|
|
8
|
+
def __init__(
|
|
9
|
+
self,
|
|
10
|
+
func: Callable,
|
|
11
|
+
inputs: str | list[str] | dict[str, str] | None,
|
|
12
|
+
outputs: str | list[str] | dict[str, str] | None,
|
|
13
|
+
*,
|
|
14
|
+
name: str | None = None,
|
|
15
|
+
machine_type: str | None = None,
|
|
16
|
+
tags: str | Iterable[str] | None = None,
|
|
17
|
+
confirms: str | list[str] | None = None,
|
|
18
|
+
namespace: str | None = None,
|
|
19
|
+
):
|
|
20
|
+
|
|
21
|
+
super().__init__(func, inputs, outputs, name=name, tags=tags, confirms=confirms, namespace=namespace)
|
|
22
|
+
self._machine_type = machine_type
|
|
23
|
+
|
|
24
|
+
@property
|
|
25
|
+
def machine_type(self) -> str:
|
|
26
|
+
return self._machine_type
|
|
@@ -18,6 +18,25 @@ spec:
|
|
|
18
18
|
parameters:
|
|
19
19
|
- name: pipeline
|
|
20
20
|
- name: kedro_nodes
|
|
21
|
+
podSpecPatch: |
|
|
22
|
+
containers:
|
|
23
|
+
- name: main
|
|
24
|
+
# Add tolerations for large memory nodes and GPU nodes
|
|
25
|
+
resources:
|
|
26
|
+
requests:
|
|
27
|
+
memory: {% raw %} "{{inputs.parameters.mem}}Gi"
|
|
28
|
+
{% endraw %}
|
|
29
|
+
cpu: {% raw %} "{{inputs.parameters.cpu}}"
|
|
30
|
+
{% endraw %}
|
|
31
|
+
nvidia.com/gpu: {% raw %} "{{inputs.parameters.num_gpu}}"
|
|
32
|
+
{% endraw %}
|
|
33
|
+
limits:
|
|
34
|
+
memory: {% raw %} "{{inputs.parameters.mem}}Gi"
|
|
35
|
+
{% endraw %}
|
|
36
|
+
cpu: {% raw %} "{{inputs.parameters.cpu}}"
|
|
37
|
+
{% endraw %}
|
|
38
|
+
nvidia.com/gpu: {% raw %} "{{inputs.parameters.num_gpu}}"
|
|
39
|
+
{% endraw %}
|
|
21
40
|
container:
|
|
22
41
|
image: {{ image }}
|
|
23
42
|
command: ["kedro"]
|
|
@@ -49,5 +68,10 @@ spec:
|
|
|
49
68
|
value: {{ pipeline_name }}
|
|
50
69
|
- name: kedro_nodes
|
|
51
70
|
value: {{ task.nodes }}
|
|
52
|
-
|
|
71
|
+
- name: num_gpu
|
|
72
|
+
value: {{ task.num_gpu }}
|
|
73
|
+
- name: mem
|
|
74
|
+
value: {{ task.mem }}
|
|
75
|
+
- name: cpu
|
|
76
|
+
value: {{ task.cpu }}
|
|
53
77
|
{% endfor %}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: argo-kedro
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.7
|
|
4
4
|
Summary: Kedro plugin for running pipelines on Argo Workflows
|
|
5
5
|
Author-email: Laurens Vijnck <laurens@everycure.org>, Nelson Alfonso <nelson@everycure.org>
|
|
6
6
|
License: MIT
|
|
@@ -26,6 +26,7 @@ Requires-Dist: kedro
|
|
|
26
26
|
Requires-Dist: pyyaml>=6.0.2
|
|
27
27
|
Requires-Dist: jinja2>=3.0.0
|
|
28
28
|
Requires-Dist: kubernetes>=35.0.0
|
|
29
|
+
Requires-Dist: pydantic>=2.0.0
|
|
29
30
|
Dynamic: license-file
|
|
30
31
|
|
|
31
32
|
# argo-kedro
|
|
@@ -15,8 +15,16 @@ argo_kedro.egg-info/top_level.txt
|
|
|
15
15
|
argo_kedro/framework/__init__.py
|
|
16
16
|
argo_kedro/framework/cli/__init__.py
|
|
17
17
|
argo_kedro/framework/cli/cli.py
|
|
18
|
+
argo_kedro/framework/hooks/argo_hook.py
|
|
18
19
|
argo_kedro/pipeline/__init__.py
|
|
19
20
|
argo_kedro/pipeline/fused_pipeline.py
|
|
21
|
+
argo_kedro/pipeline/node.py
|
|
20
22
|
argo_kedro/runners/__init__.py
|
|
21
23
|
argo_kedro/runners/fuse_runner.py
|
|
22
|
-
argo_kedro/templates/
|
|
24
|
+
argo_kedro/templates/argo.yml
|
|
25
|
+
argo_kedro/templates/argo_wf_spec.tmpl
|
|
26
|
+
tests/__init__.py
|
|
27
|
+
tests/cli/__init__.py
|
|
28
|
+
tests/cli/test_cli.py
|
|
29
|
+
tests/pipeline/__init__.py
|
|
30
|
+
tests/pipeline/test_fused_pipeline.py
|
|
@@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta"
|
|
|
5
5
|
|
|
6
6
|
[project]
|
|
7
7
|
name = "argo-kedro"
|
|
8
|
-
version = "0.1.
|
|
8
|
+
version = "0.1.7"
|
|
9
9
|
description = "Kedro plugin for running pipelines on Argo Workflows"
|
|
10
10
|
readme = "README.md"
|
|
11
11
|
requires-python = ">=3.10"
|
|
@@ -33,6 +33,7 @@ dependencies = [
|
|
|
33
33
|
"pyyaml>=6.0.2",
|
|
34
34
|
"jinja2>=3.0.0",
|
|
35
35
|
"kubernetes>=35.0.0",
|
|
36
|
+
"pydantic>=2.0.0",
|
|
36
37
|
]
|
|
37
38
|
|
|
38
39
|
[project.urls]
|
|
@@ -46,6 +47,9 @@ run = "argo_kedro.framework.cli.cli:cli"
|
|
|
46
47
|
[project.entry-points."kedro.project_commands"]
|
|
47
48
|
argo = "argo_kedro.framework.cli.cli:commands"
|
|
48
49
|
|
|
50
|
+
[project.entry-points."kedro.hooks"]
|
|
51
|
+
argo_hook = "argo_kedro.framework.hooks.argo_hook:argo_hook"
|
|
52
|
+
|
|
49
53
|
[tool.setuptools]
|
|
50
54
|
include-package-data = true
|
|
51
55
|
|
|
@@ -53,4 +57,4 @@ include-package-data = true
|
|
|
53
57
|
include = ["argo_kedro*"]
|
|
54
58
|
|
|
55
59
|
[tool.setuptools.package-data]
|
|
56
|
-
argo_kedro = ["**/*.tmpl"]
|
|
60
|
+
argo_kedro = ["**/*.tmpl", "**/*.yml"]
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
|
|
3
|
+
from kedro.pipeline import Pipeline, node
|
|
4
|
+
from argo_kedro.pipeline import FusedPipeline, ArgoNode
|
|
5
|
+
from argo_kedro.framework.cli.cli import get_argo_dag, MachineType
|
|
6
|
+
|
|
7
|
+
@pytest.fixture
|
|
8
|
+
def machine_types() -> dict[str, MachineType]:
|
|
9
|
+
return {
|
|
10
|
+
"default": MachineType(mem=16, cpu=2, num_gpu=0),
|
|
11
|
+
"n1-standard-4": MachineType(mem=16, cpu=4, num_gpu=0),
|
|
12
|
+
"n1-standard-8": MachineType(mem=16, cpu=8, num_gpu=0),
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
@pytest.fixture
|
|
16
|
+
def default_machine_type() -> str:
|
|
17
|
+
return "default"
|
|
18
|
+
|
|
19
|
+
@pytest.fixture
|
|
20
|
+
def pipeline() -> Pipeline:
|
|
21
|
+
return Pipeline(
|
|
22
|
+
[
|
|
23
|
+
ArgoNode(
|
|
24
|
+
func=lambda x: x,
|
|
25
|
+
inputs="raw_data",
|
|
26
|
+
outputs="data",
|
|
27
|
+
tags=["preprocessing"],
|
|
28
|
+
name="preprocess_fun",
|
|
29
|
+
machine_type="n1-standard-4",
|
|
30
|
+
),
|
|
31
|
+
ArgoNode(
|
|
32
|
+
func=lambda x: x,
|
|
33
|
+
inputs="data",
|
|
34
|
+
outputs="model",
|
|
35
|
+
tags=["training"],
|
|
36
|
+
name="train_fun",
|
|
37
|
+
machine_type="n1-standard-8",
|
|
38
|
+
),
|
|
39
|
+
]
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
@pytest.fixture
|
|
43
|
+
def fused_pipeline() -> Pipeline:
|
|
44
|
+
return Pipeline(
|
|
45
|
+
[
|
|
46
|
+
node(
|
|
47
|
+
func=lambda x: x,
|
|
48
|
+
inputs="raw_data",
|
|
49
|
+
outputs="data",
|
|
50
|
+
tags=["preprocessing"],
|
|
51
|
+
name="preprocess_fun",
|
|
52
|
+
),
|
|
53
|
+
FusedPipeline(
|
|
54
|
+
[
|
|
55
|
+
node(
|
|
56
|
+
func=lambda x: x,
|
|
57
|
+
inputs="data",
|
|
58
|
+
outputs="model",
|
|
59
|
+
tags=["training"],
|
|
60
|
+
name="train_fun",
|
|
61
|
+
),
|
|
62
|
+
node(
|
|
63
|
+
func=lambda x: x,
|
|
64
|
+
inputs="model",
|
|
65
|
+
outputs="predictions",
|
|
66
|
+
tags=["predictions"],
|
|
67
|
+
name="create_predictions",
|
|
68
|
+
),
|
|
69
|
+
],
|
|
70
|
+
name="fused_modelling",
|
|
71
|
+
machine_type="n1-standard-8",
|
|
72
|
+
),
|
|
73
|
+
]
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
@pytest.fixture
|
|
78
|
+
def fused_pipeline_complex() -> Pipeline:
|
|
79
|
+
return Pipeline(
|
|
80
|
+
[
|
|
81
|
+
node(
|
|
82
|
+
func=lambda x: x,
|
|
83
|
+
inputs="raw_data",
|
|
84
|
+
outputs="data",
|
|
85
|
+
tags=["preprocessing"],
|
|
86
|
+
name="preprocess_fun",
|
|
87
|
+
),
|
|
88
|
+
node(
|
|
89
|
+
func=lambda x: x,
|
|
90
|
+
inputs="raw_customers",
|
|
91
|
+
outputs="customers",
|
|
92
|
+
tags=["preprocessing"],
|
|
93
|
+
name="preprocess_customers",
|
|
94
|
+
),
|
|
95
|
+
FusedPipeline(
|
|
96
|
+
[
|
|
97
|
+
node(
|
|
98
|
+
func=lambda x: x,
|
|
99
|
+
inputs="data",
|
|
100
|
+
outputs="model",
|
|
101
|
+
tags=["training"],
|
|
102
|
+
name="train_fun",
|
|
103
|
+
),
|
|
104
|
+
node(
|
|
105
|
+
func=lambda x, y: x,
|
|
106
|
+
inputs=["model", "customers"],
|
|
107
|
+
outputs="predictions",
|
|
108
|
+
tags=["predictions"],
|
|
109
|
+
name="create_predictions",
|
|
110
|
+
),
|
|
111
|
+
],
|
|
112
|
+
name="fused_modelling",
|
|
113
|
+
machine_type="n1-standard-8",
|
|
114
|
+
),
|
|
115
|
+
]
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def test_get_argo_dag(pipeline: Pipeline, machine_types: dict[str, MachineType], default_machine_type: str):
|
|
120
|
+
|
|
121
|
+
# When generating the argo DAG
|
|
122
|
+
argo_dag = get_argo_dag(pipeline, machine_types, default_machine_type)
|
|
123
|
+
expected = {
|
|
124
|
+
"preprocess_fun": {
|
|
125
|
+
"name": "preprocess-fun",
|
|
126
|
+
"nodes": "preprocess_fun",
|
|
127
|
+
"deps": [],
|
|
128
|
+
"mem": 16,
|
|
129
|
+
"cpu": 4,
|
|
130
|
+
"num_gpu": 0,
|
|
131
|
+
},
|
|
132
|
+
"train_fun": {
|
|
133
|
+
"name": "train-fun",
|
|
134
|
+
"nodes": "train_fun",
|
|
135
|
+
"deps": ["preprocess-fun"],
|
|
136
|
+
"mem": 16,
|
|
137
|
+
"cpu": 8,
|
|
138
|
+
"num_gpu": 0,
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
# Assert resulting argo dag is correct
|
|
143
|
+
assert {key: task.to_dict() for key,task in argo_dag.items()} == expected
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def test_get_argo_dag_fused(fused_pipeline: Pipeline, machine_types: dict[str, MachineType], default_machine_type: str):
|
|
147
|
+
|
|
148
|
+
# When generating the argo DAG
|
|
149
|
+
argo_dag = get_argo_dag(fused_pipeline, machine_types, default_machine_type)
|
|
150
|
+
expected = {
|
|
151
|
+
"preprocess_fun": {
|
|
152
|
+
"name": "preprocess-fun",
|
|
153
|
+
"nodes": "preprocess_fun",
|
|
154
|
+
"deps": [],
|
|
155
|
+
"mem": 16,
|
|
156
|
+
"cpu": 2,
|
|
157
|
+
"num_gpu": 0,
|
|
158
|
+
},
|
|
159
|
+
"fused_modelling": {
|
|
160
|
+
"name": "fused-modelling",
|
|
161
|
+
"nodes": "fused_modelling",
|
|
162
|
+
"deps": ["preprocess-fun"],
|
|
163
|
+
"mem": 16,
|
|
164
|
+
"cpu": 8,
|
|
165
|
+
"num_gpu": 0,
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
# Assert resulting argo dag is correct
|
|
170
|
+
assert {key: task.to_dict() for key,task in argo_dag.items()} == expected
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def test_get_argo_dag_fused_complex(fused_pipeline_complex: Pipeline, machine_types: dict[str, MachineType], default_machine_type: str):
|
|
174
|
+
|
|
175
|
+
# When generating the argo DAG
|
|
176
|
+
argo_dag = get_argo_dag(fused_pipeline_complex, machine_types, default_machine_type)
|
|
177
|
+
expected = {
|
|
178
|
+
"preprocess_fun": {
|
|
179
|
+
"name": "preprocess-fun",
|
|
180
|
+
"nodes": "preprocess_fun",
|
|
181
|
+
"deps": [],
|
|
182
|
+
"mem": 16,
|
|
183
|
+
"cpu": 2,
|
|
184
|
+
"num_gpu": 0,
|
|
185
|
+
},
|
|
186
|
+
"preprocess_customers": {
|
|
187
|
+
"name": "preprocess-customers",
|
|
188
|
+
"nodes": "preprocess_customers",
|
|
189
|
+
"deps": [],
|
|
190
|
+
"mem": 16,
|
|
191
|
+
"cpu": 2,
|
|
192
|
+
"num_gpu": 0,
|
|
193
|
+
},
|
|
194
|
+
"fused_modelling": {
|
|
195
|
+
"name": "fused-modelling",
|
|
196
|
+
"nodes": "fused_modelling",
|
|
197
|
+
"deps": ["preprocess-customers", "preprocess-fun"],
|
|
198
|
+
"mem": 16,
|
|
199
|
+
"cpu": 8,
|
|
200
|
+
"num_gpu": 0,
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
# Assert resulting argo dag is correct
|
|
205
|
+
assert {key: task.to_dict() for key,task in argo_dag.items()} == expected
|
|
206
|
+
|
|
File without changes
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
|
|
3
|
+
from kedro.pipeline import Pipeline, node
|
|
4
|
+
from argo_kedro.pipeline.fused_pipeline import FusedPipeline, FusedNode
|
|
5
|
+
|
|
6
|
+
@pytest.fixture
|
|
7
|
+
def pipeline() -> Pipeline:
|
|
8
|
+
return Pipeline(
|
|
9
|
+
[
|
|
10
|
+
node(
|
|
11
|
+
func=lambda x: x,
|
|
12
|
+
inputs="raw_data",
|
|
13
|
+
outputs="data",
|
|
14
|
+
tags=["preprocessing"],
|
|
15
|
+
name="preprocess_fun",
|
|
16
|
+
),
|
|
17
|
+
node(
|
|
18
|
+
func=lambda x: x,
|
|
19
|
+
inputs="data",
|
|
20
|
+
outputs="model",
|
|
21
|
+
tags=["training"],
|
|
22
|
+
name="train_fun",
|
|
23
|
+
),
|
|
24
|
+
]
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def test_fused_node_inputs(pipeline: Pipeline):
|
|
29
|
+
|
|
30
|
+
# Wrap pipeline in FusedNode
|
|
31
|
+
fused_node = FusedNode(pipeline.nodes, name="fused_node")
|
|
32
|
+
|
|
33
|
+
# Assert that the fused node inputs are the pure inputs of the pipeline, i.e.,
|
|
34
|
+
# all inputs not produced as part of intermediate nodes.
|
|
35
|
+
assert set(fused_node.inputs) == set(["raw_data"])
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def test_fused_pipeline_nodes(pipeline: Pipeline):
|
|
39
|
+
|
|
40
|
+
# Wrap pipeline in FusedPipeline
|
|
41
|
+
fused_pipeline = FusedPipeline(pipeline.nodes, name="fused_pipeline")
|
|
42
|
+
|
|
43
|
+
# Assert that the fused pipeline nodes are the same as the pipeline nodes
|
|
44
|
+
assert len(fused_pipeline.nodes) == 1
|
|
45
|
+
assert isinstance(fused_pipeline.nodes[0], FusedNode)
|
|
@@ -356,6 +356,26 @@ wheels = [
|
|
|
356
356
|
]
|
|
357
357
|
|
|
358
358
|
[[package]]
|
|
359
|
+
<<<<<<< HEAD:kedro-argo/uv.lock
|
|
360
|
+
name = "kedro-argo"
|
|
361
|
+
version = "0.1.0"
|
|
362
|
+
source = { editable = "." }
|
|
363
|
+
dependencies = [
|
|
364
|
+
{ name = "jinja2" },
|
|
365
|
+
{ name = "kedro" },
|
|
366
|
+
{ name = "pyyaml" },
|
|
367
|
+
]
|
|
368
|
+
|
|
369
|
+
[package.metadata]
|
|
370
|
+
requires-dist = [
|
|
371
|
+
{ name = "jinja2", specifier = ">=3.0.0" },
|
|
372
|
+
{ name = "kedro" },
|
|
373
|
+
{ name = "pyyaml", specifier = ">=6.0.2" },
|
|
374
|
+
]
|
|
375
|
+
|
|
376
|
+
[[package]]
|
|
377
|
+
=======
|
|
378
|
+
>>>>>>> refs/remotes/origin/main:argo-kedro/uv.lock
|
|
359
379
|
name = "kedro-telemetry"
|
|
360
380
|
version = "0.6.5"
|
|
361
381
|
source = { registry = "https://pypi.org/simple" }
|
|
@@ -820,6 +840,8 @@ wheels = [
|
|
|
820
840
|
]
|
|
821
841
|
|
|
822
842
|
[[package]]
|
|
843
|
+
<<<<<<< HEAD:kedro-argo/uv.lock
|
|
844
|
+
=======
|
|
823
845
|
name = "websocket-client"
|
|
824
846
|
version = "1.9.0"
|
|
825
847
|
source = { registry = "https://pypi.org/simple" }
|
|
@@ -829,6 +851,7 @@ wheels = [
|
|
|
829
851
|
]
|
|
830
852
|
|
|
831
853
|
[[package]]
|
|
854
|
+
>>>>>>> refs/remotes/origin/main:argo-kedro/uv.lock
|
|
832
855
|
name = "zipp"
|
|
833
856
|
version = "3.23.0"
|
|
834
857
|
source = { registry = "https://pypi.org/simple" }
|
|
@@ -1,242 +0,0 @@
|
|
|
1
|
-
import re
|
|
2
|
-
from pathlib import Path
|
|
3
|
-
from typing import Any, Dict, List, Iterable
|
|
4
|
-
from logging import getLogger
|
|
5
|
-
|
|
6
|
-
import click
|
|
7
|
-
import yaml
|
|
8
|
-
from kubernetes import config
|
|
9
|
-
from kubernetes.dynamic import DynamicClient
|
|
10
|
-
from jinja2 import Environment, FileSystemLoader
|
|
11
|
-
from kedro.framework.cli.utils import CONTEXT_SETTINGS, KedroCliError
|
|
12
|
-
from kedro.framework.session import KedroSession
|
|
13
|
-
from kedro.framework.cli.project import (
|
|
14
|
-
ASYNC_ARG_HELP,
|
|
15
|
-
CONF_SOURCE_HELP,
|
|
16
|
-
FROM_INPUTS_HELP,
|
|
17
|
-
FROM_NODES_HELP,
|
|
18
|
-
LOAD_VERSION_HELP,
|
|
19
|
-
NODE_ARG_HELP,
|
|
20
|
-
PARAMS_ARG_HELP,
|
|
21
|
-
PIPELINE_ARG_HELP,
|
|
22
|
-
RUNNER_ARG_HELP,
|
|
23
|
-
TAG_ARG_HELP,
|
|
24
|
-
TO_NODES_HELP,
|
|
25
|
-
TO_OUTPUTS_HELP,
|
|
26
|
-
project_group,
|
|
27
|
-
)
|
|
28
|
-
from kedro.framework.project import pipelines as kedro_pipelines
|
|
29
|
-
from kedro.pipeline import Pipeline
|
|
30
|
-
from kedro.pipeline.node import Node
|
|
31
|
-
from kedro.runner.sequential_runner import SequentialRunner
|
|
32
|
-
from argo_kedro.runners.fuse_runner import FusedRunner
|
|
33
|
-
|
|
34
|
-
LOGGER = getLogger(__name__)
|
|
35
|
-
ARGO_TEMPLATES_DIR_PATH = Path(__file__).parent.parent.parent / "templates"
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
@click.group(context_settings=CONTEXT_SETTINGS)
|
|
39
|
-
def cli():
|
|
40
|
-
pass
|
|
41
|
-
|
|
42
|
-
@cli.command(name="run")
|
|
43
|
-
@click.option("--pipeline", "-p", type=str, default="__default__", help="Name of the pipeline to execute")
|
|
44
|
-
@click.option("--env", "-e", type=str, default=None, help="Kedro environment to run the pipeline in")
|
|
45
|
-
@click.option("--config", "-c", type=str, multiple=True, help="Extra config to pass to KedroContext")
|
|
46
|
-
@click.option("--params", type=str, multiple=True, help="Override parameters")
|
|
47
|
-
@click.option("--tags", "-t", type=str, multiple=True, help=TAG_ARG_HELP)
|
|
48
|
-
@click.option("--nodes", "-n", type=str, multiple=True, help="Run only nodes with specified names")
|
|
49
|
-
@click.option("--to-nodes", type=str, multiple=True, help="Run a sub-pipeline up to certain nodes")
|
|
50
|
-
@click.option("--from-nodes", type=str, multiple=True, help="Run a sub-pipeline starting from certain nodes")
|
|
51
|
-
@click.option("--from-inputs", type=str, multiple=True, help="Run a sub-pipeline starting from nodes that produce these inputs")
|
|
52
|
-
@click.option("--to-outputs", type=str, multiple=True, help="Run a sub-pipeline up to nodes that produce these outputs")
|
|
53
|
-
@click.option("--load-version", type=str, multiple=True, help="Specify a particular dataset version")
|
|
54
|
-
@click.option("--namespaces", type=str, multiple=True, help="Namespaces of the pipeline")
|
|
55
|
-
@click.pass_obj
|
|
56
|
-
def _run_command_impl(
|
|
57
|
-
ctx,
|
|
58
|
-
pipeline: str,
|
|
59
|
-
env: str,
|
|
60
|
-
config: tuple,
|
|
61
|
-
params: tuple,
|
|
62
|
-
tags: list[str],
|
|
63
|
-
nodes: tuple,
|
|
64
|
-
to_nodes: tuple,
|
|
65
|
-
from_nodes: tuple,
|
|
66
|
-
from_inputs: tuple,
|
|
67
|
-
to_outputs: tuple,
|
|
68
|
-
load_version: tuple,
|
|
69
|
-
namespaces: Iterable[str],
|
|
70
|
-
):
|
|
71
|
-
"""Run the pipeline with the FusedRunner."""
|
|
72
|
-
|
|
73
|
-
LOGGER.warning(f"Using plugin entrypoint")
|
|
74
|
-
|
|
75
|
-
load_versions = None
|
|
76
|
-
if load_version:
|
|
77
|
-
load_versions = {}
|
|
78
|
-
for version_spec in load_version:
|
|
79
|
-
if ":" in version_spec:
|
|
80
|
-
dataset, version = version_spec.split(":", 1)
|
|
81
|
-
load_versions[dataset] = version
|
|
82
|
-
|
|
83
|
-
conf_source = getattr(ctx, "conf_source", None)
|
|
84
|
-
env_value = env or getattr(ctx, "env", None)
|
|
85
|
-
|
|
86
|
-
with KedroSession.create(
|
|
87
|
-
env=env_value,
|
|
88
|
-
conf_source=conf_source,
|
|
89
|
-
) as session:
|
|
90
|
-
|
|
91
|
-
session.run(
|
|
92
|
-
pipeline_name=pipeline,
|
|
93
|
-
tags=tags,
|
|
94
|
-
runner=FusedRunner(pipeline_name=pipeline),
|
|
95
|
-
node_names=list(nodes) if nodes else None,
|
|
96
|
-
from_nodes=list(from_nodes) if from_nodes else None,
|
|
97
|
-
to_nodes=list(to_nodes) if to_nodes else None,
|
|
98
|
-
from_inputs=list(from_inputs) if from_inputs else None,
|
|
99
|
-
to_outputs=list(to_outputs) if to_outputs else None,
|
|
100
|
-
load_versions=load_versions,
|
|
101
|
-
namespaces=namespaces,
|
|
102
|
-
)
|
|
103
|
-
|
|
104
|
-
@click.group(name="argo")
|
|
105
|
-
def commands():
|
|
106
|
-
pass
|
|
107
|
-
|
|
108
|
-
@commands.command(name="submit")
|
|
109
|
-
@click.option("--pipeline", "-p", type=str, default="__default__", help="Specify which pipeline to execute")
|
|
110
|
-
@click.option("--environment", "-e", type=str, default="base", help="Kedro environment to execute in")
|
|
111
|
-
@click.option("--image", type=str, required=True, help="Image to execute")
|
|
112
|
-
@click.option("--namespace", "-n", type=str, required=True, help="Namespace to execute in")
|
|
113
|
-
@click.pass_obj
|
|
114
|
-
def submit(
|
|
115
|
-
ctx,
|
|
116
|
-
pipeline: str,
|
|
117
|
-
image: str,
|
|
118
|
-
namespace: str,
|
|
119
|
-
environment: str
|
|
120
|
-
):
|
|
121
|
-
"""Submit the pipeline to Argo."""
|
|
122
|
-
LOGGER.info("Loading spec template..")
|
|
123
|
-
|
|
124
|
-
loader = FileSystemLoader(searchpath=ARGO_TEMPLATES_DIR_PATH)
|
|
125
|
-
template_env = Environment(loader=loader, trim_blocks=True, lstrip_blocks=True)
|
|
126
|
-
template = template_env.get_template("argo_wf_spec.tmpl")
|
|
127
|
-
|
|
128
|
-
pipeline_tasks = get_argo_dag(kedro_pipelines[pipeline])
|
|
129
|
-
|
|
130
|
-
LOGGER.info("Rendering Argo spec...")
|
|
131
|
-
|
|
132
|
-
# Render the template
|
|
133
|
-
rendered_template = template.render(
|
|
134
|
-
pipeline_tasks=[task.to_dict() for task in pipeline_tasks.values()],
|
|
135
|
-
pipeline_name=pipeline,
|
|
136
|
-
image=image,
|
|
137
|
-
namespace=namespace,
|
|
138
|
-
environment=environment
|
|
139
|
-
)
|
|
140
|
-
|
|
141
|
-
# Load as yaml
|
|
142
|
-
yaml_data = yaml.safe_load(rendered_template)
|
|
143
|
-
yaml_without_anchors = yaml.dump(yaml_data, sort_keys=False, default_flow_style=False)
|
|
144
|
-
save_argo_template(
|
|
145
|
-
yaml_without_anchors,
|
|
146
|
-
)
|
|
147
|
-
|
|
148
|
-
# Use kubeconfig to submit to kubernetes
|
|
149
|
-
config.load_kube_config()
|
|
150
|
-
client = DynamicClient(config.new_client_from_config())
|
|
151
|
-
|
|
152
|
-
resource = client.resources.get(
|
|
153
|
-
api_version=yaml_data["apiVersion"],
|
|
154
|
-
kind=yaml_data["kind"],
|
|
155
|
-
)
|
|
156
|
-
|
|
157
|
-
resource.create(
|
|
158
|
-
body=yaml_data,
|
|
159
|
-
namespace=namespace
|
|
160
|
-
)
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
def save_argo_template(argo_template: str) -> str:
|
|
164
|
-
file_path = Path("templates") / "argo-workflow-template.yml"
|
|
165
|
-
with open(file_path, "w") as f:
|
|
166
|
-
f.write(argo_template)
|
|
167
|
-
return str(file_path)
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
class ArgoTask:
|
|
171
|
-
"""Class to model an Argo task.
|
|
172
|
-
|
|
173
|
-
Argo's operating model slightly differs from Kedro's, i.e., while Kedro uses dataset
|
|
174
|
-
dependecies to model relationships, Argo uses task dependencies."""
|
|
175
|
-
|
|
176
|
-
def __init__(self, node: Node):
|
|
177
|
-
self._node = node
|
|
178
|
-
self._parents = []
|
|
179
|
-
|
|
180
|
-
@property
|
|
181
|
-
def node(self):
|
|
182
|
-
return self._node
|
|
183
|
-
|
|
184
|
-
def add_parents(self, nodes: List[Node]):
|
|
185
|
-
self._parents.extend(nodes)
|
|
186
|
-
|
|
187
|
-
def to_dict(self):
|
|
188
|
-
return {
|
|
189
|
-
"name": clean_name(self._node.name),
|
|
190
|
-
"nodes": self._node.name,
|
|
191
|
-
"deps": [clean_name(parent.name) for parent in sorted(self._parents)],
|
|
192
|
-
}
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
def get_argo_dag(pipeline: Pipeline) -> List[Dict[str, Any]]:
|
|
196
|
-
"""Function to convert the Kedro pipeline into Argo Tasks. The function
|
|
197
|
-
iterates the nodes of the pipeline and generates Argo tasks with dependencies.
|
|
198
|
-
These dependencies are inferred based on the input and output datasets for
|
|
199
|
-
each node.
|
|
200
|
-
|
|
201
|
-
NOTE: This function is now agnostic to the fact that nodes might be fused. The nodes
|
|
202
|
-
returned as part of the pipeline may optionally contain FusedNodes, which have correct
|
|
203
|
-
inputs and outputs for the perspective of the Argo Task.
|
|
204
|
-
"""
|
|
205
|
-
tasks = {}
|
|
206
|
-
|
|
207
|
-
# The `grouped_nodes` property returns the nodes list, in a toplogical order,
|
|
208
|
-
# allowing us to easily translate the Kedro DAG to an Argo WF.
|
|
209
|
-
for group in pipeline.grouped_nodes:
|
|
210
|
-
for target_node in group:
|
|
211
|
-
task = ArgoTask(target_node)
|
|
212
|
-
task.add_parents(
|
|
213
|
-
[
|
|
214
|
-
parent.node
|
|
215
|
-
for parent in tasks.values()
|
|
216
|
-
if set(clean_dependencies(target_node.inputs)) & set(clean_dependencies(parent.node.outputs))
|
|
217
|
-
]
|
|
218
|
-
)
|
|
219
|
-
|
|
220
|
-
tasks[target_node.name] = task
|
|
221
|
-
|
|
222
|
-
return tasks
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
def clean_name(name: str) -> str:
|
|
226
|
-
"""Function to clean the node name.
|
|
227
|
-
|
|
228
|
-
Args:
|
|
229
|
-
name: name of the node
|
|
230
|
-
Returns:
|
|
231
|
-
Clean node name, according to Argo's requirements
|
|
232
|
-
"""
|
|
233
|
-
return re.sub(r"[\W_]+", "-", name).strip("-")
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
def clean_dependencies(elements) -> List[str]:
|
|
237
|
-
"""Function to clean node dependencies.
|
|
238
|
-
|
|
239
|
-
Operates by removing `params:` from the list and dismissing
|
|
240
|
-
the transcoding operator.
|
|
241
|
-
"""
|
|
242
|
-
return [el.split("@")[0] for el in elements if not el.startswith("params:")]
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|