argo-kedro 0.1.2__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
File without changes
File without changes
@@ -0,0 +1,242 @@
1
+ import re
2
+ from pathlib import Path
3
+ from typing import Any, Dict, List, Iterable
4
+ from logging import getLogger
5
+
6
+ import click
7
+ import yaml
8
+ from kubernetes import config
9
+ from kubernetes.dynamic import DynamicClient
10
+ from jinja2 import Environment, FileSystemLoader
11
+ from kedro.framework.cli.utils import CONTEXT_SETTINGS, KedroCliError
12
+ from kedro.framework.session import KedroSession
13
+ from kedro.framework.cli.project import (
14
+ ASYNC_ARG_HELP,
15
+ CONF_SOURCE_HELP,
16
+ FROM_INPUTS_HELP,
17
+ FROM_NODES_HELP,
18
+ LOAD_VERSION_HELP,
19
+ NODE_ARG_HELP,
20
+ PARAMS_ARG_HELP,
21
+ PIPELINE_ARG_HELP,
22
+ RUNNER_ARG_HELP,
23
+ TAG_ARG_HELP,
24
+ TO_NODES_HELP,
25
+ TO_OUTPUTS_HELP,
26
+ project_group,
27
+ )
28
+ from kedro.framework.project import pipelines as kedro_pipelines
29
+ from kedro.pipeline import Pipeline
30
+ from kedro.pipeline.node import Node
31
+ from kedro.runner.sequential_runner import SequentialRunner
32
+ from argo_kedro.runners.fuse_runner import FusedRunner
33
+
34
+ LOGGER = getLogger(__name__)
35
+ ARGO_TEMPLATES_DIR_PATH = Path(__file__).parent.parent.parent / "templates"
36
+
37
+
38
+ @click.group(context_settings=CONTEXT_SETTINGS)
39
+ def cli():
40
+ pass
41
+
42
+ @cli.command(name="run")
43
+ @click.option("--pipeline", "-p", type=str, default="__default__", help="Name of the pipeline to execute")
44
+ @click.option("--env", "-e", type=str, default=None, help="Kedro environment to run the pipeline in")
45
+ @click.option("--config", "-c", type=str, multiple=True, help="Extra config to pass to KedroContext")
46
+ @click.option("--params", type=str, multiple=True, help="Override parameters")
47
+ @click.option("--tags", "-t", type=str, multiple=True, help=TAG_ARG_HELP)
48
+ @click.option("--nodes", "-n", type=str, multiple=True, help="Run only nodes with specified names")
49
+ @click.option("--to-nodes", type=str, multiple=True, help="Run a sub-pipeline up to certain nodes")
50
+ @click.option("--from-nodes", type=str, multiple=True, help="Run a sub-pipeline starting from certain nodes")
51
+ @click.option("--from-inputs", type=str, multiple=True, help="Run a sub-pipeline starting from nodes that produce these inputs")
52
+ @click.option("--to-outputs", type=str, multiple=True, help="Run a sub-pipeline up to nodes that produce these outputs")
53
+ @click.option("--load-version", type=str, multiple=True, help="Specify a particular dataset version")
54
+ @click.option("--namespaces", type=str, multiple=True, help="Namespaces of the pipeline")
55
+ @click.pass_obj
56
+ def _run_command_impl(
57
+ ctx,
58
+ pipeline: str,
59
+ env: str,
60
+ config: tuple,
61
+ params: tuple,
62
+ tags: list[str],
63
+ nodes: tuple,
64
+ to_nodes: tuple,
65
+ from_nodes: tuple,
66
+ from_inputs: tuple,
67
+ to_outputs: tuple,
68
+ load_version: tuple,
69
+ namespaces: Iterable[str],
70
+ ):
71
+ """Run the pipeline with the FusedRunner."""
72
+
73
+ LOGGER.warning(f"Using plugin entrypoint")
74
+
75
+ load_versions = None
76
+ if load_version:
77
+ load_versions = {}
78
+ for version_spec in load_version:
79
+ if ":" in version_spec:
80
+ dataset, version = version_spec.split(":", 1)
81
+ load_versions[dataset] = version
82
+
83
+ conf_source = getattr(ctx, "conf_source", None)
84
+ env_value = env or getattr(ctx, "env", None)
85
+
86
+ with KedroSession.create(
87
+ env=env_value,
88
+ conf_source=conf_source,
89
+ ) as session:
90
+
91
+ session.run(
92
+ pipeline_name=pipeline,
93
+ tags=tags,
94
+ runner=FusedRunner(pipeline_name=pipeline),
95
+ node_names=list(nodes) if nodes else None,
96
+ from_nodes=list(from_nodes) if from_nodes else None,
97
+ to_nodes=list(to_nodes) if to_nodes else None,
98
+ from_inputs=list(from_inputs) if from_inputs else None,
99
+ to_outputs=list(to_outputs) if to_outputs else None,
100
+ load_versions=load_versions,
101
+ namespaces=namespaces,
102
+ )
103
+
104
+ @click.group(name="argo")
105
+ def commands():
106
+ pass
107
+
108
+ @commands.command(name="submit")
109
+ @click.option("--pipeline", "-p", type=str, default="__default__", help="Specify which pipeline to execute")
110
+ @click.option("--environment", "-e", type=str, default="base", help="Kedro environment to execute in")
111
+ @click.option("--image", type=str, required=True, help="Image to execute")
112
+ @click.option("--namespace", "-n", type=str, required=True, help="Namespace to execute in")
113
+ @click.pass_obj
114
+ def submit(
115
+ ctx,
116
+ pipeline: str,
117
+ image: str,
118
+ namespace: str,
119
+ environment: str
120
+ ):
121
+ """Submit the pipeline to Argo."""
122
+ LOGGER.info("Loading spec template..")
123
+
124
+ loader = FileSystemLoader(searchpath=ARGO_TEMPLATES_DIR_PATH)
125
+ template_env = Environment(loader=loader, trim_blocks=True, lstrip_blocks=True)
126
+ template = template_env.get_template("argo_wf_spec.tmpl")
127
+
128
+ pipeline_tasks = get_argo_dag(kedro_pipelines[pipeline])
129
+
130
+ LOGGER.info("Rendering Argo spec...")
131
+
132
+ # Render the template
133
+ rendered_template = template.render(
134
+ pipeline_tasks=[task.to_dict() for task in pipeline_tasks.values()],
135
+ pipeline_name=pipeline,
136
+ image=image,
137
+ namespace=namespace,
138
+ environment=environment
139
+ )
140
+
141
+ # Load as yaml
142
+ yaml_data = yaml.safe_load(rendered_template)
143
+ yaml_without_anchors = yaml.dump(yaml_data, sort_keys=False, default_flow_style=False)
144
+ save_argo_template(
145
+ yaml_without_anchors,
146
+ )
147
+
148
+ # Use kubeconfig to submit to kubernetes
149
+ config.load_kube_config()
150
+ client = DynamicClient(config.new_client_from_config())
151
+
152
+ resource = client.resources.get(
153
+ api_version=yaml_data["apiVersion"],
154
+ kind=yaml_data["kind"],
155
+ )
156
+
157
+ resource.create(
158
+ body=yaml_data,
159
+ namespace=namespace
160
+ )
161
+
162
+
163
+ def save_argo_template(argo_template: str) -> str:
164
+ file_path = Path("templates") / "argo-workflow-template.yml"
165
+ with open(file_path, "w") as f:
166
+ f.write(argo_template)
167
+ return str(file_path)
168
+
169
+
170
+ class ArgoTask:
171
+ """Class to model an Argo task.
172
+
173
+ Argo's operating model slightly differs from Kedro's, i.e., while Kedro uses dataset
174
+ dependecies to model relationships, Argo uses task dependencies."""
175
+
176
+ def __init__(self, node: Node):
177
+ self._node = node
178
+ self._parents = []
179
+
180
+ @property
181
+ def node(self):
182
+ return self._node
183
+
184
+ def add_parents(self, nodes: List[Node]):
185
+ self._parents.extend(nodes)
186
+
187
+ def to_dict(self):
188
+ return {
189
+ "name": clean_name(self._node.name),
190
+ "nodes": self._node.name,
191
+ "deps": [clean_name(parent.name) for parent in sorted(self._parents)],
192
+ }
193
+
194
+
195
+ def get_argo_dag(pipeline: Pipeline) -> List[Dict[str, Any]]:
196
+ """Function to convert the Kedro pipeline into Argo Tasks. The function
197
+ iterates the nodes of the pipeline and generates Argo tasks with dependencies.
198
+ These dependencies are inferred based on the input and output datasets for
199
+ each node.
200
+
201
+ NOTE: This function is now agnostic to the fact that nodes might be fused. The nodes
202
+ returned as part of the pipeline may optionally contain FusedNodes, which have correct
203
+ inputs and outputs for the perspective of the Argo Task.
204
+ """
205
+ tasks = {}
206
+
207
+ # The `grouped_nodes` property returns the nodes list, in a toplogical order,
208
+ # allowing us to easily translate the Kedro DAG to an Argo WF.
209
+ for group in pipeline.grouped_nodes:
210
+ for target_node in group:
211
+ task = ArgoTask(target_node)
212
+ task.add_parents(
213
+ [
214
+ parent.node
215
+ for parent in tasks.values()
216
+ if set(clean_dependencies(target_node.inputs)) & set(clean_dependencies(parent.node.outputs))
217
+ ]
218
+ )
219
+
220
+ tasks[target_node.name] = task
221
+
222
+ return tasks
223
+
224
+
225
+ def clean_name(name: str) -> str:
226
+ """Function to clean the node name.
227
+
228
+ Args:
229
+ name: name of the node
230
+ Returns:
231
+ Clean node name, according to Argo's requirements
232
+ """
233
+ return re.sub(r"[\W_]+", "-", name).strip("-")
234
+
235
+
236
+ def clean_dependencies(elements) -> List[str]:
237
+ """Function to clean node dependencies.
238
+
239
+ Operates by removing `params:` from the list and dismissing
240
+ the transcoding operator.
241
+ """
242
+ return [el.split("@")[0] for el in elements if not el.startswith("params:")]
@@ -0,0 +1,3 @@
1
+ from .fused_pipeline import FusedPipeline
2
+
3
+ __all__ = ["FusedPipeline", ]
@@ -0,0 +1,58 @@
1
+ from typing import Iterable, List
2
+ from kedro.pipeline import Pipeline, Node
3
+ from functools import cached_property
4
+
5
+ class FusedNode(Node):
6
+ """FusedNode is an extension of Kedro's internal node. The FusedNode
7
+ wraps a set of nodes, and correctly sets it's `inputs` and `outputs`
8
+ allowing it to act as a single unit for execution.
9
+ """
10
+
11
+ def __init__(self, nodes: List[Node], name: str):
12
+ self._nodes = nodes
13
+ self._name = name
14
+ self._namespace = None
15
+ self._inputs = []
16
+ self._outputs = []
17
+ self._confirms = []
18
+ self._func = lambda: None
19
+ self._tags = []
20
+
21
+ for node in nodes:
22
+ self._inputs.extend(node.inputs)
23
+ self._outputs.extend(node.outputs)
24
+ self._tags.extend(node._tags)
25
+
26
+ # NOTE: Exclude ouputs made as part of the intermediate nodes
27
+ for node in self._outputs:
28
+ if node in self._inputs:
29
+ self._inputs.remove(node)
30
+
31
+ self._tags = list(set(self._tags))
32
+
33
+ @cached_property
34
+ def inputs(self) -> list[str]:
35
+ return self._inputs # TODO: Remove transcoding?
36
+
37
+
38
+ class FusedPipeline(Pipeline):
39
+ """Fused pipeline allows for wrapping nodes for execution by the underlying
40
+ pipeline execution framework.
41
+
42
+ This is needed, as Kedro immediately translates a pipeline to a list of nodes
43
+ to execute, where any pipeline structure is flatmapped. The FusedPipeline produces
44
+ a _single_ FusedNode that contains the wrapped nodes."""
45
+
46
+ def __init__(
47
+ self,
48
+ nodes: Iterable[Node | Pipeline],
49
+ name: str,
50
+ *,
51
+ tags: str | Iterable[str] | None = None,
52
+ ):
53
+ self._name = name
54
+ super().__init__(nodes, tags=tags)
55
+
56
+ @property
57
+ def nodes(self) -> list[Node]:
58
+ return [FusedNode(self._nodes, name=self._name)]
@@ -0,0 +1,3 @@
1
+ from .fuse_runner import FusedRunner
2
+
3
+ __all__ = ["FusedRunner"]
@@ -0,0 +1,92 @@
1
+ from kedro.io import DataCatalog
2
+ from kedro.framework.project import pipelines
3
+ from kedro.pipeline import Pipeline
4
+ from kedro.io.memory_dataset import MemoryDataset
5
+ from kedro.runner.sequential_runner import SequentialRunner
6
+ from pluggy import PluginManager
7
+
8
+ from argo_kedro.pipeline.fused_pipeline import FusedNode
9
+
10
+ import os
11
+ import re
12
+ from pathlib import Path
13
+ from typing import Any, Dict, List
14
+ from logging import getLogger
15
+
16
+ import click
17
+ import yaml
18
+ from kubernetes import config
19
+ from kubernetes.dynamic import DynamicClient
20
+ from jinja2 import Environment, FileSystemLoader
21
+ from kedro.framework.project import pipelines as kedro_pipelines
22
+ from kedro.framework.cli.utils import find_run_command
23
+ from kedro.pipeline import Pipeline
24
+ from kedro.pipeline.node import Node
25
+
26
+ LOGGER = getLogger(__name__)
27
+ ARGO_TEMPLATES_DIR_PATH = Path(__file__).parent.parent.parent / "templates"
28
+
29
+
30
+ class FusedRunner(SequentialRunner):
31
+ """Fused runner is an extension of the SequentialRunner that
32
+ essentially unpacks the FusedNode back to the contained nodes for
33
+ execution."""
34
+
35
+ def __init__(
36
+ self,
37
+ is_async: bool = False,
38
+ pipeline_name: str | None = None,
39
+ ):
40
+ """Instantiates the runner class.
41
+
42
+ The runner requires access to the pipeline name under execution to correctly handle
43
+ node fusing, as each node during parallell execution is wrapped as a single unit. To
44
+ properly fuse, the runner needs to know the pipeline execution boundary.
45
+
46
+ Args:
47
+ is_async: If True, the node inputs and outputs are loaded and saved
48
+ asynchronously with threads. Defaults to False.
49
+ pipeline_name: Name of the pipeline to run.
50
+ """
51
+ self._is_async = is_async
52
+ self._pipeline_name = pipeline_name
53
+
54
+ def _run(
55
+ self,
56
+ pipeline: Pipeline,
57
+ catalog: DataCatalog,
58
+ hook_manager: PluginManager,
59
+ session_id: str | None = None,
60
+ ) -> None:
61
+ nodes = pipeline.nodes
62
+
63
+ LOGGER.warning(f"Running pipeline: {self._pipeline_name}")
64
+
65
+ for node in nodes:
66
+ if isinstance(node, FusedNode):
67
+ pipeline = Pipeline(node._nodes)
68
+
69
+ outputs = pipeline.outputs()
70
+ for dataset in pipeline.datasets():
71
+
72
+ found = False
73
+ for pipeline_node in pipelines[self._pipeline_name].nodes:
74
+ if node.name != pipeline_node.name:
75
+ if dataset in pipeline_node.inputs:
76
+ found = True
77
+ break
78
+
79
+ if found:
80
+ print(f"{dataset} found as input to other pipeline node")
81
+ outputs.append(dataset)
82
+
83
+ for dataset in pipeline.datasets().difference(pipeline.inputs().union(outputs)):
84
+ catalog._datasets[dataset] = MemoryDataset()
85
+
86
+ # Invoke super runner
87
+ super()._run(
88
+ Pipeline([Pipeline(node._nodes) if isinstance(node, FusedNode) else node for node in nodes]),
89
+ catalog,
90
+ hook_manager,
91
+ session_id,
92
+ )
@@ -0,0 +1,53 @@
1
+ {# <project_root>/templates/argo_spec.tmpl #}
2
+ apiVersion: argoproj.io/v1alpha1
3
+ kind: Workflow
4
+ metadata:
5
+ generateName: workflow-
6
+ namespace: {{ namespace }}
7
+ spec:
8
+ workflowMetadata:
9
+ labels:
10
+ plugin: argo-kedro
11
+ entrypoint: "pipeline"
12
+ templates:
13
+ - name: kedro
14
+ metadata:
15
+ labels:
16
+ app: argo-kedro
17
+ inputs:
18
+ parameters:
19
+ - name: pipeline
20
+ - name: kedro_nodes
21
+ container:
22
+ image: {{ image }}
23
+ command: ["kedro"]
24
+ imagePullPolicy: Always
25
+ args:
26
+ - "run"
27
+ - "--pipeline"
28
+ - "{{ '{{inputs.parameters.pipeline}}' }}"
29
+ - "--nodes"
30
+ - "{{ '{{inputs.parameters.kedro_nodes}}' }}"
31
+ - "--env"
32
+ - "{{ environment }}"
33
+
34
+ - name: pipeline
35
+ dag:
36
+ tasks:
37
+ {% for task in pipeline_tasks %}
38
+ - name: {{ task.name }}
39
+ template: {{ task.get('template', 'kedro') }}
40
+ {% if task.deps %}
41
+ dependencies:
42
+ {% for dep in task.deps %}
43
+ - {{ dep }}
44
+ {% endfor %}
45
+ {% endif %}
46
+ arguments:
47
+ parameters:
48
+ - name: pipeline
49
+ value: {{ pipeline_name }}
50
+ - name: kedro_nodes
51
+ value: {{ task.nodes }}
52
+
53
+ {% endfor %}
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: argo-kedro
3
- Version: 0.1.2
3
+ Version: 0.1.3
4
4
  Summary: Kedro plugin for running pipelines on Argo Workflows
5
5
  Author-email: Laurens Vijnck <laurens@everycure.org>, Nelson Alfonso <nelson@everycure.org>
6
6
  License: MIT
@@ -0,0 +1,14 @@
1
+ argo_kedro/framework/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ argo_kedro/framework/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ argo_kedro/framework/cli/cli.py,sha256=x1wZfDJ3-GyxhUDnmdSzcmpVLrQu1SowZH9Wnjl0mz0,8176
4
+ argo_kedro/pipeline/__init__.py,sha256=eADPWCo5qxWr3nWXJJK7yugfJ37-zGYdY7frE-8dLcs,72
5
+ argo_kedro/pipeline/fused_pipeline.py,sha256=FHK5ZMQi21dHYGGewU-4QN31JwtiWbhtPiGrnC9QtpE,1844
6
+ argo_kedro/runners/__init__.py,sha256=AfU9FbRebpfTYnliocXtdwALpfVlh19WXcrEBh4Wb78,63
7
+ argo_kedro/runners/fuse_runner.py,sha256=K-OmciE8hLicMNiaLe5SjR4GwBH4Ud3mtNDknvzhTFA,3177
8
+ argo_kedro/templates/argo_wf_spec.tmpl,sha256=7gLBGnA_d8bwhwOAMGgU_IgWIjpD8atDEGidrG5BNKU,1218
9
+ argo_kedro-0.1.3.dist-info/licenses/LICENSE,sha256=deLLtAUKpK9aD3f3sgr6-FP6M1K5Mh9ai0-EBBUZMqA,1080
10
+ argo_kedro-0.1.3.dist-info/METADATA,sha256=y4UcEXsyETGe8MYdboIyiOd2tkMU2hhAj5S9vKMh3Po,3272
11
+ argo_kedro-0.1.3.dist-info/WHEEL,sha256=qELbo2s1Yzl39ZmrAibXA2jjPLUYfnVhUNTlyF1rq0Y,92
12
+ argo_kedro-0.1.3.dist-info/entry_points.txt,sha256=czlBy9HPiG00bIpokpPOaE2EFr9YlWRSDhWPbLDcEYU,134
13
+ argo_kedro-0.1.3.dist-info/top_level.txt,sha256=bkDBnht8zOdNxOcy4MwQU2MoRz5-eOww8MVzW2CLEdE,11
14
+ argo_kedro-0.1.3.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.9.0)
2
+ Generator: setuptools (80.10.1)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -0,0 +1,5 @@
1
+ [kedro.global_commands]
2
+ run = argo_kedro.framework.cli.cli:cli
3
+
4
+ [kedro.project_commands]
5
+ argo = argo_kedro.framework.cli.cli:commands
@@ -0,0 +1 @@
1
+ argo_kedro
@@ -1,6 +0,0 @@
1
- argo_kedro-0.1.2.dist-info/licenses/LICENSE,sha256=deLLtAUKpK9aD3f3sgr6-FP6M1K5Mh9ai0-EBBUZMqA,1080
2
- argo_kedro-0.1.2.dist-info/METADATA,sha256=AxKrCOSc0__esa0Fp7xiVWDih1QAzslbBEXJeHtn8aI,3272
3
- argo_kedro-0.1.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
4
- argo_kedro-0.1.2.dist-info/entry_points.txt,sha256=_v1LY-D5TaFe5teVUEGpSGSYElj2FoV592cxE_ewwv4,134
5
- argo_kedro-0.1.2.dist-info/top_level.txt,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
6
- argo_kedro-0.1.2.dist-info/RECORD,,
@@ -1,5 +0,0 @@
1
- [kedro.global_commands]
2
- run = kedro_argo.framework.cli.cli:cli
3
-
4
- [kedro.project_commands]
5
- argo = kedro_argo.framework.cli.cli:commands
@@ -1 +0,0 @@
1
-