easylink 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- easylink/_version.py +1 -1
- easylink/cli.py +18 -9
- easylink/graph_components.py +12 -2
- easylink/implementation.py +2 -0
- easylink/pipeline.py +92 -34
- easylink/pipeline_graph.py +112 -27
- easylink/pipeline_schema_constants/__init__.py +5 -2
- easylink/pipeline_schema_constants/development.py +11 -2
- easylink/pipeline_schema_constants/testing.py +135 -0
- easylink/rule.py +282 -22
- easylink/runner.py +1 -0
- easylink/step.py +68 -4
- easylink/utilities/aggregator_utils.py +31 -0
- easylink/utilities/data_utils.py +1 -0
- easylink/utilities/general_utils.py +1 -0
- easylink/utilities/splitter_utils.py +71 -0
- {easylink-0.1.5.dist-info → easylink-0.1.7.dist-info}/METADATA +1 -1
- {easylink-0.1.5.dist-info → easylink-0.1.7.dist-info}/RECORD +21 -19
- {easylink-0.1.5.dist-info → easylink-0.1.7.dist-info}/WHEEL +1 -1
- {easylink-0.1.5.dist-info → easylink-0.1.7.dist-info}/entry_points.txt +0 -0
- {easylink-0.1.5.dist-info → easylink-0.1.7.dist-info}/top_level.txt +0 -0
easylink/_version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.1.
|
1
|
+
__version__ = "0.1.7"
|
easylink/cli.py
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
# mypy: ignore-errors
|
1
2
|
"""
|
2
3
|
======================
|
3
4
|
Command Line Interface
|
@@ -86,6 +87,16 @@ SHARED_OPTIONS = [
|
|
86
87
|
default=False,
|
87
88
|
help="Do not save the results in a timestamped sub-directory of ``--output-dir``.",
|
88
89
|
),
|
90
|
+
click.option(
|
91
|
+
"-v", "--verbose", count=True, help="Increase logging verbosity.", hidden=True
|
92
|
+
),
|
93
|
+
click.option(
|
94
|
+
"--pdb",
|
95
|
+
"with_debugger",
|
96
|
+
is_flag=True,
|
97
|
+
help="Drop into python debugger if an error occurs.",
|
98
|
+
hidden=True,
|
99
|
+
),
|
89
100
|
]
|
90
101
|
|
91
102
|
|
@@ -128,14 +139,6 @@ def easylink():
|
|
128
139
|
"the pipeline will be run locally."
|
129
140
|
),
|
130
141
|
)
|
131
|
-
@click.option("-v", "--verbose", count=True, help="Increase logging verbosity.", hidden=True)
|
132
|
-
@click.option(
|
133
|
-
"--pdb",
|
134
|
-
"with_debugger",
|
135
|
-
is_flag=True,
|
136
|
-
help="Drop into python debugger if an error occurs.",
|
137
|
-
hidden=True,
|
138
|
-
)
|
139
142
|
def run(
|
140
143
|
pipeline_specification: str,
|
141
144
|
input_data: str,
|
@@ -177,17 +180,23 @@ def generate_dag(
|
|
177
180
|
input_data: str,
|
178
181
|
output_dir: str | None,
|
179
182
|
no_timestamp: bool,
|
183
|
+
verbose: int,
|
184
|
+
with_debugger: bool,
|
180
185
|
) -> None:
|
181
186
|
"""Generates an image of the proposed pipeline directed acyclic graph (DAG).
|
182
187
|
|
183
188
|
This command only generates the DAG image of the pipeline; it does not actually
|
184
189
|
run it. To run the pipeline, use the ``easylink run`` command.
|
185
190
|
"""
|
191
|
+
configure_logging_to_terminal(verbose)
|
186
192
|
logger.info("Generating DAG")
|
187
193
|
results_dir = get_results_directory(output_dir, no_timestamp).as_posix()
|
188
194
|
logger.info(f"Results directory: {results_dir}")
|
189
195
|
# TODO [MIC-4493]: Add configuration validation
|
190
|
-
|
196
|
+
main = handle_exceptions(
|
197
|
+
func=runner.main, exceptions_logger=logger, with_debugger=with_debugger
|
198
|
+
)
|
199
|
+
main(
|
191
200
|
command="generate_dag",
|
192
201
|
pipeline_specification=pipeline_specification,
|
193
202
|
input_data=input_data,
|
easylink/graph_components.py
CHANGED
@@ -13,7 +13,7 @@ from __future__ import annotations
|
|
13
13
|
from abc import ABC, abstractmethod
|
14
14
|
from collections.abc import Callable
|
15
15
|
from dataclasses import dataclass
|
16
|
-
from typing import TYPE_CHECKING
|
16
|
+
from typing import TYPE_CHECKING, Any
|
17
17
|
|
18
18
|
import networkx as nx
|
19
19
|
|
@@ -45,8 +45,13 @@ class InputSlot:
|
|
45
45
|
"""A function that validates the input data being passed into the pipeline via
|
46
46
|
this ``InputSlot``. If the data is invalid, the function should raise an exception
|
47
47
|
with a descriptive error message which will then be reported to the user.
|
48
|
-
**Note that the function must be defined in the** :mod:`easylink.utilities.validation_utils`
|
48
|
+
**Note that the function *must* be defined in the** :mod:`easylink.utilities.validation_utils`
|
49
49
|
**module!**"""
|
50
|
+
splitter: Callable[[list[str], str, Any], None] | None = None
|
51
|
+
"""A function that splits the incoming data to this ``InputSlot`` into smaller
|
52
|
+
pieces. The primary purpose of this functionality is to run sections of the
|
53
|
+
pipeline in an embarrassingly parallel manner. **Note that the function *must*
|
54
|
+
be defined in the **:mod:`easylink.utilities.splitter_utils`** module!**"""
|
50
55
|
|
51
56
|
|
52
57
|
@dataclass(frozen=True)
|
@@ -70,6 +75,11 @@ class OutputSlot:
|
|
70
75
|
|
71
76
|
name: str
|
72
77
|
"""The name of the ``OutputSlot``."""
|
78
|
+
aggregator: Callable[[list[str], str], None] = None
|
79
|
+
"""A function that aggregates all of the generated data to be passed out via this
|
80
|
+
``OutputSlot``. The primary purpose of this functionality is to run sections
|
81
|
+
of the pipeline in an embarrassingly parallel manner. **Note that the function
|
82
|
+
*must* be defined in the **:py:mod:`easylink.utilities.aggregator_utils`** module!**"""
|
73
83
|
|
74
84
|
|
75
85
|
@dataclass(frozen=True)
|
easylink/implementation.py
CHANGED
@@ -45,6 +45,7 @@ class Implementation:
|
|
45
45
|
implementation_config: LayeredConfigTree,
|
46
46
|
input_slots: Iterable["InputSlot"] = (),
|
47
47
|
output_slots: Iterable["OutputSlot"] = (),
|
48
|
+
is_embarrassingly_parallel: bool = False,
|
48
49
|
):
|
49
50
|
self.name = implementation_config.name
|
50
51
|
"""The name of this ``Implementation``."""
|
@@ -63,6 +64,7 @@ class Implementation:
|
|
63
64
|
implemented by this particular ``Implementation``."""
|
64
65
|
self.requires_spark = self._metadata.get("requires_spark", False)
|
65
66
|
"""Whether this ``Implementation`` requires a Spark environment."""
|
67
|
+
self.is_embarrassingly_parallel = is_embarrassingly_parallel
|
66
68
|
|
67
69
|
def __repr__(self) -> str:
|
68
70
|
return f"Implementation.{self.name}"
|
easylink/pipeline.py
CHANGED
@@ -16,7 +16,13 @@ from loguru import logger
|
|
16
16
|
|
17
17
|
from easylink.configuration import Config
|
18
18
|
from easylink.pipeline_graph import PipelineGraph
|
19
|
-
from easylink.rule import
|
19
|
+
from easylink.rule import (
|
20
|
+
AggregationRule,
|
21
|
+
CheckpointRule,
|
22
|
+
ImplementedRule,
|
23
|
+
InputValidationRule,
|
24
|
+
TargetRule,
|
25
|
+
)
|
20
26
|
from easylink.utilities.general_utils import exit_with_validation_error
|
21
27
|
from easylink.utilities.paths import SPARK_SNAKEFILE
|
22
28
|
from easylink.utilities.validation_utils import validate_input_file_dummy
|
@@ -40,13 +46,17 @@ class Pipeline:
|
|
40
46
|
The :class:`~easylink.pipeline_graph.PipelineGraph` object.
|
41
47
|
spark_is_required
|
42
48
|
A boolean indicating whether the pipeline requires Spark.
|
49
|
+
any_embarrassingly_parallel
|
50
|
+
A boolean indicating whether any implementation in the pipeline is to be
|
51
|
+
run in an embarrassingly parallel manner.
|
43
52
|
|
44
53
|
"""
|
45
54
|
|
46
55
|
def __init__(self, config: Config):
|
47
56
|
self.config = config
|
48
57
|
self.pipeline_graph = PipelineGraph(config)
|
49
|
-
self.spark_is_required = self.pipeline_graph.spark_is_required
|
58
|
+
self.spark_is_required = self.pipeline_graph.spark_is_required
|
59
|
+
self.any_embarrassingly_parallel = self.pipeline_graph.any_embarrassingly_parallel
|
50
60
|
|
51
61
|
# TODO [MIC-4880]: refactor into validation object
|
52
62
|
self._validate()
|
@@ -79,10 +89,10 @@ class Pipeline:
|
|
79
89
|
logger.warning("Snakefile already exists, overwriting.")
|
80
90
|
self.snakefile_path.unlink()
|
81
91
|
self._write_imports()
|
82
|
-
self.
|
92
|
+
self._write_wildcard_constraints()
|
93
|
+
self._write_spark_config()
|
83
94
|
self._write_target_rules()
|
84
|
-
|
85
|
-
self._write_spark_module()
|
95
|
+
self._write_spark_module()
|
86
96
|
for node in self.pipeline_graph.implementation_nodes:
|
87
97
|
self._write_implementation_rules(node)
|
88
98
|
return self.snakefile_path
|
@@ -121,26 +131,35 @@ class Pipeline:
|
|
121
131
|
return errors
|
122
132
|
|
123
133
|
def _write_imports(self) -> None:
|
124
|
-
|
125
|
-
|
126
|
-
|
134
|
+
if not self.any_embarrassingly_parallel:
|
135
|
+
imports = "from easylink.utilities import validation_utils\n"
|
136
|
+
else:
|
137
|
+
imports = """import glob
|
138
|
+
import os
|
127
139
|
|
128
|
-
|
129
|
-
|
140
|
+
from snakemake.exceptions import IncompleteCheckpointException
|
141
|
+
from snakemake.io import checkpoint_target
|
130
142
|
|
131
|
-
|
132
|
-
-----
|
133
|
-
This is currently only applicable for spark-dependent pipelines.
|
134
|
-
"""
|
143
|
+
from easylink.utilities import aggregator_utils, splitter_utils, validation_utils\n"""
|
135
144
|
with open(self.snakefile_path, "a") as f:
|
136
|
-
|
145
|
+
f.write(imports)
|
146
|
+
|
147
|
+
def _write_wildcard_constraints(self) -> None:
|
148
|
+
if self.any_embarrassingly_parallel:
|
149
|
+
with open(self.snakefile_path, "a") as f:
|
137
150
|
f.write(
|
138
|
-
|
151
|
+
"""
|
152
|
+
wildcard_constraints:
|
153
|
+
# never include '/' since those are reserved for filepaths
|
154
|
+
chunk="[^/]+",\n"""
|
139
155
|
)
|
140
156
|
|
141
157
|
def _write_target_rules(self) -> None:
|
142
|
-
"""Writes the rule for the final output and its validation.
|
143
|
-
|
158
|
+
"""Writes the rule for the final output and its validation.
|
159
|
+
|
160
|
+
The input files to the the target rule (i.e. the result node) are the final
|
161
|
+
output themselves.
|
162
|
+
"""
|
144
163
|
final_output, _ = self.pipeline_graph.get_io_filepaths("results")
|
145
164
|
validator_file = str("input_validations/final_validator")
|
146
165
|
# Snakemake resolves the DAG based on the first rule, so we put the target
|
@@ -152,7 +171,7 @@ class Pipeline:
|
|
152
171
|
)
|
153
172
|
final_validation = InputValidationRule(
|
154
173
|
name="results",
|
155
|
-
|
174
|
+
input_slot_name="main_input",
|
156
175
|
input=final_output,
|
157
176
|
output=validator_file,
|
158
177
|
validator=validate_input_file_dummy,
|
@@ -160,12 +179,26 @@ class Pipeline:
|
|
160
179
|
target_rule.write_to_snakefile(self.snakefile_path)
|
161
180
|
final_validation.write_to_snakefile(self.snakefile_path)
|
162
181
|
|
182
|
+
def _write_spark_config(self) -> None:
|
183
|
+
"""Writes configuration settings to the Snakefile.
|
184
|
+
|
185
|
+
Notes
|
186
|
+
-----
|
187
|
+
This is currently only applicable for spark-dependent pipelines.
|
188
|
+
"""
|
189
|
+
if self.spark_is_required:
|
190
|
+
with open(self.snakefile_path, "a") as f:
|
191
|
+
f.write(
|
192
|
+
f"\nscattergather:\n\tnum_workers={self.config.spark_resources['num_workers']},"
|
193
|
+
)
|
194
|
+
|
163
195
|
def _write_spark_module(self) -> None:
|
164
196
|
"""Inserts the ``easylink.utilities.spark.smk`` Snakemake module into the Snakefile."""
|
197
|
+
if not self.spark_is_required:
|
198
|
+
return
|
165
199
|
slurm_resources = self.config.slurm_resources
|
166
200
|
spark_resources = self.config.spark_resources
|
167
|
-
|
168
|
-
module = f"""
|
201
|
+
module = f"""
|
169
202
|
module spark_cluster:
|
170
203
|
snakefile: '{SPARK_SNAKEFILE}'
|
171
204
|
config: config
|
@@ -173,8 +206,8 @@ module spark_cluster:
|
|
173
206
|
use rule * from spark_cluster
|
174
207
|
use rule terminate_spark from spark_cluster with:
|
175
208
|
input: rules.all.input.final_output"""
|
176
|
-
|
177
|
-
|
209
|
+
if self.config.computing_environment == "slurm":
|
210
|
+
module += f"""
|
178
211
|
use rule start_spark_master from spark_cluster with:
|
179
212
|
resources:
|
180
213
|
slurm_account={slurm_resources['slurm_account']},
|
@@ -195,21 +228,49 @@ use rule start_spark_worker from spark_cluster with:
|
|
195
228
|
terminate_file_name=rules.terminate_spark.output,
|
196
229
|
user=os.environ["USER"],
|
197
230
|
cores={spark_resources['cpus_per_task']},
|
198
|
-
memory={spark_resources['mem_mb']}
|
199
|
-
|
231
|
+
memory={spark_resources['mem_mb']}"""
|
232
|
+
|
233
|
+
with open(self.snakefile_path, "a") as f:
|
200
234
|
f.write(module)
|
201
235
|
|
202
236
|
def _write_implementation_rules(self, node_name: str) -> None:
|
203
237
|
"""Writes the rules for each :class:`~easylink.implementation.Implementation`.
|
204
238
|
|
239
|
+
This method writes *all* rules required for a given ``Implementation``,
|
240
|
+
e.g. splitters and aggregators (if necessary), validations, and the actual
|
241
|
+
rule to run the container itself.
|
242
|
+
|
205
243
|
Parameters
|
206
244
|
----------
|
207
245
|
node_name
|
208
246
|
The name of the ``Implementation`` to write the rule(s) for.
|
209
247
|
"""
|
210
|
-
|
248
|
+
|
249
|
+
input_slots, output_slots = self.pipeline_graph.get_io_slot_attributes(node_name)
|
250
|
+
validation_files, validation_rules = self._get_validations(node_name, input_slots)
|
251
|
+
for validation_rule in validation_rules:
|
252
|
+
validation_rule.write_to_snakefile(self.snakefile_path)
|
253
|
+
|
211
254
|
_input_files, output_files = self.pipeline_graph.get_io_filepaths(node_name)
|
212
|
-
|
255
|
+
is_embarrassingly_parallel = self.pipeline_graph.get_whether_embarrassingly_parallel(
|
256
|
+
node_name
|
257
|
+
)
|
258
|
+
if is_embarrassingly_parallel:
|
259
|
+
CheckpointRule(
|
260
|
+
name=node_name,
|
261
|
+
input_slots=input_slots,
|
262
|
+
validations=validation_files,
|
263
|
+
output=output_files,
|
264
|
+
).write_to_snakefile(self.snakefile_path)
|
265
|
+
for name, attrs in output_slots.items():
|
266
|
+
AggregationRule(
|
267
|
+
name=node_name,
|
268
|
+
input_slots=input_slots,
|
269
|
+
output_slot_name=name,
|
270
|
+
output_slot=attrs,
|
271
|
+
).write_to_snakefile(self.snakefile_path)
|
272
|
+
|
273
|
+
implementation = self.pipeline_graph.nodes[node_name]["implementation"]
|
213
274
|
diagnostics_dir = Path("diagnostics") / node_name
|
214
275
|
diagnostics_dir.mkdir(parents=True, exist_ok=True)
|
215
276
|
resources = (
|
@@ -217,8 +278,7 @@ use rule start_spark_worker from spark_cluster with:
|
|
217
278
|
if self.config.computing_environment == "slurm"
|
218
279
|
else None
|
219
280
|
)
|
220
|
-
|
221
|
-
implementation_rule = ImplementedRule(
|
281
|
+
ImplementedRule(
|
222
282
|
name=node_name,
|
223
283
|
step_name=" and ".join(implementation.metadata_steps),
|
224
284
|
implementation_name=implementation.name,
|
@@ -231,10 +291,8 @@ use rule start_spark_worker from spark_cluster with:
|
|
231
291
|
image_path=implementation.singularity_image_path,
|
232
292
|
script_cmd=implementation.script_cmd,
|
233
293
|
requires_spark=implementation.requires_spark,
|
234
|
-
|
235
|
-
|
236
|
-
validation_rule.write_to_snakefile(self.snakefile_path)
|
237
|
-
implementation_rule.write_to_snakefile(self.snakefile_path)
|
294
|
+
is_embarrassingly_parallel=is_embarrassingly_parallel,
|
295
|
+
).write_to_snakefile(self.snakefile_path)
|
238
296
|
|
239
297
|
@staticmethod
|
240
298
|
def _get_validations(
|
@@ -262,7 +320,7 @@ use rule start_spark_worker from spark_cluster with:
|
|
262
320
|
validation_rules.append(
|
263
321
|
InputValidationRule(
|
264
322
|
name=node_name,
|
265
|
-
|
323
|
+
input_slot_name=input_slot_name,
|
266
324
|
input=input_slot_attrs["filepaths"],
|
267
325
|
output=validation_file,
|
268
326
|
validator=input_slot_attrs["validator"],
|
easylink/pipeline_graph.py
CHANGED
@@ -45,6 +45,8 @@ class PipelineGraph(ImplementationGraph):
|
|
45
45
|
----------
|
46
46
|
config
|
47
47
|
The :class:`~easylink.configuration.Config` object.
|
48
|
+
freeze
|
49
|
+
Whether to freeze the graph after construction.
|
48
50
|
|
49
51
|
Notes
|
50
52
|
-----
|
@@ -57,11 +59,44 @@ class PipelineGraph(ImplementationGraph):
|
|
57
59
|
``Implementations`` to run.
|
58
60
|
"""
|
59
61
|
|
60
|
-
def __init__(self, config: Config) -> None:
|
62
|
+
def __init__(self, config: Config, freeze: bool = True) -> None:
|
61
63
|
super().__init__(incoming_graph_data=config.schema.get_implementation_graph())
|
62
64
|
self._merge_combined_implementations(config)
|
63
65
|
self._update_slot_filepaths(config)
|
64
|
-
|
66
|
+
if freeze:
|
67
|
+
self = nx.freeze(self)
|
68
|
+
|
69
|
+
@property
|
70
|
+
def spark_is_required(self) -> bool:
|
71
|
+
"""Whether or not any :class:`~easylink.implementation.Implementation` requires spark."""
|
72
|
+
return any([implementation.requires_spark for implementation in self.implementations])
|
73
|
+
|
74
|
+
@property
|
75
|
+
def any_embarrassingly_parallel(self) -> bool:
|
76
|
+
"""Whether or not any :class:`~easylink.implementation.Implementation` is
|
77
|
+
to be run in an embarrassingly parallel way."""
|
78
|
+
return any(
|
79
|
+
[
|
80
|
+
self.get_whether_embarrassingly_parallel(node)
|
81
|
+
for node in self.implementation_nodes
|
82
|
+
]
|
83
|
+
)
|
84
|
+
|
85
|
+
def get_whether_embarrassingly_parallel(self, node: str) -> bool:
|
86
|
+
"""Determines whether a node is to be run in an embarrassingly parallel way.
|
87
|
+
|
88
|
+
Parameters
|
89
|
+
----------
|
90
|
+
node
|
91
|
+
The node name to determine whether or not it is to be run in an
|
92
|
+
embarrassingly parallel way.
|
93
|
+
|
94
|
+
Returns
|
95
|
+
-------
|
96
|
+
A boolean indicating whether the node is to be run in an embarrassingly
|
97
|
+
parallel way.
|
98
|
+
"""
|
99
|
+
return self.nodes[node]["implementation"].is_embarrassingly_parallel
|
65
100
|
|
66
101
|
def get_io_filepaths(self, node: str) -> tuple[list[str], list[str]]:
|
67
102
|
"""Gets all of a node's input and output filepaths from its edges.
|
@@ -93,38 +128,40 @@ class PipelineGraph(ImplementationGraph):
|
|
93
128
|
)
|
94
129
|
return input_files, output_files
|
95
130
|
|
96
|
-
def
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
require spark resources.
|
101
|
-
|
102
|
-
Returns
|
103
|
-
-------
|
104
|
-
A boolean indicating whether the pipeline requires Spark.
|
105
|
-
"""
|
106
|
-
return any([implementation.requires_spark for implementation in self.implementations])
|
107
|
-
|
108
|
-
def get_input_slot_attributes(self, node: str) -> dict[str, dict[str, str | list[str]]]:
|
109
|
-
"""Gets all of a node's input slot attributes from edges.
|
131
|
+
def get_io_slot_attributes(
|
132
|
+
self, node: str
|
133
|
+
) -> tuple[dict[str, dict[str, str | list[str]]], dict[str, dict[str, str | list[str]]]]:
|
134
|
+
"""Gets all of a node's i/o slot attributes from edges.
|
110
135
|
|
111
136
|
Parameters
|
112
137
|
----------
|
113
138
|
node
|
114
|
-
The node name to get
|
139
|
+
The node name to get slot attributes for.
|
115
140
|
|
116
141
|
Returns
|
117
142
|
-------
|
118
|
-
A
|
143
|
+
A tuple of mappings of node name to slot attributes.
|
119
144
|
"""
|
120
145
|
input_slots = [
|
121
146
|
edge_attrs["input_slot"] for _, _, edge_attrs in self.in_edges(node, data=True)
|
122
147
|
]
|
123
|
-
|
148
|
+
input_filepaths_by_slot = [
|
124
149
|
list(edge_attrs["filepaths"])
|
125
150
|
for _, _, edge_attrs in self.in_edges(node, data=True)
|
126
151
|
]
|
127
|
-
|
152
|
+
input_slot_attrs = self._deduplicate_input_slots(input_slots, input_filepaths_by_slot)
|
153
|
+
|
154
|
+
output_slots = [
|
155
|
+
edge_attrs["output_slot"] for _, _, edge_attrs in self.out_edges(node, data=True)
|
156
|
+
]
|
157
|
+
output_filepaths_by_slot = [
|
158
|
+
list(edge_attrs["filepaths"])
|
159
|
+
for _, _, edge_attrs in self.out_edges(node, data=True)
|
160
|
+
]
|
161
|
+
output_slot_attrs = self._deduplicate_output_slots(
|
162
|
+
output_slots, output_filepaths_by_slot
|
163
|
+
)
|
164
|
+
return input_slot_attrs, output_slot_attrs
|
128
165
|
|
129
166
|
##################
|
130
167
|
# Helper Methods #
|
@@ -285,6 +322,15 @@ class PipelineGraph(ImplementationGraph):
|
|
285
322
|
:class:`OutputSlots<easylink.graph_components.OutputSlot>`, and
|
286
323
|
:class:`~easylink.graph_components.EdgeParams` needed to construct the
|
287
324
|
combined implementation.
|
325
|
+
|
326
|
+
Notes
|
327
|
+
-----
|
328
|
+
When combining implementations results in a node with multiple slots with
|
329
|
+
the same name and/or environment variable, the slots are made unique
|
330
|
+
by prepending the :class:`~easylink.step.Step` name to the slot name as well
|
331
|
+
as to the environment variable. This is necessary to prevent collisions
|
332
|
+
with a combined implementation that takes multiple environment variables that
|
333
|
+
have the same name.
|
288
334
|
"""
|
289
335
|
slot_types = ["input_slot", "output_slot"]
|
290
336
|
combined_slots_by_type = combined_input_slots, combined_output_slots = set(), set()
|
@@ -292,7 +338,8 @@ class PipelineGraph(ImplementationGraph):
|
|
292
338
|
transform_mappings = (InputSlotMapping, OutputSlotMapping)
|
293
339
|
|
294
340
|
combined_edges = set()
|
295
|
-
|
341
|
+
# FIXME [MIC-5848]: test coverage is lacking when two output slots have the same name,
|
342
|
+
# i.e. combing two steps that have the same name output slots
|
296
343
|
for slot_type, combined_slots, edges_by_slot, transform_mapping in zip(
|
297
344
|
slot_types, combined_slots_by_type, edges_by_slot_and_type, transform_mappings
|
298
345
|
):
|
@@ -402,8 +449,9 @@ class PipelineGraph(ImplementationGraph):
|
|
402
449
|
def _update_slot_filepaths(self, config: Config) -> None:
|
403
450
|
"""Fills graph edges with appropriate filepath information.
|
404
451
|
|
405
|
-
|
406
|
-
|
452
|
+
This method updates the :class:`~easylink.step.Step` slot information with
|
453
|
+
actual filepaths. This can't happen earlier in the process because we
|
454
|
+
don't know node names until now (which are required for the filepaths).
|
407
455
|
|
408
456
|
Parameters
|
409
457
|
----------
|
@@ -424,7 +472,8 @@ class PipelineGraph(ImplementationGraph):
|
|
424
472
|
|
425
473
|
# Update implementation nodes with yaml metadata
|
426
474
|
for node in self.implementation_nodes:
|
427
|
-
|
475
|
+
implementation = self.nodes[node]["implementation"]
|
476
|
+
imp_outputs = implementation.outputs
|
428
477
|
for src, sink, edge_attrs in self.out_edges(node, data=True):
|
429
478
|
for edge_idx in self[node][sink]:
|
430
479
|
self[src][sink][edge_idx]["filepaths"] = (
|
@@ -436,10 +485,10 @@ class PipelineGraph(ImplementationGraph):
|
|
436
485
|
)
|
437
486
|
|
438
487
|
@staticmethod
|
439
|
-
def
|
488
|
+
def _deduplicate_input_slots(
|
440
489
|
input_slots: list[InputSlot], filepaths_by_slot: list[str]
|
441
490
|
) -> dict[str, dict[str, str | list[str]]]:
|
442
|
-
"""
|
491
|
+
"""Deduplicates input slots into a dictionary with filepaths.
|
443
492
|
|
444
493
|
Parameters
|
445
494
|
----------
|
@@ -460,10 +509,11 @@ class PipelineGraph(ImplementationGraph):
|
|
460
509
|
"""
|
461
510
|
condensed_slot_dict = {}
|
462
511
|
for input_slot, filepaths in zip(input_slots, filepaths_by_slot):
|
463
|
-
slot_name, env_var, validator = (
|
512
|
+
slot_name, env_var, validator, splitter = (
|
464
513
|
input_slot.name,
|
465
514
|
input_slot.env_var,
|
466
515
|
input_slot.validator,
|
516
|
+
input_slot.splitter,
|
467
517
|
)
|
468
518
|
if slot_name in condensed_slot_dict:
|
469
519
|
if env_var != condensed_slot_dict[slot_name]["env_var"]:
|
@@ -476,11 +526,46 @@ class PipelineGraph(ImplementationGraph):
|
|
476
526
|
f"Duplicate input slots named '{slot_name}' have different validators: "
|
477
527
|
f"'{validator.__name__}' and '{condensed_slot_validator.__name__}'."
|
478
528
|
)
|
529
|
+
# Add the new filepaths to the existing slot
|
479
530
|
condensed_slot_dict[slot_name]["filepaths"].extend(filepaths)
|
480
531
|
else:
|
481
532
|
condensed_slot_dict[slot_name] = {
|
482
533
|
"env_var": env_var,
|
483
534
|
"validator": validator,
|
484
535
|
"filepaths": filepaths,
|
536
|
+
"splitter": splitter,
|
537
|
+
}
|
538
|
+
return condensed_slot_dict
|
539
|
+
|
540
|
+
@staticmethod
|
541
|
+
def _deduplicate_output_slots(
|
542
|
+
output_slots: list[OutputSlot], filepaths_by_slot: list[str]
|
543
|
+
) -> dict[str, dict[str, str | list[str]]]:
|
544
|
+
"""Deduplicates output slots into a dictionary with filepaths.
|
545
|
+
|
546
|
+
Parameters
|
547
|
+
----------
|
548
|
+
output_slots
|
549
|
+
The :class:`OutputSlots<easylink.graph_components.OutputSlot>` to deduplicate.
|
550
|
+
filepaths_by_slot
|
551
|
+
The filepaths associated with each ``OutputSlot``.
|
552
|
+
|
553
|
+
Returns
|
554
|
+
-------
|
555
|
+
A dictionary mapping ``OutputSlot`` names to their attributes and filepaths.
|
556
|
+
"""
|
557
|
+
condensed_slot_dict = {}
|
558
|
+
for output_slot, filepaths in zip(output_slots, filepaths_by_slot):
|
559
|
+
slot_name, aggregator = (
|
560
|
+
output_slot.name,
|
561
|
+
output_slot.aggregator,
|
562
|
+
)
|
563
|
+
if slot_name in condensed_slot_dict:
|
564
|
+
# Add the new filepaths to the existing slot
|
565
|
+
condensed_slot_dict[slot_name]["filepaths"].extend(filepaths)
|
566
|
+
else:
|
567
|
+
condensed_slot_dict[slot_name] = {
|
568
|
+
"filepaths": filepaths,
|
569
|
+
"aggregator": aggregator,
|
485
570
|
}
|
486
571
|
return condensed_slot_dict
|
@@ -6,7 +6,10 @@ ALLOWED_SCHEMA_PARAMS = {
|
|
6
6
|
|
7
7
|
TESTING_SCHEMA_PARAMS = {
|
8
8
|
"integration": testing.SINGLE_STEP_SCHEMA_PARAMS,
|
9
|
-
"
|
10
|
-
"
|
9
|
+
"combine_bad_topology": testing.BAD_COMBINED_TOPOLOGY_SCHEMA_PARAMS,
|
10
|
+
"combine_bad_implementation_names": testing.BAD_COMBINED_TOPOLOGY_SCHEMA_PARAMS,
|
11
11
|
"nested_templated_steps": testing.NESTED_TEMPLATED_STEPS_SCHEMA_PARAMS,
|
12
|
+
"combine_with_iteration": testing.COMBINE_WITH_ITERATION_SCHEMA_PARAMS,
|
13
|
+
"combine_with_iteration_cycle": testing.COMBINE_WITH_ITERATION_SCHEMA_PARAMS,
|
14
|
+
"combine_with_extra_node": testing.TRIPLE_STEP_SCHEMA_PARAMS,
|
12
15
|
}
|
@@ -13,6 +13,7 @@ from easylink.graph_components import (
|
|
13
13
|
)
|
14
14
|
from easylink.step import (
|
15
15
|
ChoiceStep,
|
16
|
+
EmbarrassinglyParallelStep,
|
16
17
|
HierarchicalStep,
|
17
18
|
InputStep,
|
18
19
|
LoopStep,
|
@@ -20,6 +21,8 @@ from easylink.step import (
|
|
20
21
|
ParallelStep,
|
21
22
|
Step,
|
22
23
|
)
|
24
|
+
from easylink.utilities.aggregator_utils import concatenate_datasets
|
25
|
+
from easylink.utilities.splitter_utils import split_data_by_size
|
23
26
|
from easylink.utilities.validation_utils import validate_input_file_dummy
|
24
27
|
|
25
28
|
NODES = [
|
@@ -49,16 +52,22 @@ NODES = [
|
|
49
52
|
output_slots=[OutputSlot("step_2_main_output")],
|
50
53
|
),
|
51
54
|
LoopStep(
|
52
|
-
template_step=
|
55
|
+
template_step=EmbarrassinglyParallelStep(
|
53
56
|
step_name="step_3",
|
54
57
|
input_slots=[
|
55
58
|
InputSlot(
|
56
59
|
name="step_3_main_input",
|
57
60
|
env_var="DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS",
|
58
61
|
validator=validate_input_file_dummy,
|
62
|
+
splitter=split_data_by_size,
|
63
|
+
),
|
64
|
+
],
|
65
|
+
output_slots=[
|
66
|
+
OutputSlot(
|
67
|
+
name="step_3_main_output",
|
68
|
+
aggregator=concatenate_datasets,
|
59
69
|
),
|
60
70
|
],
|
61
|
-
output_slots=[OutputSlot("step_3_main_output")],
|
62
71
|
),
|
63
72
|
self_edges=[
|
64
73
|
EdgeParams(
|