easylink 0.1.13__py3-none-any.whl → 0.1.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- easylink/_version.py +1 -1
- easylink/configuration.py +5 -5
- easylink/graph_components.py +48 -51
- easylink/implementation.py +70 -10
- easylink/pipeline.py +127 -24
- easylink/pipeline_graph.py +46 -26
- easylink/pipeline_schema_constants/__init__.py +11 -7
- easylink/pipeline_schema_constants/development.py +2 -23
- easylink/pipeline_schema_constants/testing.py +243 -17
- easylink/rule.py +60 -140
- easylink/runner.py +14 -9
- easylink/step.py +397 -143
- easylink/utilities/spark.smk +2 -2
- easylink/utilities/splitter_utils.py +35 -0
- {easylink-0.1.13.dist-info → easylink-0.1.15.dist-info}/METADATA +22 -14
- {easylink-0.1.13.dist-info → easylink-0.1.15.dist-info}/RECORD +19 -19
- {easylink-0.1.13.dist-info → easylink-0.1.15.dist-info}/WHEEL +1 -1
- {easylink-0.1.13.dist-info → easylink-0.1.15.dist-info}/entry_points.txt +0 -0
- {easylink-0.1.13.dist-info → easylink-0.1.15.dist-info}/top_level.txt +0 -0
easylink/step.py
CHANGED
@@ -14,7 +14,7 @@ from __future__ import annotations
|
|
14
14
|
import copy
|
15
15
|
from abc import ABC, abstractmethod
|
16
16
|
from collections import defaultdict
|
17
|
-
from collections.abc import Iterable
|
17
|
+
from collections.abc import Callable, Iterable
|
18
18
|
|
19
19
|
from layered_config_tree import LayeredConfigTree
|
20
20
|
|
@@ -30,7 +30,9 @@ from easylink.graph_components import (
|
|
30
30
|
)
|
31
31
|
from easylink.implementation import (
|
32
32
|
Implementation,
|
33
|
+
NullAggregatorImplementation,
|
33
34
|
NullImplementation,
|
35
|
+
NullSplitterImplementation,
|
34
36
|
PartialImplementation,
|
35
37
|
)
|
36
38
|
from easylink.utilities import paths
|
@@ -69,6 +71,8 @@ class Step:
|
|
69
71
|
The :class:`InputSlotMapping<easylink.graph_components.InputSlotMapping>` of this ``Step``.
|
70
72
|
output_slot_mappings
|
71
73
|
The :class:`OutputSlotMapping<easylink.graph_components.OutputSlotMapping>` of this ``Step``.
|
74
|
+
is_embarrassingly_parallel
|
75
|
+
Whether or not this ``Step`` is to be run in an embarrassingly parallel manner.
|
72
76
|
|
73
77
|
Notes
|
74
78
|
-----
|
@@ -81,7 +85,7 @@ class Step:
|
|
81
85
|
|
82
86
|
def __init__(
|
83
87
|
self,
|
84
|
-
step_name: str,
|
88
|
+
step_name: str | None,
|
85
89
|
name: str | None = None,
|
86
90
|
input_slots: Iterable[InputSlot] = (),
|
87
91
|
output_slots: Iterable[OutputSlot] = (),
|
@@ -89,10 +93,12 @@ class Step:
|
|
89
93
|
output_slot_mappings: Iterable[OutputSlotMapping] = (),
|
90
94
|
is_embarrassingly_parallel: bool = False,
|
91
95
|
) -> None:
|
96
|
+
if not step_name and not name:
|
97
|
+
raise ValueError("All Steps must contain a step_name, name, or both.")
|
92
98
|
self.step_name = step_name
|
93
99
|
"""The name of the pipeline step in the ``PipelineSchema``. It must also match
|
94
100
|
the key in the implementation metadata file to be used to run this ``Step``."""
|
95
|
-
self.
|
101
|
+
self._name = name if name else step_name
|
96
102
|
"""The name of this ``Step's`` node in its :class:`easylink.graph_components.StepGraph`.
|
97
103
|
This can be different from the ``step_name`` due to the need for disambiguation
|
98
104
|
during the process of flattening the ``Stepgraph``, e.g. unrolling loops, etc.
|
@@ -115,6 +121,20 @@ class Step:
|
|
115
121
|
self._configuration_state = None
|
116
122
|
"""This ``Step's`` :class:`~easylink.step.ConfigurationState`."""
|
117
123
|
|
124
|
+
@property
|
125
|
+
def name(self):
|
126
|
+
"""The name of this ``Step's`` node in its :class:`easylink.graph_components.StepGraph`.
|
127
|
+
This can be different from the ``step_name`` due to the need for disambiguation
|
128
|
+
during the process of flattening the ``Stepgraph``, e.g. unrolling loops, etc.
|
129
|
+
For example, if step 1 is looped multiple times, each node would have a
|
130
|
+
``step_name`` of, perhaps, "step_1" but unique ``names`` ("step_1_loop_1", etc)."""
|
131
|
+
return self._name
|
132
|
+
|
133
|
+
@name.setter
|
134
|
+
def name(self, value: str):
|
135
|
+
"""Sets the ``name`` of this ``Step``."""
|
136
|
+
self._name = value
|
137
|
+
|
118
138
|
@property
|
119
139
|
def config_key(self):
|
120
140
|
"""The configuration key pertinent to this type of ``Step``."""
|
@@ -161,8 +181,9 @@ class Step:
|
|
161
181
|
node_names = []
|
162
182
|
step_names = []
|
163
183
|
while step:
|
164
|
-
|
165
|
-
|
184
|
+
if step.step_name:
|
185
|
+
node_names.append(step.name)
|
186
|
+
step_names.append(step.step_name)
|
166
187
|
step = step.parent_step
|
167
188
|
|
168
189
|
prefix = []
|
@@ -333,13 +354,11 @@ class Step:
|
|
333
354
|
}
|
334
355
|
|
335
356
|
|
336
|
-
class
|
337
|
-
"""A special case type of :class:`Step`
|
357
|
+
class StandaloneStep(Step, ABC):
|
358
|
+
"""A special case type of :class:`Step` that is not implemented on the pipeline.
|
338
359
|
|
339
|
-
|
340
|
-
|
341
|
-
classes. These are not typical ``Steps`` in that they do not represent a unit
|
342
|
-
of work to be performed in the pipeline (i.e. there is no container to run) and,
|
360
|
+
These are not typical ``Steps`` in that they do not represent a unit of work
|
361
|
+
to be performed in the pipeline (i.e. there is no container to run) and,
|
343
362
|
thus, are not implemented by an :class:`~easylink.implementation.Implementation`.
|
344
363
|
|
345
364
|
See :class:`Step` for inherited attributes.
|
@@ -348,32 +367,47 @@ class IOStep(Step):
|
|
348
367
|
|
349
368
|
@property
|
350
369
|
def implementation_node_name(self) -> str:
|
351
|
-
"""Dummy name to allow ``
|
370
|
+
"""Dummy name to allow ``StandaloneSteps`` to be used interchangeably with other ``Steps``.
|
352
371
|
|
353
|
-
Unlike other types of ``Steps``, ``
|
372
|
+
Unlike other types of ``Steps``, ``StandaloneSteps`` are not actually implemented
|
354
373
|
via an :class:`~easylink.implementation.Implementation` and thus do not
|
355
374
|
require a different node name than its own ``Step`` name. This property
|
356
|
-
only exists so that ``
|
375
|
+
only exists so that ``StandaloneSteps`` can be used interchangeably with other
|
357
376
|
``Steps`` in the codebase.
|
358
377
|
|
359
378
|
Returns
|
360
379
|
-------
|
361
|
-
The ``
|
380
|
+
The ``StandaloneStep's`` name.
|
362
381
|
"""
|
363
382
|
return self.name
|
364
383
|
|
384
|
+
@abstractmethod
|
385
|
+
def add_nodes_to_implementation_graph(
|
386
|
+
self, implementation_graph: ImplementationGraph
|
387
|
+
) -> None:
|
388
|
+
"""Adds this ``StandaloneStep's`` ``Implementation`` as a node to the :class:`~easylink.graph_components.ImplementationGraph`.
|
389
|
+
|
390
|
+
Notes
|
391
|
+
-----
|
392
|
+
Unlike other types of ``Steps``, ``StandaloneSteps`` are not actually implemented
|
393
|
+
via an :class:`~easylink.implementation.Implementation`. As such, we
|
394
|
+
leverage the :class:`~easylink.implementation.NullImplementation` class
|
395
|
+
to generate the graph node.
|
396
|
+
"""
|
397
|
+
pass
|
398
|
+
|
365
399
|
def validate_step(
|
366
400
|
self,
|
367
401
|
step_config: LayeredConfigTree,
|
368
402
|
combined_implementations: LayeredConfigTree,
|
369
403
|
input_data_config: LayeredConfigTree,
|
370
404
|
) -> dict[str, list[str]]:
|
371
|
-
"""Dummy validation method to allow ``
|
405
|
+
"""Dummy validation method to allow ``StandaloneSteps`` to be used interchangeably with other ``Steps``.
|
372
406
|
|
373
|
-
Unlike other types of ``Steps``, ``
|
407
|
+
Unlike other types of ``Steps``, ``StandaloneSteps`` are not actually implemented
|
374
408
|
via an :class:`~easylink.implementation.Implementation` and thus do not
|
375
409
|
require any sort of validation since no new data is created. This method
|
376
|
-
only exists so that ``
|
410
|
+
only exists so that ``StandaloneSteps`` can be used interchangeably with other
|
377
411
|
``Steps`` in the codebase.
|
378
412
|
|
379
413
|
Returns
|
@@ -404,18 +438,31 @@ class IOStep(Step):
|
|
404
438
|
self, step_config, combined_implementations, input_data_config
|
405
439
|
)
|
406
440
|
|
441
|
+
def add_edges_to_implementation_graph(self, implementation_graph):
|
442
|
+
"""Overwrites the super ``Step``'s method to do nothing.
|
443
|
+
|
444
|
+
``StandaloneSteps`` do not have edges within them in the ``ImplementationGraph``,
|
445
|
+
since they are represented by a single ``NullImplementation`` node, and so we
|
446
|
+
simply pass.
|
447
|
+
"""
|
448
|
+
pass
|
449
|
+
|
450
|
+
|
451
|
+
class IOStep(StandaloneStep):
|
452
|
+
"""A type of :class:`StandaloneStep` used to represent incoming and outgoing data.
|
453
|
+
|
454
|
+
``IOSteps`` are used to handle the incoming and outgoing data to the pipeline;
|
455
|
+
they are inherited by concrete :class:`InputStep` and :class:`OutputStep`
|
456
|
+
classes.
|
457
|
+
|
458
|
+
See :class:`Step` for inherited attributes.
|
459
|
+
|
460
|
+
"""
|
461
|
+
|
407
462
|
def add_nodes_to_implementation_graph(
|
408
463
|
self, implementation_graph: ImplementationGraph
|
409
464
|
) -> None:
|
410
|
-
"""Adds
|
411
|
-
|
412
|
-
Notes
|
413
|
-
-----
|
414
|
-
Unlike other types of ``Steps``, ``IOSteps`` are not actually implemented
|
415
|
-
via an :class:`~easylink.implementation.Implementation`. As such, we
|
416
|
-
leverage the :class:`~easylink.implementation.NullImplementation` class
|
417
|
-
to generate the graph node.
|
418
|
-
"""
|
465
|
+
"""Adds a :class:`~easylink.implementation.NullImplementation` node to the :class:`~easylink.graph_components.ImplementationGraph`."""
|
419
466
|
implementation_graph.add_node_from_implementation(
|
420
467
|
self.name,
|
421
468
|
implementation=NullImplementation(
|
@@ -423,18 +470,9 @@ class IOStep(Step):
|
|
423
470
|
),
|
424
471
|
)
|
425
472
|
|
426
|
-
def add_edges_to_implementation_graph(self, implementation_graph):
|
427
|
-
"""Adds the edges of this ``Step's`` ``Implementation`` to the ``ImplementationGraph``.
|
428
|
-
|
429
|
-
``IOSteps`` do not have edges within them in the ``ImplementationGraph``,
|
430
|
-
since they are represented by a single ``NullImplementation`` node, and so we
|
431
|
-
simply pass.
|
432
|
-
"""
|
433
|
-
pass
|
434
|
-
|
435
473
|
|
436
474
|
class InputStep(IOStep):
|
437
|
-
"""A special case type of :class:`
|
475
|
+
"""A special case type of :class:`IOStep` used to represent incoming data.
|
438
476
|
|
439
477
|
An ``InputStep`` is used to pass data into the pipeline. Since we do not know
|
440
478
|
what the data to pass into the pipeline will be a priori, we instantiate an
|
@@ -442,6 +480,7 @@ class InputStep(IOStep):
|
|
442
480
|
*all* data defined in the input data specification file.
|
443
481
|
|
444
482
|
See :class:`IOStep` for inherited attributes.
|
483
|
+
|
445
484
|
"""
|
446
485
|
|
447
486
|
def __init__(self) -> None:
|
@@ -478,7 +517,7 @@ class InputStep(IOStep):
|
|
478
517
|
|
479
518
|
|
480
519
|
class OutputStep(IOStep):
|
481
|
-
"""A special case type of :class:`
|
520
|
+
"""A special case type of :class:`IOStep` used to represent final results data.
|
482
521
|
|
483
522
|
An ``OutputStep`` is used to write the `Snakemake <https://snakemake.readthedocs.io/en/stable/>`_
|
484
523
|
Snakefile target rule in the :meth:`easylink.pipeline.Pipeline.build_snakefile`
|
@@ -511,10 +550,11 @@ class HierarchicalStep(Step):
|
|
511
550
|
step_graph
|
512
551
|
The :class:`~easylink.graph_components.StepGraph` i.e. the directed acyclic
|
513
552
|
graph (DAG) of sub-nodes and their edges that make up this ``HierarchicalStep``.
|
514
|
-
|
515
|
-
Whether or not the ``HierarchicalStep`` is
|
516
|
-
attribute to allow for back-end ``HierarchicalStep``
|
517
|
-
|
553
|
+
directly_implemented
|
554
|
+
Whether or not the ``HierarchicalStep`` is implemented directly from the user.
|
555
|
+
It is a convenience attribute to allow for back-end ``HierarchicalStep``
|
556
|
+
construction (i.e. ones that do not have a corresponding user-provided
|
557
|
+
'substeps' configuration key).
|
518
558
|
|
519
559
|
"""
|
520
560
|
|
@@ -528,7 +568,7 @@ class HierarchicalStep(Step):
|
|
528
568
|
edges=(),
|
529
569
|
input_slot_mappings=(),
|
530
570
|
output_slot_mappings=(),
|
531
|
-
|
571
|
+
directly_implemented=True,
|
532
572
|
):
|
533
573
|
super().__init__(
|
534
574
|
step_name,
|
@@ -547,7 +587,7 @@ class HierarchicalStep(Step):
|
|
547
587
|
self.step_graph = self._get_step_graph(nodes, edges)
|
548
588
|
"""The :class:`~easylink.graph_components.StepGraph` i.e. the directed acyclic
|
549
589
|
graph (DAG) of sub-nodes and their edges that make up this ``HierarchicalStep``."""
|
550
|
-
self.
|
590
|
+
self.directly_implemented = directly_implemented
|
551
591
|
"""Whether or not the ``HierarchicalStep`` is user-configurable. It is a convenience
|
552
592
|
attribute to allow for back-end ``HierarchicalStep`` creation that are not
|
553
593
|
user-facing (i.e. they do not need to provide a 'substeps' configuration key)."""
|
@@ -595,7 +635,7 @@ class HierarchicalStep(Step):
|
|
595
635
|
all issues in one pass. In these cases, new errors may be found after the
|
596
636
|
initial ones are handled.
|
597
637
|
"""
|
598
|
-
if self.
|
638
|
+
if self.directly_implemented:
|
599
639
|
if self.config_key in step_config:
|
600
640
|
step_config = step_config[self.config_key]
|
601
641
|
else:
|
@@ -616,7 +656,7 @@ class HierarchicalStep(Step):
|
|
616
656
|
"""Sets the configuration state.
|
617
657
|
|
618
658
|
The configuration state of a ``HierarchicalStep`` depends on (1) whether
|
619
|
-
or not it is :attr:`
|
659
|
+
or not it is :attr:`directly_implemented` and (2) whether or not the
|
620
660
|
:attr:`config_key` exists in the pipeline specification file.
|
621
661
|
|
622
662
|
Parameters
|
@@ -629,7 +669,7 @@ class HierarchicalStep(Step):
|
|
629
669
|
input_data_config
|
630
670
|
The input data configuration for the entire pipeline.
|
631
671
|
"""
|
632
|
-
if self.
|
672
|
+
if self.directly_implemented:
|
633
673
|
if self.config_key in step_config:
|
634
674
|
step_config = step_config[self.config_key]
|
635
675
|
configuration_state_type = NonLeafConfigurationState
|
@@ -780,7 +820,7 @@ class TemplatedStep(Step, ABC):
|
|
780
820
|
"""Validates the ``TemplatedStep``.
|
781
821
|
|
782
822
|
Regardless of whether or not a :attr:`Step.config_key` is set, we always
|
783
|
-
validate the
|
823
|
+
validate the base ``Step`` used to create the ``TemplatedStep``. If a
|
784
824
|
``config_key`` is indeed set (that is, there is some multiplicity), we
|
785
825
|
complete additional validations.
|
786
826
|
|
@@ -889,14 +929,16 @@ class TemplatedStep(Step, ABC):
|
|
889
929
|
self.step_graph.add_node_from_step(self.template_step)
|
890
930
|
# Update the slot mappings with renamed children
|
891
931
|
input_mappings = [
|
892
|
-
InputSlotMapping(slot, self.name, slot)
|
932
|
+
InputSlotMapping(slot, self.template_step.name, slot)
|
933
|
+
for slot in self.input_slots
|
893
934
|
]
|
894
935
|
output_mappings = [
|
895
|
-
OutputSlotMapping(slot, self.name, slot)
|
936
|
+
OutputSlotMapping(slot, self.template_step.name, slot)
|
937
|
+
for slot in self.output_slots
|
896
938
|
]
|
897
939
|
self.slot_mappings = {"input": input_mappings, "output": output_mappings}
|
898
940
|
# Add the key back to the expanded config
|
899
|
-
expanded_config = LayeredConfigTree({self.name: step_config})
|
941
|
+
expanded_config = LayeredConfigTree({self.template_step.name: step_config})
|
900
942
|
else:
|
901
943
|
expanded_config = self._get_config(step_config)
|
902
944
|
num_repeats = len(expanded_config)
|
@@ -1146,7 +1188,7 @@ class ParallelStep(TemplatedStep):
|
|
1146
1188
|
|
1147
1189
|
|
1148
1190
|
class EmbarrassinglyParallelStep(Step):
|
1149
|
-
"""A
|
1191
|
+
"""A :class:`Step` that is run in parallel on the backend.
|
1150
1192
|
|
1151
1193
|
An ``EmbarrassinglyParallelStep`` is different than a :class:`ParallelStep`
|
1152
1194
|
in that it is not configured by the user to be run in parallel - it completely
|
@@ -1159,29 +1201,47 @@ class EmbarrassinglyParallelStep(Step):
|
|
1159
1201
|
step
|
1160
1202
|
The ``Step`` to be run in an embarrassingly parallel manner. To run multiple
|
1161
1203
|
steps in parallel, use a :class:`HierarchicalStep`.
|
1204
|
+
slot_splitter_mapping
|
1205
|
+
A mapping of the :class:`~easylink.graph_components.InputSlot` name to split
|
1206
|
+
to the actual splitter function to be used.
|
1207
|
+
slot_aggregator_mapping
|
1208
|
+
A mapping of all :class:`~easylink.graph_components.OutputSlot` names to
|
1209
|
+
be aggregated and the actual aggregator function to be used.
|
1162
1210
|
|
1163
1211
|
"""
|
1164
1212
|
|
1165
1213
|
def __init__(
|
1166
1214
|
self,
|
1167
1215
|
step: Step,
|
1168
|
-
|
1169
|
-
|
1170
|
-
input_slot_mappings: Iterable[InputSlotMapping],
|
1171
|
-
output_slot_mappings: Iterable[OutputSlotMapping],
|
1216
|
+
slot_splitter_mapping: dict[str, Callable],
|
1217
|
+
slot_aggregator_mapping: dict[str, Callable],
|
1172
1218
|
) -> None:
|
1173
1219
|
super().__init__(
|
1174
|
-
|
1175
|
-
step.name,
|
1176
|
-
input_slots,
|
1177
|
-
output_slots,
|
1178
|
-
input_slot_mappings,
|
1179
|
-
output_slot_mappings,
|
1220
|
+
step_name=None,
|
1221
|
+
name=step.name,
|
1180
1222
|
is_embarrassingly_parallel=True,
|
1181
1223
|
)
|
1224
|
+
self.slot_splitter_mapping = slot_splitter_mapping
|
1225
|
+
"""A mapping of the :class:`~easylink.graph_components.InputSlot` name to split
|
1226
|
+
to the actual splitter function to be used."""
|
1227
|
+
self.slot_aggregator_mapping = slot_aggregator_mapping
|
1228
|
+
"""A mapping of all :class:`~easylink.graph_components.OutputSlot` names to
|
1229
|
+
be aggregated and the actual aggregator function to be used."""
|
1182
1230
|
self.step_graph = None
|
1183
1231
|
self.step = step
|
1232
|
+
self.step.set_parent_step(self)
|
1233
|
+
self.input_slots = self.step.input_slots
|
1234
|
+
self.output_slots = self.step.output_slots
|
1184
1235
|
self._validate()
|
1236
|
+
# NOTE: We validated that the slot_splitter_mapping has only one item in self._validate()
|
1237
|
+
self.split_slot_name = list(self.slot_splitter_mapping.keys())[0]
|
1238
|
+
"""The name of the ``InputSlot`` to be split."""
|
1239
|
+
|
1240
|
+
@Step.name.setter
|
1241
|
+
def name(self, value: str) -> None:
|
1242
|
+
"""Changes the name of the ``EmbarrassinglyParallelStep`` and the underlying :class:`Step` to the given value."""
|
1243
|
+
self._name = value
|
1244
|
+
self.step._name = value
|
1185
1245
|
|
1186
1246
|
def _validate(self) -> None:
|
1187
1247
|
"""Validates the ``EmbarrassinglyParallelStep``.
|
@@ -1189,31 +1249,36 @@ class EmbarrassinglyParallelStep(Step):
|
|
1189
1249
|
``EmbarrassinglyParallelSteps`` are not configured by the user to be run
|
1190
1250
|
in parallel. Since it happens on the back end, we need to do somewhat unique
|
1191
1251
|
validations during construction. Specifically,
|
1192
|
-
- one and only one :class:`~easylink.graph_components.InputSlot` *must*
|
1193
|
-
a
|
1194
|
-
- all :class:`OutputSlots<easylink.graph_components.OutputSlot>` *must*
|
1195
|
-
|
1252
|
+
- one and only one :class:`~easylink.graph_components.InputSlot` *must*
|
1253
|
+
be mapped to a splitter method.
|
1254
|
+
- all :class:`OutputSlots<easylink.graph_components.OutputSlot>` *must*
|
1255
|
+
be mapped to aggregator methods.
|
1196
1256
|
"""
|
1197
1257
|
errors = []
|
1198
|
-
|
1199
|
-
|
1200
|
-
|
1201
|
-
|
1202
|
-
|
1203
|
-
|
1204
|
-
|
1258
|
+
|
1259
|
+
# check that only one input slot has a splitter assigned
|
1260
|
+
if len(self.slot_splitter_mapping) != 1:
|
1261
|
+
errors.append(
|
1262
|
+
f"EmbarrassinglyParallelStep '{self.step_name}' is attempting to define "
|
1263
|
+
f"{len(self.slot_splitter_mapping)} splitters when only one should be defined."
|
1264
|
+
)
|
1265
|
+
if len(self.slot_splitter_mapping) == 0:
|
1205
1266
|
errors.append(
|
1206
1267
|
f"EmbarrassinglyParallelStep '{self.step_name}' does not have any input slots with a "
|
1207
1268
|
"splitter method assigned; one and only one input slot must have a splitter."
|
1208
1269
|
)
|
1209
|
-
if len(
|
1270
|
+
if len(self.slot_splitter_mapping) > 1:
|
1210
1271
|
errors.append(
|
1211
1272
|
f"EmbarrassinglyParallelStep '{self.step_name}' has multiple input slots with "
|
1212
1273
|
"splitter methods assigned; one and only one input slot must have a splitter.\n"
|
1213
|
-
f"Input slots with splitters: {
|
1274
|
+
f"Input slots with splitters: {list(self.slot_splitter_mapping)}"
|
1214
1275
|
)
|
1276
|
+
|
1277
|
+
# check that all output slots have an aggregator assigned
|
1215
1278
|
missing_aggregators = [
|
1216
|
-
slot.name
|
1279
|
+
slot.name
|
1280
|
+
for slot in self.output_slots.values()
|
1281
|
+
if slot.name not in self.slot_aggregator_mapping
|
1217
1282
|
]
|
1218
1283
|
if len(missing_aggregators) != 0:
|
1219
1284
|
errors.append(
|
@@ -1223,6 +1288,49 @@ class EmbarrassinglyParallelStep(Step):
|
|
1223
1288
|
if errors:
|
1224
1289
|
raise ValueError("\n".join(errors))
|
1225
1290
|
|
1291
|
+
def validate_step(
|
1292
|
+
self,
|
1293
|
+
step_config: LayeredConfigTree,
|
1294
|
+
combined_implementations: LayeredConfigTree,
|
1295
|
+
input_data_config: LayeredConfigTree,
|
1296
|
+
) -> dict[str, list[str]]:
|
1297
|
+
"""Validates the ``TemplatedStep``.
|
1298
|
+
|
1299
|
+
Regardless of whether or not a :attr:`Step.config_key` is set, we always
|
1300
|
+
validate the base ``Step`` used to create the ``TemplatedStep``. If a
|
1301
|
+
``config_key`` is indeed set (that is, there is some multiplicity), we
|
1302
|
+
complete additional validations.
|
1303
|
+
|
1304
|
+
Parameters
|
1305
|
+
----------
|
1306
|
+
step_config
|
1307
|
+
The internal configuration of this ``Step``, i.e. it should not include
|
1308
|
+
the ``Step's`` name.
|
1309
|
+
combined_implementations
|
1310
|
+
The configuration for any implementations to be combined.
|
1311
|
+
input_data_config
|
1312
|
+
The input data configuration for the entire pipeline.
|
1313
|
+
|
1314
|
+
Returns
|
1315
|
+
-------
|
1316
|
+
A dictionary of errors, where the keys are the ``TemplatedStep`` name
|
1317
|
+
and the values are lists of error messages associated with the given
|
1318
|
+
``TemplatedStep``.
|
1319
|
+
|
1320
|
+
Notes
|
1321
|
+
-----
|
1322
|
+
If the ``TemplatedStep`` does not validate (i.e. errors are found and the returned
|
1323
|
+
dictionary is non-empty), the tool will exit and the pipeline will not run.
|
1324
|
+
|
1325
|
+
We attempt to batch error messages as much as possible, but there may be
|
1326
|
+
times where the configuration is so ill-formed that we are unable to handle
|
1327
|
+
all issues in one pass. In these cases, new errors may be found after the
|
1328
|
+
initial ones are handled.
|
1329
|
+
"""
|
1330
|
+
return self.step.validate_step(
|
1331
|
+
step_config, combined_implementations, input_data_config
|
1332
|
+
)
|
1333
|
+
|
1226
1334
|
def set_configuration_state(
|
1227
1335
|
self,
|
1228
1336
|
step_config: LayeredConfigTree,
|
@@ -1245,28 +1353,223 @@ class EmbarrassinglyParallelStep(Step):
|
|
1245
1353
|
input_data_config
|
1246
1354
|
The input data configuration for the entire pipeline.
|
1247
1355
|
"""
|
1248
|
-
|
1249
|
-
|
1250
|
-
|
1251
|
-
self.
|
1252
|
-
|
1253
|
-
|
1254
|
-
|
1255
|
-
|
1256
|
-
|
1257
|
-
|
1258
|
-
|
1259
|
-
|
1260
|
-
|
1261
|
-
|
1356
|
+
splitter_node_name = f"{self.name}_{self.split_slot_name}_split"
|
1357
|
+
splitter_step = SplitterStep(
|
1358
|
+
splitter_node_name,
|
1359
|
+
split_slot=self.input_slots[self.split_slot_name],
|
1360
|
+
splitter_func_name=self.slot_splitter_mapping[self.split_slot_name].__name__,
|
1361
|
+
)
|
1362
|
+
aggregator_node_name = f"{self.name}_aggregate"
|
1363
|
+
if len(self.output_slots) > 1:
|
1364
|
+
raise NotImplementedError(
|
1365
|
+
"FIXME [MIC-5883] Multiple output slots/files of EmbarrassinglyParallelSteps not yet supported"
|
1366
|
+
)
|
1367
|
+
output_slot = list(self.output_slots.values())[0]
|
1368
|
+
aggregator_step = AggregatorStep(
|
1369
|
+
aggregator_node_name,
|
1370
|
+
output_slot=output_slot,
|
1371
|
+
aggregator_func_name=self.slot_aggregator_mapping[output_slot.name].__name__,
|
1372
|
+
splitter_node_name=splitter_node_name,
|
1373
|
+
)
|
1374
|
+
self._update_step_graph(splitter_step, aggregator_step)
|
1375
|
+
self._update_slot_mappings(splitter_step, aggregator_step)
|
1262
1376
|
# Add the key back to the expanded config
|
1263
|
-
expanded_config = LayeredConfigTree({self.name: step_config})
|
1264
|
-
|
1377
|
+
expanded_config = LayeredConfigTree({self.step.name: step_config})
|
1265
1378
|
# EmbarrassinglyParallelSteps are by definition non-leaf steps
|
1266
1379
|
self._configuration_state = NonLeafConfigurationState(
|
1267
1380
|
self, expanded_config, combined_implementations, input_data_config
|
1268
1381
|
)
|
1269
1382
|
|
1383
|
+
def _update_step_graph(
|
1384
|
+
self, splitter_step: SplitterStep, aggregator_step: AggregatorStep
|
1385
|
+
) -> StepGraph:
|
1386
|
+
"""Updates the :class:`~easylink.graph_components.StepGraph` to include the splitting and aggregating nodes.
|
1387
|
+
|
1388
|
+
This strings exactly three nodes together: the :class:`SplitterStep` that does
|
1389
|
+
the splitting of the input data, the actual :class:`Step` to be run in parallel,
|
1390
|
+
and the :class:`AggregatorStep` that aggregates the output data, i.e.
|
1391
|
+
``SplitterStep -> ``Step`` -> AggregatorStep``.
|
1392
|
+
|
1393
|
+
Notes
|
1394
|
+
-----
|
1395
|
+
The ``SplitterStep`` and ``AggregatorStep`` are backed by versions of
|
1396
|
+
:class:`NullImplementations<easylink.implementation.NullImplementation>`,
|
1397
|
+
i.e. they do *not* actually require containers to run.
|
1398
|
+
|
1399
|
+
Parameters
|
1400
|
+
----------
|
1401
|
+
splitter_step
|
1402
|
+
The :class:`SplitterStep` that does the splitting of the input data.
|
1403
|
+
aggregator_step
|
1404
|
+
The :class:`AggregatorStep` that aggregates the output data.
|
1405
|
+
|
1406
|
+
Returns
|
1407
|
+
-------
|
1408
|
+
The updated ``StepGraph`` that includes ``SplitterStep``, ``Step``,
|
1409
|
+
and ``AggregatorStep`` nodes.
|
1410
|
+
"""
|
1411
|
+
self.step_graph = StepGraph()
|
1412
|
+
for node in [splitter_step, self.step, aggregator_step]:
|
1413
|
+
self.step_graph.add_node_from_step(node)
|
1414
|
+
|
1415
|
+
# Add SplitterStep -> Step edge
|
1416
|
+
self.step_graph.add_edge_from_params(
|
1417
|
+
EdgeParams(
|
1418
|
+
source_node=splitter_step.name,
|
1419
|
+
target_node=self.step.name,
|
1420
|
+
input_slot=self.split_slot_name,
|
1421
|
+
output_slot=list(splitter_step.output_slots.keys())[0],
|
1422
|
+
)
|
1423
|
+
)
|
1424
|
+
# Add the Step -> AggregatorStep edge
|
1425
|
+
if len(self.step.output_slots) > 1:
|
1426
|
+
raise NotImplementedError(
|
1427
|
+
"EmbarrassinglyParallelStep does not support multiple output slots."
|
1428
|
+
)
|
1429
|
+
self.step_graph.add_edge_from_params(
|
1430
|
+
EdgeParams(
|
1431
|
+
source_node=self.step.name,
|
1432
|
+
target_node=aggregator_step.name,
|
1433
|
+
input_slot=list(aggregator_step.input_slots.keys())[0],
|
1434
|
+
output_slot=list(self.step.output_slots.keys())[0],
|
1435
|
+
)
|
1436
|
+
)
|
1437
|
+
|
1438
|
+
def _update_slot_mappings(
|
1439
|
+
self, splitter_step: SplitterStep, aggregator_step: AggregatorStep
|
1440
|
+
) -> None:
|
1441
|
+
"""Updates the :class:`SlotMappings<easylink.graph_components.SlotMapping>`.
|
1442
|
+
|
1443
|
+
This updates the slot mappings to that the ``Step's`` inputs are redirected
|
1444
|
+
to the ``SplitterStep`` and the outputs are redirected to the ``AggregatorStep``.
|
1445
|
+
|
1446
|
+
Parameters
|
1447
|
+
----------
|
1448
|
+
splitter_step
|
1449
|
+
The :class:`SplitterStep` that does the splitting of the input data.
|
1450
|
+
aggregator_step
|
1451
|
+
The :class:`AggregatorStep` that aggregates the output data.
|
1452
|
+
|
1453
|
+
Returns
|
1454
|
+
-------
|
1455
|
+
Updated ``SlotMappings`` that account for ``SplitterStep`` and ``AggregatorStep``.
|
1456
|
+
"""
|
1457
|
+
# map the split input slot
|
1458
|
+
split_slot_name = list(splitter_step.input_slots.keys())[0]
|
1459
|
+
input_mappings = [
|
1460
|
+
InputSlotMapping(split_slot_name, splitter_step.name, split_slot_name)
|
1461
|
+
]
|
1462
|
+
# map remaining input slots
|
1463
|
+
for input_slot in [slot for slot in self.input_slots if slot != split_slot_name]:
|
1464
|
+
input_mappings.append(InputSlotMapping(input_slot, self.step.name, input_slot))
|
1465
|
+
# map the output slots
|
1466
|
+
output_mappings = [
|
1467
|
+
OutputSlotMapping(slot, aggregator_step.name, slot) for slot in self.output_slots
|
1468
|
+
]
|
1469
|
+
self.slot_mappings = {"input": input_mappings, "output": output_mappings}
|
1470
|
+
|
1471
|
+
|
1472
|
+
class SplitterStep(StandaloneStep):
|
1473
|
+
"""A :class:`StandaloneStep` that splits an :class:`~easylink.graph_components.InputSlot` for parallel processing.
|
1474
|
+
|
1475
|
+
A ``SplitterStep`` is intended to be used in conjunction with a corresponding
|
1476
|
+
:class:`AggregatorStep` and only during construction of an :class:`EmbarrassinglyParallelStep`.
|
1477
|
+
|
1478
|
+
See :class:`Step` for inherited attributes.
|
1479
|
+
|
1480
|
+
Parameters
|
1481
|
+
----------
|
1482
|
+
split_slot
|
1483
|
+
The name of the ``InputSlot`` to be split.
|
1484
|
+
splitter_func_name
|
1485
|
+
The name of the splitter function to be used.
|
1486
|
+
|
1487
|
+
"""
|
1488
|
+
|
1489
|
+
def __init__(self, name: str, split_slot: InputSlot, splitter_func_name: str) -> None:
|
1490
|
+
# Remove the env_var (not an implemented step) and validator (will be validated
|
1491
|
+
# after the splitting during input to the actual step to run)
|
1492
|
+
input_slot = copy.deepcopy(split_slot)
|
1493
|
+
input_slot.env_var = None
|
1494
|
+
input_slot.validator = None
|
1495
|
+
super().__init__(
|
1496
|
+
name, input_slots=[input_slot], output_slots=[OutputSlot(f"{name}_main_output")]
|
1497
|
+
)
|
1498
|
+
self.splitter_func_name = splitter_func_name
|
1499
|
+
"""The name of the splitter function to be used."""
|
1500
|
+
|
1501
|
+
def add_nodes_to_implementation_graph(
|
1502
|
+
self, implementation_graph: ImplementationGraph
|
1503
|
+
) -> None:
|
1504
|
+
"""Adds a :class:`~easylink.implementation.NullImplementation` node to the :class:`~easylink.graph_components.ImplementationGraph`."""
|
1505
|
+
implementation_graph.add_node_from_implementation(
|
1506
|
+
self.name,
|
1507
|
+
implementation=NullSplitterImplementation(
|
1508
|
+
self.name,
|
1509
|
+
self.input_slots.values(),
|
1510
|
+
self.output_slots.values(),
|
1511
|
+
self.splitter_func_name,
|
1512
|
+
),
|
1513
|
+
)
|
1514
|
+
|
1515
|
+
|
1516
|
+
class AggregatorStep(StandaloneStep):
|
1517
|
+
def __init__(
|
1518
|
+
self,
|
1519
|
+
name: str,
|
1520
|
+
output_slot: OutputSlot,
|
1521
|
+
aggregator_func_name: str,
|
1522
|
+
splitter_node_name: str,
|
1523
|
+
) -> None:
|
1524
|
+
"""A :class:`StandaloneStep` that aggregates :class:`OutputSlots<easylink.graph_components.Outputslot>` after parallel processing.
|
1525
|
+
|
1526
|
+
An ``AggregatorStep`` is intended to be used in conjunction with a corresponding
|
1527
|
+
:class:`SplitterStep` and only during construction of an :class:`EmbarrassinglyParallelStep`.
|
1528
|
+
|
1529
|
+
See :class:`Step` for inherited attributes.
|
1530
|
+
|
1531
|
+
Parameters
|
1532
|
+
----------
|
1533
|
+
aggregator_func_name
|
1534
|
+
The name of the aggregator function to be used.
|
1535
|
+
splitter_node_name
|
1536
|
+
The name of the ``SplitterStep`` and its corresponding
|
1537
|
+
:class:`~easylink.implementation.NullSplitterImplementation` that this ``AggregatorStep``
|
1538
|
+
is associated with.
|
1539
|
+
"""
|
1540
|
+
super().__init__(
|
1541
|
+
name,
|
1542
|
+
input_slots=[
|
1543
|
+
InputSlot(
|
1544
|
+
f"{name}_main_input",
|
1545
|
+
env_var=None,
|
1546
|
+
validator=None,
|
1547
|
+
)
|
1548
|
+
],
|
1549
|
+
output_slots=[output_slot],
|
1550
|
+
)
|
1551
|
+
self.aggregator_func_name = aggregator_func_name
|
1552
|
+
"""The name of the aggregator function to be used."""
|
1553
|
+
self.splitter_node_name = splitter_node_name
|
1554
|
+
"""The name of the ``SplitterStep`` and its corresponding
|
1555
|
+
:class:`~easylink.implementation.NullSplitterImplementation` that this ``AggregatorStep``
|
1556
|
+
is associated with."""
|
1557
|
+
|
1558
|
+
def add_nodes_to_implementation_graph(
|
1559
|
+
self, implementation_graph: ImplementationGraph
|
1560
|
+
) -> None:
|
1561
|
+
"""Adds a :class:`~easylink.implementation.NullImplementation` node to the :class:`~easylink.graph_components.ImplementationGraph`."""
|
1562
|
+
implementation_graph.add_node_from_implementation(
|
1563
|
+
self.name,
|
1564
|
+
implementation=NullAggregatorImplementation(
|
1565
|
+
self.name,
|
1566
|
+
self.input_slots.values(),
|
1567
|
+
self.output_slots.values(),
|
1568
|
+
self.aggregator_func_name,
|
1569
|
+
self.splitter_node_name,
|
1570
|
+
),
|
1571
|
+
)
|
1572
|
+
|
1270
1573
|
|
1271
1574
|
class ChoiceStep(Step):
|
1272
1575
|
"""A type of :class:`Step` that allows for choosing from a set of options.
|
@@ -1680,57 +1983,8 @@ class NonLeafConfigurationState(ConfigurationState):
|
|
1680
1983
|
substep = self._step.step_graph.nodes[node]["step"]
|
1681
1984
|
if self._step.is_embarrassingly_parallel:
|
1682
1985
|
substep.is_embarrassingly_parallel = True
|
1683
|
-
self._propagate_splitter_aggregators(self._step, substep)
|
1684
1986
|
substep.add_nodes_to_implementation_graph(implementation_graph)
|
1685
1987
|
|
1686
|
-
@staticmethod
|
1687
|
-
def _propagate_splitter_aggregators(parent: Step, child: Step):
|
1688
|
-
"""Propagates splitters and aggregators to child ``Steps``.
|
1689
|
-
|
1690
|
-
This method adds the :meth:`~easylink.graph_components.InputSlot.splitter`
|
1691
|
-
and :meth:`~easylink.graph_components.OutputSlot.aggregator` methods from a
|
1692
|
-
parent ``Step's`` :class:`~easylink.graph_components.InputSlot` and
|
1693
|
-
:class:`OutputSlots<easylink.graph_components.OutputSlot>` to the corresponding
|
1694
|
-
child steps' slots.
|
1695
|
-
|
1696
|
-
Parameters
|
1697
|
-
----------
|
1698
|
-
parent
|
1699
|
-
The parent ``Step`` whose ``splitter`` and ``aggregator`` methods are
|
1700
|
-
to be propagated to the appropriate child ``Step``.
|
1701
|
-
child
|
1702
|
-
A child ``Step`` to potentially have its parent's ``splitter`` and
|
1703
|
-
``aggregators`` assigned to its ``InputSlot`` and ``OutputSlots``,
|
1704
|
-
respectively.
|
1705
|
-
"""
|
1706
|
-
for parent_input_slot_name, parent_input_slot in parent.input_slots.items():
|
1707
|
-
if parent_input_slot.splitter:
|
1708
|
-
# Extract the appropriate child slot name from the mapping
|
1709
|
-
mappings_with_splitter = [
|
1710
|
-
mapping
|
1711
|
-
for mapping in parent.slot_mappings["input"]
|
1712
|
-
if mapping.parent_slot == parent_input_slot_name
|
1713
|
-
]
|
1714
|
-
for mapping in mappings_with_splitter:
|
1715
|
-
child_node = mapping.child_node
|
1716
|
-
child_slot = mapping.child_slot
|
1717
|
-
# Assign the splitter to the appropriate child slot
|
1718
|
-
if child_slot in child.input_slots and child_node == child.name:
|
1719
|
-
child.input_slots[child_slot].splitter = parent_input_slot.splitter
|
1720
|
-
for parent_output_slot_name, parent_output_slot in parent.output_slots.items():
|
1721
|
-
# Extract the appropriate child slot name from the mapping
|
1722
|
-
mappings_from_parent = [
|
1723
|
-
mapping
|
1724
|
-
for mapping in parent.slot_mappings["output"]
|
1725
|
-
if mapping.parent_slot == parent_output_slot_name
|
1726
|
-
]
|
1727
|
-
for mapping in mappings_from_parent:
|
1728
|
-
child_node = mapping.child_node
|
1729
|
-
child_slot = mapping.child_slot
|
1730
|
-
# Assign the aggregator to the appropriate child slot
|
1731
|
-
if child_slot in child.output_slots and child_node == child.name:
|
1732
|
-
child.output_slots[child_slot].aggregator = parent_output_slot.aggregator
|
1733
|
-
|
1734
1988
|
def add_edges_to_implementation_graph(
|
1735
1989
|
self, implementation_graph: ImplementationGraph
|
1736
1990
|
) -> None:
|
@@ -1842,10 +2096,10 @@ class NonLeafConfigurationState(ConfigurationState):
|
|
1842
2096
|
"""
|
1843
2097
|
for sub_node in self._step.step_graph.nodes:
|
1844
2098
|
sub_step = self._step.step_graph.nodes[sub_node]["step"]
|
1845
|
-
#
|
2099
|
+
# IOSteps, SplitterSteps, and AggregatorSteps never appear explicitly in the configuration
|
1846
2100
|
step_config = (
|
1847
2101
|
self.step_config
|
1848
|
-
if isinstance(sub_step, IOStep)
|
2102
|
+
if isinstance(sub_step, (IOStep, SplitterStep, AggregatorStep))
|
1849
2103
|
else self.step_config[sub_step.name]
|
1850
2104
|
)
|
1851
2105
|
sub_step.set_configuration_state(
|