easylink 0.1.13__py3-none-any.whl → 0.1.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
easylink/step.py CHANGED
@@ -14,7 +14,7 @@ from __future__ import annotations
14
14
  import copy
15
15
  from abc import ABC, abstractmethod
16
16
  from collections import defaultdict
17
- from collections.abc import Iterable
17
+ from collections.abc import Callable, Iterable
18
18
 
19
19
  from layered_config_tree import LayeredConfigTree
20
20
 
@@ -30,7 +30,9 @@ from easylink.graph_components import (
30
30
  )
31
31
  from easylink.implementation import (
32
32
  Implementation,
33
+ NullAggregatorImplementation,
33
34
  NullImplementation,
35
+ NullSplitterImplementation,
34
36
  PartialImplementation,
35
37
  )
36
38
  from easylink.utilities import paths
@@ -69,6 +71,8 @@ class Step:
69
71
  The :class:`InputSlotMapping<easylink.graph_components.InputSlotMapping>` of this ``Step``.
70
72
  output_slot_mappings
71
73
  The :class:`OutputSlotMapping<easylink.graph_components.OutputSlotMapping>` of this ``Step``.
74
+ is_embarrassingly_parallel
75
+ Whether or not this ``Step`` is to be run in an embarrassingly parallel manner.
72
76
 
73
77
  Notes
74
78
  -----
@@ -81,7 +85,7 @@ class Step:
81
85
 
82
86
  def __init__(
83
87
  self,
84
- step_name: str,
88
+ step_name: str | None,
85
89
  name: str | None = None,
86
90
  input_slots: Iterable[InputSlot] = (),
87
91
  output_slots: Iterable[OutputSlot] = (),
@@ -89,10 +93,12 @@ class Step:
89
93
  output_slot_mappings: Iterable[OutputSlotMapping] = (),
90
94
  is_embarrassingly_parallel: bool = False,
91
95
  ) -> None:
96
+ if not step_name and not name:
97
+ raise ValueError("All Steps must contain a step_name, name, or both.")
92
98
  self.step_name = step_name
93
99
  """The name of the pipeline step in the ``PipelineSchema``. It must also match
94
100
  the key in the implementation metadata file to be used to run this ``Step``."""
95
- self.name = name if name else step_name
101
+ self._name = name if name else step_name
96
102
  """The name of this ``Step's`` node in its :class:`easylink.graph_components.StepGraph`.
97
103
  This can be different from the ``step_name`` due to the need for disambiguation
98
104
  during the process of flattening the ``Stepgraph``, e.g. unrolling loops, etc.
@@ -115,6 +121,20 @@ class Step:
115
121
  self._configuration_state = None
116
122
  """This ``Step's`` :class:`~easylink.step.ConfigurationState`."""
117
123
 
124
+ @property
125
+ def name(self):
126
+ """The name of this ``Step's`` node in its :class:`easylink.graph_components.StepGraph`.
127
+ This can be different from the ``step_name`` due to the need for disambiguation
128
+ during the process of flattening the ``Stepgraph``, e.g. unrolling loops, etc.
129
+ For example, if step 1 is looped multiple times, each node would have a
130
+ ``step_name`` of, perhaps, "step_1" but unique ``names`` ("step_1_loop_1", etc)."""
131
+ return self._name
132
+
133
+ @name.setter
134
+ def name(self, value: str):
135
+ """Sets the ``name`` of this ``Step``."""
136
+ self._name = value
137
+
118
138
  @property
119
139
  def config_key(self):
120
140
  """The configuration key pertinent to this type of ``Step``."""
@@ -161,8 +181,9 @@ class Step:
161
181
  node_names = []
162
182
  step_names = []
163
183
  while step:
164
- node_names.append(step.name)
165
- step_names.append(step.step_name)
184
+ if step.step_name:
185
+ node_names.append(step.name)
186
+ step_names.append(step.step_name)
166
187
  step = step.parent_step
167
188
 
168
189
  prefix = []
@@ -333,13 +354,11 @@ class Step:
333
354
  }
334
355
 
335
356
 
336
- class IOStep(Step):
337
- """A special case type of :class:`Step` used to represent incoming and outgoing data.
357
+ class StandaloneStep(Step, ABC):
358
+ """A special case type of :class:`Step` that is not implemented on the pipeline.
338
359
 
339
- ``IOSteps`` are used to handle the incoming and outgoing data to the pipeline;
340
- they are inherited by concrete :class:`InputStep` and :class:`OutputStep`
341
- classes. These are not typical ``Steps`` in that they do not represent a unit
342
- of work to be performed in the pipeline (i.e. there is no container to run) and,
360
+ These are not typical ``Steps`` in that they do not represent a unit of work
361
+ to be performed in the pipeline (i.e. there is no container to run) and,
343
362
  thus, are not implemented by an :class:`~easylink.implementation.Implementation`.
344
363
 
345
364
  See :class:`Step` for inherited attributes.
@@ -348,32 +367,47 @@ class IOStep(Step):
348
367
 
349
368
  @property
350
369
  def implementation_node_name(self) -> str:
351
- """Dummy name to allow ``IOSteps`` to be used interchangeably with other ``Steps``.
370
+ """Dummy name to allow ``StandaloneSteps`` to be used interchangeably with other ``Steps``.
352
371
 
353
- Unlike other types of ``Steps``, ``IOSteps`` are not actually implemented
372
+ Unlike other types of ``Steps``, ``StandaloneSteps`` are not actually implemented
354
373
  via an :class:`~easylink.implementation.Implementation` and thus do not
355
374
  require a different node name than its own ``Step`` name. This property
356
- only exists so that ``IOSteps`` can be used interchangeably with other
375
+ only exists so that ``StandaloneSteps`` can be used interchangeably with other
357
376
  ``Steps`` in the codebase.
358
377
 
359
378
  Returns
360
379
  -------
361
- The ``IOStep's`` name.
380
+ The ``StandaloneStep's`` name.
362
381
  """
363
382
  return self.name
364
383
 
384
+ @abstractmethod
385
+ def add_nodes_to_implementation_graph(
386
+ self, implementation_graph: ImplementationGraph
387
+ ) -> None:
388
+ """Adds this ``StandaloneStep's`` ``Implementation`` as a node to the :class:`~easylink.graph_components.ImplementationGraph`.
389
+
390
+ Notes
391
+ -----
392
+ Unlike other types of ``Steps``, ``StandaloneSteps`` are not actually implemented
393
+ via an :class:`~easylink.implementation.Implementation`. As such, we
394
+ leverage the :class:`~easylink.implementation.NullImplementation` class
395
+ to generate the graph node.
396
+ """
397
+ pass
398
+
365
399
  def validate_step(
366
400
  self,
367
401
  step_config: LayeredConfigTree,
368
402
  combined_implementations: LayeredConfigTree,
369
403
  input_data_config: LayeredConfigTree,
370
404
  ) -> dict[str, list[str]]:
371
- """Dummy validation method to allow ``IOSteps`` to be used interchangeably with other ``Steps``.
405
+ """Dummy validation method to allow ``StandaloneSteps`` to be used interchangeably with other ``Steps``.
372
406
 
373
- Unlike other types of ``Steps``, ``IOSteps`` are not actually implemented
407
+ Unlike other types of ``Steps``, ``StandaloneSteps`` are not actually implemented
374
408
  via an :class:`~easylink.implementation.Implementation` and thus do not
375
409
  require any sort of validation since no new data is created. This method
376
- only exists so that ``IOSteps`` can be used interchangeably with other
410
+ only exists so that ``StandaloneSteps`` can be used interchangeably with other
377
411
  ``Steps`` in the codebase.
378
412
 
379
413
  Returns
@@ -404,18 +438,31 @@ class IOStep(Step):
404
438
  self, step_config, combined_implementations, input_data_config
405
439
  )
406
440
 
441
+ def add_edges_to_implementation_graph(self, implementation_graph):
442
+ """Overwrites the super ``Step``'s method to do nothing.
443
+
444
+ ``StandaloneSteps`` do not have edges within them in the ``ImplementationGraph``,
445
+ since they are represented by a single ``NullImplementation`` node, and so we
446
+ simply pass.
447
+ """
448
+ pass
449
+
450
+
451
+ class IOStep(StandaloneStep):
452
+ """A type of :class:`StandaloneStep` used to represent incoming and outgoing data.
453
+
454
+ ``IOSteps`` are used to handle the incoming and outgoing data to the pipeline;
455
+ they are inherited by concrete :class:`InputStep` and :class:`OutputStep`
456
+ classes.
457
+
458
+ See :class:`Step` for inherited attributes.
459
+
460
+ """
461
+
407
462
  def add_nodes_to_implementation_graph(
408
463
  self, implementation_graph: ImplementationGraph
409
464
  ) -> None:
410
- """Adds this ``IOStep's`` ``Implementation`` as a node to the :class:`~easylink.graph_components.ImplementationGraph`.
411
-
412
- Notes
413
- -----
414
- Unlike other types of ``Steps``, ``IOSteps`` are not actually implemented
415
- via an :class:`~easylink.implementation.Implementation`. As such, we
416
- leverage the :class:`~easylink.implementation.NullImplementation` class
417
- to generate the graph node.
418
- """
465
+ """Adds a :class:`~easylink.implementation.NullImplementation` node to the :class:`~easylink.graph_components.ImplementationGraph`."""
419
466
  implementation_graph.add_node_from_implementation(
420
467
  self.name,
421
468
  implementation=NullImplementation(
@@ -423,18 +470,9 @@ class IOStep(Step):
423
470
  ),
424
471
  )
425
472
 
426
- def add_edges_to_implementation_graph(self, implementation_graph):
427
- """Adds the edges of this ``Step's`` ``Implementation`` to the ``ImplementationGraph``.
428
-
429
- ``IOSteps`` do not have edges within them in the ``ImplementationGraph``,
430
- since they are represented by a single ``NullImplementation`` node, and so we
431
- simply pass.
432
- """
433
- pass
434
-
435
473
 
436
474
  class InputStep(IOStep):
437
- """A special case type of :class:`Step` used to represent incoming data.
475
+ """A special case type of :class:`IOStep` used to represent incoming data.
438
476
 
439
477
  An ``InputStep`` is used to pass data into the pipeline. Since we do not know
440
478
  what the data to pass into the pipeline will be a priori, we instantiate an
@@ -442,6 +480,7 @@ class InputStep(IOStep):
442
480
  *all* data defined in the input data specification file.
443
481
 
444
482
  See :class:`IOStep` for inherited attributes.
483
+
445
484
  """
446
485
 
447
486
  def __init__(self) -> None:
@@ -478,7 +517,7 @@ class InputStep(IOStep):
478
517
 
479
518
 
480
519
  class OutputStep(IOStep):
481
- """A special case type of :class:`Step` used to represent final results data.
520
+ """A special case type of :class:`IOStep` used to represent final results data.
482
521
 
483
522
  An ``OutputStep`` is used to write the `Snakemake <https://snakemake.readthedocs.io/en/stable/>`_
484
523
  Snakefile target rule in the :meth:`easylink.pipeline.Pipeline.build_snakefile`
@@ -511,10 +550,11 @@ class HierarchicalStep(Step):
511
550
  step_graph
512
551
  The :class:`~easylink.graph_components.StepGraph` i.e. the directed acyclic
513
552
  graph (DAG) of sub-nodes and their edges that make up this ``HierarchicalStep``.
514
- user_configurable
515
- Whether or not the ``HierarchicalStep`` is user-configurable. It is a convenience
516
- attribute to allow for back-end ``HierarchicalStep`` creation that are not
517
- user-facing (i.e. they do not need to provide a 'substeps' configuration key).
553
+ directly_implemented
554
+ Whether or not the ``HierarchicalStep`` is implemented directly from the user.
555
+ It is a convenience attribute to allow for back-end ``HierarchicalStep``
556
+ construction (i.e. ones that do not have a corresponding user-provided
557
+ 'substeps' configuration key).
518
558
 
519
559
  """
520
560
 
@@ -528,7 +568,7 @@ class HierarchicalStep(Step):
528
568
  edges=(),
529
569
  input_slot_mappings=(),
530
570
  output_slot_mappings=(),
531
- user_configurable=True,
571
+ directly_implemented=True,
532
572
  ):
533
573
  super().__init__(
534
574
  step_name,
@@ -547,7 +587,7 @@ class HierarchicalStep(Step):
547
587
  self.step_graph = self._get_step_graph(nodes, edges)
548
588
  """The :class:`~easylink.graph_components.StepGraph` i.e. the directed acyclic
549
589
  graph (DAG) of sub-nodes and their edges that make up this ``HierarchicalStep``."""
550
- self.user_configurable = user_configurable
590
+ self.directly_implemented = directly_implemented
551
591
  """Whether or not the ``HierarchicalStep`` is user-configurable. It is a convenience
552
592
  attribute to allow for back-end ``HierarchicalStep`` creation that are not
553
593
  user-facing (i.e. they do not need to provide a 'substeps' configuration key)."""
@@ -595,7 +635,7 @@ class HierarchicalStep(Step):
595
635
  all issues in one pass. In these cases, new errors may be found after the
596
636
  initial ones are handled.
597
637
  """
598
- if self.user_configurable:
638
+ if self.directly_implemented:
599
639
  if self.config_key in step_config:
600
640
  step_config = step_config[self.config_key]
601
641
  else:
@@ -616,7 +656,7 @@ class HierarchicalStep(Step):
616
656
  """Sets the configuration state.
617
657
 
618
658
  The configuration state of a ``HierarchicalStep`` depends on (1) whether
619
- or not it is :attr:`user_configurable` and (2) whether or not the
659
+ or not it is :attr:`directly_implemented` and (2) whether or not the
620
660
  :attr:`config_key` exists in the pipeline specification file.
621
661
 
622
662
  Parameters
@@ -629,7 +669,7 @@ class HierarchicalStep(Step):
629
669
  input_data_config
630
670
  The input data configuration for the entire pipeline.
631
671
  """
632
- if self.user_configurable:
672
+ if self.directly_implemented:
633
673
  if self.config_key in step_config:
634
674
  step_config = step_config[self.config_key]
635
675
  configuration_state_type = NonLeafConfigurationState
@@ -780,7 +820,7 @@ class TemplatedStep(Step, ABC):
780
820
  """Validates the ``TemplatedStep``.
781
821
 
782
822
  Regardless of whether or not a :attr:`Step.config_key` is set, we always
783
- validate the the base ``Step`` used to create the ``TemplatedStep``. If a
823
+ validate the base ``Step`` used to create the ``TemplatedStep``. If a
784
824
  ``config_key`` is indeed set (that is, there is some multiplicity), we
785
825
  complete additional validations.
786
826
 
@@ -889,14 +929,16 @@ class TemplatedStep(Step, ABC):
889
929
  self.step_graph.add_node_from_step(self.template_step)
890
930
  # Update the slot mappings with renamed children
891
931
  input_mappings = [
892
- InputSlotMapping(slot, self.name, slot) for slot in self.input_slots
932
+ InputSlotMapping(slot, self.template_step.name, slot)
933
+ for slot in self.input_slots
893
934
  ]
894
935
  output_mappings = [
895
- OutputSlotMapping(slot, self.name, slot) for slot in self.output_slots
936
+ OutputSlotMapping(slot, self.template_step.name, slot)
937
+ for slot in self.output_slots
896
938
  ]
897
939
  self.slot_mappings = {"input": input_mappings, "output": output_mappings}
898
940
  # Add the key back to the expanded config
899
- expanded_config = LayeredConfigTree({self.name: step_config})
941
+ expanded_config = LayeredConfigTree({self.template_step.name: step_config})
900
942
  else:
901
943
  expanded_config = self._get_config(step_config)
902
944
  num_repeats = len(expanded_config)
@@ -1146,7 +1188,7 @@ class ParallelStep(TemplatedStep):
1146
1188
 
1147
1189
 
1148
1190
  class EmbarrassinglyParallelStep(Step):
1149
- """A step that is run in parallel on the backend.
1191
+ """A :class:`Step` that is run in parallel on the backend.
1150
1192
 
1151
1193
  An ``EmbarrassinglyParallelStep`` is different than a :class:`ParallelStep`
1152
1194
  in that it is not configured by the user to be run in parallel - it completely
@@ -1159,29 +1201,47 @@ class EmbarrassinglyParallelStep(Step):
1159
1201
  step
1160
1202
  The ``Step`` to be run in an embarrassingly parallel manner. To run multiple
1161
1203
  steps in parallel, use a :class:`HierarchicalStep`.
1204
+ slot_splitter_mapping
1205
+ A mapping of the :class:`~easylink.graph_components.InputSlot` name to split
1206
+ to the actual splitter function to be used.
1207
+ slot_aggregator_mapping
1208
+ A mapping of all :class:`~easylink.graph_components.OutputSlot` names to
1209
+ be aggregated and the actual aggregator function to be used.
1162
1210
 
1163
1211
  """
1164
1212
 
1165
1213
  def __init__(
1166
1214
  self,
1167
1215
  step: Step,
1168
- input_slots: Iterable[InputSlot],
1169
- output_slots: Iterable[OutputSlot],
1170
- input_slot_mappings: Iterable[InputSlotMapping],
1171
- output_slot_mappings: Iterable[OutputSlotMapping],
1216
+ slot_splitter_mapping: dict[str, Callable],
1217
+ slot_aggregator_mapping: dict[str, Callable],
1172
1218
  ) -> None:
1173
1219
  super().__init__(
1174
- step.step_name,
1175
- step.name,
1176
- input_slots,
1177
- output_slots,
1178
- input_slot_mappings,
1179
- output_slot_mappings,
1220
+ step_name=None,
1221
+ name=step.name,
1180
1222
  is_embarrassingly_parallel=True,
1181
1223
  )
1224
+ self.slot_splitter_mapping = slot_splitter_mapping
1225
+ """A mapping of the :class:`~easylink.graph_components.InputSlot` name to split
1226
+ to the actual splitter function to be used."""
1227
+ self.slot_aggregator_mapping = slot_aggregator_mapping
1228
+ """A mapping of all :class:`~easylink.graph_components.OutputSlot` names to
1229
+ be aggregated and the actual aggregator function to be used."""
1182
1230
  self.step_graph = None
1183
1231
  self.step = step
1232
+ self.step.set_parent_step(self)
1233
+ self.input_slots = self.step.input_slots
1234
+ self.output_slots = self.step.output_slots
1184
1235
  self._validate()
1236
+ # NOTE: We validated that the slot_splitter_mapping has only one item in self._validate()
1237
+ self.split_slot_name = list(self.slot_splitter_mapping.keys())[0]
1238
+ """The name of the ``InputSlot`` to be split."""
1239
+
1240
+ @Step.name.setter
1241
+ def name(self, value: str) -> None:
1242
+ """Changes the name of the ``EmbarrassinglyParallelStep`` and the underlying :class:`Step` to the given value."""
1243
+ self._name = value
1244
+ self.step._name = value
1185
1245
 
1186
1246
  def _validate(self) -> None:
1187
1247
  """Validates the ``EmbarrassinglyParallelStep``.
@@ -1189,31 +1249,36 @@ class EmbarrassinglyParallelStep(Step):
1189
1249
  ``EmbarrassinglyParallelSteps`` are not configured by the user to be run
1190
1250
  in parallel. Since it happens on the back end, we need to do somewhat unique
1191
1251
  validations during construction. Specifically,
1192
- - one and only one :class:`~easylink.graph_components.InputSlot` *must* include
1193
- a :attr:`~easylink.graph_components.InputSlot.splitter` method.
1194
- - all :class:`OutputSlots<easylink.graph_components.OutputSlot>` *must* include
1195
- an :attr:`~easylink.graph_components.OutputSlot.aggregator` method.
1252
+ - one and only one :class:`~easylink.graph_components.InputSlot` *must*
1253
+ be mapped to a splitter method.
1254
+ - all :class:`OutputSlots<easylink.graph_components.OutputSlot>` *must*
1255
+ be mapped to aggregator methods.
1196
1256
  """
1197
1257
  errors = []
1198
- # assert that only one input slot has a splitter assigned
1199
- splitters = {
1200
- slot.name: slot.splitter.__name__
1201
- for slot in self.input_slots.values()
1202
- if slot.splitter
1203
- }
1204
- if len(splitters) == 0:
1258
+
1259
+ # check that only one input slot has a splitter assigned
1260
+ if len(self.slot_splitter_mapping) != 1:
1261
+ errors.append(
1262
+ f"EmbarrassinglyParallelStep '{self.step_name}' is attempting to define "
1263
+ f"{len(self.slot_splitter_mapping)} splitters when only one should be defined."
1264
+ )
1265
+ if len(self.slot_splitter_mapping) == 0:
1205
1266
  errors.append(
1206
1267
  f"EmbarrassinglyParallelStep '{self.step_name}' does not have any input slots with a "
1207
1268
  "splitter method assigned; one and only one input slot must have a splitter."
1208
1269
  )
1209
- if len(splitters) > 1:
1270
+ if len(self.slot_splitter_mapping) > 1:
1210
1271
  errors.append(
1211
1272
  f"EmbarrassinglyParallelStep '{self.step_name}' has multiple input slots with "
1212
1273
  "splitter methods assigned; one and only one input slot must have a splitter.\n"
1213
- f"Input slots with splitters: {splitters}"
1274
+ f"Input slots with splitters: {list(self.slot_splitter_mapping)}"
1214
1275
  )
1276
+
1277
+ # check that all output slots have an aggregator assigned
1215
1278
  missing_aggregators = [
1216
- slot.name for slot in self.output_slots.values() if not slot.aggregator
1279
+ slot.name
1280
+ for slot in self.output_slots.values()
1281
+ if slot.name not in self.slot_aggregator_mapping
1217
1282
  ]
1218
1283
  if len(missing_aggregators) != 0:
1219
1284
  errors.append(
@@ -1223,6 +1288,49 @@ class EmbarrassinglyParallelStep(Step):
1223
1288
  if errors:
1224
1289
  raise ValueError("\n".join(errors))
1225
1290
 
1291
+ def validate_step(
1292
+ self,
1293
+ step_config: LayeredConfigTree,
1294
+ combined_implementations: LayeredConfigTree,
1295
+ input_data_config: LayeredConfigTree,
1296
+ ) -> dict[str, list[str]]:
1297
+ """Validates the ``TemplatedStep``.
1298
+
1299
+ Regardless of whether or not a :attr:`Step.config_key` is set, we always
1300
+ validate the base ``Step`` used to create the ``TemplatedStep``. If a
1301
+ ``config_key`` is indeed set (that is, there is some multiplicity), we
1302
+ complete additional validations.
1303
+
1304
+ Parameters
1305
+ ----------
1306
+ step_config
1307
+ The internal configuration of this ``Step``, i.e. it should not include
1308
+ the ``Step's`` name.
1309
+ combined_implementations
1310
+ The configuration for any implementations to be combined.
1311
+ input_data_config
1312
+ The input data configuration for the entire pipeline.
1313
+
1314
+ Returns
1315
+ -------
1316
+ A dictionary of errors, where the keys are the ``TemplatedStep`` name
1317
+ and the values are lists of error messages associated with the given
1318
+ ``TemplatedStep``.
1319
+
1320
+ Notes
1321
+ -----
1322
+ If the ``TemplatedStep`` does not validate (i.e. errors are found and the returned
1323
+ dictionary is non-empty), the tool will exit and the pipeline will not run.
1324
+
1325
+ We attempt to batch error messages as much as possible, but there may be
1326
+ times where the configuration is so ill-formed that we are unable to handle
1327
+ all issues in one pass. In these cases, new errors may be found after the
1328
+ initial ones are handled.
1329
+ """
1330
+ return self.step.validate_step(
1331
+ step_config, combined_implementations, input_data_config
1332
+ )
1333
+
1226
1334
  def set_configuration_state(
1227
1335
  self,
1228
1336
  step_config: LayeredConfigTree,
@@ -1245,28 +1353,223 @@ class EmbarrassinglyParallelStep(Step):
1245
1353
  input_data_config
1246
1354
  The input data configuration for the entire pipeline.
1247
1355
  """
1248
- if self.step.name != self.name:
1249
- # Update the step name if the parent got renamed, e.g. a parent LoopStep
1250
- # 'step_1' that got expanded to 'step_1_loop_1', etc.
1251
- self.step.name = self.name
1252
- input_mappings = [
1253
- InputSlotMapping(slot, self.name, slot) for slot in self.input_slots
1254
- ]
1255
- output_mappings = [
1256
- OutputSlotMapping(slot, self.name, slot) for slot in self.output_slots
1257
- ]
1258
- self.slot_mappings = {"input": input_mappings, "output": output_mappings}
1259
- # Generate step graph from the single ``step`` attr
1260
- self.step_graph = StepGraph()
1261
- self.step_graph.add_node_from_step(self.step)
1356
+ splitter_node_name = f"{self.name}_{self.split_slot_name}_split"
1357
+ splitter_step = SplitterStep(
1358
+ splitter_node_name,
1359
+ split_slot=self.input_slots[self.split_slot_name],
1360
+ splitter_func_name=self.slot_splitter_mapping[self.split_slot_name].__name__,
1361
+ )
1362
+ aggregator_node_name = f"{self.name}_aggregate"
1363
+ if len(self.output_slots) > 1:
1364
+ raise NotImplementedError(
1365
+ "FIXME [MIC-5883] Multiple output slots/files of EmbarrassinglyParallelSteps not yet supported"
1366
+ )
1367
+ output_slot = list(self.output_slots.values())[0]
1368
+ aggregator_step = AggregatorStep(
1369
+ aggregator_node_name,
1370
+ output_slot=output_slot,
1371
+ aggregator_func_name=self.slot_aggregator_mapping[output_slot.name].__name__,
1372
+ splitter_node_name=splitter_node_name,
1373
+ )
1374
+ self._update_step_graph(splitter_step, aggregator_step)
1375
+ self._update_slot_mappings(splitter_step, aggregator_step)
1262
1376
  # Add the key back to the expanded config
1263
- expanded_config = LayeredConfigTree({self.name: step_config})
1264
-
1377
+ expanded_config = LayeredConfigTree({self.step.name: step_config})
1265
1378
  # EmbarrassinglyParallelSteps are by definition non-leaf steps
1266
1379
  self._configuration_state = NonLeafConfigurationState(
1267
1380
  self, expanded_config, combined_implementations, input_data_config
1268
1381
  )
1269
1382
 
1383
+ def _update_step_graph(
1384
+ self, splitter_step: SplitterStep, aggregator_step: AggregatorStep
1385
+ ) -> StepGraph:
1386
+ """Updates the :class:`~easylink.graph_components.StepGraph` to include the splitting and aggregating nodes.
1387
+
1388
+ This strings exactly three nodes together: the :class:`SplitterStep` that does
1389
+ the splitting of the input data, the actual :class:`Step` to be run in parallel,
1390
+ and the :class:`AggregatorStep` that aggregates the output data, i.e.
1391
+ ``SplitterStep -> ``Step`` -> AggregatorStep``.
1392
+
1393
+ Notes
1394
+ -----
1395
+ The ``SplitterStep`` and ``AggregatorStep`` are backed by versions of
1396
+ :class:`NullImplementations<easylink.implementation.NullImplementation>`,
1397
+ i.e. they do *not* actually require containers to run.
1398
+
1399
+ Parameters
1400
+ ----------
1401
+ splitter_step
1402
+ The :class:`SplitterStep` that does the splitting of the input data.
1403
+ aggregator_step
1404
+ The :class:`AggregatorStep` that aggregates the output data.
1405
+
1406
+ Returns
1407
+ -------
1408
+ The updated ``StepGraph`` that includes ``SplitterStep``, ``Step``,
1409
+ and ``AggregatorStep`` nodes.
1410
+ """
1411
+ self.step_graph = StepGraph()
1412
+ for node in [splitter_step, self.step, aggregator_step]:
1413
+ self.step_graph.add_node_from_step(node)
1414
+
1415
+ # Add SplitterStep -> Step edge
1416
+ self.step_graph.add_edge_from_params(
1417
+ EdgeParams(
1418
+ source_node=splitter_step.name,
1419
+ target_node=self.step.name,
1420
+ input_slot=self.split_slot_name,
1421
+ output_slot=list(splitter_step.output_slots.keys())[0],
1422
+ )
1423
+ )
1424
+ # Add the Step -> AggregatorStep edge
1425
+ if len(self.step.output_slots) > 1:
1426
+ raise NotImplementedError(
1427
+ "EmbarrassinglyParallelStep does not support multiple output slots."
1428
+ )
1429
+ self.step_graph.add_edge_from_params(
1430
+ EdgeParams(
1431
+ source_node=self.step.name,
1432
+ target_node=aggregator_step.name,
1433
+ input_slot=list(aggregator_step.input_slots.keys())[0],
1434
+ output_slot=list(self.step.output_slots.keys())[0],
1435
+ )
1436
+ )
1437
+
1438
+ def _update_slot_mappings(
1439
+ self, splitter_step: SplitterStep, aggregator_step: AggregatorStep
1440
+ ) -> None:
1441
+ """Updates the :class:`SlotMappings<easylink.graph_components.SlotMapping>`.
1442
+
1443
+ This updates the slot mappings to that the ``Step's`` inputs are redirected
1444
+ to the ``SplitterStep`` and the outputs are redirected to the ``AggregatorStep``.
1445
+
1446
+ Parameters
1447
+ ----------
1448
+ splitter_step
1449
+ The :class:`SplitterStep` that does the splitting of the input data.
1450
+ aggregator_step
1451
+ The :class:`AggregatorStep` that aggregates the output data.
1452
+
1453
+ Returns
1454
+ -------
1455
+ Updated ``SlotMappings`` that account for ``SplitterStep`` and ``AggregatorStep``.
1456
+ """
1457
+ # map the split input slot
1458
+ split_slot_name = list(splitter_step.input_slots.keys())[0]
1459
+ input_mappings = [
1460
+ InputSlotMapping(split_slot_name, splitter_step.name, split_slot_name)
1461
+ ]
1462
+ # map remaining input slots
1463
+ for input_slot in [slot for slot in self.input_slots if slot != split_slot_name]:
1464
+ input_mappings.append(InputSlotMapping(input_slot, self.step.name, input_slot))
1465
+ # map the output slots
1466
+ output_mappings = [
1467
+ OutputSlotMapping(slot, aggregator_step.name, slot) for slot in self.output_slots
1468
+ ]
1469
+ self.slot_mappings = {"input": input_mappings, "output": output_mappings}
1470
+
1471
+
1472
+ class SplitterStep(StandaloneStep):
1473
+ """A :class:`StandaloneStep` that splits an :class:`~easylink.graph_components.InputSlot` for parallel processing.
1474
+
1475
+ A ``SplitterStep`` is intended to be used in conjunction with a corresponding
1476
+ :class:`AggregatorStep` and only during construction of an :class:`EmbarrassinglyParallelStep`.
1477
+
1478
+ See :class:`Step` for inherited attributes.
1479
+
1480
+ Parameters
1481
+ ----------
1482
+ split_slot
1483
+ The name of the ``InputSlot`` to be split.
1484
+ splitter_func_name
1485
+ The name of the splitter function to be used.
1486
+
1487
+ """
1488
+
1489
+ def __init__(self, name: str, split_slot: InputSlot, splitter_func_name: str) -> None:
1490
+ # Remove the env_var (not an implemented step) and validator (will be validated
1491
+ # after the splitting during input to the actual step to run)
1492
+ input_slot = copy.deepcopy(split_slot)
1493
+ input_slot.env_var = None
1494
+ input_slot.validator = None
1495
+ super().__init__(
1496
+ name, input_slots=[input_slot], output_slots=[OutputSlot(f"{name}_main_output")]
1497
+ )
1498
+ self.splitter_func_name = splitter_func_name
1499
+ """The name of the splitter function to be used."""
1500
+
1501
+ def add_nodes_to_implementation_graph(
1502
+ self, implementation_graph: ImplementationGraph
1503
+ ) -> None:
1504
+ """Adds a :class:`~easylink.implementation.NullImplementation` node to the :class:`~easylink.graph_components.ImplementationGraph`."""
1505
+ implementation_graph.add_node_from_implementation(
1506
+ self.name,
1507
+ implementation=NullSplitterImplementation(
1508
+ self.name,
1509
+ self.input_slots.values(),
1510
+ self.output_slots.values(),
1511
+ self.splitter_func_name,
1512
+ ),
1513
+ )
1514
+
1515
+
1516
+ class AggregatorStep(StandaloneStep):
1517
+ def __init__(
1518
+ self,
1519
+ name: str,
1520
+ output_slot: OutputSlot,
1521
+ aggregator_func_name: str,
1522
+ splitter_node_name: str,
1523
+ ) -> None:
1524
+ """A :class:`StandaloneStep` that aggregates :class:`OutputSlots<easylink.graph_components.Outputslot>` after parallel processing.
1525
+
1526
+ An ``AggregatorStep`` is intended to be used in conjunction with a corresponding
1527
+ :class:`SplitterStep` and only during construction of an :class:`EmbarrassinglyParallelStep`.
1528
+
1529
+ See :class:`Step` for inherited attributes.
1530
+
1531
+ Parameters
1532
+ ----------
1533
+ aggregator_func_name
1534
+ The name of the aggregator function to be used.
1535
+ splitter_node_name
1536
+ The name of the ``SplitterStep`` and its corresponding
1537
+ :class:`~easylink.implementation.NullSplitterImplementation` that this ``AggregatorStep``
1538
+ is associated with.
1539
+ """
1540
+ super().__init__(
1541
+ name,
1542
+ input_slots=[
1543
+ InputSlot(
1544
+ f"{name}_main_input",
1545
+ env_var=None,
1546
+ validator=None,
1547
+ )
1548
+ ],
1549
+ output_slots=[output_slot],
1550
+ )
1551
+ self.aggregator_func_name = aggregator_func_name
1552
+ """The name of the aggregator function to be used."""
1553
+ self.splitter_node_name = splitter_node_name
1554
+ """The name of the ``SplitterStep`` and its corresponding
1555
+ :class:`~easylink.implementation.NullSplitterImplementation` that this ``AggregatorStep``
1556
+ is associated with."""
1557
+
1558
+ def add_nodes_to_implementation_graph(
1559
+ self, implementation_graph: ImplementationGraph
1560
+ ) -> None:
1561
+ """Adds a :class:`~easylink.implementation.NullImplementation` node to the :class:`~easylink.graph_components.ImplementationGraph`."""
1562
+ implementation_graph.add_node_from_implementation(
1563
+ self.name,
1564
+ implementation=NullAggregatorImplementation(
1565
+ self.name,
1566
+ self.input_slots.values(),
1567
+ self.output_slots.values(),
1568
+ self.aggregator_func_name,
1569
+ self.splitter_node_name,
1570
+ ),
1571
+ )
1572
+
1270
1573
 
1271
1574
  class ChoiceStep(Step):
1272
1575
  """A type of :class:`Step` that allows for choosing from a set of options.
@@ -1680,57 +1983,8 @@ class NonLeafConfigurationState(ConfigurationState):
1680
1983
  substep = self._step.step_graph.nodes[node]["step"]
1681
1984
  if self._step.is_embarrassingly_parallel:
1682
1985
  substep.is_embarrassingly_parallel = True
1683
- self._propagate_splitter_aggregators(self._step, substep)
1684
1986
  substep.add_nodes_to_implementation_graph(implementation_graph)
1685
1987
 
1686
- @staticmethod
1687
- def _propagate_splitter_aggregators(parent: Step, child: Step):
1688
- """Propagates splitters and aggregators to child ``Steps``.
1689
-
1690
- This method adds the :meth:`~easylink.graph_components.InputSlot.splitter`
1691
- and :meth:`~easylink.graph_components.OutputSlot.aggregator` methods from a
1692
- parent ``Step's`` :class:`~easylink.graph_components.InputSlot` and
1693
- :class:`OutputSlots<easylink.graph_components.OutputSlot>` to the corresponding
1694
- child steps' slots.
1695
-
1696
- Parameters
1697
- ----------
1698
- parent
1699
- The parent ``Step`` whose ``splitter`` and ``aggregator`` methods are
1700
- to be propagated to the appropriate child ``Step``.
1701
- child
1702
- A child ``Step`` to potentially have its parent's ``splitter`` and
1703
- ``aggregators`` assigned to its ``InputSlot`` and ``OutputSlots``,
1704
- respectively.
1705
- """
1706
- for parent_input_slot_name, parent_input_slot in parent.input_slots.items():
1707
- if parent_input_slot.splitter:
1708
- # Extract the appropriate child slot name from the mapping
1709
- mappings_with_splitter = [
1710
- mapping
1711
- for mapping in parent.slot_mappings["input"]
1712
- if mapping.parent_slot == parent_input_slot_name
1713
- ]
1714
- for mapping in mappings_with_splitter:
1715
- child_node = mapping.child_node
1716
- child_slot = mapping.child_slot
1717
- # Assign the splitter to the appropriate child slot
1718
- if child_slot in child.input_slots and child_node == child.name:
1719
- child.input_slots[child_slot].splitter = parent_input_slot.splitter
1720
- for parent_output_slot_name, parent_output_slot in parent.output_slots.items():
1721
- # Extract the appropriate child slot name from the mapping
1722
- mappings_from_parent = [
1723
- mapping
1724
- for mapping in parent.slot_mappings["output"]
1725
- if mapping.parent_slot == parent_output_slot_name
1726
- ]
1727
- for mapping in mappings_from_parent:
1728
- child_node = mapping.child_node
1729
- child_slot = mapping.child_slot
1730
- # Assign the aggregator to the appropriate child slot
1731
- if child_slot in child.output_slots and child_node == child.name:
1732
- child.output_slots[child_slot].aggregator = parent_output_slot.aggregator
1733
-
1734
1988
  def add_edges_to_implementation_graph(
1735
1989
  self, implementation_graph: ImplementationGraph
1736
1990
  ) -> None:
@@ -1842,10 +2096,10 @@ class NonLeafConfigurationState(ConfigurationState):
1842
2096
  """
1843
2097
  for sub_node in self._step.step_graph.nodes:
1844
2098
  sub_step = self._step.step_graph.nodes[sub_node]["step"]
1845
- # IOStep names never appear in configuration
2099
+ # IOSteps, SplitterSteps, and AggregatorSteps never appear explicitly in the configuration
1846
2100
  step_config = (
1847
2101
  self.step_config
1848
- if isinstance(sub_step, IOStep)
2102
+ if isinstance(sub_step, (IOStep, SplitterStep, AggregatorStep))
1849
2103
  else self.step_config[sub_step.name]
1850
2104
  )
1851
2105
  sub_step.set_configuration_state(