easylink 0.1.12__py3-none-any.whl → 0.1.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -74,29 +74,8 @@ NODES = [
74
74
  ),
75
75
  ],
76
76
  ),
77
- input_slots=[
78
- InputSlot(
79
- name="step_3_main_input",
80
- env_var="DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS",
81
- validator=validate_input_file_dummy,
82
- splitter=split_data_by_size,
83
- ),
84
- ],
85
- output_slots=[OutputSlot("step_3_main_output", aggregator=concatenate_datasets)],
86
- input_slot_mappings=[
87
- InputSlotMapping(
88
- parent_slot="step_3_main_input",
89
- child_node="step_3",
90
- child_slot="step_3_main_input",
91
- ),
92
- ],
93
- output_slot_mappings=[
94
- OutputSlotMapping(
95
- parent_slot="step_3_main_output",
96
- child_node="step_3",
97
- child_slot="step_3_main_output",
98
- ),
99
- ],
77
+ slot_splitter_mapping={"step_3_main_input": split_data_by_size},
78
+ slot_aggregator_mapping={"step_3_main_output": concatenate_datasets},
100
79
  ),
101
80
  self_edges=[
102
81
  EdgeParams(
@@ -16,6 +16,7 @@ from easylink.graph_components import (
16
16
  OutputSlotMapping,
17
17
  )
18
18
  from easylink.step import (
19
+ EmbarrassinglyParallelStep,
19
20
  HierarchicalStep,
20
21
  InputStep,
21
22
  LoopStep,
@@ -23,9 +24,11 @@ from easylink.step import (
23
24
  ParallelStep,
24
25
  Step,
25
26
  )
27
+ from easylink.utilities.aggregator_utils import concatenate_datasets
28
+ from easylink.utilities.splitter_utils import split_data_in_two
26
29
  from easylink.utilities.validation_utils import validate_input_file_dummy
27
30
 
28
- SINGLE_STEP_NODES = [
31
+ NODES_ONE_STEP = [
29
32
  InputStep(),
30
33
  Step(
31
34
  step_name="step_1",
@@ -44,7 +47,7 @@ SINGLE_STEP_NODES = [
44
47
  ],
45
48
  ),
46
49
  ]
47
- SINGLE_STEP_EDGES = [
50
+ EDGES_ONE_STEP = [
48
51
  EdgeParams(
49
52
  source_node="input_data",
50
53
  target_node="step_1",
@@ -58,10 +61,10 @@ SINGLE_STEP_EDGES = [
58
61
  input_slot="result",
59
62
  ),
60
63
  ]
64
+ SCHEMA_PARAMS_ONE_STEP = (NODES_ONE_STEP, EDGES_ONE_STEP)
61
65
 
62
- SINGLE_STEP_SCHEMA_PARAMS = (SINGLE_STEP_NODES, SINGLE_STEP_EDGES)
63
66
 
64
- TRIPLE_STEP_NODES = [
67
+ NODES_THREE_STEPS = [
65
68
  InputStep(),
66
69
  Step(
67
70
  step_name="step_1",
@@ -102,7 +105,7 @@ TRIPLE_STEP_NODES = [
102
105
  ],
103
106
  ),
104
107
  ]
105
- TRIPLE_STEP_EDGES = [
108
+ EDGES_THREE_STEPS = [
106
109
  EdgeParams(
107
110
  source_node="input_data",
108
111
  target_node="step_1",
@@ -128,11 +131,10 @@ TRIPLE_STEP_EDGES = [
128
131
  input_slot="result",
129
132
  ),
130
133
  ]
134
+ SCHEMA_PARAMS_THREE_STEPS = (NODES_THREE_STEPS, EDGES_THREE_STEPS)
131
135
 
132
- TRIPLE_STEP_SCHEMA_PARAMS = (TRIPLE_STEP_NODES, TRIPLE_STEP_EDGES)
133
136
 
134
-
135
- BAD_COMBINED_TOPOLOGY_NODES = [
137
+ NODES_BAD_COMBINED_TOPOLOGY = [
136
138
  InputStep(),
137
139
  LoopStep(
138
140
  template_step=HierarchicalStep(
@@ -207,11 +209,10 @@ BAD_COMBINED_TOPOLOGY_NODES = [
207
209
  ],
208
210
  ),
209
211
  ]
210
-
211
- BAD_COMBINED_TOPOLOGY_SCHEMA_PARAMS = (BAD_COMBINED_TOPOLOGY_NODES, SINGLE_STEP_EDGES)
212
+ SCHEMA_PARAMS_BAD_COMBINED_TOPOLOGY = (NODES_BAD_COMBINED_TOPOLOGY, EDGES_ONE_STEP)
212
213
 
213
214
 
214
- NESTED_TEMPLATED_STEPS_NODES = [
215
+ NODES_NESTED_TEMPLATED_STEPS = [
215
216
  InputStep(),
216
217
  LoopStep(
217
218
  template_step=ParallelStep(
@@ -288,12 +289,10 @@ NESTED_TEMPLATED_STEPS_NODES = [
288
289
  ],
289
290
  ),
290
291
  ]
292
+ SCHEMA_PARAMS_NESTED_TEMPLATED_STEPS = (NODES_NESTED_TEMPLATED_STEPS, EDGES_ONE_STEP)
291
293
 
292
294
 
293
- NESTED_TEMPLATED_STEPS_SCHEMA_PARAMS = (NESTED_TEMPLATED_STEPS_NODES, SINGLE_STEP_EDGES)
294
-
295
-
296
- COMBINE_WITH_ITERATION_NODES = [
295
+ NODES_COMBINE_WITH_ITERATION = [
297
296
  InputStep(),
298
297
  LoopStep(
299
298
  template_step=Step(
@@ -333,7 +332,7 @@ COMBINE_WITH_ITERATION_NODES = [
333
332
  ],
334
333
  ),
335
334
  ]
336
- DOUBLE_STEP_EDGES = [
335
+ EDGES_TWO_STEPS = [
337
336
  EdgeParams(
338
337
  source_node="input_data",
339
338
  target_node="step_1",
@@ -353,6 +352,233 @@ DOUBLE_STEP_EDGES = [
353
352
  input_slot="result",
354
353
  ),
355
354
  ]
355
+ SCHEMA_PARAMS_COMBINE_WITH_ITERATION = (NODES_COMBINE_WITH_ITERATION, EDGES_TWO_STEPS)
356
+
357
+
358
+ NODES_LOOPING_EP_STEP = [
359
+ InputStep(),
360
+ LoopStep(
361
+ template_step=EmbarrassinglyParallelStep(
362
+ step=Step(
363
+ step_name="step_1",
364
+ input_slots=[
365
+ InputSlot(
366
+ name="step_1_main_input",
367
+ env_var="DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS",
368
+ validator=validate_input_file_dummy,
369
+ ),
370
+ ],
371
+ output_slots=[
372
+ OutputSlot(
373
+ name="step_1_main_output",
374
+ ),
375
+ ],
376
+ ),
377
+ slot_splitter_mapping={"step_1_main_input": split_data_in_two},
378
+ slot_aggregator_mapping={"step_1_main_output": concatenate_datasets},
379
+ ),
380
+ self_edges=[
381
+ EdgeParams(
382
+ source_node="step_1",
383
+ target_node="step_1",
384
+ output_slot="step_1_main_output",
385
+ input_slot="step_1_main_input",
386
+ )
387
+ ],
388
+ ),
389
+ OutputStep(
390
+ input_slots=[
391
+ InputSlot(name="result", env_var=None, validator=validate_input_file_dummy)
392
+ ]
393
+ ),
394
+ ]
395
+ SCHEMA_PARAMS_LOOPING_EP_STEP = (NODES_LOOPING_EP_STEP, EDGES_ONE_STEP)
396
+
397
+
398
+ NODES_EP_PARALLEL_STEP = [
399
+ InputStep(),
400
+ EmbarrassinglyParallelStep(
401
+ step=ParallelStep(
402
+ template_step=Step(
403
+ step_name="step_1",
404
+ input_slots=[
405
+ InputSlot(
406
+ name="step_1_main_input",
407
+ env_var="DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS",
408
+ validator=validate_input_file_dummy,
409
+ ),
410
+ ],
411
+ output_slots=[
412
+ OutputSlot(
413
+ name="step_1_main_output",
414
+ ),
415
+ ],
416
+ ),
417
+ ),
418
+ slot_splitter_mapping={"step_1_main_input": split_data_in_two},
419
+ slot_aggregator_mapping={"step_1_main_output": concatenate_datasets},
420
+ ),
421
+ OutputStep(
422
+ input_slots=[
423
+ InputSlot(name="result", env_var=None, validator=validate_input_file_dummy)
424
+ ]
425
+ ),
426
+ ]
427
+ SCHEMA_PARAMS_EP_PARALLEL_STEP = (NODES_EP_PARALLEL_STEP, EDGES_ONE_STEP)
428
+
429
+
430
+ NODES_EP_LOOP_STEP = [
431
+ InputStep(),
432
+ EmbarrassinglyParallelStep(
433
+ step=LoopStep(
434
+ template_step=Step(
435
+ step_name="step_1",
436
+ input_slots=[
437
+ InputSlot(
438
+ name="step_1_main_input",
439
+ env_var="DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS",
440
+ validator=validate_input_file_dummy,
441
+ ),
442
+ ],
443
+ output_slots=[
444
+ OutputSlot(
445
+ name="step_1_main_output",
446
+ ),
447
+ ],
448
+ ),
449
+ self_edges=[
450
+ EdgeParams(
451
+ source_node="step_1",
452
+ target_node="step_1",
453
+ output_slot="step_1_main_output",
454
+ input_slot="step_1_main_input",
455
+ ),
456
+ ],
457
+ ),
458
+ slot_splitter_mapping={"step_1_main_input": split_data_in_two},
459
+ slot_aggregator_mapping={"step_1_main_output": concatenate_datasets},
460
+ ),
461
+ OutputStep(
462
+ input_slots=[
463
+ InputSlot(name="result", env_var=None, validator=validate_input_file_dummy)
464
+ ]
465
+ ),
466
+ ]
467
+ SCHEMA_PARAMS_EP_LOOP_STEP = (NODES_EP_LOOP_STEP, EDGES_ONE_STEP)
356
468
 
357
469
 
358
- COMBINE_WITH_ITERATION_SCHEMA_PARAMS = (COMBINE_WITH_ITERATION_NODES, DOUBLE_STEP_EDGES)
470
+ NODES_EP_HIERARCHICAL_STEP = [
471
+ InputStep(),
472
+ EmbarrassinglyParallelStep(
473
+ step=HierarchicalStep(
474
+ step_name="step_1",
475
+ input_slots=[
476
+ InputSlot(
477
+ name="step_1_main_input",
478
+ env_var="DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS",
479
+ validator=validate_input_file_dummy,
480
+ ),
481
+ InputSlot(
482
+ name="step_1_secondary_input",
483
+ env_var="DUMMY_CONTAINER_SECONDARY_INPUT_FILE_PATHS",
484
+ validator=validate_input_file_dummy,
485
+ ),
486
+ ],
487
+ output_slots=[OutputSlot("step_1_main_output")],
488
+ nodes=[
489
+ Step(
490
+ step_name="step_1a",
491
+ input_slots=[
492
+ InputSlot(
493
+ name="step_1a_main_input",
494
+ env_var="DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS",
495
+ validator=validate_input_file_dummy,
496
+ ),
497
+ InputSlot(
498
+ name="step_1a_secondary_input",
499
+ env_var="DUMMY_CONTAINER_SECONDARY_INPUT_FILE_PATHS",
500
+ validator=validate_input_file_dummy,
501
+ ),
502
+ ],
503
+ output_slots=[OutputSlot("step_1a_main_output")],
504
+ ),
505
+ Step(
506
+ step_name="step_1b",
507
+ input_slots=[
508
+ InputSlot(
509
+ name="step_1b_main_input",
510
+ env_var="DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS",
511
+ validator=validate_input_file_dummy,
512
+ ),
513
+ InputSlot(
514
+ name="step_1b_secondary_input",
515
+ env_var="DUMMY_CONTAINER_SECONDARY_INPUT_FILE_PATHS",
516
+ validator=validate_input_file_dummy,
517
+ ),
518
+ ],
519
+ output_slots=[OutputSlot("step_1b_main_output")],
520
+ ),
521
+ ],
522
+ edges=[
523
+ EdgeParams(
524
+ source_node="step_1a",
525
+ target_node="step_1b",
526
+ output_slot="step_1a_main_output",
527
+ input_slot="step_1b_main_input",
528
+ ),
529
+ ],
530
+ input_slot_mappings=[
531
+ InputSlotMapping(
532
+ parent_slot="step_1_main_input",
533
+ child_node="step_1a",
534
+ child_slot="step_1a_main_input",
535
+ ),
536
+ InputSlotMapping(
537
+ parent_slot="step_1_secondary_input",
538
+ child_node="step_1a",
539
+ child_slot="step_1a_secondary_input",
540
+ ),
541
+ InputSlotMapping(
542
+ parent_slot="step_1_secondary_input",
543
+ child_node="step_1b",
544
+ child_slot="step_1b_secondary_input",
545
+ ),
546
+ ],
547
+ output_slot_mappings=[
548
+ OutputSlotMapping(
549
+ parent_slot="step_1_main_output",
550
+ child_node="step_1b",
551
+ child_slot="step_1b_main_output",
552
+ ),
553
+ ],
554
+ ),
555
+ slot_splitter_mapping={"step_1_main_input": split_data_in_two},
556
+ slot_aggregator_mapping={"step_1_main_output": concatenate_datasets},
557
+ ),
558
+ OutputStep(
559
+ input_slots=[
560
+ InputSlot(name="result", env_var=None, validator=validate_input_file_dummy)
561
+ ]
562
+ ),
563
+ ]
564
+ EDGES_ONE_STEP_TWO_ISLOTS = [
565
+ EdgeParams(
566
+ source_node="input_data",
567
+ target_node="step_1",
568
+ output_slot="all",
569
+ input_slot="step_1_main_input",
570
+ ),
571
+ EdgeParams(
572
+ source_node="input_data",
573
+ target_node="step_1",
574
+ output_slot="all",
575
+ input_slot="step_1_secondary_input",
576
+ ),
577
+ EdgeParams(
578
+ source_node="step_1",
579
+ target_node="results",
580
+ output_slot="step_1_main_output",
581
+ input_slot="result",
582
+ ),
583
+ ]
584
+ SCHEMA_PARAMS_EP_HIERARCHICAL_STEP = (NODES_EP_HIERARCHICAL_STEP, EDGES_ONE_STEP_TWO_ISLOTS)
easylink/rule.py CHANGED
@@ -41,15 +41,6 @@ class Rule(ABC):
41
41
  """
42
42
  pass
43
43
 
44
- @staticmethod
45
- def get_input_slots_to_split(input_slots) -> list[str]:
46
- input_slots_to_split = [
47
- slot_name
48
- for slot_name, slot_attrs in input_slots.items()
49
- if slot_attrs.get("splitter", None)
50
- ]
51
- return input_slots_to_split
52
-
53
44
 
54
45
  @dataclass
55
46
  class TargetRule(Rule):
@@ -125,23 +116,15 @@ class ImplementedRule(Rule):
125
116
 
126
117
  def build_rule(self) -> str:
127
118
  """Builds the Snakemake rule for this ``Implementation``."""
119
+ if self.is_embarrassingly_parallel and len(self.output) > 1:
120
+ raise NotImplementedError(
121
+ "Multiple output slots/files of EmbarrassinglyParallelSteps not yet supported"
122
+ )
128
123
  return self._build_io() + self._build_resources() + self._build_shell_cmd()
129
124
 
130
125
  def _build_io(self) -> str:
131
126
  """Builds the input/output portion of the rule."""
132
- if self.is_embarrassingly_parallel:
133
- # Processed chunks are sent to a 'processed' subdir
134
- output_files = [
135
- os.path.dirname(file_path)
136
- + "/processed/{chunk}/"
137
- + os.path.basename(file_path)
138
- for file_path in self.output
139
- ]
140
- log_path_chunk_adder = "-{chunk}"
141
- else:
142
- output_files = self.output
143
- log_path_chunk_adder = ""
144
-
127
+ log_path_chunk_adder = "-{chunk}" if self.is_embarrassingly_parallel else ""
145
128
  io_str = (
146
129
  f"""
147
130
  rule:
@@ -149,7 +132,7 @@ rule:
149
132
  message: "Running {self.step_name} implementation: {self.implementation_name}" """
150
133
  + self._build_input()
151
134
  + f"""
152
- output: {output_files}
135
+ output: {self.output}
153
136
  log: "{self.diagnostics_dir}/{self.name}-output{log_path_chunk_adder}.log"
154
137
  container: "{self.image_path}" """
155
138
  )
@@ -158,33 +141,11 @@ rule:
158
141
  def _build_input(self) -> str:
159
142
  input_str = f"""
160
143
  input:"""
161
- input_slots_to_split = self.get_input_slots_to_split(self.input_slots)
162
144
  for slot, attrs in self.input_slots.items():
163
145
  env_var = attrs["env_var"].lower()
164
- if len(input_slots_to_split) > 1:
165
- raise NotImplementedError(
166
- "FIXME [MIC-5883] Multiple input slots to split not yet supported"
167
- )
168
- if self.is_embarrassingly_parallel and slot == input_slots_to_split[0]:
169
- # The input to this is the input_chunks subdir from the checkpoint
170
- # rule (which is built by modifying the output of the overall implementation)
171
- if len(self.output) > 1:
172
- raise NotImplementedError(
173
- "FIXME [MIC-5883] Multiple output slots/files of EmbarrassinglyParallelSteps not yet supported"
174
- )
175
- input_files = [
176
- os.path.dirname(self.output[0])
177
- + "/input_chunks/{chunk}/"
178
- + os.path.basename(self.output[0])
179
- ]
180
- else:
181
- input_files = attrs["filepaths"]
182
- input_str += f"""
183
- {env_var}={input_files},"""
184
- if not self.is_embarrassingly_parallel:
185
- # validations were already handled in the checkpoint rule - no need
186
- # to validate the individual chunks
187
146
  input_str += f"""
147
+ {env_var}={attrs["filepaths"]},"""
148
+ input_str += f"""
188
149
  validations={self.validations},"""
189
150
  if self.requires_spark:
190
151
  input_str += f"""
@@ -210,38 +171,19 @@ rule:
210
171
  # output_paths = ",".join(self.output)
211
172
  # wildcards_subdir = "/".join([f"{{wildcards.{wc}}}" for wc in self.wildcards])
212
173
  # and then in shell cmd: export DUMMY_CONTAINER_OUTPUT_PATHS={output_paths}/{wildcards_subdir}
213
- if self.is_embarrassingly_parallel:
214
- if len(self.output) > 1:
215
- raise NotImplementedError(
216
- "FIXME [MIC-5883] Multiple output slots/files of EmbarrassinglyParallelSteps not yet supported"
217
- )
218
- output_files = (
219
- os.path.dirname(self.output[0])
220
- + "/processed/{wildcards.chunk}/"
221
- + os.path.basename(self.output[0])
222
- )
223
- else:
224
- output_files = ",".join(self.output)
174
+
175
+ # snakemake shell commands require wildcards to be prefaced with 'wildcards.'
176
+ output_files = ",".join(self.output).replace("{chunk}", "{wildcards.chunk}")
225
177
  shell_cmd = f"""
226
178
  shell:
227
179
  '''
228
180
  export DUMMY_CONTAINER_OUTPUT_PATHS={output_files}
229
181
  export DUMMY_CONTAINER_DIAGNOSTICS_DIRECTORY={self.diagnostics_dir}"""
230
- for input_slot_name, input_slot_attrs in self.input_slots.items():
231
- input_slots_to_split = self.get_input_slots_to_split(self.input_slots)
232
- if len(input_slots_to_split) > 1:
233
- raise NotImplementedError(
234
- "FIXME [MIC-5883] Multiple input slots to split not yet supported"
235
- )
236
- if input_slot_name in input_slots_to_split:
237
- # The inputs to this come from the input_chunks subdir
238
- input_files = (
239
- os.path.dirname(self.output[0])
240
- + "/input_chunks/{wildcards.chunk}/"
241
- + os.path.basename(self.output[0])
242
- )
243
- else:
244
- input_files = ",".join(input_slot_attrs["filepaths"])
182
+ for input_slot_attrs in self.input_slots.values():
183
+ # snakemake shell commands require wildcards to be prefaced with 'wildcards.'
184
+ input_files = ",".join(input_slot_attrs["filepaths"]).replace(
185
+ "{chunk}", "{wildcards.chunk}"
186
+ )
245
187
  shell_cmd += f"""
246
188
  export {input_slot_attrs["env_var"]}={input_files}"""
247
189
  if self.requires_spark:
@@ -278,7 +220,7 @@ class InputValidationRule(Rule):
278
220
  """List of filepaths to validate."""
279
221
  output: str
280
222
  """Filepath of validation output. It must be used as an input for next rule."""
281
- validator: Callable
223
+ validator: Callable | None
282
224
  """Callable that takes a filepath as input. Raises an error if invalid."""
283
225
 
284
226
  def build_rule(self) -> str:
@@ -329,12 +271,14 @@ class CheckpointRule(Rule):
329
271
 
330
272
  name: str
331
273
  """Name of the rule."""
332
- input_slots: dict[str, dict[str, str | list[str]]]
333
- """This ``Implementation's`` input slot attributes."""
334
- validations: list[str]
335
- """Validation files from previous rule."""
336
- output: list[str]
274
+ input_files: list[str]
275
+ """The input filepaths."""
276
+ splitter_func_name: str
277
+ """The splitter function's name."""
278
+ output_dir: str
337
279
  """Output directory path. It must be used as an input for next rule."""
280
+ checkpoint_filepath: str
281
+ """Path to the checkpoint file. This is only needed for the bugfix workaround."""
338
282
 
339
283
  def build_rule(self) -> str:
340
284
  """Builds the Snakemake rule for this checkpoint.
@@ -344,29 +288,20 @@ class CheckpointRule(Rule):
344
288
  files into chunks. Note that the output of this rule is a Snakemake ``directory``
345
289
  object as opposed to a specific file like typical rules have.
346
290
  """
347
- # Replace the output filepath with an input_chunks subdir
348
- output_dir = os.path.dirname(self.output[0]) + "/input_chunks"
349
- input_slots_to_split = self.get_input_slots_to_split(self.input_slots)
350
- if len(input_slots_to_split) > 1:
351
- raise NotImplementedError(
352
- "FIXME [MIC-5883] Multiple input slots to split not yet supported"
353
- )
354
- input_slot_to_split = input_slots_to_split[0]
355
291
  checkpoint = f"""
356
292
  checkpoint:
357
- name: "split_{self.name}_{input_slot_to_split}"
293
+ name: "{self.name}"
358
294
  input:
359
- files={self.input_slots[input_slot_to_split]['filepaths']},
360
- validations={self.validations},
295
+ files={self.input_files},
361
296
  output:
362
- output_dir=directory("{output_dir}"),
363
- checkpoint_file=touch("{output_dir}/checkpoint.txt"),
297
+ output_dir=directory("{self.output_dir}"),
298
+ checkpoint_file=touch("{self.checkpoint_filepath}"),
364
299
  params:
365
300
  input_files=lambda wildcards, input: ",".join(input.files),
366
301
  localrule: True
367
- message: "Splitting {self.name} {input_slot_to_split} into chunks"
302
+ message: "Splitting {self.name} into chunks"
368
303
  run:
369
- splitter_utils.{self.input_slots[input_slot_to_split]["splitter"].__name__}(
304
+ splitter_utils.{self.splitter_func_name}(
370
305
  input_files=list(input.files),
371
306
  output_dir=output.output_dir,
372
307
  desired_chunk_size_mb=0.1,
@@ -385,12 +320,16 @@ class AggregationRule(Rule):
385
320
 
386
321
  name: str
387
322
  """Name of the rule."""
388
- input_slots: dict[str, dict[str, str | list[str]]]
389
- """This ``Implementation's`` input slot attributes."""
390
- output_slot_name: str
391
- """Name of the :class:`~easylink.graph_components.OutputSlot`."""
392
- output_slot: dict[str, str | list[str]]
393
- """The output slot attributes to create this rule for."""
323
+ input_files: list[str]
324
+ """The input processed chunk files to aggregate."""
325
+ aggregated_output_file: str
326
+ """The final aggregated results file."""
327
+ aggregator_func_name: str
328
+ """The name of the aggregation function to run."""
329
+ checkpoint_filepath: str
330
+ """Path to the checkpoint file. This is only needed for the bugfix workaround."""
331
+ checkpoint_rule_name: str
332
+ """Name of the checkpoint rule."""
394
333
 
395
334
  def build_rule(self) -> str:
396
335
  """Builds the Snakemake rule for this aggregator.
@@ -421,56 +360,37 @@ class AggregationRule(Rule):
421
360
 
422
361
  def _define_input_function(self):
423
362
  """Builds the `input function <https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#input-functions>`_."""
424
- if len(self.output_slot["filepaths"]) > 1:
425
- raise NotImplementedError(
426
- "FIXME [MIC-5883] Multiple output slots/files of EmbarrassinglyParallelSteps not yet supported"
427
- )
428
- if len(self.output_slot["filepaths"]) > 1:
429
- raise NotImplementedError(
430
- "FIXME [MIC-5883] Multiple slots/files of EmbarrassinglyParallelSteps not yet supported"
431
- )
432
- output_filepath = self.output_slot["filepaths"][0]
433
- checkpoint_file_path = (
434
- os.path.dirname(output_filepath) + "/input_chunks/checkpoint.txt"
435
- )
436
- input_slots_to_split = self.get_input_slots_to_split(self.input_slots)
437
- if len(input_slots_to_split) > 1:
438
- raise NotImplementedError(
439
- "FIXME [MIC-5883] Multiple input slots to split not yet supported"
440
- )
441
- input_slot_to_split = input_slots_to_split[0]
442
- checkpoint_name = f"checkpoints.split_{self.name}_{input_slot_to_split}"
443
- output_files = (
444
- os.path.dirname(output_filepath)
445
- + "/processed/{chunk}/"
446
- + os.path.basename(output_filepath)
447
- )
363
+ # NOTE: In the f-string below, we serialize the list `self.input_files`
364
+ # into a string which must later be executed as python code (by snakemake).
365
+ # Let's expand the list into a string representation of a python list so that
366
+ # we explicitly rely on `eval(repr(self.input_files)) == self.input_files`.
367
+ input_files_list_str = repr(self.input_files)
448
368
  func = f"""
449
- def get_aggregation_inputs_{self.name}_{self.output_slot_name}(wildcards):
450
- checkpoint_file = "{checkpoint_file_path}"
369
+ def get_aggregation_inputs_{self.name}(wildcards):
370
+ checkpoint_file = "{self.checkpoint_filepath}"
451
371
  if not os.path.exists(checkpoint_file):
452
- output, _ = {checkpoint_name}.rule.expand_output(wildcards)
453
- raise IncompleteCheckpointException({checkpoint_name}.rule, checkpoint_target(output[0]))
454
- checkpoint_output = glob.glob(f"{{{checkpoint_name}.get(**wildcards).output.output_dir}}/*/")
372
+ output, _ = {self.checkpoint_rule_name}.rule.expand_output(wildcards)
373
+ raise IncompleteCheckpointException({self.checkpoint_rule_name}.rule, checkpoint_target(output[0]))
374
+ checkpoint_output = glob.glob(f"{{{self.checkpoint_rule_name}.get(**wildcards).output.output_dir}}/*/")
455
375
  chunks = [Path(filepath).parts[-1] for filepath in checkpoint_output]
456
- return expand(
457
- "{output_files}",
458
- chunk=chunks
459
- )"""
376
+ input_files = []
377
+ for filepath in {input_files_list_str}:
378
+ input_files.extend(expand(filepath, chunk=chunks))
379
+ return input_files"""
460
380
  return func
461
381
 
462
382
  def _define_aggregator_rule(self):
463
383
  """Builds the rule that runs the aggregation."""
464
384
  rule = f"""
465
385
  rule:
466
- name: "aggregate_{self.name}_{self.output_slot_name}"
467
- input: get_aggregation_inputs_{self.name}_{self.output_slot_name}
468
- output: {self.output_slot["filepaths"]}
386
+ name: "{self.name}"
387
+ input: get_aggregation_inputs_{self.name}
388
+ output: {[self.aggregated_output_file]}
469
389
  localrule: True
470
- message: "Aggregating {self.name} {self.output_slot_name}"
390
+ message: "Aggregating {self.name}"
471
391
  run:
472
- aggregator_utils.{self.output_slot["aggregator"].__name__}(
392
+ aggregator_utils.{self.aggregator_func_name}(
473
393
  input_files=list(input),
474
- output_filepath="{self.output_slot["filepaths"][0]}",
394
+ output_filepath="{self.aggregated_output_file}",
475
395
  )"""
476
396
  return rule