easylink 0.1.6__py3-none-any.whl → 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -57,6 +57,76 @@ SINGLE_STEP_EDGES = [
57
57
 
58
58
  SINGLE_STEP_SCHEMA_PARAMS = (SINGLE_STEP_NODES, SINGLE_STEP_EDGES)
59
59
 
60
+ TRIPLE_STEP_NODES = [
61
+ InputStep(),
62
+ Step(
63
+ step_name="step_1",
64
+ input_slots=[
65
+ InputSlot(
66
+ name="step_1_main_input",
67
+ env_var="DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS",
68
+ validator=validate_input_file_dummy,
69
+ )
70
+ ],
71
+ output_slots=[OutputSlot("step_1_main_output")],
72
+ ),
73
+ Step(
74
+ step_name="step_2",
75
+ input_slots=[
76
+ InputSlot(
77
+ name="step_2_main_input",
78
+ env_var="DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS",
79
+ validator=validate_input_file_dummy,
80
+ )
81
+ ],
82
+ output_slots=[OutputSlot("step_2_main_output")],
83
+ ),
84
+ Step(
85
+ step_name="step_3",
86
+ input_slots=[
87
+ InputSlot(
88
+ name="step_3_main_input",
89
+ env_var="DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS",
90
+ validator=validate_input_file_dummy,
91
+ )
92
+ ],
93
+ output_slots=[OutputSlot("step_3_main_output")],
94
+ ),
95
+ OutputStep(
96
+ input_slots=[
97
+ InputSlot(name="result", env_var=None, validator=validate_input_file_dummy)
98
+ ],
99
+ ),
100
+ ]
101
+ TRIPLE_STEP_EDGES = [
102
+ EdgeParams(
103
+ source_node="input_data",
104
+ target_node="step_1",
105
+ output_slot="all",
106
+ input_slot="step_1_main_input",
107
+ ),
108
+ EdgeParams(
109
+ source_node="step_1",
110
+ target_node="step_2",
111
+ output_slot="step_1_main_output",
112
+ input_slot="step_2_main_input",
113
+ ),
114
+ EdgeParams(
115
+ source_node="step_2",
116
+ target_node="step_3",
117
+ output_slot="step_2_main_output",
118
+ input_slot="step_3_main_input",
119
+ ),
120
+ EdgeParams(
121
+ source_node="step_3",
122
+ target_node="results",
123
+ output_slot="step_3_main_output",
124
+ input_slot="result",
125
+ ),
126
+ ]
127
+
128
+ TRIPLE_STEP_SCHEMA_PARAMS = (TRIPLE_STEP_NODES, TRIPLE_STEP_EDGES)
129
+
60
130
 
61
131
  BAD_COMBINED_TOPOLOGY_NODES = [
62
132
  InputStep(),
@@ -217,3 +287,68 @@ NESTED_TEMPLATED_STEPS_NODES = [
217
287
 
218
288
 
219
289
  NESTED_TEMPLATED_STEPS_SCHEMA_PARAMS = (NESTED_TEMPLATED_STEPS_NODES, SINGLE_STEP_EDGES)
290
+
291
+
292
+ COMBINE_WITH_ITERATION_NODES = [
293
+ InputStep(),
294
+ LoopStep(
295
+ template_step=Step(
296
+ step_name="step_1",
297
+ input_slots=[
298
+ InputSlot(
299
+ name="step_1_main_input",
300
+ env_var="DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS",
301
+ validator=validate_input_file_dummy,
302
+ )
303
+ ],
304
+ output_slots=[OutputSlot("step_1_main_output")],
305
+ ),
306
+ self_edges=[
307
+ EdgeParams(
308
+ source_node="step_1",
309
+ target_node="step_1",
310
+ output_slot="step_1_main_output",
311
+ input_slot="step_1_main_input",
312
+ ),
313
+ ],
314
+ ),
315
+ Step(
316
+ step_name="step_2",
317
+ input_slots=[
318
+ InputSlot(
319
+ name="step_2_main_input",
320
+ env_var="DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS",
321
+ validator=validate_input_file_dummy,
322
+ )
323
+ ],
324
+ output_slots=[OutputSlot("step_2_main_output")],
325
+ ),
326
+ OutputStep(
327
+ input_slots=[
328
+ InputSlot(name="result", env_var=None, validator=validate_input_file_dummy)
329
+ ],
330
+ ),
331
+ ]
332
+ DOUBLE_STEP_EDGES = [
333
+ EdgeParams(
334
+ source_node="input_data",
335
+ target_node="step_1",
336
+ output_slot="all",
337
+ input_slot="step_1_main_input",
338
+ ),
339
+ EdgeParams(
340
+ source_node="step_1",
341
+ target_node="step_2",
342
+ output_slot="step_1_main_output",
343
+ input_slot="step_2_main_input",
344
+ ),
345
+ EdgeParams(
346
+ source_node="step_2",
347
+ target_node="results",
348
+ output_slot="step_2_main_output",
349
+ input_slot="result",
350
+ ),
351
+ ]
352
+
353
+
354
+ COMBINE_WITH_ITERATION_SCHEMA_PARAMS = (COMBINE_WITH_ITERATION_NODES, DOUBLE_STEP_EDGES)
easylink/rule.py CHANGED
@@ -31,16 +31,25 @@ class Rule(ABC):
31
31
  Path to the Snakefile to write the rule to.
32
32
  """
33
33
  with open(snakefile_path, "a") as f:
34
- f.write(self._build_rule())
34
+ f.write(self.build_rule())
35
35
 
36
36
  @abstractmethod
37
- def _build_rule(self) -> str:
37
+ def build_rule(self) -> str:
38
38
  """Builds the snakemake rule to be written to the Snakefile.
39
39
 
40
40
  This is an abstract method and must be implemented by concrete instances.
41
41
  """
42
42
  pass
43
43
 
44
+ @staticmethod
45
+ def get_input_slots_to_split(input_slots) -> list[str]:
46
+ input_slots_to_split = [
47
+ slot_name
48
+ for slot_name, slot_attrs in input_slots.items()
49
+ if slot_attrs.get("splitter", None)
50
+ ]
51
+ return input_slots_to_split
52
+
44
53
 
45
54
  @dataclass
46
55
  class TargetRule(Rule):
@@ -56,7 +65,7 @@ class TargetRule(Rule):
56
65
  requires_spark: bool
57
66
  """Whether or not this rule requires a Spark environment to run."""
58
67
 
59
- def _build_rule(self) -> str:
68
+ def build_rule(self) -> str:
60
69
  """Builds the Snakemake rule for the final output of the pipeline."""
61
70
  outputs = [os.path.basename(file_path) for file_path in self.target_files]
62
71
  rulestring = f"""
@@ -110,38 +119,77 @@ class ImplementedRule(Rule):
110
119
  """Command to execute."""
111
120
  requires_spark: bool
112
121
  """Whether or not this ``Implementation`` requires a Spark environment."""
122
+ is_embarrassingly_parallel: bool = False
123
+ """Whether or not this ``Implementation`` is to be run in an embarrassingly
124
+ parallel way."""
113
125
 
114
- def _build_rule(self) -> str:
126
+ def build_rule(self) -> str:
115
127
  """Builds the Snakemake rule for this ``Implementation``."""
116
- return self._build_io() + self._build_resources() + self._build_shell_command()
128
+ return self._build_io() + self._build_resources() + self._build_shell_cmd()
117
129
 
118
130
  def _build_io(self) -> str:
119
131
  """Builds the input/output portion of the rule."""
120
- return (
132
+ if self.is_embarrassingly_parallel:
133
+ # Processed chunks are sent to a 'processed' subdir
134
+ output_files = [
135
+ os.path.dirname(file_path)
136
+ + "/processed/{chunk}/"
137
+ + os.path.basename(file_path)
138
+ for file_path in self.output
139
+ ]
140
+ log_path_chunk_adder = "-{chunk}"
141
+ else:
142
+ output_files = self.output
143
+ log_path_chunk_adder = ""
144
+
145
+ io_str = (
121
146
  f"""
122
147
  rule:
123
148
  name: "{self.name}"
124
149
  message: "Running {self.step_name} implementation: {self.implementation_name}" """
125
150
  + self._build_input()
126
151
  + f"""
127
- output: {self.output}
128
- log: "{self.diagnostics_dir}/{self.name}-output.log"
152
+ output: {output_files}
153
+ log: "{self.diagnostics_dir}/{self.name}-output{log_path_chunk_adder}.log"
129
154
  container: "{self.image_path}" """
130
155
  )
156
+ return io_str
131
157
 
132
158
  def _build_input(self) -> str:
133
159
  input_str = f"""
134
160
  input:"""
135
- for slot_attrs in self.input_slots.values():
161
+ input_slots_to_split = self.get_input_slots_to_split(self.input_slots)
162
+ for slot, attrs in self.input_slots.items():
163
+ env_var = attrs["env_var"].lower()
164
+ if len(input_slots_to_split) > 1:
165
+ raise NotImplementedError(
166
+ "FIXME [MIC-5883] Multiple input slots to split not yet supported"
167
+ )
168
+ if self.is_embarrassingly_parallel and slot == input_slots_to_split[0]:
169
+ # The input to this is the input_chunks subdir from the checkpoint
170
+ # rule (which is built by modifying the output of the overall implementation)
171
+ if len(self.output) > 1:
172
+ raise NotImplementedError(
173
+ "FIXME [MIC-5883] Multiple output slots/files of EmbarrassinglyParallelSteps not yet supported"
174
+ )
175
+ input_files = [
176
+ os.path.dirname(self.output[0])
177
+ + "/input_chunks/{chunk}/"
178
+ + os.path.basename(self.output[0])
179
+ ]
180
+ else:
181
+ input_files = attrs["filepaths"]
182
+ input_str += f"""
183
+ {env_var}={input_files},"""
184
+ if not self.is_embarrassingly_parallel:
185
+ # validations were already handled in the checkpoint rule - no need
186
+ # to validate the individual chunks
136
187
  input_str += f"""
137
- {slot_attrs["env_var"].lower()}={slot_attrs["filepaths"]},"""
138
- input_str += f"""
139
- validations={self.validations}, """
188
+ validations={self.validations},"""
140
189
  if self.requires_spark:
141
190
  input_str += f"""
142
191
  master_trigger=gather.num_workers(rules.wait_for_spark_worker.output),
143
- master_url=rules.wait_for_spark_master.output,
144
- """
192
+ master_url=rules.wait_for_spark_master.output,"""
145
193
  return input_str
146
194
 
147
195
  def _build_resources(self) -> str:
@@ -156,16 +204,46 @@ rule:
156
204
  cpus_per_task={self.resources['cpus_per_task']},
157
205
  slurm_extra="--output '{self.diagnostics_dir}/{self.name}-slurm-%j.log'" """
158
206
 
159
- def _build_shell_command(self) -> str:
207
+ def _build_shell_cmd(self) -> str:
160
208
  """Builds the shell command portion of the rule."""
209
+ # TODO [MIC-5787]: handle multiple wildcards, e.g.
210
+ # output_paths = ",".join(self.output)
211
+ # wildcards_subdir = "/".join([f"{{wildcards.{wc}}}" for wc in self.wildcards])
212
+ # and then in shell cmd: export DUMMY_CONTAINER_OUTPUT_PATHS={output_paths}/{wildcards_subdir}
213
+ if self.is_embarrassingly_parallel:
214
+ if len(self.output) > 1:
215
+ raise NotImplementedError(
216
+ "FIXME [MIC-5883] Multiple output slots/files of EmbarrassinglyParallelSteps not yet supported"
217
+ )
218
+ output_files = (
219
+ os.path.dirname(self.output[0])
220
+ + "/processed/{wildcards.chunk}/"
221
+ + os.path.basename(self.output[0])
222
+ )
223
+ else:
224
+ output_files = ",".join(self.output)
161
225
  shell_cmd = f"""
162
226
  shell:
163
227
  '''
164
- export DUMMY_CONTAINER_OUTPUT_PATHS={",".join(self.output)}
228
+ export DUMMY_CONTAINER_OUTPUT_PATHS={output_files}
165
229
  export DUMMY_CONTAINER_DIAGNOSTICS_DIRECTORY={self.diagnostics_dir}"""
166
- for slot_attrs in self.input_slots.values():
230
+ for input_slot_name, input_slot_attrs in self.input_slots.items():
231
+ input_slots_to_split = self.get_input_slots_to_split(self.input_slots)
232
+ if len(input_slots_to_split) > 1:
233
+ raise NotImplementedError(
234
+ "FIXME [MIC-5883] Multiple input slots to split not yet supported"
235
+ )
236
+ if input_slot_name in input_slots_to_split:
237
+ # The inputs to this come from the input_chunks subdir
238
+ input_files = (
239
+ os.path.dirname(self.output[0])
240
+ + "/input_chunks/{wildcards.chunk}/"
241
+ + os.path.basename(self.output[0])
242
+ )
243
+ else:
244
+ input_files = ",".join(input_slot_attrs["filepaths"])
167
245
  shell_cmd += f"""
168
- export {slot_attrs["env_var"]}={",".join(slot_attrs["filepaths"])}"""
246
+ export {input_slot_attrs["env_var"]}={input_files}"""
169
247
  if self.requires_spark:
170
248
  shell_cmd += f"""
171
249
  read -r DUMMY_CONTAINER_SPARK_MASTER_URL < {{input.master_url}}
@@ -194,7 +272,7 @@ class InputValidationRule(Rule):
194
272
 
195
273
  name: str
196
274
  """Name of the rule."""
197
- slot_name: str
275
+ input_slot_name: str
198
276
  """Name of the ``InputSlot``."""
199
277
  input: list[str]
200
278
  """List of filepaths to validate."""
@@ -203,14 +281,196 @@ class InputValidationRule(Rule):
203
281
  validator: Callable
204
282
  """Callable that takes a filepath as input. Raises an error if invalid."""
205
283
 
206
- def _build_rule(self) -> str:
284
+ def build_rule(self) -> str:
285
+ """Builds the Snakemake rule for this validation.
286
+
287
+ This rule runs the appropriate validator function on each input file as well
288
+ as creates an empty file at the end. This empty file is used by Snakemake
289
+ to build the graph edge from this rule to the next (since the validations
290
+ themselves don't generate any output).
291
+ """
207
292
  return f"""
208
293
  rule:
209
- name: "{self.name}_{self.slot_name}_validator"
294
+ name: "{self.name}_{self.input_slot_name}_validator"
210
295
  input: {self.input}
211
296
  output: touch("{self.output}")
212
297
  localrule: True
213
- message: "Validating {self.name} input slot {self.slot_name}"
298
+ message: "Validating {self.name} input slot {self.input_slot_name}"
214
299
  run:
215
300
  for f in input:
216
301
  validation_utils.{self.validator.__name__}(f)"""
302
+
303
+
304
+ @dataclass
305
+ class CheckpointRule(Rule):
306
+ """A :class:`Rule` that defines a checkpoint.
307
+
308
+ When running an :class:`~easylink.implementation.Implementation` in an embarrassingly
309
+ parallel way, we do not know until runtime how many parallel jobs there will
310
+ be (e.g. we don't know beforehand how many chunks a large incoming dataset will
311
+ be split into since the incoming dataset isn't created until runtime). The
312
+ snakemake mechanism to handle this dynamic nature is a
313
+ `checkpoint <https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#data-dependent-conditional-execution/>`_
314
+ rule along with a directory as output.
315
+
316
+ Notes
317
+ -----
318
+ There is a known `Snakemake bug <https://github.com/snakemake/snakemake/issues/3036>`_
319
+ which prevents the use of multiple checkpoints in a single Snakefile. We
320
+ work around this by generating an empty checkpoint.txt file as part of this
321
+ rule. If this file does not yet exist when trying to run the :class:`AggregationRule`,
322
+ it means that the checkpoint has not yet been executed for the
323
+ particular wildcard value(s). In this case, we manually raise a Snakemake
324
+ ``IncompleteCheckpointException`` which Snakemake automatically handles
325
+ and leads to a re-evaluation after the checkpoint has successfully passed.
326
+
327
+ TODO [MIC-5658]: Thoroughly test this workaround when implementing cacheing.
328
+ """
329
+
330
+ name: str
331
+ """Name of the rule."""
332
+ input_slots: dict[str, dict[str, str | list[str]]]
333
+ """This ``Implementation's`` input slot attributes."""
334
+ validations: list[str]
335
+ """Validation files from previous rule."""
336
+ output: list[str]
337
+ """Output directory path. It must be used as an input for next rule."""
338
+
339
+ def build_rule(self) -> str:
340
+ """Builds the Snakemake rule for this checkpoint.
341
+
342
+ Checkpoint rules are a special type of rule in Snakemake that allow for dynamic
343
+ generation of output files. This rule is responsible for splitting the input
344
+ files into chunks. Note that the output of this rule is a Snakemake ``directory``
345
+ object as opposed to a specific file like typical rules have.
346
+ """
347
+ # Replace the output filepath with an input_chunks subdir
348
+ output_dir = os.path.dirname(self.output[0]) + "/input_chunks"
349
+ input_slots_to_split = self.get_input_slots_to_split(self.input_slots)
350
+ if len(input_slots_to_split) > 1:
351
+ raise NotImplementedError(
352
+ "FIXME [MIC-5883] Multiple input slots to split not yet supported"
353
+ )
354
+ input_slot_to_split = input_slots_to_split[0]
355
+ checkpoint = f"""
356
+ checkpoint:
357
+ name: "split_{self.name}_{input_slot_to_split}"
358
+ input:
359
+ files={self.input_slots[input_slot_to_split]['filepaths']},
360
+ validations={self.validations},
361
+ output:
362
+ output_dir=directory("{output_dir}"),
363
+ checkpoint_file=touch("{output_dir}/checkpoint.txt"),
364
+ params:
365
+ input_files=lambda wildcards, input: ",".join(input.files),
366
+ localrule: True
367
+ message: "Splitting {self.name} {input_slot_to_split} into chunks"
368
+ run:
369
+ splitter_utils.{self.input_slots[input_slot_to_split]["splitter"].__name__}(
370
+ input_files=list(input.files),
371
+ output_dir=output.output_dir,
372
+ desired_chunk_size_mb=0.1,
373
+ )"""
374
+ return checkpoint
375
+
376
+
377
+ @dataclass
378
+ class AggregationRule(Rule):
379
+ """A :class:`Rule` that aggregates the processed chunks of output data.
380
+
381
+ When running an :class:`~easylink.implementation.Implementation` in an embarrassingly
382
+ parallel way, we need to aggregate the output files from each parallel job
383
+ into a single output file.
384
+ """
385
+
386
+ name: str
387
+ """Name of the rule."""
388
+ input_slots: dict[str, dict[str, str | list[str]]]
389
+ """This ``Implementation's`` input slot attributes."""
390
+ output_slot_name: str
391
+ """Name of the :class:`~easylink.graph_components.OutputSlot`."""
392
+ output_slot: dict[str, str | list[str]]
393
+ """The output slot attributes to create this rule for."""
394
+
395
+ def build_rule(self) -> str:
396
+ """Builds the Snakemake rule for this aggregator.
397
+
398
+ When running an :class:`~easylink.step.EmbarrassinglyParallelStep`, we need
399
+ to aggregate the output files from each parallel job into a single output file.
400
+ This rule relies on a dynamically generated aggregation function which returns
401
+ all of the **processed** chunks (from running the ``EmbarrassinglyParallelStep's``
402
+ container in parallel) and uses them as inputs to the actual aggregation
403
+ rule.
404
+
405
+ Notes
406
+ -----
407
+ There is a known `Snakemake bug <https://github.com/snakemake/snakemake/issues/3036>`_
408
+ which prevents the use of multiple checkpoints in a single Snakefile. We
409
+ work around this by generating an empty checkpoint.txt file in the
410
+ :class:`~CheckpointRule`. If this file does not yet exist when trying to
411
+ aggregate, it means that the checkpoint has not yet been executed for the
412
+ particular wildcard value(s). In this case, we manually raise a Snakemake
413
+ ``IncompleteCheckpointException`` which `Snakemake automatically handles
414
+ <https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#data-dependent-conditional-execution>`_
415
+ and leads to a re-evaluation after the checkpoint has successfully passed,
416
+ i.e. we replicate `Snakemake's behavior <https://github.com/snakemake/snakemake/blob/04f89d330dd94baa51f41bc796392f85bccbd231/snakemake/checkpoints.py#L42>`_.
417
+ """
418
+ input_function = self._define_input_function()
419
+ rule = self._define_aggregator_rule()
420
+ return input_function + rule
421
+
422
+ def _define_input_function(self):
423
+ """Builds the `input function <https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#input-functions>`_."""
424
+ if len(self.output_slot["filepaths"]) > 1:
425
+ raise NotImplementedError(
426
+ "FIXME [MIC-5883] Multiple output slots/files of EmbarrassinglyParallelSteps not yet supported"
427
+ )
428
+ if len(self.output_slot["filepaths"]) > 1:
429
+ raise NotImplementedError(
430
+ "FIXME [MIC-5883] Multiple slots/files of EmbarrassinglyParallelSteps not yet supported"
431
+ )
432
+ output_filepath = self.output_slot["filepaths"][0]
433
+ checkpoint_file_path = (
434
+ os.path.dirname(output_filepath) + "/input_chunks/checkpoint.txt"
435
+ )
436
+ input_slots_to_split = self.get_input_slots_to_split(self.input_slots)
437
+ if len(input_slots_to_split) > 1:
438
+ raise NotImplementedError(
439
+ "FIXME [MIC-5883] Multiple input slots to split not yet supported"
440
+ )
441
+ input_slot_to_split = input_slots_to_split[0]
442
+ checkpoint_name = f"checkpoints.split_{self.name}_{input_slot_to_split}"
443
+ output_files = (
444
+ os.path.dirname(output_filepath)
445
+ + "/processed/{chunk}/"
446
+ + os.path.basename(output_filepath)
447
+ )
448
+ func = f"""
449
+ def get_aggregation_inputs_{self.name}_{self.output_slot_name}(wildcards):
450
+ checkpoint_file = "{checkpoint_file_path}"
451
+ if not os.path.exists(checkpoint_file):
452
+ output, _ = {checkpoint_name}.rule.expand_output(wildcards)
453
+ raise IncompleteCheckpointException({checkpoint_name}.rule, checkpoint_target(output[0]))
454
+ checkpoint_output = glob.glob(f"{{{checkpoint_name}.get(**wildcards).output.output_dir}}/*/")
455
+ chunks = [Path(filepath).parts[-1] for filepath in checkpoint_output]
456
+ return expand(
457
+ "{output_files}",
458
+ chunk=chunks
459
+ )"""
460
+ return func
461
+
462
+ def _define_aggregator_rule(self):
463
+ """Builds the rule that runs the aggregation."""
464
+ rule = f"""
465
+ rule:
466
+ name: "aggregate_{self.name}_{self.output_slot_name}"
467
+ input: get_aggregation_inputs_{self.name}_{self.output_slot_name}
468
+ output: {self.output_slot["filepaths"]}
469
+ localrule: True
470
+ message: "Aggregating {self.name} {self.output_slot_name}"
471
+ run:
472
+ aggregator_utils.{self.output_slot["aggregator"].__name__}(
473
+ input_files=list(input),
474
+ output_filepath="{self.output_slot["filepaths"][0]}",
475
+ )"""
476
+ return rule
easylink/runner.py CHANGED
@@ -113,6 +113,7 @@ def main(
113
113
  ]
114
114
  argv.extend(environment_args)
115
115
  logger.info(f"Running Snakemake")
116
+ logger.debug(f"Snakemake arguments: {argv}")
116
117
  snake_main(argv)
117
118
 
118
119
 
easylink/step.py CHANGED
@@ -1064,6 +1064,65 @@ class ParallelStep(TemplatedStep):
1064
1064
  return {"input": input_mappings, "output": output_mappings}
1065
1065
 
1066
1066
 
1067
+ class EmbarrassinglyParallelStep(Step):
1068
+ """A step that is run in parallel on the backend.
1069
+
1070
+ An ``EmbarrassinglyParallelStep`` is different than a :class:`ParallelStep`
1071
+ in that it is not configured by the user to be run in parallel - it completely
1072
+ happens on the back end for performance reasons. As such, note that it inherits
1073
+ from :class:`Step` instead of :class:`TemplatedStep`.
1074
+ """
1075
+
1076
+ def __init__(
1077
+ self,
1078
+ step_name: str,
1079
+ input_slots: Iterable[InputSlot],
1080
+ output_slots: Iterable[OutputSlot],
1081
+ ) -> None:
1082
+ super().__init__(step_name, input_slots=input_slots, output_slots=output_slots)
1083
+ self._validate()
1084
+
1085
+ def _validate(self) -> None:
1086
+ """Validates the ``EmbarrassinglyParallelStep``.
1087
+
1088
+ ``EmbarrassinglyParallelSteps`` are not configured by the user to be run
1089
+ in parallel. Since it happens on the back end, we need to do somewhat unique
1090
+ validations during construction. Specifically,
1091
+ - one and only one :class:`~easylink.graph_components.InputSlot` *must* include
1092
+ a :attr:`~easylink.graph_components.InputSlot.splitter` method.
1093
+ - all :class:`OutputSlots<easylink.graph_components.OutputSlot>` *must* include
1094
+ an :attr:`~easylink.graph_components.OutputSlot.aggregator` method.
1095
+ """
1096
+ errors = []
1097
+ # assert that only one input slot has a splitter assigned
1098
+ splitters = {
1099
+ slot.name: slot.splitter.__name__
1100
+ for slot in self.input_slots.values()
1101
+ if slot.splitter
1102
+ }
1103
+ if len(splitters) == 0:
1104
+ errors.append(
1105
+ f"EmbarrassinglyParallelStep '{self.step_name}' does not have any input slots with a "
1106
+ "splitter method assigned; one and only one input slot must have a splitter."
1107
+ )
1108
+ if len(splitters) > 1:
1109
+ errors.append(
1110
+ f"EmbarrassinglyParallelStep '{self.step_name}' has multiple input slots with "
1111
+ "splitter methods assigned; one and only one input slot must have a splitter.\n"
1112
+ f"Input slots with splitters: {splitters}"
1113
+ )
1114
+ missing_aggregators = [
1115
+ slot.name for slot in self.output_slots.values() if not slot.aggregator
1116
+ ]
1117
+ if len(missing_aggregators) != 0:
1118
+ errors.append(
1119
+ f"EmbarrassinglyParallelStep '{self.step_name}' has output slots without "
1120
+ f"aggregator methods assigned: {missing_aggregators}"
1121
+ )
1122
+ if errors:
1123
+ raise ValueError("\n".join(errors))
1124
+
1125
+
1067
1126
  class ChoiceStep(Step):
1068
1127
  """A type of :class:`Step` that allows for choosing between multiple paths.
1069
1128
 
@@ -1361,6 +1420,11 @@ class LeafConfigurationState(ConfigurationState):
1361
1420
  implementation_graph = ImplementationGraph()
1362
1421
  implementation_node_name = self._step.implementation_node_name
1363
1422
  if self.is_combined:
1423
+ if isinstance(self._step, EmbarrassinglyParallelStep):
1424
+ raise NotImplementedError(
1425
+ "Combining implementations with embarrassingly parallel steps "
1426
+ "is not yet supported."
1427
+ )
1364
1428
  implementation = PartialImplementation(
1365
1429
  combined_name=self.pipeline_config[COMBINED_IMPLEMENTATION_KEY],
1366
1430
  schema_step=self._step.step_name,
@@ -1373,6 +1437,7 @@ class LeafConfigurationState(ConfigurationState):
1373
1437
  implementation_config=self.implementation_config,
1374
1438
  input_slots=self._step.input_slots.values(),
1375
1439
  output_slots=self._step.output_slots.values(),
1440
+ is_embarrassingly_parallel=isinstance(self._step, EmbarrassinglyParallelStep),
1376
1441
  )
1377
1442
  implementation_graph.add_node_from_implementation(
1378
1443
  implementation_node_name,
@@ -0,0 +1,31 @@
1
+ """
2
+ ==========================
3
+ Data Aggregating Utilities
4
+ ==========================
5
+
6
+ This module contains utility functions for aggregating datasets. One primary use
7
+ case for this is combine the results of running sections of the pipeline in an
8
+ embarrassingly parallel manner.
9
+
10
+ Note that it is critical that all data aggregating utility functions are definied
11
+ in this module; easylink will not be able to find them otherwise.
12
+ """
13
+
14
+ import pandas as pd
15
+ from loguru import logger
16
+
17
+
18
+ def concatenate_datasets(input_files: list[str], output_filepath: str) -> None:
19
+ """Concatenates multiple datasets into a single one.
20
+
21
+ Parameters
22
+ ----------
23
+ input_files
24
+ A list of input file paths to be concatenated.
25
+ output_filepath
26
+ The output filepath.
27
+ """
28
+ logger.info(f"Concatenating {len(input_files)} datasets")
29
+ dfs = [pd.read_parquet(df) for df in input_files]
30
+ df = pd.concat(dfs, ignore_index=True)
31
+ df.to_parquet(output_filepath)
@@ -1,3 +1,4 @@
1
+ # mypy: ignore-errors
1
2
  import os
2
3
  import shutil
3
4
  from datetime import datetime
@@ -1,3 +1,4 @@
1
+ # mypy: ignore-errors
1
2
  import errno
2
3
  import functools
3
4
  import shutil