easylink 0.1.6__py3-none-any.whl → 0.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- easylink/_version.py +1 -1
- easylink/cli.py +18 -9
- easylink/graph_components.py +12 -2
- easylink/implementation.py +2 -0
- easylink/pipeline.py +92 -34
- easylink/pipeline_graph.py +112 -27
- easylink/pipeline_schema_constants/__init__.py +3 -0
- easylink/pipeline_schema_constants/development.py +11 -2
- easylink/pipeline_schema_constants/testing.py +135 -0
- easylink/rule.py +282 -22
- easylink/runner.py +1 -0
- easylink/step.py +65 -0
- easylink/utilities/aggregator_utils.py +31 -0
- easylink/utilities/data_utils.py +1 -0
- easylink/utilities/general_utils.py +1 -0
- easylink/utilities/splitter_utils.py +71 -0
- {easylink-0.1.6.dist-info → easylink-0.1.7.dist-info}/METADATA +1 -1
- {easylink-0.1.6.dist-info → easylink-0.1.7.dist-info}/RECORD +21 -19
- {easylink-0.1.6.dist-info → easylink-0.1.7.dist-info}/WHEEL +1 -1
- {easylink-0.1.6.dist-info → easylink-0.1.7.dist-info}/entry_points.txt +0 -0
- {easylink-0.1.6.dist-info → easylink-0.1.7.dist-info}/top_level.txt +0 -0
@@ -57,6 +57,76 @@ SINGLE_STEP_EDGES = [
|
|
57
57
|
|
58
58
|
SINGLE_STEP_SCHEMA_PARAMS = (SINGLE_STEP_NODES, SINGLE_STEP_EDGES)
|
59
59
|
|
60
|
+
TRIPLE_STEP_NODES = [
|
61
|
+
InputStep(),
|
62
|
+
Step(
|
63
|
+
step_name="step_1",
|
64
|
+
input_slots=[
|
65
|
+
InputSlot(
|
66
|
+
name="step_1_main_input",
|
67
|
+
env_var="DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS",
|
68
|
+
validator=validate_input_file_dummy,
|
69
|
+
)
|
70
|
+
],
|
71
|
+
output_slots=[OutputSlot("step_1_main_output")],
|
72
|
+
),
|
73
|
+
Step(
|
74
|
+
step_name="step_2",
|
75
|
+
input_slots=[
|
76
|
+
InputSlot(
|
77
|
+
name="step_2_main_input",
|
78
|
+
env_var="DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS",
|
79
|
+
validator=validate_input_file_dummy,
|
80
|
+
)
|
81
|
+
],
|
82
|
+
output_slots=[OutputSlot("step_2_main_output")],
|
83
|
+
),
|
84
|
+
Step(
|
85
|
+
step_name="step_3",
|
86
|
+
input_slots=[
|
87
|
+
InputSlot(
|
88
|
+
name="step_3_main_input",
|
89
|
+
env_var="DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS",
|
90
|
+
validator=validate_input_file_dummy,
|
91
|
+
)
|
92
|
+
],
|
93
|
+
output_slots=[OutputSlot("step_3_main_output")],
|
94
|
+
),
|
95
|
+
OutputStep(
|
96
|
+
input_slots=[
|
97
|
+
InputSlot(name="result", env_var=None, validator=validate_input_file_dummy)
|
98
|
+
],
|
99
|
+
),
|
100
|
+
]
|
101
|
+
TRIPLE_STEP_EDGES = [
|
102
|
+
EdgeParams(
|
103
|
+
source_node="input_data",
|
104
|
+
target_node="step_1",
|
105
|
+
output_slot="all",
|
106
|
+
input_slot="step_1_main_input",
|
107
|
+
),
|
108
|
+
EdgeParams(
|
109
|
+
source_node="step_1",
|
110
|
+
target_node="step_2",
|
111
|
+
output_slot="step_1_main_output",
|
112
|
+
input_slot="step_2_main_input",
|
113
|
+
),
|
114
|
+
EdgeParams(
|
115
|
+
source_node="step_2",
|
116
|
+
target_node="step_3",
|
117
|
+
output_slot="step_2_main_output",
|
118
|
+
input_slot="step_3_main_input",
|
119
|
+
),
|
120
|
+
EdgeParams(
|
121
|
+
source_node="step_3",
|
122
|
+
target_node="results",
|
123
|
+
output_slot="step_3_main_output",
|
124
|
+
input_slot="result",
|
125
|
+
),
|
126
|
+
]
|
127
|
+
|
128
|
+
TRIPLE_STEP_SCHEMA_PARAMS = (TRIPLE_STEP_NODES, TRIPLE_STEP_EDGES)
|
129
|
+
|
60
130
|
|
61
131
|
BAD_COMBINED_TOPOLOGY_NODES = [
|
62
132
|
InputStep(),
|
@@ -217,3 +287,68 @@ NESTED_TEMPLATED_STEPS_NODES = [
|
|
217
287
|
|
218
288
|
|
219
289
|
NESTED_TEMPLATED_STEPS_SCHEMA_PARAMS = (NESTED_TEMPLATED_STEPS_NODES, SINGLE_STEP_EDGES)
|
290
|
+
|
291
|
+
|
292
|
+
COMBINE_WITH_ITERATION_NODES = [
|
293
|
+
InputStep(),
|
294
|
+
LoopStep(
|
295
|
+
template_step=Step(
|
296
|
+
step_name="step_1",
|
297
|
+
input_slots=[
|
298
|
+
InputSlot(
|
299
|
+
name="step_1_main_input",
|
300
|
+
env_var="DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS",
|
301
|
+
validator=validate_input_file_dummy,
|
302
|
+
)
|
303
|
+
],
|
304
|
+
output_slots=[OutputSlot("step_1_main_output")],
|
305
|
+
),
|
306
|
+
self_edges=[
|
307
|
+
EdgeParams(
|
308
|
+
source_node="step_1",
|
309
|
+
target_node="step_1",
|
310
|
+
output_slot="step_1_main_output",
|
311
|
+
input_slot="step_1_main_input",
|
312
|
+
),
|
313
|
+
],
|
314
|
+
),
|
315
|
+
Step(
|
316
|
+
step_name="step_2",
|
317
|
+
input_slots=[
|
318
|
+
InputSlot(
|
319
|
+
name="step_2_main_input",
|
320
|
+
env_var="DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS",
|
321
|
+
validator=validate_input_file_dummy,
|
322
|
+
)
|
323
|
+
],
|
324
|
+
output_slots=[OutputSlot("step_2_main_output")],
|
325
|
+
),
|
326
|
+
OutputStep(
|
327
|
+
input_slots=[
|
328
|
+
InputSlot(name="result", env_var=None, validator=validate_input_file_dummy)
|
329
|
+
],
|
330
|
+
),
|
331
|
+
]
|
332
|
+
DOUBLE_STEP_EDGES = [
|
333
|
+
EdgeParams(
|
334
|
+
source_node="input_data",
|
335
|
+
target_node="step_1",
|
336
|
+
output_slot="all",
|
337
|
+
input_slot="step_1_main_input",
|
338
|
+
),
|
339
|
+
EdgeParams(
|
340
|
+
source_node="step_1",
|
341
|
+
target_node="step_2",
|
342
|
+
output_slot="step_1_main_output",
|
343
|
+
input_slot="step_2_main_input",
|
344
|
+
),
|
345
|
+
EdgeParams(
|
346
|
+
source_node="step_2",
|
347
|
+
target_node="results",
|
348
|
+
output_slot="step_2_main_output",
|
349
|
+
input_slot="result",
|
350
|
+
),
|
351
|
+
]
|
352
|
+
|
353
|
+
|
354
|
+
COMBINE_WITH_ITERATION_SCHEMA_PARAMS = (COMBINE_WITH_ITERATION_NODES, DOUBLE_STEP_EDGES)
|
easylink/rule.py
CHANGED
@@ -31,16 +31,25 @@ class Rule(ABC):
|
|
31
31
|
Path to the Snakefile to write the rule to.
|
32
32
|
"""
|
33
33
|
with open(snakefile_path, "a") as f:
|
34
|
-
f.write(self.
|
34
|
+
f.write(self.build_rule())
|
35
35
|
|
36
36
|
@abstractmethod
|
37
|
-
def
|
37
|
+
def build_rule(self) -> str:
|
38
38
|
"""Builds the snakemake rule to be written to the Snakefile.
|
39
39
|
|
40
40
|
This is an abstract method and must be implemented by concrete instances.
|
41
41
|
"""
|
42
42
|
pass
|
43
43
|
|
44
|
+
@staticmethod
|
45
|
+
def get_input_slots_to_split(input_slots) -> list[str]:
|
46
|
+
input_slots_to_split = [
|
47
|
+
slot_name
|
48
|
+
for slot_name, slot_attrs in input_slots.items()
|
49
|
+
if slot_attrs.get("splitter", None)
|
50
|
+
]
|
51
|
+
return input_slots_to_split
|
52
|
+
|
44
53
|
|
45
54
|
@dataclass
|
46
55
|
class TargetRule(Rule):
|
@@ -56,7 +65,7 @@ class TargetRule(Rule):
|
|
56
65
|
requires_spark: bool
|
57
66
|
"""Whether or not this rule requires a Spark environment to run."""
|
58
67
|
|
59
|
-
def
|
68
|
+
def build_rule(self) -> str:
|
60
69
|
"""Builds the Snakemake rule for the final output of the pipeline."""
|
61
70
|
outputs = [os.path.basename(file_path) for file_path in self.target_files]
|
62
71
|
rulestring = f"""
|
@@ -110,38 +119,77 @@ class ImplementedRule(Rule):
|
|
110
119
|
"""Command to execute."""
|
111
120
|
requires_spark: bool
|
112
121
|
"""Whether or not this ``Implementation`` requires a Spark environment."""
|
122
|
+
is_embarrassingly_parallel: bool = False
|
123
|
+
"""Whether or not this ``Implementation`` is to be run in an embarrassingly
|
124
|
+
parallel way."""
|
113
125
|
|
114
|
-
def
|
126
|
+
def build_rule(self) -> str:
|
115
127
|
"""Builds the Snakemake rule for this ``Implementation``."""
|
116
|
-
return self._build_io() + self._build_resources() + self.
|
128
|
+
return self._build_io() + self._build_resources() + self._build_shell_cmd()
|
117
129
|
|
118
130
|
def _build_io(self) -> str:
|
119
131
|
"""Builds the input/output portion of the rule."""
|
120
|
-
|
132
|
+
if self.is_embarrassingly_parallel:
|
133
|
+
# Processed chunks are sent to a 'processed' subdir
|
134
|
+
output_files = [
|
135
|
+
os.path.dirname(file_path)
|
136
|
+
+ "/processed/{chunk}/"
|
137
|
+
+ os.path.basename(file_path)
|
138
|
+
for file_path in self.output
|
139
|
+
]
|
140
|
+
log_path_chunk_adder = "-{chunk}"
|
141
|
+
else:
|
142
|
+
output_files = self.output
|
143
|
+
log_path_chunk_adder = ""
|
144
|
+
|
145
|
+
io_str = (
|
121
146
|
f"""
|
122
147
|
rule:
|
123
148
|
name: "{self.name}"
|
124
149
|
message: "Running {self.step_name} implementation: {self.implementation_name}" """
|
125
150
|
+ self._build_input()
|
126
151
|
+ f"""
|
127
|
-
output: {
|
128
|
-
log: "{self.diagnostics_dir}/{self.name}-output.log"
|
152
|
+
output: {output_files}
|
153
|
+
log: "{self.diagnostics_dir}/{self.name}-output{log_path_chunk_adder}.log"
|
129
154
|
container: "{self.image_path}" """
|
130
155
|
)
|
156
|
+
return io_str
|
131
157
|
|
132
158
|
def _build_input(self) -> str:
|
133
159
|
input_str = f"""
|
134
160
|
input:"""
|
135
|
-
|
161
|
+
input_slots_to_split = self.get_input_slots_to_split(self.input_slots)
|
162
|
+
for slot, attrs in self.input_slots.items():
|
163
|
+
env_var = attrs["env_var"].lower()
|
164
|
+
if len(input_slots_to_split) > 1:
|
165
|
+
raise NotImplementedError(
|
166
|
+
"FIXME [MIC-5883] Multiple input slots to split not yet supported"
|
167
|
+
)
|
168
|
+
if self.is_embarrassingly_parallel and slot == input_slots_to_split[0]:
|
169
|
+
# The input to this is the input_chunks subdir from the checkpoint
|
170
|
+
# rule (which is built by modifying the output of the overall implementation)
|
171
|
+
if len(self.output) > 1:
|
172
|
+
raise NotImplementedError(
|
173
|
+
"FIXME [MIC-5883] Multiple output slots/files of EmbarrassinglyParallelSteps not yet supported"
|
174
|
+
)
|
175
|
+
input_files = [
|
176
|
+
os.path.dirname(self.output[0])
|
177
|
+
+ "/input_chunks/{chunk}/"
|
178
|
+
+ os.path.basename(self.output[0])
|
179
|
+
]
|
180
|
+
else:
|
181
|
+
input_files = attrs["filepaths"]
|
182
|
+
input_str += f"""
|
183
|
+
{env_var}={input_files},"""
|
184
|
+
if not self.is_embarrassingly_parallel:
|
185
|
+
# validations were already handled in the checkpoint rule - no need
|
186
|
+
# to validate the individual chunks
|
136
187
|
input_str += f"""
|
137
|
-
{
|
138
|
-
input_str += f"""
|
139
|
-
validations={self.validations}, """
|
188
|
+
validations={self.validations},"""
|
140
189
|
if self.requires_spark:
|
141
190
|
input_str += f"""
|
142
191
|
master_trigger=gather.num_workers(rules.wait_for_spark_worker.output),
|
143
|
-
master_url=rules.wait_for_spark_master.output,
|
144
|
-
"""
|
192
|
+
master_url=rules.wait_for_spark_master.output,"""
|
145
193
|
return input_str
|
146
194
|
|
147
195
|
def _build_resources(self) -> str:
|
@@ -156,16 +204,46 @@ rule:
|
|
156
204
|
cpus_per_task={self.resources['cpus_per_task']},
|
157
205
|
slurm_extra="--output '{self.diagnostics_dir}/{self.name}-slurm-%j.log'" """
|
158
206
|
|
159
|
-
def
|
207
|
+
def _build_shell_cmd(self) -> str:
|
160
208
|
"""Builds the shell command portion of the rule."""
|
209
|
+
# TODO [MIC-5787]: handle multiple wildcards, e.g.
|
210
|
+
# output_paths = ",".join(self.output)
|
211
|
+
# wildcards_subdir = "/".join([f"{{wildcards.{wc}}}" for wc in self.wildcards])
|
212
|
+
# and then in shell cmd: export DUMMY_CONTAINER_OUTPUT_PATHS={output_paths}/{wildcards_subdir}
|
213
|
+
if self.is_embarrassingly_parallel:
|
214
|
+
if len(self.output) > 1:
|
215
|
+
raise NotImplementedError(
|
216
|
+
"FIXME [MIC-5883] Multiple output slots/files of EmbarrassinglyParallelSteps not yet supported"
|
217
|
+
)
|
218
|
+
output_files = (
|
219
|
+
os.path.dirname(self.output[0])
|
220
|
+
+ "/processed/{wildcards.chunk}/"
|
221
|
+
+ os.path.basename(self.output[0])
|
222
|
+
)
|
223
|
+
else:
|
224
|
+
output_files = ",".join(self.output)
|
161
225
|
shell_cmd = f"""
|
162
226
|
shell:
|
163
227
|
'''
|
164
|
-
export DUMMY_CONTAINER_OUTPUT_PATHS={
|
228
|
+
export DUMMY_CONTAINER_OUTPUT_PATHS={output_files}
|
165
229
|
export DUMMY_CONTAINER_DIAGNOSTICS_DIRECTORY={self.diagnostics_dir}"""
|
166
|
-
for
|
230
|
+
for input_slot_name, input_slot_attrs in self.input_slots.items():
|
231
|
+
input_slots_to_split = self.get_input_slots_to_split(self.input_slots)
|
232
|
+
if len(input_slots_to_split) > 1:
|
233
|
+
raise NotImplementedError(
|
234
|
+
"FIXME [MIC-5883] Multiple input slots to split not yet supported"
|
235
|
+
)
|
236
|
+
if input_slot_name in input_slots_to_split:
|
237
|
+
# The inputs to this come from the input_chunks subdir
|
238
|
+
input_files = (
|
239
|
+
os.path.dirname(self.output[0])
|
240
|
+
+ "/input_chunks/{wildcards.chunk}/"
|
241
|
+
+ os.path.basename(self.output[0])
|
242
|
+
)
|
243
|
+
else:
|
244
|
+
input_files = ",".join(input_slot_attrs["filepaths"])
|
167
245
|
shell_cmd += f"""
|
168
|
-
export {
|
246
|
+
export {input_slot_attrs["env_var"]}={input_files}"""
|
169
247
|
if self.requires_spark:
|
170
248
|
shell_cmd += f"""
|
171
249
|
read -r DUMMY_CONTAINER_SPARK_MASTER_URL < {{input.master_url}}
|
@@ -194,7 +272,7 @@ class InputValidationRule(Rule):
|
|
194
272
|
|
195
273
|
name: str
|
196
274
|
"""Name of the rule."""
|
197
|
-
|
275
|
+
input_slot_name: str
|
198
276
|
"""Name of the ``InputSlot``."""
|
199
277
|
input: list[str]
|
200
278
|
"""List of filepaths to validate."""
|
@@ -203,14 +281,196 @@ class InputValidationRule(Rule):
|
|
203
281
|
validator: Callable
|
204
282
|
"""Callable that takes a filepath as input. Raises an error if invalid."""
|
205
283
|
|
206
|
-
def
|
284
|
+
def build_rule(self) -> str:
|
285
|
+
"""Builds the Snakemake rule for this validation.
|
286
|
+
|
287
|
+
This rule runs the appropriate validator function on each input file as well
|
288
|
+
as creates an empty file at the end. This empty file is used by Snakemake
|
289
|
+
to build the graph edge from this rule to the next (since the validations
|
290
|
+
themselves don't generate any output).
|
291
|
+
"""
|
207
292
|
return f"""
|
208
293
|
rule:
|
209
|
-
name: "{self.name}_{self.
|
294
|
+
name: "{self.name}_{self.input_slot_name}_validator"
|
210
295
|
input: {self.input}
|
211
296
|
output: touch("{self.output}")
|
212
297
|
localrule: True
|
213
|
-
message: "Validating {self.name} input slot {self.
|
298
|
+
message: "Validating {self.name} input slot {self.input_slot_name}"
|
214
299
|
run:
|
215
300
|
for f in input:
|
216
301
|
validation_utils.{self.validator.__name__}(f)"""
|
302
|
+
|
303
|
+
|
304
|
+
@dataclass
|
305
|
+
class CheckpointRule(Rule):
|
306
|
+
"""A :class:`Rule` that defines a checkpoint.
|
307
|
+
|
308
|
+
When running an :class:`~easylink.implementation.Implementation` in an embarrassingly
|
309
|
+
parallel way, we do not know until runtime how many parallel jobs there will
|
310
|
+
be (e.g. we don't know beforehand how many chunks a large incoming dataset will
|
311
|
+
be split into since the incoming dataset isn't created until runtime). The
|
312
|
+
snakemake mechanism to handle this dynamic nature is a
|
313
|
+
`checkpoint <https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#data-dependent-conditional-execution/>`_
|
314
|
+
rule along with a directory as output.
|
315
|
+
|
316
|
+
Notes
|
317
|
+
-----
|
318
|
+
There is a known `Snakemake bug <https://github.com/snakemake/snakemake/issues/3036>`_
|
319
|
+
which prevents the use of multiple checkpoints in a single Snakefile. We
|
320
|
+
work around this by generating an empty checkpoint.txt file as part of this
|
321
|
+
rule. If this file does not yet exist when trying to run the :class:`AggregationRule`,
|
322
|
+
it means that the checkpoint has not yet been executed for the
|
323
|
+
particular wildcard value(s). In this case, we manually raise a Snakemake
|
324
|
+
``IncompleteCheckpointException`` which Snakemake automatically handles
|
325
|
+
and leads to a re-evaluation after the checkpoint has successfully passed.
|
326
|
+
|
327
|
+
TODO [MIC-5658]: Thoroughly test this workaround when implementing cacheing.
|
328
|
+
"""
|
329
|
+
|
330
|
+
name: str
|
331
|
+
"""Name of the rule."""
|
332
|
+
input_slots: dict[str, dict[str, str | list[str]]]
|
333
|
+
"""This ``Implementation's`` input slot attributes."""
|
334
|
+
validations: list[str]
|
335
|
+
"""Validation files from previous rule."""
|
336
|
+
output: list[str]
|
337
|
+
"""Output directory path. It must be used as an input for next rule."""
|
338
|
+
|
339
|
+
def build_rule(self) -> str:
|
340
|
+
"""Builds the Snakemake rule for this checkpoint.
|
341
|
+
|
342
|
+
Checkpoint rules are a special type of rule in Snakemake that allow for dynamic
|
343
|
+
generation of output files. This rule is responsible for splitting the input
|
344
|
+
files into chunks. Note that the output of this rule is a Snakemake ``directory``
|
345
|
+
object as opposed to a specific file like typical rules have.
|
346
|
+
"""
|
347
|
+
# Replace the output filepath with an input_chunks subdir
|
348
|
+
output_dir = os.path.dirname(self.output[0]) + "/input_chunks"
|
349
|
+
input_slots_to_split = self.get_input_slots_to_split(self.input_slots)
|
350
|
+
if len(input_slots_to_split) > 1:
|
351
|
+
raise NotImplementedError(
|
352
|
+
"FIXME [MIC-5883] Multiple input slots to split not yet supported"
|
353
|
+
)
|
354
|
+
input_slot_to_split = input_slots_to_split[0]
|
355
|
+
checkpoint = f"""
|
356
|
+
checkpoint:
|
357
|
+
name: "split_{self.name}_{input_slot_to_split}"
|
358
|
+
input:
|
359
|
+
files={self.input_slots[input_slot_to_split]['filepaths']},
|
360
|
+
validations={self.validations},
|
361
|
+
output:
|
362
|
+
output_dir=directory("{output_dir}"),
|
363
|
+
checkpoint_file=touch("{output_dir}/checkpoint.txt"),
|
364
|
+
params:
|
365
|
+
input_files=lambda wildcards, input: ",".join(input.files),
|
366
|
+
localrule: True
|
367
|
+
message: "Splitting {self.name} {input_slot_to_split} into chunks"
|
368
|
+
run:
|
369
|
+
splitter_utils.{self.input_slots[input_slot_to_split]["splitter"].__name__}(
|
370
|
+
input_files=list(input.files),
|
371
|
+
output_dir=output.output_dir,
|
372
|
+
desired_chunk_size_mb=0.1,
|
373
|
+
)"""
|
374
|
+
return checkpoint
|
375
|
+
|
376
|
+
|
377
|
+
@dataclass
|
378
|
+
class AggregationRule(Rule):
|
379
|
+
"""A :class:`Rule` that aggregates the processed chunks of output data.
|
380
|
+
|
381
|
+
When running an :class:`~easylink.implementation.Implementation` in an embarrassingly
|
382
|
+
parallel way, we need to aggregate the output files from each parallel job
|
383
|
+
into a single output file.
|
384
|
+
"""
|
385
|
+
|
386
|
+
name: str
|
387
|
+
"""Name of the rule."""
|
388
|
+
input_slots: dict[str, dict[str, str | list[str]]]
|
389
|
+
"""This ``Implementation's`` input slot attributes."""
|
390
|
+
output_slot_name: str
|
391
|
+
"""Name of the :class:`~easylink.graph_components.OutputSlot`."""
|
392
|
+
output_slot: dict[str, str | list[str]]
|
393
|
+
"""The output slot attributes to create this rule for."""
|
394
|
+
|
395
|
+
def build_rule(self) -> str:
|
396
|
+
"""Builds the Snakemake rule for this aggregator.
|
397
|
+
|
398
|
+
When running an :class:`~easylink.step.EmbarrassinglyParallelStep`, we need
|
399
|
+
to aggregate the output files from each parallel job into a single output file.
|
400
|
+
This rule relies on a dynamically generated aggregation function which returns
|
401
|
+
all of the **processed** chunks (from running the ``EmbarrassinglyParallelStep's``
|
402
|
+
container in parallel) and uses them as inputs to the actual aggregation
|
403
|
+
rule.
|
404
|
+
|
405
|
+
Notes
|
406
|
+
-----
|
407
|
+
There is a known `Snakemake bug <https://github.com/snakemake/snakemake/issues/3036>`_
|
408
|
+
which prevents the use of multiple checkpoints in a single Snakefile. We
|
409
|
+
work around this by generating an empty checkpoint.txt file in the
|
410
|
+
:class:`~CheckpointRule`. If this file does not yet exist when trying to
|
411
|
+
aggregate, it means that the checkpoint has not yet been executed for the
|
412
|
+
particular wildcard value(s). In this case, we manually raise a Snakemake
|
413
|
+
``IncompleteCheckpointException`` which `Snakemake automatically handles
|
414
|
+
<https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#data-dependent-conditional-execution>`_
|
415
|
+
and leads to a re-evaluation after the checkpoint has successfully passed,
|
416
|
+
i.e. we replicate `Snakemake's behavior <https://github.com/snakemake/snakemake/blob/04f89d330dd94baa51f41bc796392f85bccbd231/snakemake/checkpoints.py#L42>`_.
|
417
|
+
"""
|
418
|
+
input_function = self._define_input_function()
|
419
|
+
rule = self._define_aggregator_rule()
|
420
|
+
return input_function + rule
|
421
|
+
|
422
|
+
def _define_input_function(self):
|
423
|
+
"""Builds the `input function <https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#input-functions>`_."""
|
424
|
+
if len(self.output_slot["filepaths"]) > 1:
|
425
|
+
raise NotImplementedError(
|
426
|
+
"FIXME [MIC-5883] Multiple output slots/files of EmbarrassinglyParallelSteps not yet supported"
|
427
|
+
)
|
428
|
+
if len(self.output_slot["filepaths"]) > 1:
|
429
|
+
raise NotImplementedError(
|
430
|
+
"FIXME [MIC-5883] Multiple slots/files of EmbarrassinglyParallelSteps not yet supported"
|
431
|
+
)
|
432
|
+
output_filepath = self.output_slot["filepaths"][0]
|
433
|
+
checkpoint_file_path = (
|
434
|
+
os.path.dirname(output_filepath) + "/input_chunks/checkpoint.txt"
|
435
|
+
)
|
436
|
+
input_slots_to_split = self.get_input_slots_to_split(self.input_slots)
|
437
|
+
if len(input_slots_to_split) > 1:
|
438
|
+
raise NotImplementedError(
|
439
|
+
"FIXME [MIC-5883] Multiple input slots to split not yet supported"
|
440
|
+
)
|
441
|
+
input_slot_to_split = input_slots_to_split[0]
|
442
|
+
checkpoint_name = f"checkpoints.split_{self.name}_{input_slot_to_split}"
|
443
|
+
output_files = (
|
444
|
+
os.path.dirname(output_filepath)
|
445
|
+
+ "/processed/{chunk}/"
|
446
|
+
+ os.path.basename(output_filepath)
|
447
|
+
)
|
448
|
+
func = f"""
|
449
|
+
def get_aggregation_inputs_{self.name}_{self.output_slot_name}(wildcards):
|
450
|
+
checkpoint_file = "{checkpoint_file_path}"
|
451
|
+
if not os.path.exists(checkpoint_file):
|
452
|
+
output, _ = {checkpoint_name}.rule.expand_output(wildcards)
|
453
|
+
raise IncompleteCheckpointException({checkpoint_name}.rule, checkpoint_target(output[0]))
|
454
|
+
checkpoint_output = glob.glob(f"{{{checkpoint_name}.get(**wildcards).output.output_dir}}/*/")
|
455
|
+
chunks = [Path(filepath).parts[-1] for filepath in checkpoint_output]
|
456
|
+
return expand(
|
457
|
+
"{output_files}",
|
458
|
+
chunk=chunks
|
459
|
+
)"""
|
460
|
+
return func
|
461
|
+
|
462
|
+
def _define_aggregator_rule(self):
|
463
|
+
"""Builds the rule that runs the aggregation."""
|
464
|
+
rule = f"""
|
465
|
+
rule:
|
466
|
+
name: "aggregate_{self.name}_{self.output_slot_name}"
|
467
|
+
input: get_aggregation_inputs_{self.name}_{self.output_slot_name}
|
468
|
+
output: {self.output_slot["filepaths"]}
|
469
|
+
localrule: True
|
470
|
+
message: "Aggregating {self.name} {self.output_slot_name}"
|
471
|
+
run:
|
472
|
+
aggregator_utils.{self.output_slot["aggregator"].__name__}(
|
473
|
+
input_files=list(input),
|
474
|
+
output_filepath="{self.output_slot["filepaths"][0]}",
|
475
|
+
)"""
|
476
|
+
return rule
|
easylink/runner.py
CHANGED
easylink/step.py
CHANGED
@@ -1064,6 +1064,65 @@ class ParallelStep(TemplatedStep):
|
|
1064
1064
|
return {"input": input_mappings, "output": output_mappings}
|
1065
1065
|
|
1066
1066
|
|
1067
|
+
class EmbarrassinglyParallelStep(Step):
|
1068
|
+
"""A step that is run in parallel on the backend.
|
1069
|
+
|
1070
|
+
An ``EmbarrassinglyParallelStep`` is different than a :class:`ParallelStep`
|
1071
|
+
in that it is not configured by the user to be run in parallel - it completely
|
1072
|
+
happens on the back end for performance reasons. As such, note that it inherits
|
1073
|
+
from :class:`Step` instead of :class:`TemplatedStep`.
|
1074
|
+
"""
|
1075
|
+
|
1076
|
+
def __init__(
|
1077
|
+
self,
|
1078
|
+
step_name: str,
|
1079
|
+
input_slots: Iterable[InputSlot],
|
1080
|
+
output_slots: Iterable[OutputSlot],
|
1081
|
+
) -> None:
|
1082
|
+
super().__init__(step_name, input_slots=input_slots, output_slots=output_slots)
|
1083
|
+
self._validate()
|
1084
|
+
|
1085
|
+
def _validate(self) -> None:
|
1086
|
+
"""Validates the ``EmbarrassinglyParallelStep``.
|
1087
|
+
|
1088
|
+
``EmbarrassinglyParallelSteps`` are not configured by the user to be run
|
1089
|
+
in parallel. Since it happens on the back end, we need to do somewhat unique
|
1090
|
+
validations during construction. Specifically,
|
1091
|
+
- one and only one :class:`~easylink.graph_components.InputSlot` *must* include
|
1092
|
+
a :attr:`~easylink.graph_components.InputSlot.splitter` method.
|
1093
|
+
- all :class:`OutputSlots<easylink.graph_components.OutputSlot>` *must* include
|
1094
|
+
an :attr:`~easylink.graph_components.OutputSlot.aggregator` method.
|
1095
|
+
"""
|
1096
|
+
errors = []
|
1097
|
+
# assert that only one input slot has a splitter assigned
|
1098
|
+
splitters = {
|
1099
|
+
slot.name: slot.splitter.__name__
|
1100
|
+
for slot in self.input_slots.values()
|
1101
|
+
if slot.splitter
|
1102
|
+
}
|
1103
|
+
if len(splitters) == 0:
|
1104
|
+
errors.append(
|
1105
|
+
f"EmbarrassinglyParallelStep '{self.step_name}' does not have any input slots with a "
|
1106
|
+
"splitter method assigned; one and only one input slot must have a splitter."
|
1107
|
+
)
|
1108
|
+
if len(splitters) > 1:
|
1109
|
+
errors.append(
|
1110
|
+
f"EmbarrassinglyParallelStep '{self.step_name}' has multiple input slots with "
|
1111
|
+
"splitter methods assigned; one and only one input slot must have a splitter.\n"
|
1112
|
+
f"Input slots with splitters: {splitters}"
|
1113
|
+
)
|
1114
|
+
missing_aggregators = [
|
1115
|
+
slot.name for slot in self.output_slots.values() if not slot.aggregator
|
1116
|
+
]
|
1117
|
+
if len(missing_aggregators) != 0:
|
1118
|
+
errors.append(
|
1119
|
+
f"EmbarrassinglyParallelStep '{self.step_name}' has output slots without "
|
1120
|
+
f"aggregator methods assigned: {missing_aggregators}"
|
1121
|
+
)
|
1122
|
+
if errors:
|
1123
|
+
raise ValueError("\n".join(errors))
|
1124
|
+
|
1125
|
+
|
1067
1126
|
class ChoiceStep(Step):
|
1068
1127
|
"""A type of :class:`Step` that allows for choosing between multiple paths.
|
1069
1128
|
|
@@ -1361,6 +1420,11 @@ class LeafConfigurationState(ConfigurationState):
|
|
1361
1420
|
implementation_graph = ImplementationGraph()
|
1362
1421
|
implementation_node_name = self._step.implementation_node_name
|
1363
1422
|
if self.is_combined:
|
1423
|
+
if isinstance(self._step, EmbarrassinglyParallelStep):
|
1424
|
+
raise NotImplementedError(
|
1425
|
+
"Combining implementations with embarrassingly parallel steps "
|
1426
|
+
"is not yet supported."
|
1427
|
+
)
|
1364
1428
|
implementation = PartialImplementation(
|
1365
1429
|
combined_name=self.pipeline_config[COMBINED_IMPLEMENTATION_KEY],
|
1366
1430
|
schema_step=self._step.step_name,
|
@@ -1373,6 +1437,7 @@ class LeafConfigurationState(ConfigurationState):
|
|
1373
1437
|
implementation_config=self.implementation_config,
|
1374
1438
|
input_slots=self._step.input_slots.values(),
|
1375
1439
|
output_slots=self._step.output_slots.values(),
|
1440
|
+
is_embarrassingly_parallel=isinstance(self._step, EmbarrassinglyParallelStep),
|
1376
1441
|
)
|
1377
1442
|
implementation_graph.add_node_from_implementation(
|
1378
1443
|
implementation_node_name,
|
@@ -0,0 +1,31 @@
|
|
1
|
+
"""
|
2
|
+
==========================
|
3
|
+
Data Aggregating Utilities
|
4
|
+
==========================
|
5
|
+
|
6
|
+
This module contains utility functions for aggregating datasets. One primary use
|
7
|
+
case for this is combine the results of running sections of the pipeline in an
|
8
|
+
embarrassingly parallel manner.
|
9
|
+
|
10
|
+
Note that it is critical that all data aggregating utility functions are definied
|
11
|
+
in this module; easylink will not be able to find them otherwise.
|
12
|
+
"""
|
13
|
+
|
14
|
+
import pandas as pd
|
15
|
+
from loguru import logger
|
16
|
+
|
17
|
+
|
18
|
+
def concatenate_datasets(input_files: list[str], output_filepath: str) -> None:
|
19
|
+
"""Concatenates multiple datasets into a single one.
|
20
|
+
|
21
|
+
Parameters
|
22
|
+
----------
|
23
|
+
input_files
|
24
|
+
A list of input file paths to be concatenated.
|
25
|
+
output_filepath
|
26
|
+
The output filepath.
|
27
|
+
"""
|
28
|
+
logger.info(f"Concatenating {len(input_files)} datasets")
|
29
|
+
dfs = [pd.read_parquet(df) for df in input_files]
|
30
|
+
df = pd.concat(dfs, ignore_index=True)
|
31
|
+
df.to_parquet(output_filepath)
|
easylink/utilities/data_utils.py
CHANGED