easylink 0.1.12__py3-none-any.whl → 0.1.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- easylink/_version.py +1 -1
- easylink/configuration.py +5 -5
- easylink/graph_components.py +48 -51
- easylink/implementation.py +70 -10
- easylink/pipeline.py +127 -24
- easylink/pipeline_graph.py +46 -26
- easylink/pipeline_schema_constants/__init__.py +11 -7
- easylink/pipeline_schema_constants/development.py +2 -23
- easylink/pipeline_schema_constants/testing.py +243 -17
- easylink/rule.py +60 -140
- easylink/runner.py +14 -9
- easylink/step.py +397 -143
- easylink/utilities/splitter_utils.py +35 -0
- {easylink-0.1.12.dist-info → easylink-0.1.14.dist-info}/METADATA +22 -14
- {easylink-0.1.12.dist-info → easylink-0.1.14.dist-info}/RECORD +18 -18
- {easylink-0.1.12.dist-info → easylink-0.1.14.dist-info}/WHEEL +1 -1
- {easylink-0.1.12.dist-info → easylink-0.1.14.dist-info}/entry_points.txt +0 -0
- {easylink-0.1.12.dist-info → easylink-0.1.14.dist-info}/top_level.txt +0 -0
@@ -74,29 +74,8 @@ NODES = [
|
|
74
74
|
),
|
75
75
|
],
|
76
76
|
),
|
77
|
-
|
78
|
-
|
79
|
-
name="step_3_main_input",
|
80
|
-
env_var="DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS",
|
81
|
-
validator=validate_input_file_dummy,
|
82
|
-
splitter=split_data_by_size,
|
83
|
-
),
|
84
|
-
],
|
85
|
-
output_slots=[OutputSlot("step_3_main_output", aggregator=concatenate_datasets)],
|
86
|
-
input_slot_mappings=[
|
87
|
-
InputSlotMapping(
|
88
|
-
parent_slot="step_3_main_input",
|
89
|
-
child_node="step_3",
|
90
|
-
child_slot="step_3_main_input",
|
91
|
-
),
|
92
|
-
],
|
93
|
-
output_slot_mappings=[
|
94
|
-
OutputSlotMapping(
|
95
|
-
parent_slot="step_3_main_output",
|
96
|
-
child_node="step_3",
|
97
|
-
child_slot="step_3_main_output",
|
98
|
-
),
|
99
|
-
],
|
77
|
+
slot_splitter_mapping={"step_3_main_input": split_data_by_size},
|
78
|
+
slot_aggregator_mapping={"step_3_main_output": concatenate_datasets},
|
100
79
|
),
|
101
80
|
self_edges=[
|
102
81
|
EdgeParams(
|
@@ -16,6 +16,7 @@ from easylink.graph_components import (
|
|
16
16
|
OutputSlotMapping,
|
17
17
|
)
|
18
18
|
from easylink.step import (
|
19
|
+
EmbarrassinglyParallelStep,
|
19
20
|
HierarchicalStep,
|
20
21
|
InputStep,
|
21
22
|
LoopStep,
|
@@ -23,9 +24,11 @@ from easylink.step import (
|
|
23
24
|
ParallelStep,
|
24
25
|
Step,
|
25
26
|
)
|
27
|
+
from easylink.utilities.aggregator_utils import concatenate_datasets
|
28
|
+
from easylink.utilities.splitter_utils import split_data_in_two
|
26
29
|
from easylink.utilities.validation_utils import validate_input_file_dummy
|
27
30
|
|
28
|
-
|
31
|
+
NODES_ONE_STEP = [
|
29
32
|
InputStep(),
|
30
33
|
Step(
|
31
34
|
step_name="step_1",
|
@@ -44,7 +47,7 @@ SINGLE_STEP_NODES = [
|
|
44
47
|
],
|
45
48
|
),
|
46
49
|
]
|
47
|
-
|
50
|
+
EDGES_ONE_STEP = [
|
48
51
|
EdgeParams(
|
49
52
|
source_node="input_data",
|
50
53
|
target_node="step_1",
|
@@ -58,10 +61,10 @@ SINGLE_STEP_EDGES = [
|
|
58
61
|
input_slot="result",
|
59
62
|
),
|
60
63
|
]
|
64
|
+
SCHEMA_PARAMS_ONE_STEP = (NODES_ONE_STEP, EDGES_ONE_STEP)
|
61
65
|
|
62
|
-
SINGLE_STEP_SCHEMA_PARAMS = (SINGLE_STEP_NODES, SINGLE_STEP_EDGES)
|
63
66
|
|
64
|
-
|
67
|
+
NODES_THREE_STEPS = [
|
65
68
|
InputStep(),
|
66
69
|
Step(
|
67
70
|
step_name="step_1",
|
@@ -102,7 +105,7 @@ TRIPLE_STEP_NODES = [
|
|
102
105
|
],
|
103
106
|
),
|
104
107
|
]
|
105
|
-
|
108
|
+
EDGES_THREE_STEPS = [
|
106
109
|
EdgeParams(
|
107
110
|
source_node="input_data",
|
108
111
|
target_node="step_1",
|
@@ -128,11 +131,10 @@ TRIPLE_STEP_EDGES = [
|
|
128
131
|
input_slot="result",
|
129
132
|
),
|
130
133
|
]
|
134
|
+
SCHEMA_PARAMS_THREE_STEPS = (NODES_THREE_STEPS, EDGES_THREE_STEPS)
|
131
135
|
|
132
|
-
TRIPLE_STEP_SCHEMA_PARAMS = (TRIPLE_STEP_NODES, TRIPLE_STEP_EDGES)
|
133
136
|
|
134
|
-
|
135
|
-
BAD_COMBINED_TOPOLOGY_NODES = [
|
137
|
+
NODES_BAD_COMBINED_TOPOLOGY = [
|
136
138
|
InputStep(),
|
137
139
|
LoopStep(
|
138
140
|
template_step=HierarchicalStep(
|
@@ -207,11 +209,10 @@ BAD_COMBINED_TOPOLOGY_NODES = [
|
|
207
209
|
],
|
208
210
|
),
|
209
211
|
]
|
210
|
-
|
211
|
-
BAD_COMBINED_TOPOLOGY_SCHEMA_PARAMS = (BAD_COMBINED_TOPOLOGY_NODES, SINGLE_STEP_EDGES)
|
212
|
+
SCHEMA_PARAMS_BAD_COMBINED_TOPOLOGY = (NODES_BAD_COMBINED_TOPOLOGY, EDGES_ONE_STEP)
|
212
213
|
|
213
214
|
|
214
|
-
|
215
|
+
NODES_NESTED_TEMPLATED_STEPS = [
|
215
216
|
InputStep(),
|
216
217
|
LoopStep(
|
217
218
|
template_step=ParallelStep(
|
@@ -288,12 +289,10 @@ NESTED_TEMPLATED_STEPS_NODES = [
|
|
288
289
|
],
|
289
290
|
),
|
290
291
|
]
|
292
|
+
SCHEMA_PARAMS_NESTED_TEMPLATED_STEPS = (NODES_NESTED_TEMPLATED_STEPS, EDGES_ONE_STEP)
|
291
293
|
|
292
294
|
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
COMBINE_WITH_ITERATION_NODES = [
|
295
|
+
NODES_COMBINE_WITH_ITERATION = [
|
297
296
|
InputStep(),
|
298
297
|
LoopStep(
|
299
298
|
template_step=Step(
|
@@ -333,7 +332,7 @@ COMBINE_WITH_ITERATION_NODES = [
|
|
333
332
|
],
|
334
333
|
),
|
335
334
|
]
|
336
|
-
|
335
|
+
EDGES_TWO_STEPS = [
|
337
336
|
EdgeParams(
|
338
337
|
source_node="input_data",
|
339
338
|
target_node="step_1",
|
@@ -353,6 +352,233 @@ DOUBLE_STEP_EDGES = [
|
|
353
352
|
input_slot="result",
|
354
353
|
),
|
355
354
|
]
|
355
|
+
SCHEMA_PARAMS_COMBINE_WITH_ITERATION = (NODES_COMBINE_WITH_ITERATION, EDGES_TWO_STEPS)
|
356
|
+
|
357
|
+
|
358
|
+
NODES_LOOPING_EP_STEP = [
|
359
|
+
InputStep(),
|
360
|
+
LoopStep(
|
361
|
+
template_step=EmbarrassinglyParallelStep(
|
362
|
+
step=Step(
|
363
|
+
step_name="step_1",
|
364
|
+
input_slots=[
|
365
|
+
InputSlot(
|
366
|
+
name="step_1_main_input",
|
367
|
+
env_var="DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS",
|
368
|
+
validator=validate_input_file_dummy,
|
369
|
+
),
|
370
|
+
],
|
371
|
+
output_slots=[
|
372
|
+
OutputSlot(
|
373
|
+
name="step_1_main_output",
|
374
|
+
),
|
375
|
+
],
|
376
|
+
),
|
377
|
+
slot_splitter_mapping={"step_1_main_input": split_data_in_two},
|
378
|
+
slot_aggregator_mapping={"step_1_main_output": concatenate_datasets},
|
379
|
+
),
|
380
|
+
self_edges=[
|
381
|
+
EdgeParams(
|
382
|
+
source_node="step_1",
|
383
|
+
target_node="step_1",
|
384
|
+
output_slot="step_1_main_output",
|
385
|
+
input_slot="step_1_main_input",
|
386
|
+
)
|
387
|
+
],
|
388
|
+
),
|
389
|
+
OutputStep(
|
390
|
+
input_slots=[
|
391
|
+
InputSlot(name="result", env_var=None, validator=validate_input_file_dummy)
|
392
|
+
]
|
393
|
+
),
|
394
|
+
]
|
395
|
+
SCHEMA_PARAMS_LOOPING_EP_STEP = (NODES_LOOPING_EP_STEP, EDGES_ONE_STEP)
|
396
|
+
|
397
|
+
|
398
|
+
NODES_EP_PARALLEL_STEP = [
|
399
|
+
InputStep(),
|
400
|
+
EmbarrassinglyParallelStep(
|
401
|
+
step=ParallelStep(
|
402
|
+
template_step=Step(
|
403
|
+
step_name="step_1",
|
404
|
+
input_slots=[
|
405
|
+
InputSlot(
|
406
|
+
name="step_1_main_input",
|
407
|
+
env_var="DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS",
|
408
|
+
validator=validate_input_file_dummy,
|
409
|
+
),
|
410
|
+
],
|
411
|
+
output_slots=[
|
412
|
+
OutputSlot(
|
413
|
+
name="step_1_main_output",
|
414
|
+
),
|
415
|
+
],
|
416
|
+
),
|
417
|
+
),
|
418
|
+
slot_splitter_mapping={"step_1_main_input": split_data_in_two},
|
419
|
+
slot_aggregator_mapping={"step_1_main_output": concatenate_datasets},
|
420
|
+
),
|
421
|
+
OutputStep(
|
422
|
+
input_slots=[
|
423
|
+
InputSlot(name="result", env_var=None, validator=validate_input_file_dummy)
|
424
|
+
]
|
425
|
+
),
|
426
|
+
]
|
427
|
+
SCHEMA_PARAMS_EP_PARALLEL_STEP = (NODES_EP_PARALLEL_STEP, EDGES_ONE_STEP)
|
428
|
+
|
429
|
+
|
430
|
+
NODES_EP_LOOP_STEP = [
|
431
|
+
InputStep(),
|
432
|
+
EmbarrassinglyParallelStep(
|
433
|
+
step=LoopStep(
|
434
|
+
template_step=Step(
|
435
|
+
step_name="step_1",
|
436
|
+
input_slots=[
|
437
|
+
InputSlot(
|
438
|
+
name="step_1_main_input",
|
439
|
+
env_var="DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS",
|
440
|
+
validator=validate_input_file_dummy,
|
441
|
+
),
|
442
|
+
],
|
443
|
+
output_slots=[
|
444
|
+
OutputSlot(
|
445
|
+
name="step_1_main_output",
|
446
|
+
),
|
447
|
+
],
|
448
|
+
),
|
449
|
+
self_edges=[
|
450
|
+
EdgeParams(
|
451
|
+
source_node="step_1",
|
452
|
+
target_node="step_1",
|
453
|
+
output_slot="step_1_main_output",
|
454
|
+
input_slot="step_1_main_input",
|
455
|
+
),
|
456
|
+
],
|
457
|
+
),
|
458
|
+
slot_splitter_mapping={"step_1_main_input": split_data_in_two},
|
459
|
+
slot_aggregator_mapping={"step_1_main_output": concatenate_datasets},
|
460
|
+
),
|
461
|
+
OutputStep(
|
462
|
+
input_slots=[
|
463
|
+
InputSlot(name="result", env_var=None, validator=validate_input_file_dummy)
|
464
|
+
]
|
465
|
+
),
|
466
|
+
]
|
467
|
+
SCHEMA_PARAMS_EP_LOOP_STEP = (NODES_EP_LOOP_STEP, EDGES_ONE_STEP)
|
356
468
|
|
357
469
|
|
358
|
-
|
470
|
+
NODES_EP_HIERARCHICAL_STEP = [
|
471
|
+
InputStep(),
|
472
|
+
EmbarrassinglyParallelStep(
|
473
|
+
step=HierarchicalStep(
|
474
|
+
step_name="step_1",
|
475
|
+
input_slots=[
|
476
|
+
InputSlot(
|
477
|
+
name="step_1_main_input",
|
478
|
+
env_var="DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS",
|
479
|
+
validator=validate_input_file_dummy,
|
480
|
+
),
|
481
|
+
InputSlot(
|
482
|
+
name="step_1_secondary_input",
|
483
|
+
env_var="DUMMY_CONTAINER_SECONDARY_INPUT_FILE_PATHS",
|
484
|
+
validator=validate_input_file_dummy,
|
485
|
+
),
|
486
|
+
],
|
487
|
+
output_slots=[OutputSlot("step_1_main_output")],
|
488
|
+
nodes=[
|
489
|
+
Step(
|
490
|
+
step_name="step_1a",
|
491
|
+
input_slots=[
|
492
|
+
InputSlot(
|
493
|
+
name="step_1a_main_input",
|
494
|
+
env_var="DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS",
|
495
|
+
validator=validate_input_file_dummy,
|
496
|
+
),
|
497
|
+
InputSlot(
|
498
|
+
name="step_1a_secondary_input",
|
499
|
+
env_var="DUMMY_CONTAINER_SECONDARY_INPUT_FILE_PATHS",
|
500
|
+
validator=validate_input_file_dummy,
|
501
|
+
),
|
502
|
+
],
|
503
|
+
output_slots=[OutputSlot("step_1a_main_output")],
|
504
|
+
),
|
505
|
+
Step(
|
506
|
+
step_name="step_1b",
|
507
|
+
input_slots=[
|
508
|
+
InputSlot(
|
509
|
+
name="step_1b_main_input",
|
510
|
+
env_var="DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS",
|
511
|
+
validator=validate_input_file_dummy,
|
512
|
+
),
|
513
|
+
InputSlot(
|
514
|
+
name="step_1b_secondary_input",
|
515
|
+
env_var="DUMMY_CONTAINER_SECONDARY_INPUT_FILE_PATHS",
|
516
|
+
validator=validate_input_file_dummy,
|
517
|
+
),
|
518
|
+
],
|
519
|
+
output_slots=[OutputSlot("step_1b_main_output")],
|
520
|
+
),
|
521
|
+
],
|
522
|
+
edges=[
|
523
|
+
EdgeParams(
|
524
|
+
source_node="step_1a",
|
525
|
+
target_node="step_1b",
|
526
|
+
output_slot="step_1a_main_output",
|
527
|
+
input_slot="step_1b_main_input",
|
528
|
+
),
|
529
|
+
],
|
530
|
+
input_slot_mappings=[
|
531
|
+
InputSlotMapping(
|
532
|
+
parent_slot="step_1_main_input",
|
533
|
+
child_node="step_1a",
|
534
|
+
child_slot="step_1a_main_input",
|
535
|
+
),
|
536
|
+
InputSlotMapping(
|
537
|
+
parent_slot="step_1_secondary_input",
|
538
|
+
child_node="step_1a",
|
539
|
+
child_slot="step_1a_secondary_input",
|
540
|
+
),
|
541
|
+
InputSlotMapping(
|
542
|
+
parent_slot="step_1_secondary_input",
|
543
|
+
child_node="step_1b",
|
544
|
+
child_slot="step_1b_secondary_input",
|
545
|
+
),
|
546
|
+
],
|
547
|
+
output_slot_mappings=[
|
548
|
+
OutputSlotMapping(
|
549
|
+
parent_slot="step_1_main_output",
|
550
|
+
child_node="step_1b",
|
551
|
+
child_slot="step_1b_main_output",
|
552
|
+
),
|
553
|
+
],
|
554
|
+
),
|
555
|
+
slot_splitter_mapping={"step_1_main_input": split_data_in_two},
|
556
|
+
slot_aggregator_mapping={"step_1_main_output": concatenate_datasets},
|
557
|
+
),
|
558
|
+
OutputStep(
|
559
|
+
input_slots=[
|
560
|
+
InputSlot(name="result", env_var=None, validator=validate_input_file_dummy)
|
561
|
+
]
|
562
|
+
),
|
563
|
+
]
|
564
|
+
EDGES_ONE_STEP_TWO_ISLOTS = [
|
565
|
+
EdgeParams(
|
566
|
+
source_node="input_data",
|
567
|
+
target_node="step_1",
|
568
|
+
output_slot="all",
|
569
|
+
input_slot="step_1_main_input",
|
570
|
+
),
|
571
|
+
EdgeParams(
|
572
|
+
source_node="input_data",
|
573
|
+
target_node="step_1",
|
574
|
+
output_slot="all",
|
575
|
+
input_slot="step_1_secondary_input",
|
576
|
+
),
|
577
|
+
EdgeParams(
|
578
|
+
source_node="step_1",
|
579
|
+
target_node="results",
|
580
|
+
output_slot="step_1_main_output",
|
581
|
+
input_slot="result",
|
582
|
+
),
|
583
|
+
]
|
584
|
+
SCHEMA_PARAMS_EP_HIERARCHICAL_STEP = (NODES_EP_HIERARCHICAL_STEP, EDGES_ONE_STEP_TWO_ISLOTS)
|
easylink/rule.py
CHANGED
@@ -41,15 +41,6 @@ class Rule(ABC):
|
|
41
41
|
"""
|
42
42
|
pass
|
43
43
|
|
44
|
-
@staticmethod
|
45
|
-
def get_input_slots_to_split(input_slots) -> list[str]:
|
46
|
-
input_slots_to_split = [
|
47
|
-
slot_name
|
48
|
-
for slot_name, slot_attrs in input_slots.items()
|
49
|
-
if slot_attrs.get("splitter", None)
|
50
|
-
]
|
51
|
-
return input_slots_to_split
|
52
|
-
|
53
44
|
|
54
45
|
@dataclass
|
55
46
|
class TargetRule(Rule):
|
@@ -125,23 +116,15 @@ class ImplementedRule(Rule):
|
|
125
116
|
|
126
117
|
def build_rule(self) -> str:
|
127
118
|
"""Builds the Snakemake rule for this ``Implementation``."""
|
119
|
+
if self.is_embarrassingly_parallel and len(self.output) > 1:
|
120
|
+
raise NotImplementedError(
|
121
|
+
"Multiple output slots/files of EmbarrassinglyParallelSteps not yet supported"
|
122
|
+
)
|
128
123
|
return self._build_io() + self._build_resources() + self._build_shell_cmd()
|
129
124
|
|
130
125
|
def _build_io(self) -> str:
|
131
126
|
"""Builds the input/output portion of the rule."""
|
132
|
-
if self.is_embarrassingly_parallel
|
133
|
-
# Processed chunks are sent to a 'processed' subdir
|
134
|
-
output_files = [
|
135
|
-
os.path.dirname(file_path)
|
136
|
-
+ "/processed/{chunk}/"
|
137
|
-
+ os.path.basename(file_path)
|
138
|
-
for file_path in self.output
|
139
|
-
]
|
140
|
-
log_path_chunk_adder = "-{chunk}"
|
141
|
-
else:
|
142
|
-
output_files = self.output
|
143
|
-
log_path_chunk_adder = ""
|
144
|
-
|
127
|
+
log_path_chunk_adder = "-{chunk}" if self.is_embarrassingly_parallel else ""
|
145
128
|
io_str = (
|
146
129
|
f"""
|
147
130
|
rule:
|
@@ -149,7 +132,7 @@ rule:
|
|
149
132
|
message: "Running {self.step_name} implementation: {self.implementation_name}" """
|
150
133
|
+ self._build_input()
|
151
134
|
+ f"""
|
152
|
-
output: {
|
135
|
+
output: {self.output}
|
153
136
|
log: "{self.diagnostics_dir}/{self.name}-output{log_path_chunk_adder}.log"
|
154
137
|
container: "{self.image_path}" """
|
155
138
|
)
|
@@ -158,33 +141,11 @@ rule:
|
|
158
141
|
def _build_input(self) -> str:
|
159
142
|
input_str = f"""
|
160
143
|
input:"""
|
161
|
-
input_slots_to_split = self.get_input_slots_to_split(self.input_slots)
|
162
144
|
for slot, attrs in self.input_slots.items():
|
163
145
|
env_var = attrs["env_var"].lower()
|
164
|
-
if len(input_slots_to_split) > 1:
|
165
|
-
raise NotImplementedError(
|
166
|
-
"FIXME [MIC-5883] Multiple input slots to split not yet supported"
|
167
|
-
)
|
168
|
-
if self.is_embarrassingly_parallel and slot == input_slots_to_split[0]:
|
169
|
-
# The input to this is the input_chunks subdir from the checkpoint
|
170
|
-
# rule (which is built by modifying the output of the overall implementation)
|
171
|
-
if len(self.output) > 1:
|
172
|
-
raise NotImplementedError(
|
173
|
-
"FIXME [MIC-5883] Multiple output slots/files of EmbarrassinglyParallelSteps not yet supported"
|
174
|
-
)
|
175
|
-
input_files = [
|
176
|
-
os.path.dirname(self.output[0])
|
177
|
-
+ "/input_chunks/{chunk}/"
|
178
|
-
+ os.path.basename(self.output[0])
|
179
|
-
]
|
180
|
-
else:
|
181
|
-
input_files = attrs["filepaths"]
|
182
|
-
input_str += f"""
|
183
|
-
{env_var}={input_files},"""
|
184
|
-
if not self.is_embarrassingly_parallel:
|
185
|
-
# validations were already handled in the checkpoint rule - no need
|
186
|
-
# to validate the individual chunks
|
187
146
|
input_str += f"""
|
147
|
+
{env_var}={attrs["filepaths"]},"""
|
148
|
+
input_str += f"""
|
188
149
|
validations={self.validations},"""
|
189
150
|
if self.requires_spark:
|
190
151
|
input_str += f"""
|
@@ -210,38 +171,19 @@ rule:
|
|
210
171
|
# output_paths = ",".join(self.output)
|
211
172
|
# wildcards_subdir = "/".join([f"{{wildcards.{wc}}}" for wc in self.wildcards])
|
212
173
|
# and then in shell cmd: export DUMMY_CONTAINER_OUTPUT_PATHS={output_paths}/{wildcards_subdir}
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
"FIXME [MIC-5883] Multiple output slots/files of EmbarrassinglyParallelSteps not yet supported"
|
217
|
-
)
|
218
|
-
output_files = (
|
219
|
-
os.path.dirname(self.output[0])
|
220
|
-
+ "/processed/{wildcards.chunk}/"
|
221
|
-
+ os.path.basename(self.output[0])
|
222
|
-
)
|
223
|
-
else:
|
224
|
-
output_files = ",".join(self.output)
|
174
|
+
|
175
|
+
# snakemake shell commands require wildcards to be prefaced with 'wildcards.'
|
176
|
+
output_files = ",".join(self.output).replace("{chunk}", "{wildcards.chunk}")
|
225
177
|
shell_cmd = f"""
|
226
178
|
shell:
|
227
179
|
'''
|
228
180
|
export DUMMY_CONTAINER_OUTPUT_PATHS={output_files}
|
229
181
|
export DUMMY_CONTAINER_DIAGNOSTICS_DIRECTORY={self.diagnostics_dir}"""
|
230
|
-
for
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
)
|
236
|
-
if input_slot_name in input_slots_to_split:
|
237
|
-
# The inputs to this come from the input_chunks subdir
|
238
|
-
input_files = (
|
239
|
-
os.path.dirname(self.output[0])
|
240
|
-
+ "/input_chunks/{wildcards.chunk}/"
|
241
|
-
+ os.path.basename(self.output[0])
|
242
|
-
)
|
243
|
-
else:
|
244
|
-
input_files = ",".join(input_slot_attrs["filepaths"])
|
182
|
+
for input_slot_attrs in self.input_slots.values():
|
183
|
+
# snakemake shell commands require wildcards to be prefaced with 'wildcards.'
|
184
|
+
input_files = ",".join(input_slot_attrs["filepaths"]).replace(
|
185
|
+
"{chunk}", "{wildcards.chunk}"
|
186
|
+
)
|
245
187
|
shell_cmd += f"""
|
246
188
|
export {input_slot_attrs["env_var"]}={input_files}"""
|
247
189
|
if self.requires_spark:
|
@@ -278,7 +220,7 @@ class InputValidationRule(Rule):
|
|
278
220
|
"""List of filepaths to validate."""
|
279
221
|
output: str
|
280
222
|
"""Filepath of validation output. It must be used as an input for next rule."""
|
281
|
-
validator: Callable
|
223
|
+
validator: Callable | None
|
282
224
|
"""Callable that takes a filepath as input. Raises an error if invalid."""
|
283
225
|
|
284
226
|
def build_rule(self) -> str:
|
@@ -329,12 +271,14 @@ class CheckpointRule(Rule):
|
|
329
271
|
|
330
272
|
name: str
|
331
273
|
"""Name of the rule."""
|
332
|
-
|
333
|
-
"""
|
334
|
-
|
335
|
-
"""
|
336
|
-
|
274
|
+
input_files: list[str]
|
275
|
+
"""The input filepaths."""
|
276
|
+
splitter_func_name: str
|
277
|
+
"""The splitter function's name."""
|
278
|
+
output_dir: str
|
337
279
|
"""Output directory path. It must be used as an input for next rule."""
|
280
|
+
checkpoint_filepath: str
|
281
|
+
"""Path to the checkpoint file. This is only needed for the bugfix workaround."""
|
338
282
|
|
339
283
|
def build_rule(self) -> str:
|
340
284
|
"""Builds the Snakemake rule for this checkpoint.
|
@@ -344,29 +288,20 @@ class CheckpointRule(Rule):
|
|
344
288
|
files into chunks. Note that the output of this rule is a Snakemake ``directory``
|
345
289
|
object as opposed to a specific file like typical rules have.
|
346
290
|
"""
|
347
|
-
# Replace the output filepath with an input_chunks subdir
|
348
|
-
output_dir = os.path.dirname(self.output[0]) + "/input_chunks"
|
349
|
-
input_slots_to_split = self.get_input_slots_to_split(self.input_slots)
|
350
|
-
if len(input_slots_to_split) > 1:
|
351
|
-
raise NotImplementedError(
|
352
|
-
"FIXME [MIC-5883] Multiple input slots to split not yet supported"
|
353
|
-
)
|
354
|
-
input_slot_to_split = input_slots_to_split[0]
|
355
291
|
checkpoint = f"""
|
356
292
|
checkpoint:
|
357
|
-
name: "
|
293
|
+
name: "{self.name}"
|
358
294
|
input:
|
359
|
-
files={self.
|
360
|
-
validations={self.validations},
|
295
|
+
files={self.input_files},
|
361
296
|
output:
|
362
|
-
output_dir=directory("{output_dir}"),
|
363
|
-
checkpoint_file=touch("{
|
297
|
+
output_dir=directory("{self.output_dir}"),
|
298
|
+
checkpoint_file=touch("{self.checkpoint_filepath}"),
|
364
299
|
params:
|
365
300
|
input_files=lambda wildcards, input: ",".join(input.files),
|
366
301
|
localrule: True
|
367
|
-
message: "Splitting {self.name}
|
302
|
+
message: "Splitting {self.name} into chunks"
|
368
303
|
run:
|
369
|
-
splitter_utils.{self.
|
304
|
+
splitter_utils.{self.splitter_func_name}(
|
370
305
|
input_files=list(input.files),
|
371
306
|
output_dir=output.output_dir,
|
372
307
|
desired_chunk_size_mb=0.1,
|
@@ -385,12 +320,16 @@ class AggregationRule(Rule):
|
|
385
320
|
|
386
321
|
name: str
|
387
322
|
"""Name of the rule."""
|
388
|
-
|
389
|
-
"""
|
390
|
-
|
391
|
-
"""
|
392
|
-
|
393
|
-
"""The
|
323
|
+
input_files: list[str]
|
324
|
+
"""The input processed chunk files to aggregate."""
|
325
|
+
aggregated_output_file: str
|
326
|
+
"""The final aggregated results file."""
|
327
|
+
aggregator_func_name: str
|
328
|
+
"""The name of the aggregation function to run."""
|
329
|
+
checkpoint_filepath: str
|
330
|
+
"""Path to the checkpoint file. This is only needed for the bugfix workaround."""
|
331
|
+
checkpoint_rule_name: str
|
332
|
+
"""Name of the checkpoint rule."""
|
394
333
|
|
395
334
|
def build_rule(self) -> str:
|
396
335
|
"""Builds the Snakemake rule for this aggregator.
|
@@ -421,56 +360,37 @@ class AggregationRule(Rule):
|
|
421
360
|
|
422
361
|
def _define_input_function(self):
|
423
362
|
"""Builds the `input function <https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#input-functions>`_."""
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
|
429
|
-
raise NotImplementedError(
|
430
|
-
"FIXME [MIC-5883] Multiple slots/files of EmbarrassinglyParallelSteps not yet supported"
|
431
|
-
)
|
432
|
-
output_filepath = self.output_slot["filepaths"][0]
|
433
|
-
checkpoint_file_path = (
|
434
|
-
os.path.dirname(output_filepath) + "/input_chunks/checkpoint.txt"
|
435
|
-
)
|
436
|
-
input_slots_to_split = self.get_input_slots_to_split(self.input_slots)
|
437
|
-
if len(input_slots_to_split) > 1:
|
438
|
-
raise NotImplementedError(
|
439
|
-
"FIXME [MIC-5883] Multiple input slots to split not yet supported"
|
440
|
-
)
|
441
|
-
input_slot_to_split = input_slots_to_split[0]
|
442
|
-
checkpoint_name = f"checkpoints.split_{self.name}_{input_slot_to_split}"
|
443
|
-
output_files = (
|
444
|
-
os.path.dirname(output_filepath)
|
445
|
-
+ "/processed/{chunk}/"
|
446
|
-
+ os.path.basename(output_filepath)
|
447
|
-
)
|
363
|
+
# NOTE: In the f-string below, we serialize the list `self.input_files`
|
364
|
+
# into a string which must later be executed as python code (by snakemake).
|
365
|
+
# Let's expand the list into a string representation of a python list so that
|
366
|
+
# we explicitly rely on `eval(repr(self.input_files)) == self.input_files`.
|
367
|
+
input_files_list_str = repr(self.input_files)
|
448
368
|
func = f"""
|
449
|
-
def get_aggregation_inputs_{self.name}
|
450
|
-
checkpoint_file = "{
|
369
|
+
def get_aggregation_inputs_{self.name}(wildcards):
|
370
|
+
checkpoint_file = "{self.checkpoint_filepath}"
|
451
371
|
if not os.path.exists(checkpoint_file):
|
452
|
-
output, _ = {
|
453
|
-
raise IncompleteCheckpointException({
|
454
|
-
checkpoint_output = glob.glob(f"{{{
|
372
|
+
output, _ = {self.checkpoint_rule_name}.rule.expand_output(wildcards)
|
373
|
+
raise IncompleteCheckpointException({self.checkpoint_rule_name}.rule, checkpoint_target(output[0]))
|
374
|
+
checkpoint_output = glob.glob(f"{{{self.checkpoint_rule_name}.get(**wildcards).output.output_dir}}/*/")
|
455
375
|
chunks = [Path(filepath).parts[-1] for filepath in checkpoint_output]
|
456
|
-
|
457
|
-
|
458
|
-
chunk=chunks
|
459
|
-
|
376
|
+
input_files = []
|
377
|
+
for filepath in {input_files_list_str}:
|
378
|
+
input_files.extend(expand(filepath, chunk=chunks))
|
379
|
+
return input_files"""
|
460
380
|
return func
|
461
381
|
|
462
382
|
def _define_aggregator_rule(self):
|
463
383
|
"""Builds the rule that runs the aggregation."""
|
464
384
|
rule = f"""
|
465
385
|
rule:
|
466
|
-
name: "
|
467
|
-
input: get_aggregation_inputs_{self.name}
|
468
|
-
output: {self.
|
386
|
+
name: "{self.name}"
|
387
|
+
input: get_aggregation_inputs_{self.name}
|
388
|
+
output: {[self.aggregated_output_file]}
|
469
389
|
localrule: True
|
470
|
-
message: "Aggregating {self.name}
|
390
|
+
message: "Aggregating {self.name}"
|
471
391
|
run:
|
472
|
-
aggregator_utils.{self.
|
392
|
+
aggregator_utils.{self.aggregator_func_name}(
|
473
393
|
input_files=list(input),
|
474
|
-
output_filepath="{self.
|
394
|
+
output_filepath="{self.aggregated_output_file}",
|
475
395
|
)"""
|
476
396
|
return rule
|