easylink 0.1.20__py3-none-any.whl → 0.1.22__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- easylink/_version.py +1 -1
- easylink/configuration.py +4 -3
- easylink/devtools/implementation_creator.py +2 -2
- easylink/implementation.py +2 -2
- easylink/pipeline.py +13 -15
- easylink/pipeline_graph.py +10 -15
- easylink/pipeline_schema.py +3 -3
- easylink/pipeline_schema_constants/__init__.py +5 -4
- easylink/pipeline_schema_constants/development.py +4 -4
- easylink/pipeline_schema_constants/main.py +5 -5
- easylink/pipeline_schema_constants/testing.py +145 -16
- easylink/rule.py +9 -10
- easylink/step.py +79 -48
- easylink/utilities/aggregator_utils.py +2 -2
- easylink/utilities/splitter_utils.py +1 -1
- {easylink-0.1.20.dist-info → easylink-0.1.22.dist-info}/METADATA +1 -1
- {easylink-0.1.20.dist-info → easylink-0.1.22.dist-info}/RECORD +21 -23
- easylink/images/spark_cluster/Dockerfile +0 -16
- easylink/images/spark_cluster/README.md +0 -15
- {easylink-0.1.20.dist-info → easylink-0.1.22.dist-info}/WHEEL +0 -0
- {easylink-0.1.20.dist-info → easylink-0.1.22.dist-info}/entry_points.txt +0 -0
- {easylink-0.1.20.dist-info → easylink-0.1.22.dist-info}/licenses/LICENSE +0 -0
- {easylink-0.1.20.dist-info → easylink-0.1.22.dist-info}/top_level.txt +0 -0
easylink/_version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.1.
|
1
|
+
__version__ = "0.1.22"
|
easylink/configuration.py
CHANGED
@@ -184,7 +184,9 @@ class Config(LayeredConfigTree):
|
|
184
184
|
#################
|
185
185
|
|
186
186
|
def _get_schema(self, schema_name: str = "main") -> PipelineSchema:
|
187
|
-
"""
|
187
|
+
"""Gets the requested :class:`~easylink.pipeline_schema.PipelineSchema`.
|
188
|
+
|
189
|
+
The schema is only returned if it validates the pipeline configuration.
|
188
190
|
|
189
191
|
Parameters
|
190
192
|
----------
|
@@ -205,11 +207,10 @@ class Config(LayeredConfigTree):
|
|
205
207
|
Notes
|
206
208
|
-----
|
207
209
|
This acts as the pipeline configuration file's validation method since
|
208
|
-
we can only
|
210
|
+
we can only validate the ``PipelineSchema`` if that file is valid.
|
209
211
|
|
210
212
|
"""
|
211
213
|
errors = defaultdict(dict)
|
212
|
-
# Try each schema until one is validated
|
213
214
|
schema = PipelineSchema.get_schema(schema_name)
|
214
215
|
logs = schema.validate_step(self.pipeline, self.input_data)
|
215
216
|
if logs:
|
@@ -21,8 +21,8 @@ from loguru import logger
|
|
21
21
|
|
22
22
|
from easylink.pipeline_schema_constants import SCHEMA_PARAMS
|
23
23
|
from easylink.step import (
|
24
|
+
AutoParallelStep,
|
24
25
|
ChoiceStep,
|
25
|
-
EmbarrassinglyParallelStep,
|
26
26
|
HierarchicalStep,
|
27
27
|
IOStep,
|
28
28
|
Step,
|
@@ -319,7 +319,7 @@ class ImplementationCreator:
|
|
319
319
|
elif isinstance(node, TemplatedStep):
|
320
320
|
_process_step(node.template_step)
|
321
321
|
return
|
322
|
-
elif isinstance(node,
|
322
|
+
elif isinstance(node, AutoParallelStep):
|
323
323
|
_process_step(node.step)
|
324
324
|
return
|
325
325
|
elif isinstance(node, ChoiceStep):
|
easylink/implementation.py
CHANGED
@@ -55,7 +55,7 @@ class Implementation:
|
|
55
55
|
implementation_config: LayeredConfigTree,
|
56
56
|
input_slots: Iterable[InputSlot] = (),
|
57
57
|
output_slots: Iterable[OutputSlot] = (),
|
58
|
-
|
58
|
+
is_auto_parallel: bool = False,
|
59
59
|
):
|
60
60
|
self.name = implementation_config.name
|
61
61
|
"""The name of this ``Implementation``."""
|
@@ -74,7 +74,7 @@ class Implementation:
|
|
74
74
|
implemented by this particular ``Implementation``."""
|
75
75
|
self.requires_spark = self._metadata.get("requires_spark", False)
|
76
76
|
"""Whether this ``Implementation`` requires a Spark environment."""
|
77
|
-
self.
|
77
|
+
self.is_auto_parallel = is_auto_parallel
|
78
78
|
|
79
79
|
def __repr__(self) -> str:
|
80
80
|
return f"Implementation.{self.name}"
|
easylink/pipeline.py
CHANGED
@@ -45,9 +45,9 @@ class Pipeline:
|
|
45
45
|
The :class:`~easylink.pipeline_graph.PipelineGraph` object.
|
46
46
|
spark_is_required
|
47
47
|
A boolean indicating whether the pipeline requires Spark.
|
48
|
-
|
48
|
+
any_auto_parallel
|
49
49
|
A boolean indicating whether any implementation in the pipeline is to be
|
50
|
-
run in
|
50
|
+
automatically run in parallel.
|
51
51
|
|
52
52
|
"""
|
53
53
|
|
@@ -55,7 +55,7 @@ class Pipeline:
|
|
55
55
|
self.config = config
|
56
56
|
self.pipeline_graph = PipelineGraph(config)
|
57
57
|
self.spark_is_required = self.pipeline_graph.spark_is_required
|
58
|
-
self.
|
58
|
+
self.any_auto_parallel = self.pipeline_graph.any_auto_parallel
|
59
59
|
|
60
60
|
# TODO [MIC-4880]: refactor into validation object
|
61
61
|
self._validate()
|
@@ -179,7 +179,7 @@ class Pipeline:
|
|
179
179
|
#################################
|
180
180
|
|
181
181
|
def _write_imports(self) -> None:
|
182
|
-
if not self.
|
182
|
+
if not self.any_auto_parallel:
|
183
183
|
imports = "from easylink.utilities import validation_utils\n"
|
184
184
|
else:
|
185
185
|
imports = """import glob
|
@@ -193,7 +193,7 @@ from easylink.utilities import aggregator_utils, splitter_utils, validation_util
|
|
193
193
|
f.write(imports)
|
194
194
|
|
195
195
|
def _write_wildcard_constraints(self) -> None:
|
196
|
-
if self.
|
196
|
+
if self.any_auto_parallel:
|
197
197
|
with open(self.snakefile_path, "a") as f:
|
198
198
|
f.write(
|
199
199
|
"""
|
@@ -301,12 +301,10 @@ use rule start_spark_worker from spark_cluster with:
|
|
301
301
|
The name of the ``Implementation`` to write the rule(s) for.
|
302
302
|
"""
|
303
303
|
|
304
|
-
|
305
|
-
node_name
|
306
|
-
)
|
304
|
+
is_auto_parallel = self.pipeline_graph.get_whether_auto_parallel(node_name)
|
307
305
|
input_slots, _output_slots = self.pipeline_graph.get_io_slot_attributes(node_name)
|
308
306
|
validation_files, validation_rules = self._get_validations(
|
309
|
-
node_name, input_slots,
|
307
|
+
node_name, input_slots, is_auto_parallel
|
310
308
|
)
|
311
309
|
for validation_rule in validation_rules:
|
312
310
|
validation_rule.write_to_snakefile(self.snakefile_path)
|
@@ -334,7 +332,7 @@ use rule start_spark_worker from spark_cluster with:
|
|
334
332
|
image_path=self.config.images_dir / implementation.singularity_image_name,
|
335
333
|
script_cmd=implementation.script_cmd,
|
336
334
|
requires_spark=implementation.requires_spark,
|
337
|
-
|
335
|
+
is_auto_parallel=is_auto_parallel,
|
338
336
|
).write_to_snakefile(self.snakefile_path)
|
339
337
|
|
340
338
|
def _write_checkpoint_rule(self, node_name: str, checkpoint_filepath: str) -> None:
|
@@ -377,7 +375,7 @@ use rule start_spark_worker from spark_cluster with:
|
|
377
375
|
input_files, output_files = self.pipeline_graph.get_io_filepaths(node_name)
|
378
376
|
if len(output_slots) > 1:
|
379
377
|
raise NotImplementedError(
|
380
|
-
"FIXME [MIC-5883] Multiple output slots/files of
|
378
|
+
"FIXME [MIC-5883] Multiple output slots/files of AutoParallelSteps not yet supported"
|
381
379
|
)
|
382
380
|
if len(output_files) > 1:
|
383
381
|
raise ValueError(
|
@@ -388,7 +386,7 @@ use rule start_spark_worker from spark_cluster with:
|
|
388
386
|
output_slot_attrs = list(output_slots.values())[0]
|
389
387
|
if len(output_slot_attrs["filepaths"]) > 1:
|
390
388
|
raise NotImplementedError(
|
391
|
-
"FIXME [MIC-5883] Multiple output slots/files of
|
389
|
+
"FIXME [MIC-5883] Multiple output slots/files of AutoParallelSteps not yet supported"
|
392
390
|
)
|
393
391
|
checkpoint_rule_name = f"checkpoints.{implementation.splitter_node_name}"
|
394
392
|
AggregationRule(
|
@@ -404,7 +402,7 @@ use rule start_spark_worker from spark_cluster with:
|
|
404
402
|
def _get_validations(
|
405
403
|
node_name: str,
|
406
404
|
input_slots: dict[str, dict[str, str | list[str]]],
|
407
|
-
|
405
|
+
is_auto_parallel: bool,
|
408
406
|
) -> tuple[list[str], list[InputValidationRule]]:
|
409
407
|
"""Gets the validation rule and its output filepath for each slot for a given node.
|
410
408
|
|
@@ -423,10 +421,10 @@ use rule start_spark_worker from spark_cluster with:
|
|
423
421
|
validation_rules = []
|
424
422
|
|
425
423
|
for input_slot_name, input_slot_attrs in input_slots.items():
|
426
|
-
#
|
424
|
+
# auto-parallel implementations rely on snakemake wildcards
|
427
425
|
# TODO: [MIC-5787] - need to support multiple wildcards at once
|
428
426
|
validation_file = f"input_validations/{node_name}/{input_slot_name}_validator" + (
|
429
|
-
"-{chunk}" if
|
427
|
+
"-{chunk}" if is_auto_parallel else ""
|
430
428
|
)
|
431
429
|
validation_files.append(validation_file)
|
432
430
|
validation_rules.append(
|
easylink/pipeline_graph.py
CHANGED
@@ -72,31 +72,26 @@ class PipelineGraph(ImplementationGraph):
|
|
72
72
|
return any([implementation.requires_spark for implementation in self.implementations])
|
73
73
|
|
74
74
|
@property
|
75
|
-
def
|
75
|
+
def any_auto_parallel(self) -> bool:
|
76
76
|
"""Whether or not any :class:`~easylink.implementation.Implementation` is
|
77
|
-
to be run in
|
77
|
+
to be automatically run in parallel."""
|
78
78
|
return any(
|
79
|
-
[
|
80
|
-
self.get_whether_embarrassingly_parallel(node)
|
81
|
-
for node in self.implementation_nodes
|
82
|
-
]
|
79
|
+
[self.get_whether_auto_parallel(node) for node in self.implementation_nodes]
|
83
80
|
)
|
84
81
|
|
85
|
-
def
|
86
|
-
"""Determines whether a node is to be run in
|
82
|
+
def get_whether_auto_parallel(self, node: str) -> dict[str, bool]:
|
83
|
+
"""Determines whether a node is to be automatically run in parallel.
|
87
84
|
|
88
85
|
Parameters
|
89
86
|
----------
|
90
87
|
node
|
91
|
-
The node name to determine whether or not it is to be run in
|
92
|
-
embarrassingly parallel way.
|
88
|
+
The node name to determine whether or not it is to be automatically run in parallel.
|
93
89
|
|
94
90
|
Returns
|
95
91
|
-------
|
96
|
-
A boolean indicating whether the node is to be run in
|
97
|
-
parallel way.
|
92
|
+
A boolean indicating whether the node is to be automatically run in parallel.
|
98
93
|
"""
|
99
|
-
return self.nodes[node]["implementation"].
|
94
|
+
return self.nodes[node]["implementation"].is_auto_parallel
|
100
95
|
|
101
96
|
def get_io_filepaths(self, node: str) -> tuple[list[str], list[str]]:
|
102
97
|
"""Gets all of a node's input and output filepaths from its edges.
|
@@ -482,9 +477,9 @@ class PipelineGraph(ImplementationGraph):
|
|
482
477
|
str(
|
483
478
|
Path("intermediate")
|
484
479
|
/ node
|
485
|
-
#
|
480
|
+
# auto-parallel implementations rely on snakemake wildcards
|
486
481
|
# TODO: [MIC-5787] - need to support multiple wildcards at once
|
487
|
-
/ ("{chunk}" if implementation.
|
482
|
+
/ ("{chunk}" if implementation.is_auto_parallel else "")
|
488
483
|
/ imp_outputs[edge_attrs["output_slot"].name]
|
489
484
|
),
|
490
485
|
)
|
easylink/pipeline_schema.py
CHANGED
@@ -159,10 +159,10 @@ class PipelineSchema(HierarchicalStep):
|
|
159
159
|
)
|
160
160
|
|
161
161
|
@classmethod
|
162
|
-
def get_schema(cls, name: str = "main") ->
|
163
|
-
"""Gets
|
162
|
+
def get_schema(cls, name: str = "main") -> "PipelineSchema":
|
163
|
+
"""Gets the requested ``PipelineSchema``.
|
164
164
|
|
165
|
-
|
165
|
+
This ``PipelineSchema`` represents the fully supported pipelines and is
|
166
166
|
used to validate the user-requested pipeline.
|
167
167
|
|
168
168
|
Parameters
|
@@ -23,8 +23,9 @@ SCHEMA_PARAMS = {
|
|
23
23
|
"combine_with_iteration": testing.SCHEMA_PARAMS_COMBINE_WITH_ITERATION,
|
24
24
|
"combine_with_iteration_cycle": testing.SCHEMA_PARAMS_COMBINE_WITH_ITERATION,
|
25
25
|
"combine_with_extra_node": testing.SCHEMA_PARAMS_THREE_STEPS,
|
26
|
-
"
|
27
|
-
"
|
28
|
-
"
|
29
|
-
"
|
26
|
+
"looping_auto_parallel_step": testing.SCHEMA_PARAMS_LOOPING_AUTO_PARALLEL_STEP,
|
27
|
+
"auto_parallel_cloneable_step": testing.SCHEMA_PARAMS_AUTO_PARALLEL_CLONEABLE_STEP,
|
28
|
+
"auto_parallel_loop_step": testing.SCHEMA_PARAMS_AUTO_PARALLEL_LOOP_STEP,
|
29
|
+
"auto_parallel_hierarchical_step": testing.SCHEMA_PARAMS_AUTO_PARALLEL_HIERARCHICAL_STEP,
|
30
|
+
"default_implementations": testing.SCHEMA_PARAMS_DEFAULT_IMPLEMENTATIONS,
|
30
31
|
}
|
@@ -18,13 +18,13 @@ from easylink.graph_components import (
|
|
18
18
|
OutputSlotMapping,
|
19
19
|
)
|
20
20
|
from easylink.step import (
|
21
|
+
AutoParallelStep,
|
21
22
|
ChoiceStep,
|
22
|
-
|
23
|
+
CloneableStep,
|
23
24
|
HierarchicalStep,
|
24
25
|
InputStep,
|
25
26
|
LoopStep,
|
26
27
|
OutputStep,
|
27
|
-
ParallelStep,
|
28
28
|
Step,
|
29
29
|
)
|
30
30
|
from easylink.utilities.aggregator_utils import concatenate_datasets
|
@@ -33,7 +33,7 @@ from easylink.utilities.validation_utils import validate_input_file_dummy
|
|
33
33
|
|
34
34
|
NODES = [
|
35
35
|
InputStep(),
|
36
|
-
|
36
|
+
CloneableStep(
|
37
37
|
template_step=Step(
|
38
38
|
step_name="step_1",
|
39
39
|
input_slots=[
|
@@ -58,7 +58,7 @@ NODES = [
|
|
58
58
|
output_slots=[OutputSlot("step_2_main_output")],
|
59
59
|
),
|
60
60
|
LoopStep(
|
61
|
-
template_step=
|
61
|
+
template_step=AutoParallelStep(
|
62
62
|
step=Step(
|
63
63
|
step_name="step_3",
|
64
64
|
input_slots=[
|
@@ -12,11 +12,11 @@ from easylink.graph_components import (
|
|
12
12
|
OutputSlotMapping,
|
13
13
|
)
|
14
14
|
from easylink.step import (
|
15
|
+
CloneableStep,
|
15
16
|
HierarchicalStep,
|
16
17
|
InputStep,
|
17
18
|
LoopStep,
|
18
19
|
OutputStep,
|
19
|
-
ParallelStep,
|
20
20
|
Step,
|
21
21
|
)
|
22
22
|
from easylink.utilities.validation_utils import (
|
@@ -56,8 +56,8 @@ NODES = [
|
|
56
56
|
],
|
57
57
|
output_slots=[OutputSlot("clusters")],
|
58
58
|
nodes=[
|
59
|
-
|
60
|
-
# NOTE: Splitters/aggregators on the
|
59
|
+
CloneableStep(
|
60
|
+
# NOTE: Splitters/aggregators on the CloneableStep are implicit!
|
61
61
|
template_step=HierarchicalStep(
|
62
62
|
step_name="determining_exclusions_and_removing_records",
|
63
63
|
directly_implemented=False,
|
@@ -190,7 +190,7 @@ NODES = [
|
|
190
190
|
],
|
191
191
|
output_slots=[OutputSlot("links")],
|
192
192
|
nodes=[
|
193
|
-
|
193
|
+
CloneableStep(
|
194
194
|
template_step=LoopStep(
|
195
195
|
template_step=Step(
|
196
196
|
step_name="pre-processing",
|
@@ -265,7 +265,7 @@ NODES = [
|
|
265
265
|
source_node="pre-processing",
|
266
266
|
target_node="schema_alignment",
|
267
267
|
output_slot="dataset",
|
268
|
-
# NOTE: The implicit
|
268
|
+
# NOTE: The implicit CloneableStep aggregator has
|
269
269
|
# made this multiple (a list)
|
270
270
|
input_slot="datasets",
|
271
271
|
),
|
@@ -16,12 +16,12 @@ from easylink.graph_components import (
|
|
16
16
|
OutputSlotMapping,
|
17
17
|
)
|
18
18
|
from easylink.step import (
|
19
|
-
|
19
|
+
AutoParallelStep,
|
20
|
+
CloneableStep,
|
20
21
|
HierarchicalStep,
|
21
22
|
InputStep,
|
22
23
|
LoopStep,
|
23
24
|
OutputStep,
|
24
|
-
ParallelStep,
|
25
25
|
Step,
|
26
26
|
)
|
27
27
|
from easylink.utilities.aggregator_utils import concatenate_datasets
|
@@ -215,7 +215,7 @@ SCHEMA_PARAMS_BAD_COMBINED_TOPOLOGY = (NODES_BAD_COMBINED_TOPOLOGY, EDGES_ONE_ST
|
|
215
215
|
NODES_NESTED_TEMPLATED_STEPS = [
|
216
216
|
InputStep(),
|
217
217
|
LoopStep(
|
218
|
-
template_step=
|
218
|
+
template_step=CloneableStep(
|
219
219
|
template_step=HierarchicalStep(
|
220
220
|
step_name="step_1",
|
221
221
|
input_slots=[
|
@@ -355,10 +355,10 @@ EDGES_TWO_STEPS = [
|
|
355
355
|
SCHEMA_PARAMS_COMBINE_WITH_ITERATION = (NODES_COMBINE_WITH_ITERATION, EDGES_TWO_STEPS)
|
356
356
|
|
357
357
|
|
358
|
-
|
358
|
+
NODES_LOOPING_AUTO_PARALLEL_STEP = [
|
359
359
|
InputStep(),
|
360
360
|
LoopStep(
|
361
|
-
template_step=
|
361
|
+
template_step=AutoParallelStep(
|
362
362
|
step=Step(
|
363
363
|
step_name="step_1",
|
364
364
|
input_slots=[
|
@@ -392,13 +392,13 @@ NODES_LOOPING_EP_STEP = [
|
|
392
392
|
]
|
393
393
|
),
|
394
394
|
]
|
395
|
-
|
395
|
+
SCHEMA_PARAMS_LOOPING_AUTO_PARALLEL_STEP = (NODES_LOOPING_AUTO_PARALLEL_STEP, EDGES_ONE_STEP)
|
396
396
|
|
397
397
|
|
398
|
-
|
398
|
+
NODES_AUTO_PARALLEL_PARALLEL_STEP = [
|
399
399
|
InputStep(),
|
400
|
-
|
401
|
-
step=
|
400
|
+
AutoParallelStep(
|
401
|
+
step=CloneableStep(
|
402
402
|
template_step=Step(
|
403
403
|
step_name="step_1",
|
404
404
|
input_slots=[
|
@@ -424,12 +424,15 @@ NODES_EP_PARALLEL_STEP = [
|
|
424
424
|
]
|
425
425
|
),
|
426
426
|
]
|
427
|
-
|
427
|
+
SCHEMA_PARAMS_AUTO_PARALLEL_CLONEABLE_STEP = (
|
428
|
+
NODES_AUTO_PARALLEL_PARALLEL_STEP,
|
429
|
+
EDGES_ONE_STEP,
|
430
|
+
)
|
428
431
|
|
429
432
|
|
430
|
-
|
433
|
+
NODES_AUTO_PARALLEL_LOOP_STEP = [
|
431
434
|
InputStep(),
|
432
|
-
|
435
|
+
AutoParallelStep(
|
433
436
|
step=LoopStep(
|
434
437
|
template_step=Step(
|
435
438
|
step_name="step_1",
|
@@ -464,12 +467,12 @@ NODES_EP_LOOP_STEP = [
|
|
464
467
|
]
|
465
468
|
),
|
466
469
|
]
|
467
|
-
|
470
|
+
SCHEMA_PARAMS_AUTO_PARALLEL_LOOP_STEP = (NODES_AUTO_PARALLEL_LOOP_STEP, EDGES_ONE_STEP)
|
468
471
|
|
469
472
|
|
470
|
-
|
473
|
+
NODES_AUTO_PARALLEL_HIERARCHICAL_STEP = [
|
471
474
|
InputStep(),
|
472
|
-
|
475
|
+
AutoParallelStep(
|
473
476
|
step=HierarchicalStep(
|
474
477
|
step_name="step_1",
|
475
478
|
input_slots=[
|
@@ -581,7 +584,10 @@ EDGES_ONE_STEP_TWO_ISLOTS = [
|
|
581
584
|
input_slot="result",
|
582
585
|
),
|
583
586
|
]
|
584
|
-
|
587
|
+
SCHEMA_PARAMS_AUTO_PARALLEL_HIERARCHICAL_STEP = (
|
588
|
+
NODES_AUTO_PARALLEL_HIERARCHICAL_STEP,
|
589
|
+
EDGES_ONE_STEP_TWO_ISLOTS,
|
590
|
+
)
|
585
591
|
|
586
592
|
NODES_OUTPUT_DIR = [
|
587
593
|
InputStep(),
|
@@ -634,3 +640,126 @@ EDGES_OUTPUT_DIR = [
|
|
634
640
|
),
|
635
641
|
]
|
636
642
|
SCHEMA_PARAMS_OUTPUT_DIR = (NODES_OUTPUT_DIR, EDGES_OUTPUT_DIR)
|
643
|
+
|
644
|
+
|
645
|
+
NODES_DEFAULT_IMPLEMENTATIONS = [
|
646
|
+
InputStep(),
|
647
|
+
HierarchicalStep(
|
648
|
+
step_name="step_1",
|
649
|
+
input_slots=[
|
650
|
+
InputSlot(
|
651
|
+
name="step_1_main_input",
|
652
|
+
env_var="DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS",
|
653
|
+
validator=validate_input_file_dummy,
|
654
|
+
),
|
655
|
+
],
|
656
|
+
output_slots=[OutputSlot("step_1_main_output")],
|
657
|
+
nodes=[
|
658
|
+
Step(
|
659
|
+
step_name="step_1a",
|
660
|
+
input_slots=[
|
661
|
+
InputSlot(
|
662
|
+
name="step_1a_main_input",
|
663
|
+
env_var="DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS",
|
664
|
+
validator=validate_input_file_dummy,
|
665
|
+
),
|
666
|
+
],
|
667
|
+
output_slots=[OutputSlot("step_1a_main_output")],
|
668
|
+
default_implementation="step_1a_python_pandas",
|
669
|
+
),
|
670
|
+
Step(
|
671
|
+
step_name="step_1b",
|
672
|
+
input_slots=[
|
673
|
+
InputSlot(
|
674
|
+
name="step_1b_main_input",
|
675
|
+
env_var="DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS",
|
676
|
+
validator=validate_input_file_dummy,
|
677
|
+
),
|
678
|
+
],
|
679
|
+
output_slots=[OutputSlot("step_1b_main_output")],
|
680
|
+
default_implementation="step_1b_python_pandas",
|
681
|
+
),
|
682
|
+
],
|
683
|
+
edges=[
|
684
|
+
EdgeParams(
|
685
|
+
source_node="step_1a",
|
686
|
+
target_node="step_1b",
|
687
|
+
output_slot="step_1a_main_output",
|
688
|
+
input_slot="step_1b_main_input",
|
689
|
+
),
|
690
|
+
],
|
691
|
+
input_slot_mappings=[
|
692
|
+
InputSlotMapping(
|
693
|
+
parent_slot="step_1_main_input",
|
694
|
+
child_node="step_1a",
|
695
|
+
child_slot="step_1a_main_input",
|
696
|
+
),
|
697
|
+
],
|
698
|
+
output_slot_mappings=[
|
699
|
+
OutputSlotMapping(
|
700
|
+
parent_slot="step_1_main_output",
|
701
|
+
child_node="step_1b",
|
702
|
+
child_slot="step_1b_main_output",
|
703
|
+
),
|
704
|
+
],
|
705
|
+
default_implementation="step_1_python_pandas",
|
706
|
+
),
|
707
|
+
Step(
|
708
|
+
step_name="step_2",
|
709
|
+
input_slots=[
|
710
|
+
InputSlot(
|
711
|
+
name="step_2_main_input",
|
712
|
+
env_var="DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS",
|
713
|
+
validator=validate_input_file_dummy,
|
714
|
+
)
|
715
|
+
],
|
716
|
+
output_slots=[OutputSlot("step_2_main_output")],
|
717
|
+
default_implementation="step_2_python_pandas",
|
718
|
+
),
|
719
|
+
LoopStep(
|
720
|
+
template_step=Step(
|
721
|
+
step_name="step_3",
|
722
|
+
input_slots=[
|
723
|
+
InputSlot(
|
724
|
+
name="step_3_main_input",
|
725
|
+
env_var="DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS",
|
726
|
+
validator=validate_input_file_dummy,
|
727
|
+
)
|
728
|
+
],
|
729
|
+
output_slots=[OutputSlot("step_3_main_output")],
|
730
|
+
),
|
731
|
+
self_edges=[
|
732
|
+
EdgeParams(
|
733
|
+
source_node="step_3",
|
734
|
+
target_node="step_3",
|
735
|
+
output_slot="step_3_main_output",
|
736
|
+
input_slot="step_3_main_input",
|
737
|
+
),
|
738
|
+
],
|
739
|
+
default_implementation="step_3_python_pandas",
|
740
|
+
),
|
741
|
+
CloneableStep(
|
742
|
+
template_step=Step(
|
743
|
+
step_name="step_4",
|
744
|
+
input_slots=[
|
745
|
+
InputSlot(
|
746
|
+
name="step_4_main_input",
|
747
|
+
env_var="DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS",
|
748
|
+
validator=validate_input_file_dummy,
|
749
|
+
),
|
750
|
+
],
|
751
|
+
output_slots=[
|
752
|
+
OutputSlot(
|
753
|
+
name="step_4_main_output",
|
754
|
+
),
|
755
|
+
],
|
756
|
+
),
|
757
|
+
default_implementation="step_4_python_pandas",
|
758
|
+
),
|
759
|
+
OutputStep(
|
760
|
+
input_slots=[
|
761
|
+
InputSlot(name="result", env_var=None, validator=validate_input_file_dummy)
|
762
|
+
],
|
763
|
+
),
|
764
|
+
]
|
765
|
+
SCHEMA_PARAMS_DEFAULT_IMPLEMENTATIONS = (NODES_DEFAULT_IMPLEMENTATIONS, EDGES_TWO_STEPS)
|
easylink/rule.py
CHANGED
@@ -111,21 +111,20 @@ class ImplementedRule(Rule):
|
|
111
111
|
"""Command to execute."""
|
112
112
|
requires_spark: bool
|
113
113
|
"""Whether or not this ``Implementation`` requires a Spark environment."""
|
114
|
-
|
115
|
-
"""Whether or not this ``Implementation`` is to be run in
|
116
|
-
parallel way."""
|
114
|
+
is_auto_parallel: bool = False
|
115
|
+
"""Whether or not this ``Implementation`` is to be automatically run in parallel."""
|
117
116
|
|
118
117
|
def build_rule(self) -> str:
|
119
118
|
"""Builds the Snakemake rule for this ``Implementation``."""
|
120
|
-
if self.
|
119
|
+
if self.is_auto_parallel and len(self.output) > 1:
|
121
120
|
raise NotImplementedError(
|
122
|
-
"Multiple output slots/files of
|
121
|
+
"Multiple output slots/files of AutoParallelSteps not yet supported"
|
123
122
|
)
|
124
123
|
return self._build_io() + self._build_resources() + self._build_shell_cmd()
|
125
124
|
|
126
125
|
def _build_io(self) -> str:
|
127
126
|
"""Builds the input/output portion of the rule."""
|
128
|
-
log_path_chunk_adder = "-{chunk}" if self.
|
127
|
+
log_path_chunk_adder = "-{chunk}" if self.is_auto_parallel else ""
|
129
128
|
# Handle output files vs directories
|
130
129
|
files = [path for path in self.output if Path(path).suffix != ""]
|
131
130
|
if len(files) == len(self.output):
|
@@ -260,7 +259,7 @@ rule:
|
|
260
259
|
class CheckpointRule(Rule):
|
261
260
|
"""A :class:`Rule` that defines a checkpoint.
|
262
261
|
|
263
|
-
When running an :class:`~easylink.implementation.Implementation` in an
|
262
|
+
When running an :class:`~easylink.implementation.Implementation` in an auto
|
264
263
|
parallel way, we do not know until runtime how many parallel jobs there will
|
265
264
|
be (e.g. we don't know beforehand how many chunks a large incoming dataset will
|
266
265
|
be split into since the incoming dataset isn't created until runtime). The
|
@@ -326,7 +325,7 @@ checkpoint:
|
|
326
325
|
class AggregationRule(Rule):
|
327
326
|
"""A :class:`Rule` that aggregates the processed chunks of output data.
|
328
327
|
|
329
|
-
When running an :class:`~easylink.implementation.Implementation` in an
|
328
|
+
When running an :class:`~easylink.implementation.Implementation` in an auto
|
330
329
|
parallel way, we need to aggregate the output files from each parallel job
|
331
330
|
into a single output file.
|
332
331
|
"""
|
@@ -347,10 +346,10 @@ class AggregationRule(Rule):
|
|
347
346
|
def build_rule(self) -> str:
|
348
347
|
"""Builds the Snakemake rule for this aggregator.
|
349
348
|
|
350
|
-
When running an :class:`~easylink.step.
|
349
|
+
When running an :class:`~easylink.step.AutoParallelStep`, we need
|
351
350
|
to aggregate the output files from each parallel job into a single output file.
|
352
351
|
This rule relies on a dynamically generated aggregation function which returns
|
353
|
-
all of the **processed** chunks (from running the ``
|
352
|
+
all of the **processed** chunks (from running the ``AutoParallelStep's``
|
354
353
|
container in parallel) and uses them as inputs to the actual aggregation
|
355
354
|
rule.
|
356
355
|
|
easylink/step.py
CHANGED
@@ -71,8 +71,8 @@ class Step:
|
|
71
71
|
The :class:`InputSlotMapping<easylink.graph_components.InputSlotMapping>` of this ``Step``.
|
72
72
|
output_slot_mappings
|
73
73
|
The :class:`OutputSlotMapping<easylink.graph_components.OutputSlotMapping>` of this ``Step``.
|
74
|
-
|
75
|
-
Whether or not this ``Step`` is to
|
74
|
+
is_auto_parallel
|
75
|
+
Whether or not this ``Step`` is to automatically run in parallel.
|
76
76
|
|
77
77
|
Notes
|
78
78
|
-----
|
@@ -91,7 +91,8 @@ class Step:
|
|
91
91
|
output_slots: Iterable[OutputSlot] = (),
|
92
92
|
input_slot_mappings: Iterable[InputSlotMapping] = (),
|
93
93
|
output_slot_mappings: Iterable[OutputSlotMapping] = (),
|
94
|
-
|
94
|
+
is_auto_parallel: bool = False,
|
95
|
+
default_implementation: str | None = None,
|
95
96
|
) -> None:
|
96
97
|
if not step_name and not name:
|
97
98
|
raise ValueError("All Steps must contain a step_name, name, or both.")
|
@@ -125,8 +126,11 @@ class Step:
|
|
125
126
|
}
|
126
127
|
"""A combined dictionary containing both the ``InputSlotMappings`` and
|
127
128
|
``OutputSlotMappings`` of this ``Step``."""
|
128
|
-
self.
|
129
|
-
"""Whether or not this ``Step`` is to be run in
|
129
|
+
self.is_auto_parallel = is_auto_parallel
|
130
|
+
"""Whether or not this ``Step`` is to be automatically run in parallel."""
|
131
|
+
self.default_implementation = default_implementation
|
132
|
+
"""The default implementation to use for this ``Step`` if the ``Step`` is
|
133
|
+
not explicitly configured in the pipeline specification."""
|
130
134
|
self.parent_step = None
|
131
135
|
"""This ``Step's`` parent ``Step``, if applicable."""
|
132
136
|
self._configuration_state = None
|
@@ -580,6 +584,7 @@ class HierarchicalStep(Step):
|
|
580
584
|
input_slot_mappings=(),
|
581
585
|
output_slot_mappings=(),
|
582
586
|
directly_implemented=True,
|
587
|
+
default_implementation: str | None = None,
|
583
588
|
):
|
584
589
|
super().__init__(
|
585
590
|
step_name,
|
@@ -588,6 +593,7 @@ class HierarchicalStep(Step):
|
|
588
593
|
output_slots,
|
589
594
|
input_slot_mappings,
|
590
595
|
output_slot_mappings,
|
596
|
+
default_implementation=default_implementation,
|
591
597
|
)
|
592
598
|
self.nodes = nodes
|
593
599
|
"""All sub-nodes (i.e. sub-``Steps``) that make up this ``HierarchicalStep``."""
|
@@ -722,13 +728,19 @@ class HierarchicalStep(Step):
|
|
722
728
|
step = self.step_graph.nodes[node]["step"]
|
723
729
|
if isinstance(step, IOStep):
|
724
730
|
continue
|
731
|
+
if step.name not in step_config:
|
732
|
+
default_implementation = self.step_graph.nodes[step.name][
|
733
|
+
"step"
|
734
|
+
].default_implementation
|
735
|
+
step_errors = (
|
736
|
+
{f"step {step.name}": ["The step is not configured."]}
|
737
|
+
if not default_implementation
|
738
|
+
else {}
|
739
|
+
)
|
725
740
|
else:
|
726
|
-
|
727
|
-
|
728
|
-
|
729
|
-
step_errors = step.validate_step(
|
730
|
-
step_config[step.name], combined_implementations, input_data_config
|
731
|
-
)
|
741
|
+
step_errors = step.validate_step(
|
742
|
+
step_config[step.name], combined_implementations, input_data_config
|
743
|
+
)
|
732
744
|
if step_errors:
|
733
745
|
errors.update(step_errors)
|
734
746
|
extra_steps = set(step_config.keys()) - set(self.step_graph.nodes)
|
@@ -816,7 +828,7 @@ class TemplatedStep(Step, ABC):
|
|
816
828
|
|
817
829
|
A ``TemplatedStep`` is used to represents a ``Step`` that contains a specified
|
818
830
|
amount of multiplicity, such as one that is looped or run in parallel; it is
|
819
|
-
inherited by concrete :class:`LoopStep` and :class:`
|
831
|
+
inherited by concrete :class:`LoopStep` and :class:`CloneableStep` instances.
|
820
832
|
|
821
833
|
See :class:`Step` for inherited attributes.
|
822
834
|
|
@@ -830,12 +842,14 @@ class TemplatedStep(Step, ABC):
|
|
830
842
|
def __init__(
|
831
843
|
self,
|
832
844
|
template_step: Step,
|
845
|
+
default_implementation: str | None = None,
|
833
846
|
) -> None:
|
834
847
|
super().__init__(
|
835
848
|
template_step.step_name,
|
836
849
|
template_step.name,
|
837
850
|
template_step.input_slots.values(),
|
838
851
|
template_step.output_slots.values(),
|
852
|
+
default_implementation=default_implementation,
|
839
853
|
)
|
840
854
|
self.step_graph = None
|
841
855
|
"""The :class:`~easylink.graph_components.StepGraph` i.e. the directed acyclic
|
@@ -1110,8 +1124,9 @@ class LoopStep(TemplatedStep):
|
|
1110
1124
|
self,
|
1111
1125
|
template_step: Step | None = None,
|
1112
1126
|
self_edges: Iterable[EdgeParams] = (),
|
1127
|
+
default_implementation: str | None = None,
|
1113
1128
|
) -> None:
|
1114
|
-
super().__init__(template_step)
|
1129
|
+
super().__init__(template_step, default_implementation)
|
1115
1130
|
self.self_edges = self_edges
|
1116
1131
|
""":class:`~easylink.graph_components.EdgeParams` that represent self-edges,
|
1117
1132
|
i.e. edges that connect the output of one loop to the input of the next."""
|
@@ -1206,7 +1221,7 @@ class LoopStep(TemplatedStep):
|
|
1206
1221
|
return {"input": input_mappings, "output": output_mappings}
|
1207
1222
|
|
1208
1223
|
|
1209
|
-
class
|
1224
|
+
class CloneableStep(TemplatedStep):
|
1210
1225
|
"""A type of :class:`TemplatedStep` that creates multiple copies in parallel
|
1211
1226
|
with no dependencies between them.
|
1212
1227
|
|
@@ -1216,13 +1231,13 @@ class ParallelStep(TemplatedStep):
|
|
1216
1231
|
|
1217
1232
|
@property
|
1218
1233
|
def config_key(self):
|
1219
|
-
"""The pipeline specification key required for a ``
|
1220
|
-
return "
|
1234
|
+
"""The pipeline specification key required for a ``CloneableStep``."""
|
1235
|
+
return "clones"
|
1221
1236
|
|
1222
1237
|
@property
|
1223
1238
|
def node_prefix(self):
|
1224
|
-
"""The prefix to be used in the ``
|
1225
|
-
return "
|
1239
|
+
"""The prefix to be used in the ``CloneableStep`` node name."""
|
1240
|
+
return "clone"
|
1226
1241
|
|
1227
1242
|
def _update_step_graph(self, num_repeats: int) -> StepGraph:
|
1228
1243
|
"""Updates the :class:`~easylink.graph_components.StepGraph` to include parallelization.
|
@@ -1276,10 +1291,10 @@ class ParallelStep(TemplatedStep):
|
|
1276
1291
|
return {"input": input_mappings, "output": output_mappings}
|
1277
1292
|
|
1278
1293
|
|
1279
|
-
class
|
1294
|
+
class AutoParallelStep(Step):
|
1280
1295
|
"""A :class:`Step` that is run in parallel on the backend.
|
1281
1296
|
|
1282
|
-
An ``
|
1297
|
+
An ``AutoParallelStep`` is different than a :class:`CloneableStep`
|
1283
1298
|
in that it is not configured by the user to be run in parallel - it completely
|
1284
1299
|
happens on the back end for performance reasons.
|
1285
1300
|
|
@@ -1288,8 +1303,8 @@ class EmbarrassinglyParallelStep(Step):
|
|
1288
1303
|
Parameters
|
1289
1304
|
----------
|
1290
1305
|
step
|
1291
|
-
The ``Step`` to be run in
|
1292
|
-
|
1306
|
+
The ``Step`` to be automatically run in parallel. To run multiple steps in
|
1307
|
+
parallel, use a :class:`HierarchicalStep`.
|
1293
1308
|
slot_splitter_mapping
|
1294
1309
|
A mapping of the :class:`~easylink.graph_components.InputSlot` name to split
|
1295
1310
|
to the actual splitter function to be used.
|
@@ -1308,7 +1323,7 @@ class EmbarrassinglyParallelStep(Step):
|
|
1308
1323
|
super().__init__(
|
1309
1324
|
step_name=None,
|
1310
1325
|
name=step.name,
|
1311
|
-
|
1326
|
+
is_auto_parallel=True,
|
1312
1327
|
)
|
1313
1328
|
self.slot_splitter_mapping = slot_splitter_mapping
|
1314
1329
|
"""A mapping of the :class:`~easylink.graph_components.InputSlot` name to split
|
@@ -1328,14 +1343,14 @@ class EmbarrassinglyParallelStep(Step):
|
|
1328
1343
|
|
1329
1344
|
@Step.name.setter
|
1330
1345
|
def name(self, value: str) -> None:
|
1331
|
-
"""Changes the name of the ``
|
1346
|
+
"""Changes the name of the ``AutoParallelStep`` and the underlying :class:`Step` to the given value."""
|
1332
1347
|
self._name = value
|
1333
1348
|
self.step._name = value
|
1334
1349
|
|
1335
1350
|
def _validate(self) -> None:
|
1336
|
-
"""Validates the ``
|
1351
|
+
"""Validates the ``AutoParallelStep``.
|
1337
1352
|
|
1338
|
-
``
|
1353
|
+
``AutoParallelSteps`` are not configured by the user to be run
|
1339
1354
|
in parallel. Since it happens on the back end, we need to do somewhat unique
|
1340
1355
|
validations during construction. Specifically,
|
1341
1356
|
- one and only one :class:`~easylink.graph_components.InputSlot` *must*
|
@@ -1348,17 +1363,17 @@ class EmbarrassinglyParallelStep(Step):
|
|
1348
1363
|
# check that only one input slot has a splitter assigned
|
1349
1364
|
if len(self.slot_splitter_mapping) != 1:
|
1350
1365
|
errors.append(
|
1351
|
-
f"
|
1366
|
+
f"AutoParallelStep '{self.step_name}' is attempting to define "
|
1352
1367
|
f"{len(self.slot_splitter_mapping)} splitters when only one should be defined."
|
1353
1368
|
)
|
1354
1369
|
if len(self.slot_splitter_mapping) == 0:
|
1355
1370
|
errors.append(
|
1356
|
-
f"
|
1371
|
+
f"AutoParallelStep '{self.step_name}' does not have any input slots with a "
|
1357
1372
|
"splitter method assigned; one and only one input slot must have a splitter."
|
1358
1373
|
)
|
1359
1374
|
if len(self.slot_splitter_mapping) > 1:
|
1360
1375
|
errors.append(
|
1361
|
-
f"
|
1376
|
+
f"AutoParallelStep '{self.step_name}' has multiple input slots with "
|
1362
1377
|
"splitter methods assigned; one and only one input slot must have a splitter.\n"
|
1363
1378
|
f"Input slots with splitters: {list(self.slot_splitter_mapping)}"
|
1364
1379
|
)
|
@@ -1371,7 +1386,7 @@ class EmbarrassinglyParallelStep(Step):
|
|
1371
1386
|
]
|
1372
1387
|
if len(missing_aggregators) != 0:
|
1373
1388
|
errors.append(
|
1374
|
-
f"
|
1389
|
+
f"AutoParallelStep '{self.step_name}' has output slots without "
|
1375
1390
|
f"aggregator methods assigned: {missing_aggregators}"
|
1376
1391
|
)
|
1377
1392
|
if errors:
|
@@ -1451,7 +1466,7 @@ class EmbarrassinglyParallelStep(Step):
|
|
1451
1466
|
aggregator_node_name = f"{self.name}_aggregate"
|
1452
1467
|
if len(self.output_slots) > 1:
|
1453
1468
|
raise NotImplementedError(
|
1454
|
-
"FIXME [MIC-5883] Multiple output slots/files of
|
1469
|
+
"FIXME [MIC-5883] Multiple output slots/files of AutoParallelSteps not yet supported"
|
1455
1470
|
)
|
1456
1471
|
output_slot = list(self.output_slots.values())[0]
|
1457
1472
|
aggregator_step = AggregatorStep(
|
@@ -1464,7 +1479,7 @@ class EmbarrassinglyParallelStep(Step):
|
|
1464
1479
|
self._update_slot_mappings(splitter_step, aggregator_step)
|
1465
1480
|
# Add the key back to the expanded config
|
1466
1481
|
expanded_config = LayeredConfigTree({self.step.name: step_config})
|
1467
|
-
#
|
1482
|
+
# AutoParallelSteps are by definition non-leaf steps
|
1468
1483
|
self._configuration_state = NonLeafConfigurationState(
|
1469
1484
|
self, expanded_config, combined_implementations, input_data_config
|
1470
1485
|
)
|
@@ -1513,7 +1528,7 @@ class EmbarrassinglyParallelStep(Step):
|
|
1513
1528
|
# Add the Step -> AggregatorStep edge
|
1514
1529
|
if len(self.step.output_slots) > 1:
|
1515
1530
|
raise NotImplementedError(
|
1516
|
-
"
|
1531
|
+
"AutoParallelStep does not support multiple output slots."
|
1517
1532
|
)
|
1518
1533
|
self.step_graph.add_edge_from_params(
|
1519
1534
|
EdgeParams(
|
@@ -1562,7 +1577,7 @@ class SplitterStep(StandaloneStep):
|
|
1562
1577
|
"""A :class:`StandaloneStep` that splits an :class:`~easylink.graph_components.InputSlot` for parallel processing.
|
1563
1578
|
|
1564
1579
|
A ``SplitterStep`` is intended to be used in conjunction with a corresponding
|
1565
|
-
:class:`AggregatorStep` and only during construction of an :class:`
|
1580
|
+
:class:`AggregatorStep` and only during construction of an :class:`AutoParallelStep`.
|
1566
1581
|
|
1567
1582
|
See :class:`Step` for inherited attributes.
|
1568
1583
|
|
@@ -1613,7 +1628,7 @@ class AggregatorStep(StandaloneStep):
|
|
1613
1628
|
"""A :class:`StandaloneStep` that aggregates :class:`OutputSlots<easylink.graph_components.Outputslot>` after parallel processing.
|
1614
1629
|
|
1615
1630
|
An ``AggregatorStep`` is intended to be used in conjunction with a corresponding
|
1616
|
-
:class:`SplitterStep` and only during construction of an :class:`
|
1631
|
+
:class:`SplitterStep` and only during construction of an :class:`AutoParallelStep`.
|
1617
1632
|
|
1618
1633
|
See :class:`Step` for inherited attributes.
|
1619
1634
|
|
@@ -1918,10 +1933,9 @@ class LeafConfigurationState(ConfigurationState):
|
|
1918
1933
|
"""
|
1919
1934
|
step = self._step
|
1920
1935
|
if self.is_combined:
|
1921
|
-
if step.
|
1936
|
+
if step.is_auto_parallel:
|
1922
1937
|
raise NotImplementedError(
|
1923
|
-
"Combining implementations with
|
1924
|
-
"is not supported."
|
1938
|
+
"Combining implementations with auto-parallel steps is not supported."
|
1925
1939
|
)
|
1926
1940
|
implementation = PartialImplementation(
|
1927
1941
|
combined_name=self.step_config[COMBINED_IMPLEMENTATION_KEY],
|
@@ -1935,7 +1949,7 @@ class LeafConfigurationState(ConfigurationState):
|
|
1935
1949
|
implementation_config=self.implementation_config,
|
1936
1950
|
input_slots=step.input_slots.values(),
|
1937
1951
|
output_slots=step.output_slots.values(),
|
1938
|
-
|
1952
|
+
is_auto_parallel=step.is_auto_parallel,
|
1939
1953
|
)
|
1940
1954
|
implementation_graph.add_node_from_implementation(
|
1941
1955
|
step.implementation_node_name,
|
@@ -1985,7 +1999,7 @@ class LeafConfigurationState(ConfigurationState):
|
|
1985
1999
|
if mapping.parent_slot == edge.input_slot
|
1986
2000
|
]
|
1987
2001
|
for mapping in mappings:
|
1988
|
-
# FIXME [MIC-5771]: Fix
|
2002
|
+
# FIXME [MIC-5771]: Fix CloneableSteps
|
1989
2003
|
if (
|
1990
2004
|
"input_data_file" in self.step_config
|
1991
2005
|
and edge.source_node == "pipeline_graph_input_data"
|
@@ -2070,8 +2084,8 @@ class NonLeafConfigurationState(ConfigurationState):
|
|
2070
2084
|
"""
|
2071
2085
|
for node in self._step.step_graph.nodes:
|
2072
2086
|
substep = self._step.step_graph.nodes[node]["step"]
|
2073
|
-
if self._step.
|
2074
|
-
substep.
|
2087
|
+
if self._step.is_auto_parallel:
|
2088
|
+
substep.is_auto_parallel = True
|
2075
2089
|
substep.add_nodes_to_implementation_graph(implementation_graph)
|
2076
2090
|
|
2077
2091
|
def add_edges_to_implementation_graph(
|
@@ -2182,15 +2196,32 @@ class NonLeafConfigurationState(ConfigurationState):
|
|
2182
2196
|
|
2183
2197
|
This method recursively traverses the ``StepGraph`` and sets the configuration
|
2184
2198
|
state for each ``Step`` until reaching all leaf nodes.
|
2199
|
+
|
2200
|
+
Notes
|
2201
|
+
-----
|
2202
|
+
If a ``Step`` name is missing from the ``step_config``, we know that it
|
2203
|
+
must have a default implementation because we already validated that one
|
2204
|
+
exists during :meth:`HierarchicalStep._validate_step_graph`. In that case,
|
2205
|
+
we manually instantiate and use a ``step_config`` with the default implementation.
|
2185
2206
|
"""
|
2186
2207
|
for sub_node in self._step.step_graph.nodes:
|
2187
2208
|
sub_step = self._step.step_graph.nodes[sub_node]["step"]
|
2188
|
-
|
2189
|
-
|
2190
|
-
|
2191
|
-
|
2192
|
-
|
2193
|
-
|
2209
|
+
try:
|
2210
|
+
step_config = (
|
2211
|
+
self.step_config
|
2212
|
+
if isinstance(sub_step, StandaloneStep)
|
2213
|
+
else self.step_config[sub_step.name]
|
2214
|
+
)
|
2215
|
+
except KeyError:
|
2216
|
+
# We know that any missing keys must have a default implementation
|
2217
|
+
# (because we have already checked that it exists during validation)
|
2218
|
+
step_config = LayeredConfigTree(
|
2219
|
+
{
|
2220
|
+
"implementation": {
|
2221
|
+
"name": sub_step.default_implementation,
|
2222
|
+
}
|
2223
|
+
}
|
2224
|
+
)
|
2194
2225
|
sub_step.set_configuration_state(
|
2195
2226
|
step_config, self.combined_implementations, self.input_data_config
|
2196
2227
|
)
|
@@ -4,8 +4,8 @@ Data Aggregating Utilities
|
|
4
4
|
==========================
|
5
5
|
|
6
6
|
This module contains utility functions for aggregating datasets. One primary use
|
7
|
-
case for this is combine the results of
|
8
|
-
|
7
|
+
case for this is to combine the results of sections that were automatically run
|
8
|
+
in parallel.
|
9
9
|
|
10
10
|
Note that it is critical that all data aggregating utility functions are definied
|
11
11
|
in this module; easylink will not be able to find them otherwise.
|
@@ -4,7 +4,7 @@ Data Splitting Utilities
|
|
4
4
|
========================
|
5
5
|
|
6
6
|
This module contains utility functions for splitting datasets into smaller datasets.
|
7
|
-
One primary use case for this is to run sections of the pipeline in an
|
7
|
+
One primary use case for this is to run sections of the pipeline in an auto
|
8
8
|
parallel manner.
|
9
9
|
|
10
10
|
Note that it is critical that all data splitting utility functions are definied
|
@@ -1,24 +1,22 @@
|
|
1
1
|
easylink/__about__.py,sha256=2-oxCfu9t9yUJouLDwqYRZ0eii8kN25SxRzsawjWjho,440
|
2
2
|
easylink/__init__.py,sha256=gGMcIVfiVnHtlDw5mZwhevcDb2wt-kuP6F64gnkFack,159
|
3
|
-
easylink/_version.py,sha256=
|
3
|
+
easylink/_version.py,sha256=zmP2TRnzKPjZJ1eiBcT-cRInsji6FW-OVD3FafQFCc4,23
|
4
4
|
easylink/cli.py,sha256=zQO4lOVoZ3eVgPVWT2sCF4zNoKgiDJP9ReTh2Myr9jc,10307
|
5
|
-
easylink/configuration.py,sha256=
|
5
|
+
easylink/configuration.py,sha256=TPd3WbqUcJMJDPJuHeo5ZebvZPQrRyfm8-laad2sOFk,12681
|
6
6
|
easylink/graph_components.py,sha256=zZDZXg5smReHO3ryQC4pao24wyKXzWDe6jS3C6fM2ak,13892
|
7
|
-
easylink/implementation.py,sha256=
|
7
|
+
easylink/implementation.py,sha256=lSF37g-aQYgdLI0lgFaDrBkp23te9O9B1V-CmmRtB-0,14514
|
8
8
|
easylink/implementation_metadata.yaml,sha256=GoU_aWjVryG8-xjUHkC2nCUeznmYD0BwfJYnNrpZ8P4,10670
|
9
|
-
easylink/pipeline.py,sha256=
|
10
|
-
easylink/pipeline_graph.py,sha256=
|
11
|
-
easylink/pipeline_schema.py,sha256=
|
12
|
-
easylink/rule.py,sha256=
|
9
|
+
easylink/pipeline.py,sha256=NJyMInbOCjJ_5kRzzuK5AcupvC7ecd5qLOC-f1Gy3Ss,17701
|
10
|
+
easylink/pipeline_graph.py,sha256=jtjS7_2IVa189U8cL621600erC_0pa6BKPRRO8VqwiU,23760
|
11
|
+
easylink/pipeline_schema.py,sha256=sj6YQqMFUS35RRFWdlrhPlud68d3Rb5y7iKfAEZy1qs,6877
|
12
|
+
easylink/rule.py,sha256=QJPmrvQUZPnqGFD9UmMK8imdJ7VODzGlUOSnpJhb9AU,16677
|
13
13
|
easylink/runner.py,sha256=Z9GKgiUAWtp0dW1cyAp86MGthIDeABJtHDXUtzv0-kE,6714
|
14
|
-
easylink/step.py,sha256=
|
15
|
-
easylink/devtools/implementation_creator.py,sha256=
|
16
|
-
easylink/
|
17
|
-
easylink/
|
18
|
-
easylink/pipeline_schema_constants/
|
19
|
-
easylink/pipeline_schema_constants/
|
20
|
-
easylink/pipeline_schema_constants/main.py,sha256=9IxAjgQej7AaV-zYZEFhG8U-v_rYBFaPuNS3Y3m4Sho,22929
|
21
|
-
easylink/pipeline_schema_constants/testing.py,sha256=UDmVVjI1SiDktMbJ2CrSb7amHSYNwhgqNkXhl4lYxQw,20459
|
14
|
+
easylink/step.py,sha256=zQAoz4HlSVvgS7iMlfmCrXluOtPQxbSgPZOeyZwjdpo,91085
|
15
|
+
easylink/devtools/implementation_creator.py,sha256=gZZpfpiOOh912nNjR_d5wR0mBO5c09jWMS0FSYRcE1o,19120
|
16
|
+
easylink/pipeline_schema_constants/__init__.py,sha256=njUL2v67WFC3dW_FjOXWRLSeOhInox_ZINLEji0_7V8,1523
|
17
|
+
easylink/pipeline_schema_constants/development.py,sha256=0x6lWqBmP1K9AebEmeZ4veSnLBcdQcZXzbV6lCU11bc,12622
|
18
|
+
easylink/pipeline_schema_constants/main.py,sha256=kcAhdbK_BhS79LUMhKkn-Uttl2vA-MHVX4M1StTloJQ,22934
|
19
|
+
easylink/pipeline_schema_constants/testing.py,sha256=ZFD19CpcidZPVUYBvh8LAa5sZEERT2yfoFa-3xmskFs,24595
|
22
20
|
easylink/steps/cascading/exclude_clustered.def,sha256=GfoDqO2Vtsh7VI8SwGaJtv_KtKjs-UmBcivqQ7OPkjk,503
|
23
21
|
easylink/steps/cascading/exclude_clustered.py,sha256=NSA6GZBzGa7e6CH4tacCGfr0Y9sUM29g9Nf8NquHB44,2612
|
24
22
|
easylink/steps/cascading/exclude_none.def,sha256=iFUhUMocxtkA0NErkjVrBxY0MUdS3DIPNsbCpTJRP0k,488
|
@@ -76,16 +74,16 @@ easylink/steps/splink/splink_evaluating_pairs.py,sha256=JR2qVgb14cNZKozDyOrN11nr
|
|
76
74
|
easylink/steps/splink/splink_links_to_clusters.def,sha256=RurvOYyGjNs9tx64DTXwI-GSgHD4T7SzDfhAH18pTEM,524
|
77
75
|
easylink/steps/splink/splink_links_to_clusters.py,sha256=z5ymdYl9ytp1e5MA6vn8wpGRFWVuhh23LqGq8NJJxZQ,1936
|
78
76
|
easylink/utilities/__init__.py,sha256=0U33kbv4hoMfFQ_lh5hLwifxRPzOgkLkjKLYxmaK10g,196
|
79
|
-
easylink/utilities/aggregator_utils.py,sha256=
|
77
|
+
easylink/utilities/aggregator_utils.py,sha256=_DAHRAf9janbDsuj_jnAn5Dzz2s4R5Ni3YeytDpN9UE,954
|
80
78
|
easylink/utilities/data_utils.py,sha256=XPRjq3qW_fN0xQ23Jms_xBzpTHbRwqZWDP1AW0nYkP0,6926
|
81
79
|
easylink/utilities/general_utils.py,sha256=El1W0nn4P27sRBGotNQb-9du-Gbhk9ggSuu4vmGDfwo,4591
|
82
80
|
easylink/utilities/paths.py,sha256=9inDKMPorAaWh5vhjc3j1Tj_aXVKhLdodiJO9H1nNes,947
|
83
81
|
easylink/utilities/spark.smk,sha256=kGtpem7LfQc71tMh5WAYaqKnHQKFvcdhPQSdumOP70k,5799
|
84
|
-
easylink/utilities/splitter_utils.py,sha256=
|
82
|
+
easylink/utilities/splitter_utils.py,sha256=KXiVUYJ9TGxCQmrwos18pB1sxG_0Ay67qoDJT6vem2o,3828
|
85
83
|
easylink/utilities/validation_utils.py,sha256=DBJB2TLVklgYw1WaaPse9vqtwPLMGmZNYM2cbCZsoHI,18417
|
86
|
-
easylink-0.1.
|
87
|
-
easylink-0.1.
|
88
|
-
easylink-0.1.
|
89
|
-
easylink-0.1.
|
90
|
-
easylink-0.1.
|
91
|
-
easylink-0.1.
|
84
|
+
easylink-0.1.22.dist-info/licenses/LICENSE,sha256=z6NBo3hY2Pbh2Wjm9IcHLyQoG-kd816uASh8PbwhTcQ,1530
|
85
|
+
easylink-0.1.22.dist-info/METADATA,sha256=hei9KKa0HUgy1Z4aU-nPEAs8KF2_TEe7J0-_esdCG40,3565
|
86
|
+
easylink-0.1.22.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
87
|
+
easylink-0.1.22.dist-info/entry_points.txt,sha256=OGMZDFltg3yMboT7XjJt3joiPhRfV_7jnREVtrAIQNU,51
|
88
|
+
easylink-0.1.22.dist-info/top_level.txt,sha256=oHcOpcF_jDMWFiJRzfGQvuskENGDjSPC_Agu9Z_Xvik,9
|
89
|
+
easylink-0.1.22.dist-info/RECORD,,
|
@@ -1,16 +0,0 @@
|
|
1
|
-
# Stage 1: Start with the miniconda3 base image
|
2
|
-
FROM continuumio/miniconda3 as conda-base
|
3
|
-
|
4
|
-
# Create a new conda environment
|
5
|
-
SHELL ["/bin/bash", "--login", "-c"]
|
6
|
-
RUN conda init bash \
|
7
|
-
&& . ~/.bashrc \
|
8
|
-
&& conda create -n spark_cluster python=3.10
|
9
|
-
|
10
|
-
# Stage 2: Start with the Apache Spark base image
|
11
|
-
FROM apache/spark@sha256:a1dd2487a97fb5e35c5a5b409e830b501a92919029c62f9a559b13c4f5c50f63 as spark-base
|
12
|
-
|
13
|
-
COPY --from=conda-base /opt/conda /opt/conda
|
14
|
-
|
15
|
-
# Set PATH for conda environment and conda itself
|
16
|
-
ENV PATH=/opt/conda/envs/spark_cluster/bin:/opt/conda/condabin:${PATH}
|
@@ -1,15 +0,0 @@
|
|
1
|
-
# spark_cluster container
|
2
|
-
NOTE: Spinning up a spark cluster using `easylink` currently requires building an image from this directory.
|
3
|
-
|
4
|
-
This is done by running the following commands from this directory:
|
5
|
-
|
6
|
-
```
|
7
|
-
# build the image
|
8
|
-
$ sudo docker build -t easylink:sparkbuilder .
|
9
|
-
# save as compressed tarball
|
10
|
-
$ sudo docker save easylink:sparkbuilder | gzip > spark_cluster.tar.gz
|
11
|
-
# remove the image
|
12
|
-
$ sudo docker rmi easylink:sparkbuilder
|
13
|
-
# convert the image from the docker image
|
14
|
-
$ singularity build --force spark_cluster.sif docker-archive://$(pwd)/spark_cluster.tar.gz
|
15
|
-
```
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|