easylink 0.1.20__py3-none-any.whl → 0.1.22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
easylink/_version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.1.20"
1
+ __version__ = "0.1.22"
easylink/configuration.py CHANGED
@@ -184,7 +184,9 @@ class Config(LayeredConfigTree):
184
184
  #################
185
185
 
186
186
  def _get_schema(self, schema_name: str = "main") -> PipelineSchema:
187
- """Returns the first :class:`~easylink.pipeline_schema.PipelineSchema` that validates the requested pipeline.
187
+ """Gets the requested :class:`~easylink.pipeline_schema.PipelineSchema`.
188
+
189
+ The schema is only returned if it validates the pipeline configuration.
188
190
 
189
191
  Parameters
190
192
  ----------
@@ -205,11 +207,10 @@ class Config(LayeredConfigTree):
205
207
  Notes
206
208
  -----
207
209
  This acts as the pipeline configuration file's validation method since
208
- we can only find a matching ``PipelineSchema`` if that file is valid.
210
+ we can only validate the ``PipelineSchema`` if that file is valid.
209
211
 
210
212
  """
211
213
  errors = defaultdict(dict)
212
- # Try each schema until one is validated
213
214
  schema = PipelineSchema.get_schema(schema_name)
214
215
  logs = schema.validate_step(self.pipeline, self.input_data)
215
216
  if logs:
@@ -21,8 +21,8 @@ from loguru import logger
21
21
 
22
22
  from easylink.pipeline_schema_constants import SCHEMA_PARAMS
23
23
  from easylink.step import (
24
+ AutoParallelStep,
24
25
  ChoiceStep,
25
- EmbarrassinglyParallelStep,
26
26
  HierarchicalStep,
27
27
  IOStep,
28
28
  Step,
@@ -319,7 +319,7 @@ class ImplementationCreator:
319
319
  elif isinstance(node, TemplatedStep):
320
320
  _process_step(node.template_step)
321
321
  return
322
- elif isinstance(node, EmbarrassinglyParallelStep):
322
+ elif isinstance(node, AutoParallelStep):
323
323
  _process_step(node.step)
324
324
  return
325
325
  elif isinstance(node, ChoiceStep):
@@ -55,7 +55,7 @@ class Implementation:
55
55
  implementation_config: LayeredConfigTree,
56
56
  input_slots: Iterable[InputSlot] = (),
57
57
  output_slots: Iterable[OutputSlot] = (),
58
- is_embarrassingly_parallel: bool = False,
58
+ is_auto_parallel: bool = False,
59
59
  ):
60
60
  self.name = implementation_config.name
61
61
  """The name of this ``Implementation``."""
@@ -74,7 +74,7 @@ class Implementation:
74
74
  implemented by this particular ``Implementation``."""
75
75
  self.requires_spark = self._metadata.get("requires_spark", False)
76
76
  """Whether this ``Implementation`` requires a Spark environment."""
77
- self.is_embarrassingly_parallel = is_embarrassingly_parallel
77
+ self.is_auto_parallel = is_auto_parallel
78
78
 
79
79
  def __repr__(self) -> str:
80
80
  return f"Implementation.{self.name}"
easylink/pipeline.py CHANGED
@@ -45,9 +45,9 @@ class Pipeline:
45
45
  The :class:`~easylink.pipeline_graph.PipelineGraph` object.
46
46
  spark_is_required
47
47
  A boolean indicating whether the pipeline requires Spark.
48
- any_embarrassingly_parallel
48
+ any_auto_parallel
49
49
  A boolean indicating whether any implementation in the pipeline is to be
50
- run in an embarrassingly parallel manner.
50
+ automatically run in parallel.
51
51
 
52
52
  """
53
53
 
@@ -55,7 +55,7 @@ class Pipeline:
55
55
  self.config = config
56
56
  self.pipeline_graph = PipelineGraph(config)
57
57
  self.spark_is_required = self.pipeline_graph.spark_is_required
58
- self.any_embarrassingly_parallel = self.pipeline_graph.any_embarrassingly_parallel
58
+ self.any_auto_parallel = self.pipeline_graph.any_auto_parallel
59
59
 
60
60
  # TODO [MIC-4880]: refactor into validation object
61
61
  self._validate()
@@ -179,7 +179,7 @@ class Pipeline:
179
179
  #################################
180
180
 
181
181
  def _write_imports(self) -> None:
182
- if not self.any_embarrassingly_parallel:
182
+ if not self.any_auto_parallel:
183
183
  imports = "from easylink.utilities import validation_utils\n"
184
184
  else:
185
185
  imports = """import glob
@@ -193,7 +193,7 @@ from easylink.utilities import aggregator_utils, splitter_utils, validation_util
193
193
  f.write(imports)
194
194
 
195
195
  def _write_wildcard_constraints(self) -> None:
196
- if self.any_embarrassingly_parallel:
196
+ if self.any_auto_parallel:
197
197
  with open(self.snakefile_path, "a") as f:
198
198
  f.write(
199
199
  """
@@ -301,12 +301,10 @@ use rule start_spark_worker from spark_cluster with:
301
301
  The name of the ``Implementation`` to write the rule(s) for.
302
302
  """
303
303
 
304
- is_embarrassingly_parallel = self.pipeline_graph.get_whether_embarrassingly_parallel(
305
- node_name
306
- )
304
+ is_auto_parallel = self.pipeline_graph.get_whether_auto_parallel(node_name)
307
305
  input_slots, _output_slots = self.pipeline_graph.get_io_slot_attributes(node_name)
308
306
  validation_files, validation_rules = self._get_validations(
309
- node_name, input_slots, is_embarrassingly_parallel
307
+ node_name, input_slots, is_auto_parallel
310
308
  )
311
309
  for validation_rule in validation_rules:
312
310
  validation_rule.write_to_snakefile(self.snakefile_path)
@@ -334,7 +332,7 @@ use rule start_spark_worker from spark_cluster with:
334
332
  image_path=self.config.images_dir / implementation.singularity_image_name,
335
333
  script_cmd=implementation.script_cmd,
336
334
  requires_spark=implementation.requires_spark,
337
- is_embarrassingly_parallel=is_embarrassingly_parallel,
335
+ is_auto_parallel=is_auto_parallel,
338
336
  ).write_to_snakefile(self.snakefile_path)
339
337
 
340
338
  def _write_checkpoint_rule(self, node_name: str, checkpoint_filepath: str) -> None:
@@ -377,7 +375,7 @@ use rule start_spark_worker from spark_cluster with:
377
375
  input_files, output_files = self.pipeline_graph.get_io_filepaths(node_name)
378
376
  if len(output_slots) > 1:
379
377
  raise NotImplementedError(
380
- "FIXME [MIC-5883] Multiple output slots/files of EmbarrassinglyParallelSteps not yet supported"
378
+ "FIXME [MIC-5883] Multiple output slots/files of AutoParallelSteps not yet supported"
381
379
  )
382
380
  if len(output_files) > 1:
383
381
  raise ValueError(
@@ -388,7 +386,7 @@ use rule start_spark_worker from spark_cluster with:
388
386
  output_slot_attrs = list(output_slots.values())[0]
389
387
  if len(output_slot_attrs["filepaths"]) > 1:
390
388
  raise NotImplementedError(
391
- "FIXME [MIC-5883] Multiple output slots/files of EmbarrassinglyParallelSteps not yet supported"
389
+ "FIXME [MIC-5883] Multiple output slots/files of AutoParallelSteps not yet supported"
392
390
  )
393
391
  checkpoint_rule_name = f"checkpoints.{implementation.splitter_node_name}"
394
392
  AggregationRule(
@@ -404,7 +402,7 @@ use rule start_spark_worker from spark_cluster with:
404
402
  def _get_validations(
405
403
  node_name: str,
406
404
  input_slots: dict[str, dict[str, str | list[str]]],
407
- is_embarrassingly_parallel: bool,
405
+ is_auto_parallel: bool,
408
406
  ) -> tuple[list[str], list[InputValidationRule]]:
409
407
  """Gets the validation rule and its output filepath for each slot for a given node.
410
408
 
@@ -423,10 +421,10 @@ use rule start_spark_worker from spark_cluster with:
423
421
  validation_rules = []
424
422
 
425
423
  for input_slot_name, input_slot_attrs in input_slots.items():
426
- # embarrassingly parallel implementations rely on snakemake wildcards
424
+ # auto-parallel implementations rely on snakemake wildcards
427
425
  # TODO: [MIC-5787] - need to support multiple wildcards at once
428
426
  validation_file = f"input_validations/{node_name}/{input_slot_name}_validator" + (
429
- "-{chunk}" if is_embarrassingly_parallel else ""
427
+ "-{chunk}" if is_auto_parallel else ""
430
428
  )
431
429
  validation_files.append(validation_file)
432
430
  validation_rules.append(
@@ -72,31 +72,26 @@ class PipelineGraph(ImplementationGraph):
72
72
  return any([implementation.requires_spark for implementation in self.implementations])
73
73
 
74
74
  @property
75
- def any_embarrassingly_parallel(self) -> bool:
75
+ def any_auto_parallel(self) -> bool:
76
76
  """Whether or not any :class:`~easylink.implementation.Implementation` is
77
- to be run in an embarrassingly parallel way."""
77
+ to be automatically run in parallel."""
78
78
  return any(
79
- [
80
- self.get_whether_embarrassingly_parallel(node)
81
- for node in self.implementation_nodes
82
- ]
79
+ [self.get_whether_auto_parallel(node) for node in self.implementation_nodes]
83
80
  )
84
81
 
85
- def get_whether_embarrassingly_parallel(self, node: str) -> dict[str, bool]:
86
- """Determines whether a node is to be run in an embarrassingly parallel way.
82
+ def get_whether_auto_parallel(self, node: str) -> dict[str, bool]:
83
+ """Determines whether a node is to be automatically run in parallel.
87
84
 
88
85
  Parameters
89
86
  ----------
90
87
  node
91
- The node name to determine whether or not it is to be run in an
92
- embarrassingly parallel way.
88
+ The node name to determine whether or not it is to be automatically run in parallel.
93
89
 
94
90
  Returns
95
91
  -------
96
- A boolean indicating whether the node is to be run in an embarrassingly
97
- parallel way.
92
+ A boolean indicating whether the node is to be automatically run in parallel.
98
93
  """
99
- return self.nodes[node]["implementation"].is_embarrassingly_parallel
94
+ return self.nodes[node]["implementation"].is_auto_parallel
100
95
 
101
96
  def get_io_filepaths(self, node: str) -> tuple[list[str], list[str]]:
102
97
  """Gets all of a node's input and output filepaths from its edges.
@@ -482,9 +477,9 @@ class PipelineGraph(ImplementationGraph):
482
477
  str(
483
478
  Path("intermediate")
484
479
  / node
485
- # embarrassingly parallel implementations rely on snakemake wildcards
480
+ # auto-parallel implementations rely on snakemake wildcards
486
481
  # TODO: [MIC-5787] - need to support multiple wildcards at once
487
- / ("{chunk}" if implementation.is_embarrassingly_parallel else "")
482
+ / ("{chunk}" if implementation.is_auto_parallel else "")
488
483
  / imp_outputs[edge_attrs["output_slot"].name]
489
484
  ),
490
485
  )
@@ -159,10 +159,10 @@ class PipelineSchema(HierarchicalStep):
159
159
  )
160
160
 
161
161
  @classmethod
162
- def get_schema(cls, name: str = "main") -> list["PipelineSchema"]:
163
- """Gets all allowable ``PipelineSchemas``.
162
+ def get_schema(cls, name: str = "main") -> "PipelineSchema":
163
+ """Gets the requested ``PipelineSchema``.
164
164
 
165
- These ``PipelineSchemas`` represent the fully supported pipelines and are
165
+ This ``PipelineSchema`` represents the fully supported pipelines and is
166
166
  used to validate the user-requested pipeline.
167
167
 
168
168
  Parameters
@@ -23,8 +23,9 @@ SCHEMA_PARAMS = {
23
23
  "combine_with_iteration": testing.SCHEMA_PARAMS_COMBINE_WITH_ITERATION,
24
24
  "combine_with_iteration_cycle": testing.SCHEMA_PARAMS_COMBINE_WITH_ITERATION,
25
25
  "combine_with_extra_node": testing.SCHEMA_PARAMS_THREE_STEPS,
26
- "looping_ep_step": testing.SCHEMA_PARAMS_LOOPING_EP_STEP,
27
- "ep_parallel_step": testing.SCHEMA_PARAMS_EP_PARALLEL_STEP,
28
- "ep_loop_step": testing.SCHEMA_PARAMS_EP_LOOP_STEP,
29
- "ep_hierarchical_step": testing.SCHEMA_PARAMS_EP_HIERARCHICAL_STEP,
26
+ "looping_auto_parallel_step": testing.SCHEMA_PARAMS_LOOPING_AUTO_PARALLEL_STEP,
27
+ "auto_parallel_cloneable_step": testing.SCHEMA_PARAMS_AUTO_PARALLEL_CLONEABLE_STEP,
28
+ "auto_parallel_loop_step": testing.SCHEMA_PARAMS_AUTO_PARALLEL_LOOP_STEP,
29
+ "auto_parallel_hierarchical_step": testing.SCHEMA_PARAMS_AUTO_PARALLEL_HIERARCHICAL_STEP,
30
+ "default_implementations": testing.SCHEMA_PARAMS_DEFAULT_IMPLEMENTATIONS,
30
31
  }
@@ -18,13 +18,13 @@ from easylink.graph_components import (
18
18
  OutputSlotMapping,
19
19
  )
20
20
  from easylink.step import (
21
+ AutoParallelStep,
21
22
  ChoiceStep,
22
- EmbarrassinglyParallelStep,
23
+ CloneableStep,
23
24
  HierarchicalStep,
24
25
  InputStep,
25
26
  LoopStep,
26
27
  OutputStep,
27
- ParallelStep,
28
28
  Step,
29
29
  )
30
30
  from easylink.utilities.aggregator_utils import concatenate_datasets
@@ -33,7 +33,7 @@ from easylink.utilities.validation_utils import validate_input_file_dummy
33
33
 
34
34
  NODES = [
35
35
  InputStep(),
36
- ParallelStep(
36
+ CloneableStep(
37
37
  template_step=Step(
38
38
  step_name="step_1",
39
39
  input_slots=[
@@ -58,7 +58,7 @@ NODES = [
58
58
  output_slots=[OutputSlot("step_2_main_output")],
59
59
  ),
60
60
  LoopStep(
61
- template_step=EmbarrassinglyParallelStep(
61
+ template_step=AutoParallelStep(
62
62
  step=Step(
63
63
  step_name="step_3",
64
64
  input_slots=[
@@ -12,11 +12,11 @@ from easylink.graph_components import (
12
12
  OutputSlotMapping,
13
13
  )
14
14
  from easylink.step import (
15
+ CloneableStep,
15
16
  HierarchicalStep,
16
17
  InputStep,
17
18
  LoopStep,
18
19
  OutputStep,
19
- ParallelStep,
20
20
  Step,
21
21
  )
22
22
  from easylink.utilities.validation_utils import (
@@ -56,8 +56,8 @@ NODES = [
56
56
  ],
57
57
  output_slots=[OutputSlot("clusters")],
58
58
  nodes=[
59
- ParallelStep(
60
- # NOTE: Splitters/aggregators on the ParallelStep are implicit!
59
+ CloneableStep(
60
+ # NOTE: Splitters/aggregators on the CloneableStep are implicit!
61
61
  template_step=HierarchicalStep(
62
62
  step_name="determining_exclusions_and_removing_records",
63
63
  directly_implemented=False,
@@ -190,7 +190,7 @@ NODES = [
190
190
  ],
191
191
  output_slots=[OutputSlot("links")],
192
192
  nodes=[
193
- ParallelStep(
193
+ CloneableStep(
194
194
  template_step=LoopStep(
195
195
  template_step=Step(
196
196
  step_name="pre-processing",
@@ -265,7 +265,7 @@ NODES = [
265
265
  source_node="pre-processing",
266
266
  target_node="schema_alignment",
267
267
  output_slot="dataset",
268
- # NOTE: The implicit ParallelStep aggregator has
268
+ # NOTE: The implicit CloneableStep aggregator has
269
269
  # made this multiple (a list)
270
270
  input_slot="datasets",
271
271
  ),
@@ -16,12 +16,12 @@ from easylink.graph_components import (
16
16
  OutputSlotMapping,
17
17
  )
18
18
  from easylink.step import (
19
- EmbarrassinglyParallelStep,
19
+ AutoParallelStep,
20
+ CloneableStep,
20
21
  HierarchicalStep,
21
22
  InputStep,
22
23
  LoopStep,
23
24
  OutputStep,
24
- ParallelStep,
25
25
  Step,
26
26
  )
27
27
  from easylink.utilities.aggregator_utils import concatenate_datasets
@@ -215,7 +215,7 @@ SCHEMA_PARAMS_BAD_COMBINED_TOPOLOGY = (NODES_BAD_COMBINED_TOPOLOGY, EDGES_ONE_ST
215
215
  NODES_NESTED_TEMPLATED_STEPS = [
216
216
  InputStep(),
217
217
  LoopStep(
218
- template_step=ParallelStep(
218
+ template_step=CloneableStep(
219
219
  template_step=HierarchicalStep(
220
220
  step_name="step_1",
221
221
  input_slots=[
@@ -355,10 +355,10 @@ EDGES_TWO_STEPS = [
355
355
  SCHEMA_PARAMS_COMBINE_WITH_ITERATION = (NODES_COMBINE_WITH_ITERATION, EDGES_TWO_STEPS)
356
356
 
357
357
 
358
- NODES_LOOPING_EP_STEP = [
358
+ NODES_LOOPING_AUTO_PARALLEL_STEP = [
359
359
  InputStep(),
360
360
  LoopStep(
361
- template_step=EmbarrassinglyParallelStep(
361
+ template_step=AutoParallelStep(
362
362
  step=Step(
363
363
  step_name="step_1",
364
364
  input_slots=[
@@ -392,13 +392,13 @@ NODES_LOOPING_EP_STEP = [
392
392
  ]
393
393
  ),
394
394
  ]
395
- SCHEMA_PARAMS_LOOPING_EP_STEP = (NODES_LOOPING_EP_STEP, EDGES_ONE_STEP)
395
+ SCHEMA_PARAMS_LOOPING_AUTO_PARALLEL_STEP = (NODES_LOOPING_AUTO_PARALLEL_STEP, EDGES_ONE_STEP)
396
396
 
397
397
 
398
- NODES_EP_PARALLEL_STEP = [
398
+ NODES_AUTO_PARALLEL_PARALLEL_STEP = [
399
399
  InputStep(),
400
- EmbarrassinglyParallelStep(
401
- step=ParallelStep(
400
+ AutoParallelStep(
401
+ step=CloneableStep(
402
402
  template_step=Step(
403
403
  step_name="step_1",
404
404
  input_slots=[
@@ -424,12 +424,15 @@ NODES_EP_PARALLEL_STEP = [
424
424
  ]
425
425
  ),
426
426
  ]
427
- SCHEMA_PARAMS_EP_PARALLEL_STEP = (NODES_EP_PARALLEL_STEP, EDGES_ONE_STEP)
427
+ SCHEMA_PARAMS_AUTO_PARALLEL_CLONEABLE_STEP = (
428
+ NODES_AUTO_PARALLEL_PARALLEL_STEP,
429
+ EDGES_ONE_STEP,
430
+ )
428
431
 
429
432
 
430
- NODES_EP_LOOP_STEP = [
433
+ NODES_AUTO_PARALLEL_LOOP_STEP = [
431
434
  InputStep(),
432
- EmbarrassinglyParallelStep(
435
+ AutoParallelStep(
433
436
  step=LoopStep(
434
437
  template_step=Step(
435
438
  step_name="step_1",
@@ -464,12 +467,12 @@ NODES_EP_LOOP_STEP = [
464
467
  ]
465
468
  ),
466
469
  ]
467
- SCHEMA_PARAMS_EP_LOOP_STEP = (NODES_EP_LOOP_STEP, EDGES_ONE_STEP)
470
+ SCHEMA_PARAMS_AUTO_PARALLEL_LOOP_STEP = (NODES_AUTO_PARALLEL_LOOP_STEP, EDGES_ONE_STEP)
468
471
 
469
472
 
470
- NODES_EP_HIERARCHICAL_STEP = [
473
+ NODES_AUTO_PARALLEL_HIERARCHICAL_STEP = [
471
474
  InputStep(),
472
- EmbarrassinglyParallelStep(
475
+ AutoParallelStep(
473
476
  step=HierarchicalStep(
474
477
  step_name="step_1",
475
478
  input_slots=[
@@ -581,7 +584,10 @@ EDGES_ONE_STEP_TWO_ISLOTS = [
581
584
  input_slot="result",
582
585
  ),
583
586
  ]
584
- SCHEMA_PARAMS_EP_HIERARCHICAL_STEP = (NODES_EP_HIERARCHICAL_STEP, EDGES_ONE_STEP_TWO_ISLOTS)
587
+ SCHEMA_PARAMS_AUTO_PARALLEL_HIERARCHICAL_STEP = (
588
+ NODES_AUTO_PARALLEL_HIERARCHICAL_STEP,
589
+ EDGES_ONE_STEP_TWO_ISLOTS,
590
+ )
585
591
 
586
592
  NODES_OUTPUT_DIR = [
587
593
  InputStep(),
@@ -634,3 +640,126 @@ EDGES_OUTPUT_DIR = [
634
640
  ),
635
641
  ]
636
642
  SCHEMA_PARAMS_OUTPUT_DIR = (NODES_OUTPUT_DIR, EDGES_OUTPUT_DIR)
643
+
644
+
645
+ NODES_DEFAULT_IMPLEMENTATIONS = [
646
+ InputStep(),
647
+ HierarchicalStep(
648
+ step_name="step_1",
649
+ input_slots=[
650
+ InputSlot(
651
+ name="step_1_main_input",
652
+ env_var="DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS",
653
+ validator=validate_input_file_dummy,
654
+ ),
655
+ ],
656
+ output_slots=[OutputSlot("step_1_main_output")],
657
+ nodes=[
658
+ Step(
659
+ step_name="step_1a",
660
+ input_slots=[
661
+ InputSlot(
662
+ name="step_1a_main_input",
663
+ env_var="DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS",
664
+ validator=validate_input_file_dummy,
665
+ ),
666
+ ],
667
+ output_slots=[OutputSlot("step_1a_main_output")],
668
+ default_implementation="step_1a_python_pandas",
669
+ ),
670
+ Step(
671
+ step_name="step_1b",
672
+ input_slots=[
673
+ InputSlot(
674
+ name="step_1b_main_input",
675
+ env_var="DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS",
676
+ validator=validate_input_file_dummy,
677
+ ),
678
+ ],
679
+ output_slots=[OutputSlot("step_1b_main_output")],
680
+ default_implementation="step_1b_python_pandas",
681
+ ),
682
+ ],
683
+ edges=[
684
+ EdgeParams(
685
+ source_node="step_1a",
686
+ target_node="step_1b",
687
+ output_slot="step_1a_main_output",
688
+ input_slot="step_1b_main_input",
689
+ ),
690
+ ],
691
+ input_slot_mappings=[
692
+ InputSlotMapping(
693
+ parent_slot="step_1_main_input",
694
+ child_node="step_1a",
695
+ child_slot="step_1a_main_input",
696
+ ),
697
+ ],
698
+ output_slot_mappings=[
699
+ OutputSlotMapping(
700
+ parent_slot="step_1_main_output",
701
+ child_node="step_1b",
702
+ child_slot="step_1b_main_output",
703
+ ),
704
+ ],
705
+ default_implementation="step_1_python_pandas",
706
+ ),
707
+ Step(
708
+ step_name="step_2",
709
+ input_slots=[
710
+ InputSlot(
711
+ name="step_2_main_input",
712
+ env_var="DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS",
713
+ validator=validate_input_file_dummy,
714
+ )
715
+ ],
716
+ output_slots=[OutputSlot("step_2_main_output")],
717
+ default_implementation="step_2_python_pandas",
718
+ ),
719
+ LoopStep(
720
+ template_step=Step(
721
+ step_name="step_3",
722
+ input_slots=[
723
+ InputSlot(
724
+ name="step_3_main_input",
725
+ env_var="DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS",
726
+ validator=validate_input_file_dummy,
727
+ )
728
+ ],
729
+ output_slots=[OutputSlot("step_3_main_output")],
730
+ ),
731
+ self_edges=[
732
+ EdgeParams(
733
+ source_node="step_3",
734
+ target_node="step_3",
735
+ output_slot="step_3_main_output",
736
+ input_slot="step_3_main_input",
737
+ ),
738
+ ],
739
+ default_implementation="step_3_python_pandas",
740
+ ),
741
+ CloneableStep(
742
+ template_step=Step(
743
+ step_name="step_4",
744
+ input_slots=[
745
+ InputSlot(
746
+ name="step_4_main_input",
747
+ env_var="DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS",
748
+ validator=validate_input_file_dummy,
749
+ ),
750
+ ],
751
+ output_slots=[
752
+ OutputSlot(
753
+ name="step_4_main_output",
754
+ ),
755
+ ],
756
+ ),
757
+ default_implementation="step_4_python_pandas",
758
+ ),
759
+ OutputStep(
760
+ input_slots=[
761
+ InputSlot(name="result", env_var=None, validator=validate_input_file_dummy)
762
+ ],
763
+ ),
764
+ ]
765
+ SCHEMA_PARAMS_DEFAULT_IMPLEMENTATIONS = (NODES_DEFAULT_IMPLEMENTATIONS, EDGES_TWO_STEPS)
easylink/rule.py CHANGED
@@ -111,21 +111,20 @@ class ImplementedRule(Rule):
111
111
  """Command to execute."""
112
112
  requires_spark: bool
113
113
  """Whether or not this ``Implementation`` requires a Spark environment."""
114
- is_embarrassingly_parallel: bool = False
115
- """Whether or not this ``Implementation`` is to be run in an embarrassingly
116
- parallel way."""
114
+ is_auto_parallel: bool = False
115
+ """Whether or not this ``Implementation`` is to be automatically run in parallel."""
117
116
 
118
117
  def build_rule(self) -> str:
119
118
  """Builds the Snakemake rule for this ``Implementation``."""
120
- if self.is_embarrassingly_parallel and len(self.output) > 1:
119
+ if self.is_auto_parallel and len(self.output) > 1:
121
120
  raise NotImplementedError(
122
- "Multiple output slots/files of EmbarrassinglyParallelSteps not yet supported"
121
+ "Multiple output slots/files of AutoParallelSteps not yet supported"
123
122
  )
124
123
  return self._build_io() + self._build_resources() + self._build_shell_cmd()
125
124
 
126
125
  def _build_io(self) -> str:
127
126
  """Builds the input/output portion of the rule."""
128
- log_path_chunk_adder = "-{chunk}" if self.is_embarrassingly_parallel else ""
127
+ log_path_chunk_adder = "-{chunk}" if self.is_auto_parallel else ""
129
128
  # Handle output files vs directories
130
129
  files = [path for path in self.output if Path(path).suffix != ""]
131
130
  if len(files) == len(self.output):
@@ -260,7 +259,7 @@ rule:
260
259
  class CheckpointRule(Rule):
261
260
  """A :class:`Rule` that defines a checkpoint.
262
261
 
263
- When running an :class:`~easylink.implementation.Implementation` in an embarrassingly
262
+ When running an :class:`~easylink.implementation.Implementation` in an auto
264
263
  parallel way, we do not know until runtime how many parallel jobs there will
265
264
  be (e.g. we don't know beforehand how many chunks a large incoming dataset will
266
265
  be split into since the incoming dataset isn't created until runtime). The
@@ -326,7 +325,7 @@ checkpoint:
326
325
  class AggregationRule(Rule):
327
326
  """A :class:`Rule` that aggregates the processed chunks of output data.
328
327
 
329
- When running an :class:`~easylink.implementation.Implementation` in an embarrassingly
328
+ When running an :class:`~easylink.implementation.Implementation` in an auto
330
329
  parallel way, we need to aggregate the output files from each parallel job
331
330
  into a single output file.
332
331
  """
@@ -347,10 +346,10 @@ class AggregationRule(Rule):
347
346
  def build_rule(self) -> str:
348
347
  """Builds the Snakemake rule for this aggregator.
349
348
 
350
- When running an :class:`~easylink.step.EmbarrassinglyParallelStep`, we need
349
+ When running an :class:`~easylink.step.AutoParallelStep`, we need
351
350
  to aggregate the output files from each parallel job into a single output file.
352
351
  This rule relies on a dynamically generated aggregation function which returns
353
- all of the **processed** chunks (from running the ``EmbarrassinglyParallelStep's``
352
+ all of the **processed** chunks (from running the ``AutoParallelStep's``
354
353
  container in parallel) and uses them as inputs to the actual aggregation
355
354
  rule.
356
355
 
easylink/step.py CHANGED
@@ -71,8 +71,8 @@ class Step:
71
71
  The :class:`InputSlotMapping<easylink.graph_components.InputSlotMapping>` of this ``Step``.
72
72
  output_slot_mappings
73
73
  The :class:`OutputSlotMapping<easylink.graph_components.OutputSlotMapping>` of this ``Step``.
74
- is_embarrassingly_parallel
75
- Whether or not this ``Step`` is to be run in an embarrassingly parallel manner.
74
+ is_auto_parallel
75
+ Whether or not this ``Step`` is to automatically run in parallel.
76
76
 
77
77
  Notes
78
78
  -----
@@ -91,7 +91,8 @@ class Step:
91
91
  output_slots: Iterable[OutputSlot] = (),
92
92
  input_slot_mappings: Iterable[InputSlotMapping] = (),
93
93
  output_slot_mappings: Iterable[OutputSlotMapping] = (),
94
- is_embarrassingly_parallel: bool = False,
94
+ is_auto_parallel: bool = False,
95
+ default_implementation: str | None = None,
95
96
  ) -> None:
96
97
  if not step_name and not name:
97
98
  raise ValueError("All Steps must contain a step_name, name, or both.")
@@ -125,8 +126,11 @@ class Step:
125
126
  }
126
127
  """A combined dictionary containing both the ``InputSlotMappings`` and
127
128
  ``OutputSlotMappings`` of this ``Step``."""
128
- self.is_embarrassingly_parallel = is_embarrassingly_parallel
129
- """Whether or not this ``Step`` is to be run in an embarrassingly parallel manner."""
129
+ self.is_auto_parallel = is_auto_parallel
130
+ """Whether or not this ``Step`` is to be automatically run in parallel."""
131
+ self.default_implementation = default_implementation
132
+ """The default implementation to use for this ``Step`` if the ``Step`` is
133
+ not explicitly configured in the pipeline specification."""
130
134
  self.parent_step = None
131
135
  """This ``Step's`` parent ``Step``, if applicable."""
132
136
  self._configuration_state = None
@@ -580,6 +584,7 @@ class HierarchicalStep(Step):
580
584
  input_slot_mappings=(),
581
585
  output_slot_mappings=(),
582
586
  directly_implemented=True,
587
+ default_implementation: str | None = None,
583
588
  ):
584
589
  super().__init__(
585
590
  step_name,
@@ -588,6 +593,7 @@ class HierarchicalStep(Step):
588
593
  output_slots,
589
594
  input_slot_mappings,
590
595
  output_slot_mappings,
596
+ default_implementation=default_implementation,
591
597
  )
592
598
  self.nodes = nodes
593
599
  """All sub-nodes (i.e. sub-``Steps``) that make up this ``HierarchicalStep``."""
@@ -722,13 +728,19 @@ class HierarchicalStep(Step):
722
728
  step = self.step_graph.nodes[node]["step"]
723
729
  if isinstance(step, IOStep):
724
730
  continue
731
+ if step.name not in step_config:
732
+ default_implementation = self.step_graph.nodes[step.name][
733
+ "step"
734
+ ].default_implementation
735
+ step_errors = (
736
+ {f"step {step.name}": ["The step is not configured."]}
737
+ if not default_implementation
738
+ else {}
739
+ )
725
740
  else:
726
- if step.name not in step_config:
727
- step_errors = {f"step {step.name}": ["The step is not configured."]}
728
- else:
729
- step_errors = step.validate_step(
730
- step_config[step.name], combined_implementations, input_data_config
731
- )
741
+ step_errors = step.validate_step(
742
+ step_config[step.name], combined_implementations, input_data_config
743
+ )
732
744
  if step_errors:
733
745
  errors.update(step_errors)
734
746
  extra_steps = set(step_config.keys()) - set(self.step_graph.nodes)
@@ -816,7 +828,7 @@ class TemplatedStep(Step, ABC):
816
828
 
817
829
  A ``TemplatedStep`` is used to represents a ``Step`` that contains a specified
818
830
  amount of multiplicity, such as one that is looped or run in parallel; it is
819
- inherited by concrete :class:`LoopStep` and :class:`ParallelStep` instances.
831
+ inherited by concrete :class:`LoopStep` and :class:`CloneableStep` instances.
820
832
 
821
833
  See :class:`Step` for inherited attributes.
822
834
 
@@ -830,12 +842,14 @@ class TemplatedStep(Step, ABC):
830
842
  def __init__(
831
843
  self,
832
844
  template_step: Step,
845
+ default_implementation: str | None = None,
833
846
  ) -> None:
834
847
  super().__init__(
835
848
  template_step.step_name,
836
849
  template_step.name,
837
850
  template_step.input_slots.values(),
838
851
  template_step.output_slots.values(),
852
+ default_implementation=default_implementation,
839
853
  )
840
854
  self.step_graph = None
841
855
  """The :class:`~easylink.graph_components.StepGraph` i.e. the directed acyclic
@@ -1110,8 +1124,9 @@ class LoopStep(TemplatedStep):
1110
1124
  self,
1111
1125
  template_step: Step | None = None,
1112
1126
  self_edges: Iterable[EdgeParams] = (),
1127
+ default_implementation: str | None = None,
1113
1128
  ) -> None:
1114
- super().__init__(template_step)
1129
+ super().__init__(template_step, default_implementation)
1115
1130
  self.self_edges = self_edges
1116
1131
  """:class:`~easylink.graph_components.EdgeParams` that represent self-edges,
1117
1132
  i.e. edges that connect the output of one loop to the input of the next."""
@@ -1206,7 +1221,7 @@ class LoopStep(TemplatedStep):
1206
1221
  return {"input": input_mappings, "output": output_mappings}
1207
1222
 
1208
1223
 
1209
- class ParallelStep(TemplatedStep):
1224
+ class CloneableStep(TemplatedStep):
1210
1225
  """A type of :class:`TemplatedStep` that creates multiple copies in parallel
1211
1226
  with no dependencies between them.
1212
1227
 
@@ -1216,13 +1231,13 @@ class ParallelStep(TemplatedStep):
1216
1231
 
1217
1232
  @property
1218
1233
  def config_key(self):
1219
- """The pipeline specification key required for a ``ParallelStep``."""
1220
- return "parallel"
1234
+ """The pipeline specification key required for a ``CloneableStep``."""
1235
+ return "clones"
1221
1236
 
1222
1237
  @property
1223
1238
  def node_prefix(self):
1224
- """The prefix to be used in the ``ParallelStep`` node name."""
1225
- return "parallel_split"
1239
+ """The prefix to be used in the ``CloneableStep`` node name."""
1240
+ return "clone"
1226
1241
 
1227
1242
  def _update_step_graph(self, num_repeats: int) -> StepGraph:
1228
1243
  """Updates the :class:`~easylink.graph_components.StepGraph` to include parallelization.
@@ -1276,10 +1291,10 @@ class ParallelStep(TemplatedStep):
1276
1291
  return {"input": input_mappings, "output": output_mappings}
1277
1292
 
1278
1293
 
1279
- class EmbarrassinglyParallelStep(Step):
1294
+ class AutoParallelStep(Step):
1280
1295
  """A :class:`Step` that is run in parallel on the backend.
1281
1296
 
1282
- An ``EmbarrassinglyParallelStep`` is different than a :class:`ParallelStep`
1297
+ An ``AutoParallelStep`` is different than a :class:`CloneableStep`
1283
1298
  in that it is not configured by the user to be run in parallel - it completely
1284
1299
  happens on the back end for performance reasons.
1285
1300
 
@@ -1288,8 +1303,8 @@ class EmbarrassinglyParallelStep(Step):
1288
1303
  Parameters
1289
1304
  ----------
1290
1305
  step
1291
- The ``Step`` to be run in an embarrassingly parallel manner. To run multiple
1292
- steps in parallel, use a :class:`HierarchicalStep`.
1306
+ The ``Step`` to be automatically run in parallel. To run multiple steps in
1307
+ parallel, use a :class:`HierarchicalStep`.
1293
1308
  slot_splitter_mapping
1294
1309
  A mapping of the :class:`~easylink.graph_components.InputSlot` name to split
1295
1310
  to the actual splitter function to be used.
@@ -1308,7 +1323,7 @@ class EmbarrassinglyParallelStep(Step):
1308
1323
  super().__init__(
1309
1324
  step_name=None,
1310
1325
  name=step.name,
1311
- is_embarrassingly_parallel=True,
1326
+ is_auto_parallel=True,
1312
1327
  )
1313
1328
  self.slot_splitter_mapping = slot_splitter_mapping
1314
1329
  """A mapping of the :class:`~easylink.graph_components.InputSlot` name to split
@@ -1328,14 +1343,14 @@ class EmbarrassinglyParallelStep(Step):
1328
1343
 
1329
1344
  @Step.name.setter
1330
1345
  def name(self, value: str) -> None:
1331
- """Changes the name of the ``EmbarrassinglyParallelStep`` and the underlying :class:`Step` to the given value."""
1346
+ """Changes the name of the ``AutoParallelStep`` and the underlying :class:`Step` to the given value."""
1332
1347
  self._name = value
1333
1348
  self.step._name = value
1334
1349
 
1335
1350
  def _validate(self) -> None:
1336
- """Validates the ``EmbarrassinglyParallelStep``.
1351
+ """Validates the ``AutoParallelStep``.
1337
1352
 
1338
- ``EmbarrassinglyParallelSteps`` are not configured by the user to be run
1353
+ ``AutoParallelSteps`` are not configured by the user to be run
1339
1354
  in parallel. Since it happens on the back end, we need to do somewhat unique
1340
1355
  validations during construction. Specifically,
1341
1356
  - one and only one :class:`~easylink.graph_components.InputSlot` *must*
@@ -1348,17 +1363,17 @@ class EmbarrassinglyParallelStep(Step):
1348
1363
  # check that only one input slot has a splitter assigned
1349
1364
  if len(self.slot_splitter_mapping) != 1:
1350
1365
  errors.append(
1351
- f"EmbarrassinglyParallelStep '{self.step_name}' is attempting to define "
1366
+ f"AutoParallelStep '{self.step_name}' is attempting to define "
1352
1367
  f"{len(self.slot_splitter_mapping)} splitters when only one should be defined."
1353
1368
  )
1354
1369
  if len(self.slot_splitter_mapping) == 0:
1355
1370
  errors.append(
1356
- f"EmbarrassinglyParallelStep '{self.step_name}' does not have any input slots with a "
1371
+ f"AutoParallelStep '{self.step_name}' does not have any input slots with a "
1357
1372
  "splitter method assigned; one and only one input slot must have a splitter."
1358
1373
  )
1359
1374
  if len(self.slot_splitter_mapping) > 1:
1360
1375
  errors.append(
1361
- f"EmbarrassinglyParallelStep '{self.step_name}' has multiple input slots with "
1376
+ f"AutoParallelStep '{self.step_name}' has multiple input slots with "
1362
1377
  "splitter methods assigned; one and only one input slot must have a splitter.\n"
1363
1378
  f"Input slots with splitters: {list(self.slot_splitter_mapping)}"
1364
1379
  )
@@ -1371,7 +1386,7 @@ class EmbarrassinglyParallelStep(Step):
1371
1386
  ]
1372
1387
  if len(missing_aggregators) != 0:
1373
1388
  errors.append(
1374
- f"EmbarrassinglyParallelStep '{self.step_name}' has output slots without "
1389
+ f"AutoParallelStep '{self.step_name}' has output slots without "
1375
1390
  f"aggregator methods assigned: {missing_aggregators}"
1376
1391
  )
1377
1392
  if errors:
@@ -1451,7 +1466,7 @@ class EmbarrassinglyParallelStep(Step):
1451
1466
  aggregator_node_name = f"{self.name}_aggregate"
1452
1467
  if len(self.output_slots) > 1:
1453
1468
  raise NotImplementedError(
1454
- "FIXME [MIC-5883] Multiple output slots/files of EmbarrassinglyParallelSteps not yet supported"
1469
+ "FIXME [MIC-5883] Multiple output slots/files of AutoParallelSteps not yet supported"
1455
1470
  )
1456
1471
  output_slot = list(self.output_slots.values())[0]
1457
1472
  aggregator_step = AggregatorStep(
@@ -1464,7 +1479,7 @@ class EmbarrassinglyParallelStep(Step):
1464
1479
  self._update_slot_mappings(splitter_step, aggregator_step)
1465
1480
  # Add the key back to the expanded config
1466
1481
  expanded_config = LayeredConfigTree({self.step.name: step_config})
1467
- # EmbarrassinglyParallelSteps are by definition non-leaf steps
1482
+ # AutoParallelSteps are by definition non-leaf steps
1468
1483
  self._configuration_state = NonLeafConfigurationState(
1469
1484
  self, expanded_config, combined_implementations, input_data_config
1470
1485
  )
@@ -1513,7 +1528,7 @@ class EmbarrassinglyParallelStep(Step):
1513
1528
  # Add the Step -> AggregatorStep edge
1514
1529
  if len(self.step.output_slots) > 1:
1515
1530
  raise NotImplementedError(
1516
- "EmbarrassinglyParallelStep does not support multiple output slots."
1531
+ "AutoParallelStep does not support multiple output slots."
1517
1532
  )
1518
1533
  self.step_graph.add_edge_from_params(
1519
1534
  EdgeParams(
@@ -1562,7 +1577,7 @@ class SplitterStep(StandaloneStep):
1562
1577
  """A :class:`StandaloneStep` that splits an :class:`~easylink.graph_components.InputSlot` for parallel processing.
1563
1578
 
1564
1579
  A ``SplitterStep`` is intended to be used in conjunction with a corresponding
1565
- :class:`AggregatorStep` and only during construction of an :class:`EmbarrassinglyParallelStep`.
1580
+ :class:`AggregatorStep` and only during construction of an :class:`AutoParallelStep`.
1566
1581
 
1567
1582
  See :class:`Step` for inherited attributes.
1568
1583
 
@@ -1613,7 +1628,7 @@ class AggregatorStep(StandaloneStep):
1613
1628
  """A :class:`StandaloneStep` that aggregates :class:`OutputSlots<easylink.graph_components.Outputslot>` after parallel processing.
1614
1629
 
1615
1630
  An ``AggregatorStep`` is intended to be used in conjunction with a corresponding
1616
- :class:`SplitterStep` and only during construction of an :class:`EmbarrassinglyParallelStep`.
1631
+ :class:`SplitterStep` and only during construction of an :class:`AutoParallelStep`.
1617
1632
 
1618
1633
  See :class:`Step` for inherited attributes.
1619
1634
 
@@ -1918,10 +1933,9 @@ class LeafConfigurationState(ConfigurationState):
1918
1933
  """
1919
1934
  step = self._step
1920
1935
  if self.is_combined:
1921
- if step.is_embarrassingly_parallel:
1936
+ if step.is_auto_parallel:
1922
1937
  raise NotImplementedError(
1923
- "Combining implementations with embarrassingly parallel steps "
1924
- "is not supported."
1938
+ "Combining implementations with auto-parallel steps is not supported."
1925
1939
  )
1926
1940
  implementation = PartialImplementation(
1927
1941
  combined_name=self.step_config[COMBINED_IMPLEMENTATION_KEY],
@@ -1935,7 +1949,7 @@ class LeafConfigurationState(ConfigurationState):
1935
1949
  implementation_config=self.implementation_config,
1936
1950
  input_slots=step.input_slots.values(),
1937
1951
  output_slots=step.output_slots.values(),
1938
- is_embarrassingly_parallel=step.is_embarrassingly_parallel,
1952
+ is_auto_parallel=step.is_auto_parallel,
1939
1953
  )
1940
1954
  implementation_graph.add_node_from_implementation(
1941
1955
  step.implementation_node_name,
@@ -1985,7 +1999,7 @@ class LeafConfigurationState(ConfigurationState):
1985
1999
  if mapping.parent_slot == edge.input_slot
1986
2000
  ]
1987
2001
  for mapping in mappings:
1988
- # FIXME [MIC-5771]: Fix ParallelSteps
2002
+ # FIXME [MIC-5771]: Fix CloneableSteps
1989
2003
  if (
1990
2004
  "input_data_file" in self.step_config
1991
2005
  and edge.source_node == "pipeline_graph_input_data"
@@ -2070,8 +2084,8 @@ class NonLeafConfigurationState(ConfigurationState):
2070
2084
  """
2071
2085
  for node in self._step.step_graph.nodes:
2072
2086
  substep = self._step.step_graph.nodes[node]["step"]
2073
- if self._step.is_embarrassingly_parallel:
2074
- substep.is_embarrassingly_parallel = True
2087
+ if self._step.is_auto_parallel:
2088
+ substep.is_auto_parallel = True
2075
2089
  substep.add_nodes_to_implementation_graph(implementation_graph)
2076
2090
 
2077
2091
  def add_edges_to_implementation_graph(
@@ -2182,15 +2196,32 @@ class NonLeafConfigurationState(ConfigurationState):
2182
2196
 
2183
2197
  This method recursively traverses the ``StepGraph`` and sets the configuration
2184
2198
  state for each ``Step`` until reaching all leaf nodes.
2199
+
2200
+ Notes
2201
+ -----
2202
+ If a ``Step`` name is missing from the ``step_config``, we know that it
2203
+ must have a default implementation because we already validated that one
2204
+ exists during :meth:`HierarchicalStep._validate_step_graph`. In that case,
2205
+ we manually instantiate and use a ``step_config`` with the default implementation.
2185
2206
  """
2186
2207
  for sub_node in self._step.step_graph.nodes:
2187
2208
  sub_step = self._step.step_graph.nodes[sub_node]["step"]
2188
- # IOSteps, SplitterSteps, and AggregatorSteps never appear explicitly in the configuration
2189
- step_config = (
2190
- self.step_config
2191
- if isinstance(sub_step, (IOStep, SplitterStep, AggregatorStep))
2192
- else self.step_config[sub_step.name]
2193
- )
2209
+ try:
2210
+ step_config = (
2211
+ self.step_config
2212
+ if isinstance(sub_step, StandaloneStep)
2213
+ else self.step_config[sub_step.name]
2214
+ )
2215
+ except KeyError:
2216
+ # We know that any missing keys must have a default implementation
2217
+ # (because we have already checked that it exists during validation)
2218
+ step_config = LayeredConfigTree(
2219
+ {
2220
+ "implementation": {
2221
+ "name": sub_step.default_implementation,
2222
+ }
2223
+ }
2224
+ )
2194
2225
  sub_step.set_configuration_state(
2195
2226
  step_config, self.combined_implementations, self.input_data_config
2196
2227
  )
@@ -4,8 +4,8 @@ Data Aggregating Utilities
4
4
  ==========================
5
5
 
6
6
  This module contains utility functions for aggregating datasets. One primary use
7
- case for this is combine the results of running sections of the pipeline in an
8
- embarrassingly parallel manner.
7
+ case for this is to combine the results of sections that were automatically run
8
+ in parallel.
9
9
 
10
10
  Note that it is critical that all data aggregating utility functions are definied
11
11
  in this module; easylink will not be able to find them otherwise.
@@ -4,7 +4,7 @@ Data Splitting Utilities
4
4
  ========================
5
5
 
6
6
  This module contains utility functions for splitting datasets into smaller datasets.
7
- One primary use case for this is to run sections of the pipeline in an embarrassingly
7
+ One primary use case for this is to run sections of the pipeline in an auto
8
8
  parallel manner.
9
9
 
10
10
  Note that it is critical that all data splitting utility functions are definied
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: easylink
3
- Version: 0.1.20
3
+ Version: 0.1.22
4
4
  Summary: Research repository for the EasyLink ER ecosystem project.
5
5
  Home-page: https://github.com/ihmeuw/easylink
6
6
  Author: The EasyLink developers
@@ -1,24 +1,22 @@
1
1
  easylink/__about__.py,sha256=2-oxCfu9t9yUJouLDwqYRZ0eii8kN25SxRzsawjWjho,440
2
2
  easylink/__init__.py,sha256=gGMcIVfiVnHtlDw5mZwhevcDb2wt-kuP6F64gnkFack,159
3
- easylink/_version.py,sha256=8XalsVoLEfXslFvdtUEmkNOuYShzOzYOcFbgmOz1oSk,23
3
+ easylink/_version.py,sha256=zmP2TRnzKPjZJ1eiBcT-cRInsji6FW-OVD3FafQFCc4,23
4
4
  easylink/cli.py,sha256=zQO4lOVoZ3eVgPVWT2sCF4zNoKgiDJP9ReTh2Myr9jc,10307
5
- easylink/configuration.py,sha256=hgmG5SIbYqnHDHfk44Gr3QX7C3yTaEVW6GuKeMqvu6c,12689
5
+ easylink/configuration.py,sha256=TPd3WbqUcJMJDPJuHeo5ZebvZPQrRyfm8-laad2sOFk,12681
6
6
  easylink/graph_components.py,sha256=zZDZXg5smReHO3ryQC4pao24wyKXzWDe6jS3C6fM2ak,13892
7
- easylink/implementation.py,sha256=H46WjW9O3csaVAU7qLto3aOu1bSfVOBS0ZySBBX05o0,14544
7
+ easylink/implementation.py,sha256=lSF37g-aQYgdLI0lgFaDrBkp23te9O9B1V-CmmRtB-0,14514
8
8
  easylink/implementation_metadata.yaml,sha256=GoU_aWjVryG8-xjUHkC2nCUeznmYD0BwfJYnNrpZ8P4,10670
9
- easylink/pipeline.py,sha256=LC0mwboLfe84Mbju9manJjN00Kup4jauiugLlgGCz6I,17884
10
- easylink/pipeline_graph.py,sha256=9ysX4wAkA-WkUoo15jSLAErncybE4tJwznVx7N_kwIA,23922
11
- easylink/pipeline_schema.py,sha256=FieJBa3rKgaCIB9QDuQEfWJ9joNBUUp6iHT6xmns-Vk,6886
12
- easylink/rule.py,sha256=NusEUtBxx18L7UCcgDi3KKooFxSUgyS4eisVM5aPqFE,16770
9
+ easylink/pipeline.py,sha256=NJyMInbOCjJ_5kRzzuK5AcupvC7ecd5qLOC-f1Gy3Ss,17701
10
+ easylink/pipeline_graph.py,sha256=jtjS7_2IVa189U8cL621600erC_0pa6BKPRRO8VqwiU,23760
11
+ easylink/pipeline_schema.py,sha256=sj6YQqMFUS35RRFWdlrhPlud68d3Rb5y7iKfAEZy1qs,6877
12
+ easylink/rule.py,sha256=QJPmrvQUZPnqGFD9UmMK8imdJ7VODzGlUOSnpJhb9AU,16677
13
13
  easylink/runner.py,sha256=Z9GKgiUAWtp0dW1cyAp86MGthIDeABJtHDXUtzv0-kE,6714
14
- easylink/step.py,sha256=NGy1KNqM4eXP7kP0kdfcfyGc4K_ExSCSidCdW3h0Qg8,89902
15
- easylink/devtools/implementation_creator.py,sha256=RkwnI1T0aEquRPgGjPOGtJo_87tjoKvDAElRcf6Vqqk,19140
16
- easylink/images/spark_cluster/Dockerfile,sha256=3PHotbR4jdjVYRHOJ0VQW55b5Qd4tQ1pLLQMrTKWVA0,576
17
- easylink/images/spark_cluster/README.md,sha256=KdgSttZRplNNWqHn4K1GTsTIab3dTOSG4V99QPLxSp8,569
18
- easylink/pipeline_schema_constants/__init__.py,sha256=xYymSjTeH3prvQL_rgGFVrriohANFtW_cy0vDwlF3ds,1355
19
- easylink/pipeline_schema_constants/development.py,sha256=XxcYYZDZM4IADp3eFPQCchD6-OtMp99GiyZBfSswzFo,12640
20
- easylink/pipeline_schema_constants/main.py,sha256=9IxAjgQej7AaV-zYZEFhG8U-v_rYBFaPuNS3Y3m4Sho,22929
21
- easylink/pipeline_schema_constants/testing.py,sha256=UDmVVjI1SiDktMbJ2CrSb7amHSYNwhgqNkXhl4lYxQw,20459
14
+ easylink/step.py,sha256=zQAoz4HlSVvgS7iMlfmCrXluOtPQxbSgPZOeyZwjdpo,91085
15
+ easylink/devtools/implementation_creator.py,sha256=gZZpfpiOOh912nNjR_d5wR0mBO5c09jWMS0FSYRcE1o,19120
16
+ easylink/pipeline_schema_constants/__init__.py,sha256=njUL2v67WFC3dW_FjOXWRLSeOhInox_ZINLEji0_7V8,1523
17
+ easylink/pipeline_schema_constants/development.py,sha256=0x6lWqBmP1K9AebEmeZ4veSnLBcdQcZXzbV6lCU11bc,12622
18
+ easylink/pipeline_schema_constants/main.py,sha256=kcAhdbK_BhS79LUMhKkn-Uttl2vA-MHVX4M1StTloJQ,22934
19
+ easylink/pipeline_schema_constants/testing.py,sha256=ZFD19CpcidZPVUYBvh8LAa5sZEERT2yfoFa-3xmskFs,24595
22
20
  easylink/steps/cascading/exclude_clustered.def,sha256=GfoDqO2Vtsh7VI8SwGaJtv_KtKjs-UmBcivqQ7OPkjk,503
23
21
  easylink/steps/cascading/exclude_clustered.py,sha256=NSA6GZBzGa7e6CH4tacCGfr0Y9sUM29g9Nf8NquHB44,2612
24
22
  easylink/steps/cascading/exclude_none.def,sha256=iFUhUMocxtkA0NErkjVrBxY0MUdS3DIPNsbCpTJRP0k,488
@@ -76,16 +74,16 @@ easylink/steps/splink/splink_evaluating_pairs.py,sha256=JR2qVgb14cNZKozDyOrN11nr
76
74
  easylink/steps/splink/splink_links_to_clusters.def,sha256=RurvOYyGjNs9tx64DTXwI-GSgHD4T7SzDfhAH18pTEM,524
77
75
  easylink/steps/splink/splink_links_to_clusters.py,sha256=z5ymdYl9ytp1e5MA6vn8wpGRFWVuhh23LqGq8NJJxZQ,1936
78
76
  easylink/utilities/__init__.py,sha256=0U33kbv4hoMfFQ_lh5hLwifxRPzOgkLkjKLYxmaK10g,196
79
- easylink/utilities/aggregator_utils.py,sha256=pqBog6kEX4MXBBMjQtHFlE5gEMqRWb5VFl64u0Lr__g,972
77
+ easylink/utilities/aggregator_utils.py,sha256=_DAHRAf9janbDsuj_jnAn5Dzz2s4R5Ni3YeytDpN9UE,954
80
78
  easylink/utilities/data_utils.py,sha256=XPRjq3qW_fN0xQ23Jms_xBzpTHbRwqZWDP1AW0nYkP0,6926
81
79
  easylink/utilities/general_utils.py,sha256=El1W0nn4P27sRBGotNQb-9du-Gbhk9ggSuu4vmGDfwo,4591
82
80
  easylink/utilities/paths.py,sha256=9inDKMPorAaWh5vhjc3j1Tj_aXVKhLdodiJO9H1nNes,947
83
81
  easylink/utilities/spark.smk,sha256=kGtpem7LfQc71tMh5WAYaqKnHQKFvcdhPQSdumOP70k,5799
84
- easylink/utilities/splitter_utils.py,sha256=UOz4hjkEPqaAz0RrDkDYYej79lLSaq0VVVSH_tF1z0o,3838
82
+ easylink/utilities/splitter_utils.py,sha256=KXiVUYJ9TGxCQmrwos18pB1sxG_0Ay67qoDJT6vem2o,3828
85
83
  easylink/utilities/validation_utils.py,sha256=DBJB2TLVklgYw1WaaPse9vqtwPLMGmZNYM2cbCZsoHI,18417
86
- easylink-0.1.20.dist-info/licenses/LICENSE,sha256=z6NBo3hY2Pbh2Wjm9IcHLyQoG-kd816uASh8PbwhTcQ,1530
87
- easylink-0.1.20.dist-info/METADATA,sha256=aGNai6P-z5BQcQ0XYFTBr9JmuZAFTpZJYouFRlTJCzk,3565
88
- easylink-0.1.20.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
89
- easylink-0.1.20.dist-info/entry_points.txt,sha256=OGMZDFltg3yMboT7XjJt3joiPhRfV_7jnREVtrAIQNU,51
90
- easylink-0.1.20.dist-info/top_level.txt,sha256=oHcOpcF_jDMWFiJRzfGQvuskENGDjSPC_Agu9Z_Xvik,9
91
- easylink-0.1.20.dist-info/RECORD,,
84
+ easylink-0.1.22.dist-info/licenses/LICENSE,sha256=z6NBo3hY2Pbh2Wjm9IcHLyQoG-kd816uASh8PbwhTcQ,1530
85
+ easylink-0.1.22.dist-info/METADATA,sha256=hei9KKa0HUgy1Z4aU-nPEAs8KF2_TEe7J0-_esdCG40,3565
86
+ easylink-0.1.22.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
87
+ easylink-0.1.22.dist-info/entry_points.txt,sha256=OGMZDFltg3yMboT7XjJt3joiPhRfV_7jnREVtrAIQNU,51
88
+ easylink-0.1.22.dist-info/top_level.txt,sha256=oHcOpcF_jDMWFiJRzfGQvuskENGDjSPC_Agu9Z_Xvik,9
89
+ easylink-0.1.22.dist-info/RECORD,,
@@ -1,16 +0,0 @@
1
- # Stage 1: Start with the miniconda3 base image
2
- FROM continuumio/miniconda3 as conda-base
3
-
4
- # Create a new conda environment
5
- SHELL ["/bin/bash", "--login", "-c"]
6
- RUN conda init bash \
7
- && . ~/.bashrc \
8
- && conda create -n spark_cluster python=3.10
9
-
10
- # Stage 2: Start with the Apache Spark base image
11
- FROM apache/spark@sha256:a1dd2487a97fb5e35c5a5b409e830b501a92919029c62f9a559b13c4f5c50f63 as spark-base
12
-
13
- COPY --from=conda-base /opt/conda /opt/conda
14
-
15
- # Set PATH for conda environment and conda itself
16
- ENV PATH=/opt/conda/envs/spark_cluster/bin:/opt/conda/condabin:${PATH}
@@ -1,15 +0,0 @@
1
- # spark_cluster container
2
- NOTE: Spinning up a spark cluster using `easylink` currently requires building an image from this directory.
3
-
4
- This is done by running the following commands from this directory:
5
-
6
- ```
7
- # build the image
8
- $ sudo docker build -t easylink:sparkbuilder .
9
- # save as compressed tarball
10
- $ sudo docker save easylink:sparkbuilder | gzip > spark_cluster.tar.gz
11
- # remove the image
12
- $ sudo docker rmi easylink:sparkbuilder
13
- # convert the image from the docker image
14
- $ singularity build --force spark_cluster.sif docker-archive://$(pwd)/spark_cluster.tar.gz
15
- ```