easylink 0.1.21__py3-none-any.whl → 0.1.22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
easylink/_version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.1.21"
1
+ __version__ = "0.1.22"
easylink/configuration.py CHANGED
@@ -184,7 +184,9 @@ class Config(LayeredConfigTree):
184
184
  #################
185
185
 
186
186
  def _get_schema(self, schema_name: str = "main") -> PipelineSchema:
187
- """Returns the first :class:`~easylink.pipeline_schema.PipelineSchema` that validates the requested pipeline.
187
+ """Gets the requested :class:`~easylink.pipeline_schema.PipelineSchema`.
188
+
189
+ The schema is only returned if it validates the pipeline configuration.
188
190
 
189
191
  Parameters
190
192
  ----------
@@ -205,11 +207,10 @@ class Config(LayeredConfigTree):
205
207
  Notes
206
208
  -----
207
209
  This acts as the pipeline configuration file's validation method since
208
- we can only find a matching ``PipelineSchema`` if that file is valid.
210
+ we can only validate the ``PipelineSchema`` if that file is valid.
209
211
 
210
212
  """
211
213
  errors = defaultdict(dict)
212
- # Try each schema until one is validated
213
214
  schema = PipelineSchema.get_schema(schema_name)
214
215
  logs = schema.validate_step(self.pipeline, self.input_data)
215
216
  if logs:
@@ -159,10 +159,10 @@ class PipelineSchema(HierarchicalStep):
159
159
  )
160
160
 
161
161
  @classmethod
162
- def get_schema(cls, name: str = "main") -> list["PipelineSchema"]:
163
- """Gets all allowable ``PipelineSchemas``.
162
+ def get_schema(cls, name: str = "main") -> "PipelineSchema":
163
+ """Gets the requested ``PipelineSchema``.
164
164
 
165
- These ``PipelineSchemas`` represent the fully supported pipelines and are
165
+ This ``PipelineSchema`` represents the fully supported pipelines and is
166
166
  used to validate the user-requested pipeline.
167
167
 
168
168
  Parameters
@@ -27,4 +27,5 @@ SCHEMA_PARAMS = {
27
27
  "auto_parallel_cloneable_step": testing.SCHEMA_PARAMS_AUTO_PARALLEL_CLONEABLE_STEP,
28
28
  "auto_parallel_loop_step": testing.SCHEMA_PARAMS_AUTO_PARALLEL_LOOP_STEP,
29
29
  "auto_parallel_hierarchical_step": testing.SCHEMA_PARAMS_AUTO_PARALLEL_HIERARCHICAL_STEP,
30
+ "default_implementations": testing.SCHEMA_PARAMS_DEFAULT_IMPLEMENTATIONS,
30
31
  }
@@ -640,3 +640,126 @@ EDGES_OUTPUT_DIR = [
640
640
  ),
641
641
  ]
642
642
  SCHEMA_PARAMS_OUTPUT_DIR = (NODES_OUTPUT_DIR, EDGES_OUTPUT_DIR)
643
+
644
+
645
+ NODES_DEFAULT_IMPLEMENTATIONS = [
646
+ InputStep(),
647
+ HierarchicalStep(
648
+ step_name="step_1",
649
+ input_slots=[
650
+ InputSlot(
651
+ name="step_1_main_input",
652
+ env_var="DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS",
653
+ validator=validate_input_file_dummy,
654
+ ),
655
+ ],
656
+ output_slots=[OutputSlot("step_1_main_output")],
657
+ nodes=[
658
+ Step(
659
+ step_name="step_1a",
660
+ input_slots=[
661
+ InputSlot(
662
+ name="step_1a_main_input",
663
+ env_var="DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS",
664
+ validator=validate_input_file_dummy,
665
+ ),
666
+ ],
667
+ output_slots=[OutputSlot("step_1a_main_output")],
668
+ default_implementation="step_1a_python_pandas",
669
+ ),
670
+ Step(
671
+ step_name="step_1b",
672
+ input_slots=[
673
+ InputSlot(
674
+ name="step_1b_main_input",
675
+ env_var="DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS",
676
+ validator=validate_input_file_dummy,
677
+ ),
678
+ ],
679
+ output_slots=[OutputSlot("step_1b_main_output")],
680
+ default_implementation="step_1b_python_pandas",
681
+ ),
682
+ ],
683
+ edges=[
684
+ EdgeParams(
685
+ source_node="step_1a",
686
+ target_node="step_1b",
687
+ output_slot="step_1a_main_output",
688
+ input_slot="step_1b_main_input",
689
+ ),
690
+ ],
691
+ input_slot_mappings=[
692
+ InputSlotMapping(
693
+ parent_slot="step_1_main_input",
694
+ child_node="step_1a",
695
+ child_slot="step_1a_main_input",
696
+ ),
697
+ ],
698
+ output_slot_mappings=[
699
+ OutputSlotMapping(
700
+ parent_slot="step_1_main_output",
701
+ child_node="step_1b",
702
+ child_slot="step_1b_main_output",
703
+ ),
704
+ ],
705
+ default_implementation="step_1_python_pandas",
706
+ ),
707
+ Step(
708
+ step_name="step_2",
709
+ input_slots=[
710
+ InputSlot(
711
+ name="step_2_main_input",
712
+ env_var="DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS",
713
+ validator=validate_input_file_dummy,
714
+ )
715
+ ],
716
+ output_slots=[OutputSlot("step_2_main_output")],
717
+ default_implementation="step_2_python_pandas",
718
+ ),
719
+ LoopStep(
720
+ template_step=Step(
721
+ step_name="step_3",
722
+ input_slots=[
723
+ InputSlot(
724
+ name="step_3_main_input",
725
+ env_var="DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS",
726
+ validator=validate_input_file_dummy,
727
+ )
728
+ ],
729
+ output_slots=[OutputSlot("step_3_main_output")],
730
+ ),
731
+ self_edges=[
732
+ EdgeParams(
733
+ source_node="step_3",
734
+ target_node="step_3",
735
+ output_slot="step_3_main_output",
736
+ input_slot="step_3_main_input",
737
+ ),
738
+ ],
739
+ default_implementation="step_3_python_pandas",
740
+ ),
741
+ CloneableStep(
742
+ template_step=Step(
743
+ step_name="step_4",
744
+ input_slots=[
745
+ InputSlot(
746
+ name="step_4_main_input",
747
+ env_var="DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS",
748
+ validator=validate_input_file_dummy,
749
+ ),
750
+ ],
751
+ output_slots=[
752
+ OutputSlot(
753
+ name="step_4_main_output",
754
+ ),
755
+ ],
756
+ ),
757
+ default_implementation="step_4_python_pandas",
758
+ ),
759
+ OutputStep(
760
+ input_slots=[
761
+ InputSlot(name="result", env_var=None, validator=validate_input_file_dummy)
762
+ ],
763
+ ),
764
+ ]
765
+ SCHEMA_PARAMS_DEFAULT_IMPLEMENTATIONS = (NODES_DEFAULT_IMPLEMENTATIONS, EDGES_TWO_STEPS)
easylink/step.py CHANGED
@@ -92,6 +92,7 @@ class Step:
92
92
  input_slot_mappings: Iterable[InputSlotMapping] = (),
93
93
  output_slot_mappings: Iterable[OutputSlotMapping] = (),
94
94
  is_auto_parallel: bool = False,
95
+ default_implementation: str | None = None,
95
96
  ) -> None:
96
97
  if not step_name and not name:
97
98
  raise ValueError("All Steps must contain a step_name, name, or both.")
@@ -127,6 +128,9 @@ class Step:
127
128
  ``OutputSlotMappings`` of this ``Step``."""
128
129
  self.is_auto_parallel = is_auto_parallel
129
130
  """Whether or not this ``Step`` is to be automatically run in parallel."""
131
+ self.default_implementation = default_implementation
132
+ """The default implementation to use for this ``Step`` if the ``Step`` is
133
+ not explicitly configured in the pipeline specification."""
130
134
  self.parent_step = None
131
135
  """This ``Step's`` parent ``Step``, if applicable."""
132
136
  self._configuration_state = None
@@ -580,6 +584,7 @@ class HierarchicalStep(Step):
580
584
  input_slot_mappings=(),
581
585
  output_slot_mappings=(),
582
586
  directly_implemented=True,
587
+ default_implementation: str | None = None,
583
588
  ):
584
589
  super().__init__(
585
590
  step_name,
@@ -588,6 +593,7 @@ class HierarchicalStep(Step):
588
593
  output_slots,
589
594
  input_slot_mappings,
590
595
  output_slot_mappings,
596
+ default_implementation=default_implementation,
591
597
  )
592
598
  self.nodes = nodes
593
599
  """All sub-nodes (i.e. sub-``Steps``) that make up this ``HierarchicalStep``."""
@@ -722,13 +728,19 @@ class HierarchicalStep(Step):
722
728
  step = self.step_graph.nodes[node]["step"]
723
729
  if isinstance(step, IOStep):
724
730
  continue
731
+ if step.name not in step_config:
732
+ default_implementation = self.step_graph.nodes[step.name][
733
+ "step"
734
+ ].default_implementation
735
+ step_errors = (
736
+ {f"step {step.name}": ["The step is not configured."]}
737
+ if not default_implementation
738
+ else {}
739
+ )
725
740
  else:
726
- if step.name not in step_config:
727
- step_errors = {f"step {step.name}": ["The step is not configured."]}
728
- else:
729
- step_errors = step.validate_step(
730
- step_config[step.name], combined_implementations, input_data_config
731
- )
741
+ step_errors = step.validate_step(
742
+ step_config[step.name], combined_implementations, input_data_config
743
+ )
732
744
  if step_errors:
733
745
  errors.update(step_errors)
734
746
  extra_steps = set(step_config.keys()) - set(self.step_graph.nodes)
@@ -830,12 +842,14 @@ class TemplatedStep(Step, ABC):
830
842
  def __init__(
831
843
  self,
832
844
  template_step: Step,
845
+ default_implementation: str | None = None,
833
846
  ) -> None:
834
847
  super().__init__(
835
848
  template_step.step_name,
836
849
  template_step.name,
837
850
  template_step.input_slots.values(),
838
851
  template_step.output_slots.values(),
852
+ default_implementation=default_implementation,
839
853
  )
840
854
  self.step_graph = None
841
855
  """The :class:`~easylink.graph_components.StepGraph` i.e. the directed acyclic
@@ -1110,8 +1124,9 @@ class LoopStep(TemplatedStep):
1110
1124
  self,
1111
1125
  template_step: Step | None = None,
1112
1126
  self_edges: Iterable[EdgeParams] = (),
1127
+ default_implementation: str | None = None,
1113
1128
  ) -> None:
1114
- super().__init__(template_step)
1129
+ super().__init__(template_step, default_implementation)
1115
1130
  self.self_edges = self_edges
1116
1131
  """:class:`~easylink.graph_components.EdgeParams` that represent self-edges,
1117
1132
  i.e. edges that connect the output of one loop to the input of the next."""
@@ -2181,15 +2196,32 @@ class NonLeafConfigurationState(ConfigurationState):
2181
2196
 
2182
2197
  This method recursively traverses the ``StepGraph`` and sets the configuration
2183
2198
  state for each ``Step`` until reaching all leaf nodes.
2199
+
2200
+ Notes
2201
+ -----
2202
+ If a ``Step`` name is missing from the ``step_config``, we know that it
2203
+ must have a default implementation because we already validated that one
2204
+ exists during :meth:`HierarchicalStep._validate_step_graph`. In that case,
2205
+ we manually instantiate and use a ``step_config`` with the default implementation.
2184
2206
  """
2185
2207
  for sub_node in self._step.step_graph.nodes:
2186
2208
  sub_step = self._step.step_graph.nodes[sub_node]["step"]
2187
- # IOSteps, SplitterSteps, and AggregatorSteps never appear explicitly in the configuration
2188
- step_config = (
2189
- self.step_config
2190
- if isinstance(sub_step, (IOStep, SplitterStep, AggregatorStep))
2191
- else self.step_config[sub_step.name]
2192
- )
2209
+ try:
2210
+ step_config = (
2211
+ self.step_config
2212
+ if isinstance(sub_step, StandaloneStep)
2213
+ else self.step_config[sub_step.name]
2214
+ )
2215
+ except KeyError:
2216
+ # We know that any missing keys must have a default implementation
2217
+ # (because we have already checked that it exists during validation)
2218
+ step_config = LayeredConfigTree(
2219
+ {
2220
+ "implementation": {
2221
+ "name": sub_step.default_implementation,
2222
+ }
2223
+ }
2224
+ )
2193
2225
  sub_step.set_configuration_state(
2194
2226
  step_config, self.combined_implementations, self.input_data_config
2195
2227
  )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: easylink
3
- Version: 0.1.21
3
+ Version: 0.1.22
4
4
  Summary: Research repository for the EasyLink ER ecosystem project.
5
5
  Home-page: https://github.com/ihmeuw/easylink
6
6
  Author: The EasyLink developers
@@ -1,24 +1,22 @@
1
1
  easylink/__about__.py,sha256=2-oxCfu9t9yUJouLDwqYRZ0eii8kN25SxRzsawjWjho,440
2
2
  easylink/__init__.py,sha256=gGMcIVfiVnHtlDw5mZwhevcDb2wt-kuP6F64gnkFack,159
3
- easylink/_version.py,sha256=qEmNtjnOwhDYQ0cHPPtUkUaghzD2xl0thJEznl4giYw,23
3
+ easylink/_version.py,sha256=zmP2TRnzKPjZJ1eiBcT-cRInsji6FW-OVD3FafQFCc4,23
4
4
  easylink/cli.py,sha256=zQO4lOVoZ3eVgPVWT2sCF4zNoKgiDJP9ReTh2Myr9jc,10307
5
- easylink/configuration.py,sha256=hgmG5SIbYqnHDHfk44Gr3QX7C3yTaEVW6GuKeMqvu6c,12689
5
+ easylink/configuration.py,sha256=TPd3WbqUcJMJDPJuHeo5ZebvZPQrRyfm8-laad2sOFk,12681
6
6
  easylink/graph_components.py,sha256=zZDZXg5smReHO3ryQC4pao24wyKXzWDe6jS3C6fM2ak,13892
7
7
  easylink/implementation.py,sha256=lSF37g-aQYgdLI0lgFaDrBkp23te9O9B1V-CmmRtB-0,14514
8
8
  easylink/implementation_metadata.yaml,sha256=GoU_aWjVryG8-xjUHkC2nCUeznmYD0BwfJYnNrpZ8P4,10670
9
9
  easylink/pipeline.py,sha256=NJyMInbOCjJ_5kRzzuK5AcupvC7ecd5qLOC-f1Gy3Ss,17701
10
10
  easylink/pipeline_graph.py,sha256=jtjS7_2IVa189U8cL621600erC_0pa6BKPRRO8VqwiU,23760
11
- easylink/pipeline_schema.py,sha256=FieJBa3rKgaCIB9QDuQEfWJ9joNBUUp6iHT6xmns-Vk,6886
11
+ easylink/pipeline_schema.py,sha256=sj6YQqMFUS35RRFWdlrhPlud68d3Rb5y7iKfAEZy1qs,6877
12
12
  easylink/rule.py,sha256=QJPmrvQUZPnqGFD9UmMK8imdJ7VODzGlUOSnpJhb9AU,16677
13
13
  easylink/runner.py,sha256=Z9GKgiUAWtp0dW1cyAp86MGthIDeABJtHDXUtzv0-kE,6714
14
- easylink/step.py,sha256=SqOxinHyRaLCEnB_y5dvhGMaRLyphQDCpVsQ3160c9U,89588
14
+ easylink/step.py,sha256=zQAoz4HlSVvgS7iMlfmCrXluOtPQxbSgPZOeyZwjdpo,91085
15
15
  easylink/devtools/implementation_creator.py,sha256=gZZpfpiOOh912nNjR_d5wR0mBO5c09jWMS0FSYRcE1o,19120
16
- easylink/images/spark_cluster/Dockerfile,sha256=3PHotbR4jdjVYRHOJ0VQW55b5Qd4tQ1pLLQMrTKWVA0,576
17
- easylink/images/spark_cluster/README.md,sha256=KdgSttZRplNNWqHn4K1GTsTIab3dTOSG4V99QPLxSp8,569
18
- easylink/pipeline_schema_constants/__init__.py,sha256=SMNXz49DSwx05PHMKUsunJsgMOqsBJaAHA1fmIOJsUU,1445
16
+ easylink/pipeline_schema_constants/__init__.py,sha256=njUL2v67WFC3dW_FjOXWRLSeOhInox_ZINLEji0_7V8,1523
19
17
  easylink/pipeline_schema_constants/development.py,sha256=0x6lWqBmP1K9AebEmeZ4veSnLBcdQcZXzbV6lCU11bc,12622
20
18
  easylink/pipeline_schema_constants/main.py,sha256=kcAhdbK_BhS79LUMhKkn-Uttl2vA-MHVX4M1StTloJQ,22934
21
- easylink/pipeline_schema_constants/testing.py,sha256=G7szRMyY48dL8kUHWq2MeMaV2G0F-AdAPsQxFzdUnFI,20567
19
+ easylink/pipeline_schema_constants/testing.py,sha256=ZFD19CpcidZPVUYBvh8LAa5sZEERT2yfoFa-3xmskFs,24595
22
20
  easylink/steps/cascading/exclude_clustered.def,sha256=GfoDqO2Vtsh7VI8SwGaJtv_KtKjs-UmBcivqQ7OPkjk,503
23
21
  easylink/steps/cascading/exclude_clustered.py,sha256=NSA6GZBzGa7e6CH4tacCGfr0Y9sUM29g9Nf8NquHB44,2612
24
22
  easylink/steps/cascading/exclude_none.def,sha256=iFUhUMocxtkA0NErkjVrBxY0MUdS3DIPNsbCpTJRP0k,488
@@ -83,9 +81,9 @@ easylink/utilities/paths.py,sha256=9inDKMPorAaWh5vhjc3j1Tj_aXVKhLdodiJO9H1nNes,9
83
81
  easylink/utilities/spark.smk,sha256=kGtpem7LfQc71tMh5WAYaqKnHQKFvcdhPQSdumOP70k,5799
84
82
  easylink/utilities/splitter_utils.py,sha256=KXiVUYJ9TGxCQmrwos18pB1sxG_0Ay67qoDJT6vem2o,3828
85
83
  easylink/utilities/validation_utils.py,sha256=DBJB2TLVklgYw1WaaPse9vqtwPLMGmZNYM2cbCZsoHI,18417
86
- easylink-0.1.21.dist-info/licenses/LICENSE,sha256=z6NBo3hY2Pbh2Wjm9IcHLyQoG-kd816uASh8PbwhTcQ,1530
87
- easylink-0.1.21.dist-info/METADATA,sha256=wdHGbqg2d4yte9ep9mO_GAr2EbUmEAVHHjPg6LsvMLE,3565
88
- easylink-0.1.21.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
89
- easylink-0.1.21.dist-info/entry_points.txt,sha256=OGMZDFltg3yMboT7XjJt3joiPhRfV_7jnREVtrAIQNU,51
90
- easylink-0.1.21.dist-info/top_level.txt,sha256=oHcOpcF_jDMWFiJRzfGQvuskENGDjSPC_Agu9Z_Xvik,9
91
- easylink-0.1.21.dist-info/RECORD,,
84
+ easylink-0.1.22.dist-info/licenses/LICENSE,sha256=z6NBo3hY2Pbh2Wjm9IcHLyQoG-kd816uASh8PbwhTcQ,1530
85
+ easylink-0.1.22.dist-info/METADATA,sha256=hei9KKa0HUgy1Z4aU-nPEAs8KF2_TEe7J0-_esdCG40,3565
86
+ easylink-0.1.22.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
87
+ easylink-0.1.22.dist-info/entry_points.txt,sha256=OGMZDFltg3yMboT7XjJt3joiPhRfV_7jnREVtrAIQNU,51
88
+ easylink-0.1.22.dist-info/top_level.txt,sha256=oHcOpcF_jDMWFiJRzfGQvuskENGDjSPC_Agu9Z_Xvik,9
89
+ easylink-0.1.22.dist-info/RECORD,,
@@ -1,16 +0,0 @@
1
- # Stage 1: Start with the miniconda3 base image
2
- FROM continuumio/miniconda3 as conda-base
3
-
4
- # Create a new conda environment
5
- SHELL ["/bin/bash", "--login", "-c"]
6
- RUN conda init bash \
7
- && . ~/.bashrc \
8
- && conda create -n spark_cluster python=3.10
9
-
10
- # Stage 2: Start with the Apache Spark base image
11
- FROM apache/spark@sha256:a1dd2487a97fb5e35c5a5b409e830b501a92919029c62f9a559b13c4f5c50f63 as spark-base
12
-
13
- COPY --from=conda-base /opt/conda /opt/conda
14
-
15
- # Set PATH for conda environment and conda itself
16
- ENV PATH=/opt/conda/envs/spark_cluster/bin:/opt/conda/condabin:${PATH}
@@ -1,15 +0,0 @@
1
- # spark_cluster container
2
- NOTE: Spinning up a spark cluster using `easylink` currently requires building an image from this directory.
3
-
4
- This is done by running the following commands from this directory:
5
-
6
- ```
7
- # build the image
8
- $ sudo docker build -t easylink:sparkbuilder .
9
- # save as compressed tarball
10
- $ sudo docker save easylink:sparkbuilder | gzip > spark_cluster.tar.gz
11
- # remove the image
12
- $ sudo docker rmi easylink:sparkbuilder
13
- # convert the image from the docker image
14
- $ singularity build --force spark_cluster.sif docker-archive://$(pwd)/spark_cluster.tar.gz
15
- ```