easylink 0.1.21__py3-none-any.whl → 0.1.23__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. easylink/_version.py +1 -1
  2. easylink/configuration.py +4 -3
  3. easylink/implementation_metadata.yaml +53 -46
  4. easylink/pipeline_schema.py +3 -3
  5. easylink/pipeline_schema_constants/__init__.py +1 -0
  6. easylink/pipeline_schema_constants/testing.py +124 -1
  7. easylink/rule.py +5 -5
  8. easylink/step.py +46 -14
  9. easylink/steps/cascading/exclude_clustered.py +2 -2
  10. easylink/steps/cascading/exclude_none.py +2 -2
  11. easylink/steps/cascading/update_clusters_by_connected_components.py +2 -2
  12. easylink/steps/default/default_clusters_to_links.py +2 -2
  13. easylink/steps/default/default_determining_exclusions.py +2 -2
  14. easylink/steps/default/default_removing_records.py +2 -2
  15. easylink/steps/default/default_schema_alignment.py +3 -2
  16. easylink/steps/default/default_updating_clusters.py +2 -2
  17. easylink/steps/dev/README.md +1 -1
  18. easylink/steps/dev/python_pandas/dummy_step.py +4 -4
  19. easylink/steps/dev/python_pandas/python_pandas.def +2 -13
  20. easylink/steps/dev/python_pyspark/dummy_step.py +5 -7
  21. easylink/steps/dev/python_pyspark/python_pyspark.def +2 -12
  22. easylink/steps/dev/r/dummy_step.R +2 -2
  23. easylink/steps/dev/r/r-image.def +2 -12
  24. easylink/steps/example/middle_name_to_initial.def +22 -0
  25. easylink/steps/example/middle_name_to_initial.py +60 -0
  26. easylink/steps/fastLink/fastLink_evaluating_pairs.R +4 -4
  27. easylink/steps/fastLink/fastLink_links_to_clusters.R +2 -2
  28. easylink/steps/output_dir/dummy_step_1_for_output_dir_example.py +1 -1
  29. easylink/steps/output_dir/dummy_step_2_for_output_dir_example.py +2 -2
  30. easylink/steps/rl-dummy/canonicalizing_and_downstream_analysis/dummy_canonicalizing_and_downstream_analysis.py +2 -2
  31. easylink/steps/rl-dummy/pre-processing/dummy_pre-processing.py +2 -2
  32. easylink/steps/splink/splink_blocking_and_filtering.def +1 -1
  33. easylink/steps/splink/splink_blocking_and_filtering.py +32 -6
  34. easylink/steps/splink/splink_evaluating_pairs.py +14 -4
  35. easylink/steps/splink/splink_links_to_clusters.py +1 -1
  36. {easylink-0.1.21.dist-info → easylink-0.1.23.dist-info}/METADATA +1 -1
  37. {easylink-0.1.21.dist-info → easylink-0.1.23.dist-info}/RECORD +41 -41
  38. easylink/images/spark_cluster/Dockerfile +0 -16
  39. easylink/images/spark_cluster/README.md +0 -15
  40. {easylink-0.1.21.dist-info → easylink-0.1.23.dist-info}/WHEEL +0 -0
  41. {easylink-0.1.21.dist-info → easylink-0.1.23.dist-info}/entry_points.txt +0 -0
  42. {easylink-0.1.21.dist-info → easylink-0.1.23.dist-info}/licenses/LICENSE +0 -0
  43. {easylink-0.1.21.dist-info → easylink-0.1.23.dist-info}/top_level.txt +0 -0
easylink/_version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.1.21"
1
+ __version__ = "0.1.23"
easylink/configuration.py CHANGED
@@ -184,7 +184,9 @@ class Config(LayeredConfigTree):
184
184
  #################
185
185
 
186
186
  def _get_schema(self, schema_name: str = "main") -> PipelineSchema:
187
- """Returns the first :class:`~easylink.pipeline_schema.PipelineSchema` that validates the requested pipeline.
187
+ """Gets the requested :class:`~easylink.pipeline_schema.PipelineSchema`.
188
+
189
+ The schema is only returned if it validates the pipeline configuration.
188
190
 
189
191
  Parameters
190
192
  ----------
@@ -205,11 +207,10 @@ class Config(LayeredConfigTree):
205
207
  Notes
206
208
  -----
207
209
  This acts as the pipeline configuration file's validation method since
208
- we can only find a matching ``PipelineSchema`` if that file is valid.
210
+ we can only validate the ``PipelineSchema`` if that file is valid.
209
211
 
210
212
  """
211
213
  errors = defaultdict(dict)
212
- # Try each schema until one is validated
213
214
  schema = PipelineSchema.get_schema(schema_name)
214
215
  logs = schema.validate_step(self.pipeline, self.input_data)
215
216
  if logs:
@@ -2,8 +2,8 @@ step_1_python_pandas:
2
2
  steps:
3
3
  - step_1
4
4
  image_name: python_pandas.sif
5
- zenodo_record_id: 15611084
6
- md5_checksum: 7cc7cb37195c635684903b6777cf1cdf
5
+ zenodo_record_id: 15733426
6
+ md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
7
7
  script_cmd: python /dummy_step.py
8
8
  outputs:
9
9
  step_1_main_output: result.parquet
@@ -11,8 +11,8 @@ step_1a_python_pandas:
11
11
  steps:
12
12
  - step_1a
13
13
  image_name: python_pandas.sif
14
- zenodo_record_id: 15611084
15
- md5_checksum: 7cc7cb37195c635684903b6777cf1cdf
14
+ zenodo_record_id: 15733426
15
+ md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
16
16
  script_cmd: python /dummy_step.py
17
17
  env:
18
18
  INPUT_ENV_VARS: DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS,DUMMY_CONTAINER_SECONDARY_INPUT_FILE_PATHS
@@ -22,8 +22,8 @@ step_1b_python_pandas:
22
22
  steps:
23
23
  - step_1b
24
24
  image_name: python_pandas.sif
25
- zenodo_record_id: 15611084
26
- md5_checksum: 7cc7cb37195c635684903b6777cf1cdf
25
+ zenodo_record_id: 15733426
26
+ md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
27
27
  script_cmd: python /dummy_step.py
28
28
  env:
29
29
  INPUT_ENV_VARS: DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS,DUMMY_CONTAINER_SECONDARY_INPUT_FILE_PATHS
@@ -33,8 +33,8 @@ step_2_python_pandas:
33
33
  steps:
34
34
  - step_2
35
35
  image_name: python_pandas.sif
36
- zenodo_record_id: 15611084
37
- md5_checksum: 7cc7cb37195c635684903b6777cf1cdf
36
+ zenodo_record_id: 15733426
37
+ md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
38
38
  script_cmd: python /dummy_step.py
39
39
  outputs:
40
40
  step_2_main_output: result.parquet
@@ -42,8 +42,8 @@ step_3_python_pandas:
42
42
  steps:
43
43
  - step_3
44
44
  image_name: python_pandas.sif
45
- zenodo_record_id: 15611084
46
- md5_checksum: 7cc7cb37195c635684903b6777cf1cdf
45
+ zenodo_record_id: 15733426
46
+ md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
47
47
  script_cmd: python /dummy_step.py
48
48
  outputs:
49
49
  step_3_main_output: result.parquet
@@ -51,8 +51,8 @@ step_4_python_pandas:
51
51
  steps:
52
52
  - step_4
53
53
  image_name: python_pandas.sif
54
- zenodo_record_id: 15611084
55
- md5_checksum: 7cc7cb37195c635684903b6777cf1cdf
54
+ zenodo_record_id: 15733426
55
+ md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
56
56
  script_cmd: python /dummy_step.py
57
57
  env:
58
58
  INPUT_ENV_VARS: DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS,DUMMY_CONTAINER_SECONDARY_INPUT_FILE_PATHS
@@ -62,8 +62,8 @@ step_5_python_pandas:
62
62
  steps:
63
63
  - step_5
64
64
  image_name: python_pandas.sif
65
- zenodo_record_id: 15611084
66
- md5_checksum: 7cc7cb37195c635684903b6777cf1cdf
65
+ zenodo_record_id: 15733426
66
+ md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
67
67
  script_cmd: python /dummy_step.py
68
68
  env:
69
69
  INPUT_ENV_VARS: DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS,DUMMY_CONTAINER_SECONDARY_INPUT_FILE_PATHS
@@ -73,8 +73,8 @@ step_6_python_pandas:
73
73
  steps:
74
74
  - step_6
75
75
  image_name: python_pandas.sif
76
- zenodo_record_id: 15611084
77
- md5_checksum: 7cc7cb37195c635684903b6777cf1cdf
76
+ zenodo_record_id: 15733426
77
+ md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
78
78
  script_cmd: python /dummy_step.py
79
79
  env:
80
80
  INPUT_ENV_VARS: DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS,DUMMY_CONTAINER_SECONDARY_INPUT_FILE_PATHS
@@ -84,8 +84,8 @@ step_4a_python_pandas:
84
84
  steps:
85
85
  - step_4a
86
86
  image_name: python_pandas.sif
87
- zenodo_record_id: 15611084
88
- md5_checksum: 7cc7cb37195c635684903b6777cf1cdf
87
+ zenodo_record_id: 15733426
88
+ md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
89
89
  script_cmd: python /dummy_step.py
90
90
  env:
91
91
  INPUT_ENV_VARS: DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS,DUMMY_CONTAINER_SECONDARY_INPUT_FILE_PATHS
@@ -95,8 +95,8 @@ step_4b_python_pandas:
95
95
  steps:
96
96
  - step_4b
97
97
  image_name: python_pandas.sif
98
- zenodo_record_id: 15611084
99
- md5_checksum: 7cc7cb37195c635684903b6777cf1cdf
98
+ zenodo_record_id: 15733426
99
+ md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
100
100
  script_cmd: python /dummy_step.py
101
101
  env:
102
102
  INPUT_ENV_VARS: DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS,DUMMY_CONTAINER_SECONDARY_INPUT_FILE_PATHS
@@ -106,8 +106,8 @@ step_4b_r:
106
106
  steps:
107
107
  - step_4b
108
108
  image_name: r-image.sif
109
- zenodo_record_id: 15611084
110
- md5_checksum: 9410af1317aabc332604cbec33b59d42
109
+ zenodo_record_id: 15733426
110
+ md5_checksum: a4a03b836694a2b81a1bd2852736ccc5
111
111
  script_cmd: Rscript /dummy_step.R
112
112
  env:
113
113
  INPUT_ENV_VARS: DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS,DUMMY_CONTAINER_SECONDARY_INPUT_FILE_PATHS
@@ -117,8 +117,8 @@ step_1_python_pyspark:
117
117
  steps:
118
118
  - step_1
119
119
  image_name: python_pyspark.sif
120
- zenodo_record_id: 15611084
121
- md5_checksum: 6fb2a2119630138f4db82356b8d78b87
120
+ zenodo_record_id: 15733426
121
+ md5_checksum: c948577ab0607411dd4b640622d9ec3a
122
122
  script_cmd: python3 /code/dummy_step.py
123
123
  outputs:
124
124
  step_1_main_output: result.parquet
@@ -127,8 +127,8 @@ step_2_python_pyspark:
127
127
  steps:
128
128
  - step_2
129
129
  image_name: python_pyspark.sif
130
- zenodo_record_id: 15611084
131
- md5_checksum: 6fb2a2119630138f4db82356b8d78b87
130
+ zenodo_record_id: 15733426
131
+ md5_checksum: c948577ab0607411dd4b640622d9ec3a
132
132
  script_cmd: python3 /code/dummy_step.py
133
133
  outputs:
134
134
  step_2_main_output: result.parquet
@@ -137,8 +137,8 @@ step_3_python_pyspark:
137
137
  steps:
138
138
  - step_3
139
139
  image_name: python_pyspark.sif
140
- zenodo_record_id: 15611084
141
- md5_checksum: 6fb2a2119630138f4db82356b8d78b87
140
+ zenodo_record_id: 15733426
141
+ md5_checksum: c948577ab0607411dd4b640622d9ec3a
142
142
  script_cmd: python3 /code/dummy_step.py
143
143
  outputs:
144
144
  step_3_main_output: result.parquet
@@ -147,8 +147,8 @@ step_4_python_pyspark:
147
147
  steps:
148
148
  - step_4
149
149
  image_name: python_pyspark.sif
150
- zenodo_record_id: 15611084
151
- md5_checksum: 6fb2a2119630138f4db82356b8d78b87
150
+ zenodo_record_id: 15733426
151
+ md5_checksum: c948577ab0607411dd4b640622d9ec3a
152
152
  script_cmd: python3 /code/dummy_step.py
153
153
  env:
154
154
  INPUT_ENV_VARS: DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS,DUMMY_CONTAINER_SECONDARY_INPUT_FILE_PATHS
@@ -158,8 +158,8 @@ step_1_r:
158
158
  steps:
159
159
  - step_1
160
160
  image_name: r-image.sif
161
- zenodo_record_id: 15611084
162
- md5_checksum: 9410af1317aabc332604cbec33b59d42
161
+ zenodo_record_id: 15733426
162
+ md5_checksum: a4a03b836694a2b81a1bd2852736ccc5
163
163
  script_cmd: Rscript /dummy_step.R
164
164
  outputs:
165
165
  step_1_main_output: result.parquet
@@ -168,8 +168,8 @@ step_2_r:
168
168
  steps:
169
169
  - step_2
170
170
  image_name: r-image.sif
171
- zenodo_record_id: 15611084
172
- md5_checksum: 9410af1317aabc332604cbec33b59d42
171
+ zenodo_record_id: 15733426
172
+ md5_checksum: a4a03b836694a2b81a1bd2852736ccc5
173
173
  script_cmd: Rscript /dummy_step.R
174
174
  outputs:
175
175
  step_2_main_output: result.parquet
@@ -178,8 +178,8 @@ step_3_r:
178
178
  steps:
179
179
  - step_3
180
180
  image_name: r-image.sif
181
- zenodo_record_id: 15611084
182
- md5_checksum: 9410af1317aabc332604cbec33b59d42
181
+ zenodo_record_id: 15733426
182
+ md5_checksum: a4a03b836694a2b81a1bd2852736ccc5
183
183
  script_cmd: Rscript /dummy_step.R
184
184
  outputs:
185
185
  step_3_main_output: result.parquet
@@ -188,8 +188,8 @@ step_4_r:
188
188
  steps:
189
189
  - step_4
190
190
  image_name: r-image.sif
191
- zenodo_record_id: 15611084
192
- md5_checksum: 9410af1317aabc332604cbec33b59d42
191
+ zenodo_record_id: 15733426
192
+ md5_checksum: a4a03b836694a2b81a1bd2852736ccc5
193
193
  script_cmd: Rscript /dummy_step.R
194
194
  env:
195
195
  INPUT_ENV_VARS: DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS,DUMMY_CONTAINER_SECONDARY_INPUT_FILE_PATHS
@@ -201,8 +201,8 @@ step_1_and_step_2_combined_python_pandas:
201
201
  - step_1
202
202
  - step_2
203
203
  image_name: python_pandas.sif
204
- zenodo_record_id: 15611084
205
- md5_checksum: 7cc7cb37195c635684903b6777cf1cdf
204
+ zenodo_record_id: 15733426
205
+ md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
206
206
  script_cmd: python /dummy_step.py
207
207
  outputs:
208
208
  step_2_main_output: result.parquet
@@ -211,8 +211,8 @@ step_1_and_step_2_parallel_python_pandas:
211
211
  - step_1
212
212
  - step_2
213
213
  image_name: python_pandas.sif
214
- zenodo_record_id: 15611084
215
- md5_checksum: 7cc7cb37195c635684903b6777cf1cdf
214
+ zenodo_record_id: 15733426
215
+ md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
216
216
  script_cmd: python /dummy_step.py
217
217
  env:
218
218
  INPUT_ENV_VARS: STEP_1_DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS,STEP_2_DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS
@@ -223,8 +223,8 @@ step_3_and_step_4_combined_python_pandas:
223
223
  - step_3
224
224
  - step_4
225
225
  image_name: python_pandas.sif
226
- zenodo_record_id: 15611084
227
- md5_checksum: 7cc7cb37195c635684903b6777cf1cdf
226
+ zenodo_record_id: 15733426
227
+ md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
228
228
  script_cmd: python /dummy_step.py
229
229
  outputs:
230
230
  step_4_main_output: result.parquet
@@ -233,8 +233,8 @@ step_1a_and_step_1b_combined_python_pandas:
233
233
  - step_1a
234
234
  - step_1b
235
235
  image_name: python_pandas.sif
236
- zenodo_record_id: 15611084
237
- md5_checksum: 7cc7cb37195c635684903b6777cf1cdf
236
+ zenodo_record_id: 15733426
237
+ md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
238
238
  script_cmd: python /dummy_step.py
239
239
  outputs:
240
240
  step_1_main_output: result.parquet
@@ -362,3 +362,10 @@ update_clusters_by_connected_components:
362
362
  script_cmd: python /update_clusters_by_connected_components.py
363
363
  outputs:
364
364
  clusters: result.parquet
365
+ middle_name_to_initial:
366
+ steps:
367
+ - pre-processing
368
+ image_name: main/middle_name_to_initial.sif
369
+ script_cmd: python /middle_name_to_initial.py
370
+ outputs:
371
+ dataset: dataset
@@ -159,10 +159,10 @@ class PipelineSchema(HierarchicalStep):
159
159
  )
160
160
 
161
161
  @classmethod
162
- def get_schema(cls, name: str = "main") -> list["PipelineSchema"]:
163
- """Gets all allowable ``PipelineSchemas``.
162
+ def get_schema(cls, name: str = "main") -> "PipelineSchema":
163
+ """Gets the requested ``PipelineSchema``.
164
164
 
165
- These ``PipelineSchemas`` represent the fully supported pipelines and are
165
+ This ``PipelineSchema`` represents the fully supported pipelines and is
166
166
  used to validate the user-requested pipeline.
167
167
 
168
168
  Parameters
@@ -27,4 +27,5 @@ SCHEMA_PARAMS = {
27
27
  "auto_parallel_cloneable_step": testing.SCHEMA_PARAMS_AUTO_PARALLEL_CLONEABLE_STEP,
28
28
  "auto_parallel_loop_step": testing.SCHEMA_PARAMS_AUTO_PARALLEL_LOOP_STEP,
29
29
  "auto_parallel_hierarchical_step": testing.SCHEMA_PARAMS_AUTO_PARALLEL_HIERARCHICAL_STEP,
30
+ "default_implementations": testing.SCHEMA_PARAMS_DEFAULT_IMPLEMENTATIONS,
30
31
  }
@@ -607,7 +607,7 @@ NODES_OUTPUT_DIR = [
607
607
  input_slots=[
608
608
  InputSlot(
609
609
  name="step_2_main_input",
610
- env_var="DUMMY_CONTAINER_MAIN_INPUT_DIR_PATH",
610
+ env_var="MAIN_INPUT_DIR_PATH",
611
611
  validator=validate_dir,
612
612
  )
613
613
  ],
@@ -640,3 +640,126 @@ EDGES_OUTPUT_DIR = [
640
640
  ),
641
641
  ]
642
642
  SCHEMA_PARAMS_OUTPUT_DIR = (NODES_OUTPUT_DIR, EDGES_OUTPUT_DIR)
643
+
644
+
645
+ NODES_DEFAULT_IMPLEMENTATIONS = [
646
+ InputStep(),
647
+ HierarchicalStep(
648
+ step_name="step_1",
649
+ input_slots=[
650
+ InputSlot(
651
+ name="step_1_main_input",
652
+ env_var="DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS",
653
+ validator=validate_input_file_dummy,
654
+ ),
655
+ ],
656
+ output_slots=[OutputSlot("step_1_main_output")],
657
+ nodes=[
658
+ Step(
659
+ step_name="step_1a",
660
+ input_slots=[
661
+ InputSlot(
662
+ name="step_1a_main_input",
663
+ env_var="DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS",
664
+ validator=validate_input_file_dummy,
665
+ ),
666
+ ],
667
+ output_slots=[OutputSlot("step_1a_main_output")],
668
+ default_implementation="step_1a_python_pandas",
669
+ ),
670
+ Step(
671
+ step_name="step_1b",
672
+ input_slots=[
673
+ InputSlot(
674
+ name="step_1b_main_input",
675
+ env_var="DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS",
676
+ validator=validate_input_file_dummy,
677
+ ),
678
+ ],
679
+ output_slots=[OutputSlot("step_1b_main_output")],
680
+ default_implementation="step_1b_python_pandas",
681
+ ),
682
+ ],
683
+ edges=[
684
+ EdgeParams(
685
+ source_node="step_1a",
686
+ target_node="step_1b",
687
+ output_slot="step_1a_main_output",
688
+ input_slot="step_1b_main_input",
689
+ ),
690
+ ],
691
+ input_slot_mappings=[
692
+ InputSlotMapping(
693
+ parent_slot="step_1_main_input",
694
+ child_node="step_1a",
695
+ child_slot="step_1a_main_input",
696
+ ),
697
+ ],
698
+ output_slot_mappings=[
699
+ OutputSlotMapping(
700
+ parent_slot="step_1_main_output",
701
+ child_node="step_1b",
702
+ child_slot="step_1b_main_output",
703
+ ),
704
+ ],
705
+ default_implementation="step_1_python_pandas",
706
+ ),
707
+ Step(
708
+ step_name="step_2",
709
+ input_slots=[
710
+ InputSlot(
711
+ name="step_2_main_input",
712
+ env_var="DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS",
713
+ validator=validate_input_file_dummy,
714
+ )
715
+ ],
716
+ output_slots=[OutputSlot("step_2_main_output")],
717
+ default_implementation="step_2_python_pandas",
718
+ ),
719
+ LoopStep(
720
+ template_step=Step(
721
+ step_name="step_3",
722
+ input_slots=[
723
+ InputSlot(
724
+ name="step_3_main_input",
725
+ env_var="DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS",
726
+ validator=validate_input_file_dummy,
727
+ )
728
+ ],
729
+ output_slots=[OutputSlot("step_3_main_output")],
730
+ ),
731
+ self_edges=[
732
+ EdgeParams(
733
+ source_node="step_3",
734
+ target_node="step_3",
735
+ output_slot="step_3_main_output",
736
+ input_slot="step_3_main_input",
737
+ ),
738
+ ],
739
+ default_implementation="step_3_python_pandas",
740
+ ),
741
+ CloneableStep(
742
+ template_step=Step(
743
+ step_name="step_4",
744
+ input_slots=[
745
+ InputSlot(
746
+ name="step_4_main_input",
747
+ env_var="DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS",
748
+ validator=validate_input_file_dummy,
749
+ ),
750
+ ],
751
+ output_slots=[
752
+ OutputSlot(
753
+ name="step_4_main_output",
754
+ ),
755
+ ],
756
+ ),
757
+ default_implementation="step_4_python_pandas",
758
+ ),
759
+ OutputStep(
760
+ input_slots=[
761
+ InputSlot(name="result", env_var=None, validator=validate_input_file_dummy)
762
+ ],
763
+ ),
764
+ ]
765
+ SCHEMA_PARAMS_DEFAULT_IMPLEMENTATIONS = (NODES_DEFAULT_IMPLEMENTATIONS, EDGES_TWO_STEPS)
easylink/rule.py CHANGED
@@ -182,15 +182,15 @@ rule:
182
182
  # TODO [MIC-5787]: handle multiple wildcards, e.g.
183
183
  # output_paths = ",".join(self.output)
184
184
  # wildcards_subdir = "/".join([f"{{wildcards.{wc}}}" for wc in self.wildcards])
185
- # and then in shell cmd: export DUMMY_CONTAINER_OUTPUT_PATHS={output_paths}/{wildcards_subdir}
185
+ # and then in shell cmd: export OUTPUT_PATHS={output_paths}/{wildcards_subdir}
186
186
 
187
187
  # snakemake shell commands require wildcards to be prefaced with 'wildcards.'
188
188
  output_files = ",".join(self.output).replace("{chunk}", "{wildcards.chunk}")
189
189
  shell_cmd = f"""
190
190
  shell:
191
191
  '''
192
- export DUMMY_CONTAINER_OUTPUT_PATHS={output_files}
193
- export DUMMY_CONTAINER_DIAGNOSTICS_DIRECTORY={self.diagnostics_dir}"""
192
+ export OUTPUT_PATHS={output_files}
193
+ export DIAGNOSTICS_DIRECTORY={self.diagnostics_dir}"""
194
194
  for input_slot_attrs in self.input_slots.values():
195
195
  # snakemake shell commands require wildcards to be prefaced with 'wildcards.'
196
196
  input_files = ",".join(input_slot_attrs["filepaths"]).replace(
@@ -200,8 +200,8 @@ rule:
200
200
  export {input_slot_attrs["env_var"]}={input_files}"""
201
201
  if self.requires_spark:
202
202
  shell_cmd += f"""
203
- read -r DUMMY_CONTAINER_SPARK_MASTER_URL < {{input.master_url}}
204
- export DUMMY_CONTAINER_SPARK_MASTER_URL"""
203
+ read -r SPARK_MASTER_URL < {{input.master_url}}
204
+ export SPARK_MASTER_URL"""
205
205
  for var_name, var_value in self.envvars.items():
206
206
  shell_cmd += f"""
207
207
  export {var_name}={var_value}"""
easylink/step.py CHANGED
@@ -92,6 +92,7 @@ class Step:
92
92
  input_slot_mappings: Iterable[InputSlotMapping] = (),
93
93
  output_slot_mappings: Iterable[OutputSlotMapping] = (),
94
94
  is_auto_parallel: bool = False,
95
+ default_implementation: str | None = None,
95
96
  ) -> None:
96
97
  if not step_name and not name:
97
98
  raise ValueError("All Steps must contain a step_name, name, or both.")
@@ -127,6 +128,9 @@ class Step:
127
128
  ``OutputSlotMappings`` of this ``Step``."""
128
129
  self.is_auto_parallel = is_auto_parallel
129
130
  """Whether or not this ``Step`` is to be automatically run in parallel."""
131
+ self.default_implementation = default_implementation
132
+ """The default implementation to use for this ``Step`` if the ``Step`` is
133
+ not explicitly configured in the pipeline specification."""
130
134
  self.parent_step = None
131
135
  """This ``Step's`` parent ``Step``, if applicable."""
132
136
  self._configuration_state = None
@@ -580,6 +584,7 @@ class HierarchicalStep(Step):
580
584
  input_slot_mappings=(),
581
585
  output_slot_mappings=(),
582
586
  directly_implemented=True,
587
+ default_implementation: str | None = None,
583
588
  ):
584
589
  super().__init__(
585
590
  step_name,
@@ -588,6 +593,7 @@ class HierarchicalStep(Step):
588
593
  output_slots,
589
594
  input_slot_mappings,
590
595
  output_slot_mappings,
596
+ default_implementation=default_implementation,
591
597
  )
592
598
  self.nodes = nodes
593
599
  """All sub-nodes (i.e. sub-``Steps``) that make up this ``HierarchicalStep``."""
@@ -722,13 +728,19 @@ class HierarchicalStep(Step):
722
728
  step = self.step_graph.nodes[node]["step"]
723
729
  if isinstance(step, IOStep):
724
730
  continue
731
+ if step.name not in step_config:
732
+ default_implementation = self.step_graph.nodes[step.name][
733
+ "step"
734
+ ].default_implementation
735
+ step_errors = (
736
+ {f"step {step.name}": ["The step is not configured."]}
737
+ if not default_implementation
738
+ else {}
739
+ )
725
740
  else:
726
- if step.name not in step_config:
727
- step_errors = {f"step {step.name}": ["The step is not configured."]}
728
- else:
729
- step_errors = step.validate_step(
730
- step_config[step.name], combined_implementations, input_data_config
731
- )
741
+ step_errors = step.validate_step(
742
+ step_config[step.name], combined_implementations, input_data_config
743
+ )
732
744
  if step_errors:
733
745
  errors.update(step_errors)
734
746
  extra_steps = set(step_config.keys()) - set(self.step_graph.nodes)
@@ -830,12 +842,14 @@ class TemplatedStep(Step, ABC):
830
842
  def __init__(
831
843
  self,
832
844
  template_step: Step,
845
+ default_implementation: str | None = None,
833
846
  ) -> None:
834
847
  super().__init__(
835
848
  template_step.step_name,
836
849
  template_step.name,
837
850
  template_step.input_slots.values(),
838
851
  template_step.output_slots.values(),
852
+ default_implementation=default_implementation,
839
853
  )
840
854
  self.step_graph = None
841
855
  """The :class:`~easylink.graph_components.StepGraph` i.e. the directed acyclic
@@ -1110,8 +1124,9 @@ class LoopStep(TemplatedStep):
1110
1124
  self,
1111
1125
  template_step: Step | None = None,
1112
1126
  self_edges: Iterable[EdgeParams] = (),
1127
+ default_implementation: str | None = None,
1113
1128
  ) -> None:
1114
- super().__init__(template_step)
1129
+ super().__init__(template_step, default_implementation)
1115
1130
  self.self_edges = self_edges
1116
1131
  """:class:`~easylink.graph_components.EdgeParams` that represent self-edges,
1117
1132
  i.e. edges that connect the output of one loop to the input of the next."""
@@ -1119,7 +1134,7 @@ class LoopStep(TemplatedStep):
1119
1134
  @property
1120
1135
  def config_key(self):
1121
1136
  """The pipeline specification key required for a ``LoopStep``."""
1122
- return "iterate"
1137
+ return "iterations"
1123
1138
 
1124
1139
  @property
1125
1140
  def node_prefix(self):
@@ -2181,15 +2196,32 @@ class NonLeafConfigurationState(ConfigurationState):
2181
2196
 
2182
2197
  This method recursively traverses the ``StepGraph`` and sets the configuration
2183
2198
  state for each ``Step`` until reaching all leaf nodes.
2199
+
2200
+ Notes
2201
+ -----
2202
+ If a ``Step`` name is missing from the ``step_config``, we know that it
2203
+ must have a default implementation because we already validated that one
2204
+ exists during :meth:`HierarchicalStep._validate_step_graph`. In that case,
2205
+ we manually instantiate and use a ``step_config`` with the default implementation.
2184
2206
  """
2185
2207
  for sub_node in self._step.step_graph.nodes:
2186
2208
  sub_step = self._step.step_graph.nodes[sub_node]["step"]
2187
- # IOSteps, SplitterSteps, and AggregatorSteps never appear explicitly in the configuration
2188
- step_config = (
2189
- self.step_config
2190
- if isinstance(sub_step, (IOStep, SplitterStep, AggregatorStep))
2191
- else self.step_config[sub_step.name]
2192
- )
2209
+ try:
2210
+ step_config = (
2211
+ self.step_config
2212
+ if isinstance(sub_step, StandaloneStep)
2213
+ else self.step_config[sub_step.name]
2214
+ )
2215
+ except KeyError:
2216
+ # We know that any missing keys must have a default implementation
2217
+ # (because we have already checked that it exists during validation)
2218
+ step_config = LayeredConfigTree(
2219
+ {
2220
+ "implementation": {
2221
+ "name": sub_step.default_implementation,
2222
+ }
2223
+ }
2224
+ )
2193
2225
  sub_step.set_configuration_state(
2194
2226
  step_config, self.combined_implementations, self.input_data_config
2195
2227
  )
@@ -69,8 +69,8 @@ clustered_record_ids = set(dataset_df["Record ID"].unique()) & set(
69
69
 
70
70
  IDS_TO_REMOVE = pd.DataFrame({"Record ID": list(clustered_record_ids)})
71
71
 
72
- # DUMMY_CONTAINER_OUTPUT_PATHS is a single path to a file (results.parquet)
73
- results_filepath = os.environ["DUMMY_CONTAINER_OUTPUT_PATHS"]
72
+ # OUTPUT_PATHS is a single path to a file (results.parquet)
73
+ results_filepath = os.environ["OUTPUT_PATHS"]
74
74
 
75
75
  logging.info(f"Writing output for dataset from input {dataset_path} to {results_filepath}")
76
76
  IDS_TO_REMOVE.to_parquet(results_filepath)
@@ -69,8 +69,8 @@ clusters_df = load_file(clusters_filepath)
69
69
 
70
70
  IDS_TO_REMOVE = pd.DataFrame(columns=["Record ID"])
71
71
 
72
- # DUMMY_CONTAINER_OUTPUT_PATHS is a single path to a file (results.parquet)
73
- results_filepath = os.environ["DUMMY_CONTAINER_OUTPUT_PATHS"]
72
+ # OUTPUT_PATHS is a single path to a file (results.parquet)
73
+ results_filepath = os.environ["OUTPUT_PATHS"]
74
74
 
75
75
  logging.info(f"Writing output for dataset from input {dataset_path} to {results_filepath}")
76
76
  IDS_TO_REMOVE.to_parquet(results_filepath)
@@ -50,8 +50,8 @@ if len(known_clusters_filepaths) == 0:
50
50
  known_clusters_filepath = known_clusters_filepaths[0]
51
51
  known_clusters_df = load_file(known_clusters_filepath)
52
52
 
53
- # DUMMY_CONTAINER_OUTPUT_PATHS is a path to a single file (clusters.parquet)
54
- results_filepath = os.environ["DUMMY_CONTAINER_OUTPUT_PATHS"]
53
+ # OUTPUT_PATHS is a path to a single file (clusters.parquet)
54
+ results_filepath = os.environ["OUTPUT_PATHS"]
55
55
  Path(results_filepath).parent.mkdir(exist_ok=True, parents=True)
56
56
 
57
57
  new_clusters_df = load_file(new_clusters_filepath)
@@ -80,8 +80,8 @@ if len(clusters_filepaths) == 0:
80
80
 
81
81
  clusters_filepath = clusters_filepaths[0]
82
82
 
83
- # DUMMY_CONTAINER_OUTPUT_PATHS is a path to a single file (results.parquet)
84
- results_filepath = os.environ["DUMMY_CONTAINER_OUTPUT_PATHS"]
83
+ # OUTPUT_PATHS is a path to a single file (results.parquet)
84
+ results_filepath = os.environ["OUTPUT_PATHS"]
85
85
 
86
86
  clusters_df = load_file(clusters_filepath)
87
87
  links_df = clusters_to_links(clusters_df)
@@ -74,8 +74,8 @@ if len(clusters_df) > 0:
74
74
 
75
75
  IDS_TO_REMOVE = pd.DataFrame(columns=["Record ID"])
76
76
 
77
- # DUMMY_CONTAINER_OUTPUT_PATHS is a single path to a file (results.parquet)
78
- results_filepath = os.environ["DUMMY_CONTAINER_OUTPUT_PATHS"]
77
+ # OUTPUT_PATHS is a single path to a file (results.parquet)
78
+ results_filepath = os.environ["OUTPUT_PATHS"]
79
79
 
80
80
  logging.info(f"Writing output for dataset from input {dataset_path} to {results_filepath}")
81
81
  IDS_TO_REMOVE.to_parquet(results_filepath)