easylink 0.1.21__py3-none-any.whl → 0.1.23__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- easylink/_version.py +1 -1
- easylink/configuration.py +4 -3
- easylink/implementation_metadata.yaml +53 -46
- easylink/pipeline_schema.py +3 -3
- easylink/pipeline_schema_constants/__init__.py +1 -0
- easylink/pipeline_schema_constants/testing.py +124 -1
- easylink/rule.py +5 -5
- easylink/step.py +46 -14
- easylink/steps/cascading/exclude_clustered.py +2 -2
- easylink/steps/cascading/exclude_none.py +2 -2
- easylink/steps/cascading/update_clusters_by_connected_components.py +2 -2
- easylink/steps/default/default_clusters_to_links.py +2 -2
- easylink/steps/default/default_determining_exclusions.py +2 -2
- easylink/steps/default/default_removing_records.py +2 -2
- easylink/steps/default/default_schema_alignment.py +3 -2
- easylink/steps/default/default_updating_clusters.py +2 -2
- easylink/steps/dev/README.md +1 -1
- easylink/steps/dev/python_pandas/dummy_step.py +4 -4
- easylink/steps/dev/python_pandas/python_pandas.def +2 -13
- easylink/steps/dev/python_pyspark/dummy_step.py +5 -7
- easylink/steps/dev/python_pyspark/python_pyspark.def +2 -12
- easylink/steps/dev/r/dummy_step.R +2 -2
- easylink/steps/dev/r/r-image.def +2 -12
- easylink/steps/example/middle_name_to_initial.def +22 -0
- easylink/steps/example/middle_name_to_initial.py +60 -0
- easylink/steps/fastLink/fastLink_evaluating_pairs.R +4 -4
- easylink/steps/fastLink/fastLink_links_to_clusters.R +2 -2
- easylink/steps/output_dir/dummy_step_1_for_output_dir_example.py +1 -1
- easylink/steps/output_dir/dummy_step_2_for_output_dir_example.py +2 -2
- easylink/steps/rl-dummy/canonicalizing_and_downstream_analysis/dummy_canonicalizing_and_downstream_analysis.py +2 -2
- easylink/steps/rl-dummy/pre-processing/dummy_pre-processing.py +2 -2
- easylink/steps/splink/splink_blocking_and_filtering.def +1 -1
- easylink/steps/splink/splink_blocking_and_filtering.py +32 -6
- easylink/steps/splink/splink_evaluating_pairs.py +14 -4
- easylink/steps/splink/splink_links_to_clusters.py +1 -1
- {easylink-0.1.21.dist-info → easylink-0.1.23.dist-info}/METADATA +1 -1
- {easylink-0.1.21.dist-info → easylink-0.1.23.dist-info}/RECORD +41 -41
- easylink/images/spark_cluster/Dockerfile +0 -16
- easylink/images/spark_cluster/README.md +0 -15
- {easylink-0.1.21.dist-info → easylink-0.1.23.dist-info}/WHEEL +0 -0
- {easylink-0.1.21.dist-info → easylink-0.1.23.dist-info}/entry_points.txt +0 -0
- {easylink-0.1.21.dist-info → easylink-0.1.23.dist-info}/licenses/LICENSE +0 -0
- {easylink-0.1.21.dist-info → easylink-0.1.23.dist-info}/top_level.txt +0 -0
easylink/_version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.1.
|
1
|
+
__version__ = "0.1.23"
|
easylink/configuration.py
CHANGED
@@ -184,7 +184,9 @@ class Config(LayeredConfigTree):
|
|
184
184
|
#################
|
185
185
|
|
186
186
|
def _get_schema(self, schema_name: str = "main") -> PipelineSchema:
|
187
|
-
"""
|
187
|
+
"""Gets the requested :class:`~easylink.pipeline_schema.PipelineSchema`.
|
188
|
+
|
189
|
+
The schema is only returned if it validates the pipeline configuration.
|
188
190
|
|
189
191
|
Parameters
|
190
192
|
----------
|
@@ -205,11 +207,10 @@ class Config(LayeredConfigTree):
|
|
205
207
|
Notes
|
206
208
|
-----
|
207
209
|
This acts as the pipeline configuration file's validation method since
|
208
|
-
we can only
|
210
|
+
we can only validate the ``PipelineSchema`` if that file is valid.
|
209
211
|
|
210
212
|
"""
|
211
213
|
errors = defaultdict(dict)
|
212
|
-
# Try each schema until one is validated
|
213
214
|
schema = PipelineSchema.get_schema(schema_name)
|
214
215
|
logs = schema.validate_step(self.pipeline, self.input_data)
|
215
216
|
if logs:
|
@@ -2,8 +2,8 @@ step_1_python_pandas:
|
|
2
2
|
steps:
|
3
3
|
- step_1
|
4
4
|
image_name: python_pandas.sif
|
5
|
-
zenodo_record_id:
|
6
|
-
md5_checksum:
|
5
|
+
zenodo_record_id: 15733426
|
6
|
+
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
7
7
|
script_cmd: python /dummy_step.py
|
8
8
|
outputs:
|
9
9
|
step_1_main_output: result.parquet
|
@@ -11,8 +11,8 @@ step_1a_python_pandas:
|
|
11
11
|
steps:
|
12
12
|
- step_1a
|
13
13
|
image_name: python_pandas.sif
|
14
|
-
zenodo_record_id:
|
15
|
-
md5_checksum:
|
14
|
+
zenodo_record_id: 15733426
|
15
|
+
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
16
16
|
script_cmd: python /dummy_step.py
|
17
17
|
env:
|
18
18
|
INPUT_ENV_VARS: DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS,DUMMY_CONTAINER_SECONDARY_INPUT_FILE_PATHS
|
@@ -22,8 +22,8 @@ step_1b_python_pandas:
|
|
22
22
|
steps:
|
23
23
|
- step_1b
|
24
24
|
image_name: python_pandas.sif
|
25
|
-
zenodo_record_id:
|
26
|
-
md5_checksum:
|
25
|
+
zenodo_record_id: 15733426
|
26
|
+
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
27
27
|
script_cmd: python /dummy_step.py
|
28
28
|
env:
|
29
29
|
INPUT_ENV_VARS: DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS,DUMMY_CONTAINER_SECONDARY_INPUT_FILE_PATHS
|
@@ -33,8 +33,8 @@ step_2_python_pandas:
|
|
33
33
|
steps:
|
34
34
|
- step_2
|
35
35
|
image_name: python_pandas.sif
|
36
|
-
zenodo_record_id:
|
37
|
-
md5_checksum:
|
36
|
+
zenodo_record_id: 15733426
|
37
|
+
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
38
38
|
script_cmd: python /dummy_step.py
|
39
39
|
outputs:
|
40
40
|
step_2_main_output: result.parquet
|
@@ -42,8 +42,8 @@ step_3_python_pandas:
|
|
42
42
|
steps:
|
43
43
|
- step_3
|
44
44
|
image_name: python_pandas.sif
|
45
|
-
zenodo_record_id:
|
46
|
-
md5_checksum:
|
45
|
+
zenodo_record_id: 15733426
|
46
|
+
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
47
47
|
script_cmd: python /dummy_step.py
|
48
48
|
outputs:
|
49
49
|
step_3_main_output: result.parquet
|
@@ -51,8 +51,8 @@ step_4_python_pandas:
|
|
51
51
|
steps:
|
52
52
|
- step_4
|
53
53
|
image_name: python_pandas.sif
|
54
|
-
zenodo_record_id:
|
55
|
-
md5_checksum:
|
54
|
+
zenodo_record_id: 15733426
|
55
|
+
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
56
56
|
script_cmd: python /dummy_step.py
|
57
57
|
env:
|
58
58
|
INPUT_ENV_VARS: DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS,DUMMY_CONTAINER_SECONDARY_INPUT_FILE_PATHS
|
@@ -62,8 +62,8 @@ step_5_python_pandas:
|
|
62
62
|
steps:
|
63
63
|
- step_5
|
64
64
|
image_name: python_pandas.sif
|
65
|
-
zenodo_record_id:
|
66
|
-
md5_checksum:
|
65
|
+
zenodo_record_id: 15733426
|
66
|
+
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
67
67
|
script_cmd: python /dummy_step.py
|
68
68
|
env:
|
69
69
|
INPUT_ENV_VARS: DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS,DUMMY_CONTAINER_SECONDARY_INPUT_FILE_PATHS
|
@@ -73,8 +73,8 @@ step_6_python_pandas:
|
|
73
73
|
steps:
|
74
74
|
- step_6
|
75
75
|
image_name: python_pandas.sif
|
76
|
-
zenodo_record_id:
|
77
|
-
md5_checksum:
|
76
|
+
zenodo_record_id: 15733426
|
77
|
+
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
78
78
|
script_cmd: python /dummy_step.py
|
79
79
|
env:
|
80
80
|
INPUT_ENV_VARS: DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS,DUMMY_CONTAINER_SECONDARY_INPUT_FILE_PATHS
|
@@ -84,8 +84,8 @@ step_4a_python_pandas:
|
|
84
84
|
steps:
|
85
85
|
- step_4a
|
86
86
|
image_name: python_pandas.sif
|
87
|
-
zenodo_record_id:
|
88
|
-
md5_checksum:
|
87
|
+
zenodo_record_id: 15733426
|
88
|
+
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
89
89
|
script_cmd: python /dummy_step.py
|
90
90
|
env:
|
91
91
|
INPUT_ENV_VARS: DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS,DUMMY_CONTAINER_SECONDARY_INPUT_FILE_PATHS
|
@@ -95,8 +95,8 @@ step_4b_python_pandas:
|
|
95
95
|
steps:
|
96
96
|
- step_4b
|
97
97
|
image_name: python_pandas.sif
|
98
|
-
zenodo_record_id:
|
99
|
-
md5_checksum:
|
98
|
+
zenodo_record_id: 15733426
|
99
|
+
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
100
100
|
script_cmd: python /dummy_step.py
|
101
101
|
env:
|
102
102
|
INPUT_ENV_VARS: DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS,DUMMY_CONTAINER_SECONDARY_INPUT_FILE_PATHS
|
@@ -106,8 +106,8 @@ step_4b_r:
|
|
106
106
|
steps:
|
107
107
|
- step_4b
|
108
108
|
image_name: r-image.sif
|
109
|
-
zenodo_record_id:
|
110
|
-
md5_checksum:
|
109
|
+
zenodo_record_id: 15733426
|
110
|
+
md5_checksum: a4a03b836694a2b81a1bd2852736ccc5
|
111
111
|
script_cmd: Rscript /dummy_step.R
|
112
112
|
env:
|
113
113
|
INPUT_ENV_VARS: DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS,DUMMY_CONTAINER_SECONDARY_INPUT_FILE_PATHS
|
@@ -117,8 +117,8 @@ step_1_python_pyspark:
|
|
117
117
|
steps:
|
118
118
|
- step_1
|
119
119
|
image_name: python_pyspark.sif
|
120
|
-
zenodo_record_id:
|
121
|
-
md5_checksum:
|
120
|
+
zenodo_record_id: 15733426
|
121
|
+
md5_checksum: c948577ab0607411dd4b640622d9ec3a
|
122
122
|
script_cmd: python3 /code/dummy_step.py
|
123
123
|
outputs:
|
124
124
|
step_1_main_output: result.parquet
|
@@ -127,8 +127,8 @@ step_2_python_pyspark:
|
|
127
127
|
steps:
|
128
128
|
- step_2
|
129
129
|
image_name: python_pyspark.sif
|
130
|
-
zenodo_record_id:
|
131
|
-
md5_checksum:
|
130
|
+
zenodo_record_id: 15733426
|
131
|
+
md5_checksum: c948577ab0607411dd4b640622d9ec3a
|
132
132
|
script_cmd: python3 /code/dummy_step.py
|
133
133
|
outputs:
|
134
134
|
step_2_main_output: result.parquet
|
@@ -137,8 +137,8 @@ step_3_python_pyspark:
|
|
137
137
|
steps:
|
138
138
|
- step_3
|
139
139
|
image_name: python_pyspark.sif
|
140
|
-
zenodo_record_id:
|
141
|
-
md5_checksum:
|
140
|
+
zenodo_record_id: 15733426
|
141
|
+
md5_checksum: c948577ab0607411dd4b640622d9ec3a
|
142
142
|
script_cmd: python3 /code/dummy_step.py
|
143
143
|
outputs:
|
144
144
|
step_3_main_output: result.parquet
|
@@ -147,8 +147,8 @@ step_4_python_pyspark:
|
|
147
147
|
steps:
|
148
148
|
- step_4
|
149
149
|
image_name: python_pyspark.sif
|
150
|
-
zenodo_record_id:
|
151
|
-
md5_checksum:
|
150
|
+
zenodo_record_id: 15733426
|
151
|
+
md5_checksum: c948577ab0607411dd4b640622d9ec3a
|
152
152
|
script_cmd: python3 /code/dummy_step.py
|
153
153
|
env:
|
154
154
|
INPUT_ENV_VARS: DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS,DUMMY_CONTAINER_SECONDARY_INPUT_FILE_PATHS
|
@@ -158,8 +158,8 @@ step_1_r:
|
|
158
158
|
steps:
|
159
159
|
- step_1
|
160
160
|
image_name: r-image.sif
|
161
|
-
zenodo_record_id:
|
162
|
-
md5_checksum:
|
161
|
+
zenodo_record_id: 15733426
|
162
|
+
md5_checksum: a4a03b836694a2b81a1bd2852736ccc5
|
163
163
|
script_cmd: Rscript /dummy_step.R
|
164
164
|
outputs:
|
165
165
|
step_1_main_output: result.parquet
|
@@ -168,8 +168,8 @@ step_2_r:
|
|
168
168
|
steps:
|
169
169
|
- step_2
|
170
170
|
image_name: r-image.sif
|
171
|
-
zenodo_record_id:
|
172
|
-
md5_checksum:
|
171
|
+
zenodo_record_id: 15733426
|
172
|
+
md5_checksum: a4a03b836694a2b81a1bd2852736ccc5
|
173
173
|
script_cmd: Rscript /dummy_step.R
|
174
174
|
outputs:
|
175
175
|
step_2_main_output: result.parquet
|
@@ -178,8 +178,8 @@ step_3_r:
|
|
178
178
|
steps:
|
179
179
|
- step_3
|
180
180
|
image_name: r-image.sif
|
181
|
-
zenodo_record_id:
|
182
|
-
md5_checksum:
|
181
|
+
zenodo_record_id: 15733426
|
182
|
+
md5_checksum: a4a03b836694a2b81a1bd2852736ccc5
|
183
183
|
script_cmd: Rscript /dummy_step.R
|
184
184
|
outputs:
|
185
185
|
step_3_main_output: result.parquet
|
@@ -188,8 +188,8 @@ step_4_r:
|
|
188
188
|
steps:
|
189
189
|
- step_4
|
190
190
|
image_name: r-image.sif
|
191
|
-
zenodo_record_id:
|
192
|
-
md5_checksum:
|
191
|
+
zenodo_record_id: 15733426
|
192
|
+
md5_checksum: a4a03b836694a2b81a1bd2852736ccc5
|
193
193
|
script_cmd: Rscript /dummy_step.R
|
194
194
|
env:
|
195
195
|
INPUT_ENV_VARS: DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS,DUMMY_CONTAINER_SECONDARY_INPUT_FILE_PATHS
|
@@ -201,8 +201,8 @@ step_1_and_step_2_combined_python_pandas:
|
|
201
201
|
- step_1
|
202
202
|
- step_2
|
203
203
|
image_name: python_pandas.sif
|
204
|
-
zenodo_record_id:
|
205
|
-
md5_checksum:
|
204
|
+
zenodo_record_id: 15733426
|
205
|
+
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
206
206
|
script_cmd: python /dummy_step.py
|
207
207
|
outputs:
|
208
208
|
step_2_main_output: result.parquet
|
@@ -211,8 +211,8 @@ step_1_and_step_2_parallel_python_pandas:
|
|
211
211
|
- step_1
|
212
212
|
- step_2
|
213
213
|
image_name: python_pandas.sif
|
214
|
-
zenodo_record_id:
|
215
|
-
md5_checksum:
|
214
|
+
zenodo_record_id: 15733426
|
215
|
+
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
216
216
|
script_cmd: python /dummy_step.py
|
217
217
|
env:
|
218
218
|
INPUT_ENV_VARS: STEP_1_DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS,STEP_2_DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS
|
@@ -223,8 +223,8 @@ step_3_and_step_4_combined_python_pandas:
|
|
223
223
|
- step_3
|
224
224
|
- step_4
|
225
225
|
image_name: python_pandas.sif
|
226
|
-
zenodo_record_id:
|
227
|
-
md5_checksum:
|
226
|
+
zenodo_record_id: 15733426
|
227
|
+
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
228
228
|
script_cmd: python /dummy_step.py
|
229
229
|
outputs:
|
230
230
|
step_4_main_output: result.parquet
|
@@ -233,8 +233,8 @@ step_1a_and_step_1b_combined_python_pandas:
|
|
233
233
|
- step_1a
|
234
234
|
- step_1b
|
235
235
|
image_name: python_pandas.sif
|
236
|
-
zenodo_record_id:
|
237
|
-
md5_checksum:
|
236
|
+
zenodo_record_id: 15733426
|
237
|
+
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
238
238
|
script_cmd: python /dummy_step.py
|
239
239
|
outputs:
|
240
240
|
step_1_main_output: result.parquet
|
@@ -362,3 +362,10 @@ update_clusters_by_connected_components:
|
|
362
362
|
script_cmd: python /update_clusters_by_connected_components.py
|
363
363
|
outputs:
|
364
364
|
clusters: result.parquet
|
365
|
+
middle_name_to_initial:
|
366
|
+
steps:
|
367
|
+
- pre-processing
|
368
|
+
image_name: main/middle_name_to_initial.sif
|
369
|
+
script_cmd: python /middle_name_to_initial.py
|
370
|
+
outputs:
|
371
|
+
dataset: dataset
|
easylink/pipeline_schema.py
CHANGED
@@ -159,10 +159,10 @@ class PipelineSchema(HierarchicalStep):
|
|
159
159
|
)
|
160
160
|
|
161
161
|
@classmethod
|
162
|
-
def get_schema(cls, name: str = "main") ->
|
163
|
-
"""Gets
|
162
|
+
def get_schema(cls, name: str = "main") -> "PipelineSchema":
|
163
|
+
"""Gets the requested ``PipelineSchema``.
|
164
164
|
|
165
|
-
|
165
|
+
This ``PipelineSchema`` represents the fully supported pipelines and is
|
166
166
|
used to validate the user-requested pipeline.
|
167
167
|
|
168
168
|
Parameters
|
@@ -27,4 +27,5 @@ SCHEMA_PARAMS = {
|
|
27
27
|
"auto_parallel_cloneable_step": testing.SCHEMA_PARAMS_AUTO_PARALLEL_CLONEABLE_STEP,
|
28
28
|
"auto_parallel_loop_step": testing.SCHEMA_PARAMS_AUTO_PARALLEL_LOOP_STEP,
|
29
29
|
"auto_parallel_hierarchical_step": testing.SCHEMA_PARAMS_AUTO_PARALLEL_HIERARCHICAL_STEP,
|
30
|
+
"default_implementations": testing.SCHEMA_PARAMS_DEFAULT_IMPLEMENTATIONS,
|
30
31
|
}
|
@@ -607,7 +607,7 @@ NODES_OUTPUT_DIR = [
|
|
607
607
|
input_slots=[
|
608
608
|
InputSlot(
|
609
609
|
name="step_2_main_input",
|
610
|
-
env_var="
|
610
|
+
env_var="MAIN_INPUT_DIR_PATH",
|
611
611
|
validator=validate_dir,
|
612
612
|
)
|
613
613
|
],
|
@@ -640,3 +640,126 @@ EDGES_OUTPUT_DIR = [
|
|
640
640
|
),
|
641
641
|
]
|
642
642
|
SCHEMA_PARAMS_OUTPUT_DIR = (NODES_OUTPUT_DIR, EDGES_OUTPUT_DIR)
|
643
|
+
|
644
|
+
|
645
|
+
NODES_DEFAULT_IMPLEMENTATIONS = [
|
646
|
+
InputStep(),
|
647
|
+
HierarchicalStep(
|
648
|
+
step_name="step_1",
|
649
|
+
input_slots=[
|
650
|
+
InputSlot(
|
651
|
+
name="step_1_main_input",
|
652
|
+
env_var="DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS",
|
653
|
+
validator=validate_input_file_dummy,
|
654
|
+
),
|
655
|
+
],
|
656
|
+
output_slots=[OutputSlot("step_1_main_output")],
|
657
|
+
nodes=[
|
658
|
+
Step(
|
659
|
+
step_name="step_1a",
|
660
|
+
input_slots=[
|
661
|
+
InputSlot(
|
662
|
+
name="step_1a_main_input",
|
663
|
+
env_var="DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS",
|
664
|
+
validator=validate_input_file_dummy,
|
665
|
+
),
|
666
|
+
],
|
667
|
+
output_slots=[OutputSlot("step_1a_main_output")],
|
668
|
+
default_implementation="step_1a_python_pandas",
|
669
|
+
),
|
670
|
+
Step(
|
671
|
+
step_name="step_1b",
|
672
|
+
input_slots=[
|
673
|
+
InputSlot(
|
674
|
+
name="step_1b_main_input",
|
675
|
+
env_var="DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS",
|
676
|
+
validator=validate_input_file_dummy,
|
677
|
+
),
|
678
|
+
],
|
679
|
+
output_slots=[OutputSlot("step_1b_main_output")],
|
680
|
+
default_implementation="step_1b_python_pandas",
|
681
|
+
),
|
682
|
+
],
|
683
|
+
edges=[
|
684
|
+
EdgeParams(
|
685
|
+
source_node="step_1a",
|
686
|
+
target_node="step_1b",
|
687
|
+
output_slot="step_1a_main_output",
|
688
|
+
input_slot="step_1b_main_input",
|
689
|
+
),
|
690
|
+
],
|
691
|
+
input_slot_mappings=[
|
692
|
+
InputSlotMapping(
|
693
|
+
parent_slot="step_1_main_input",
|
694
|
+
child_node="step_1a",
|
695
|
+
child_slot="step_1a_main_input",
|
696
|
+
),
|
697
|
+
],
|
698
|
+
output_slot_mappings=[
|
699
|
+
OutputSlotMapping(
|
700
|
+
parent_slot="step_1_main_output",
|
701
|
+
child_node="step_1b",
|
702
|
+
child_slot="step_1b_main_output",
|
703
|
+
),
|
704
|
+
],
|
705
|
+
default_implementation="step_1_python_pandas",
|
706
|
+
),
|
707
|
+
Step(
|
708
|
+
step_name="step_2",
|
709
|
+
input_slots=[
|
710
|
+
InputSlot(
|
711
|
+
name="step_2_main_input",
|
712
|
+
env_var="DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS",
|
713
|
+
validator=validate_input_file_dummy,
|
714
|
+
)
|
715
|
+
],
|
716
|
+
output_slots=[OutputSlot("step_2_main_output")],
|
717
|
+
default_implementation="step_2_python_pandas",
|
718
|
+
),
|
719
|
+
LoopStep(
|
720
|
+
template_step=Step(
|
721
|
+
step_name="step_3",
|
722
|
+
input_slots=[
|
723
|
+
InputSlot(
|
724
|
+
name="step_3_main_input",
|
725
|
+
env_var="DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS",
|
726
|
+
validator=validate_input_file_dummy,
|
727
|
+
)
|
728
|
+
],
|
729
|
+
output_slots=[OutputSlot("step_3_main_output")],
|
730
|
+
),
|
731
|
+
self_edges=[
|
732
|
+
EdgeParams(
|
733
|
+
source_node="step_3",
|
734
|
+
target_node="step_3",
|
735
|
+
output_slot="step_3_main_output",
|
736
|
+
input_slot="step_3_main_input",
|
737
|
+
),
|
738
|
+
],
|
739
|
+
default_implementation="step_3_python_pandas",
|
740
|
+
),
|
741
|
+
CloneableStep(
|
742
|
+
template_step=Step(
|
743
|
+
step_name="step_4",
|
744
|
+
input_slots=[
|
745
|
+
InputSlot(
|
746
|
+
name="step_4_main_input",
|
747
|
+
env_var="DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS",
|
748
|
+
validator=validate_input_file_dummy,
|
749
|
+
),
|
750
|
+
],
|
751
|
+
output_slots=[
|
752
|
+
OutputSlot(
|
753
|
+
name="step_4_main_output",
|
754
|
+
),
|
755
|
+
],
|
756
|
+
),
|
757
|
+
default_implementation="step_4_python_pandas",
|
758
|
+
),
|
759
|
+
OutputStep(
|
760
|
+
input_slots=[
|
761
|
+
InputSlot(name="result", env_var=None, validator=validate_input_file_dummy)
|
762
|
+
],
|
763
|
+
),
|
764
|
+
]
|
765
|
+
SCHEMA_PARAMS_DEFAULT_IMPLEMENTATIONS = (NODES_DEFAULT_IMPLEMENTATIONS, EDGES_TWO_STEPS)
|
easylink/rule.py
CHANGED
@@ -182,15 +182,15 @@ rule:
|
|
182
182
|
# TODO [MIC-5787]: handle multiple wildcards, e.g.
|
183
183
|
# output_paths = ",".join(self.output)
|
184
184
|
# wildcards_subdir = "/".join([f"{{wildcards.{wc}}}" for wc in self.wildcards])
|
185
|
-
# and then in shell cmd: export
|
185
|
+
# and then in shell cmd: export OUTPUT_PATHS={output_paths}/{wildcards_subdir}
|
186
186
|
|
187
187
|
# snakemake shell commands require wildcards to be prefaced with 'wildcards.'
|
188
188
|
output_files = ",".join(self.output).replace("{chunk}", "{wildcards.chunk}")
|
189
189
|
shell_cmd = f"""
|
190
190
|
shell:
|
191
191
|
'''
|
192
|
-
export
|
193
|
-
export
|
192
|
+
export OUTPUT_PATHS={output_files}
|
193
|
+
export DIAGNOSTICS_DIRECTORY={self.diagnostics_dir}"""
|
194
194
|
for input_slot_attrs in self.input_slots.values():
|
195
195
|
# snakemake shell commands require wildcards to be prefaced with 'wildcards.'
|
196
196
|
input_files = ",".join(input_slot_attrs["filepaths"]).replace(
|
@@ -200,8 +200,8 @@ rule:
|
|
200
200
|
export {input_slot_attrs["env_var"]}={input_files}"""
|
201
201
|
if self.requires_spark:
|
202
202
|
shell_cmd += f"""
|
203
|
-
read -r
|
204
|
-
export
|
203
|
+
read -r SPARK_MASTER_URL < {{input.master_url}}
|
204
|
+
export SPARK_MASTER_URL"""
|
205
205
|
for var_name, var_value in self.envvars.items():
|
206
206
|
shell_cmd += f"""
|
207
207
|
export {var_name}={var_value}"""
|
easylink/step.py
CHANGED
@@ -92,6 +92,7 @@ class Step:
|
|
92
92
|
input_slot_mappings: Iterable[InputSlotMapping] = (),
|
93
93
|
output_slot_mappings: Iterable[OutputSlotMapping] = (),
|
94
94
|
is_auto_parallel: bool = False,
|
95
|
+
default_implementation: str | None = None,
|
95
96
|
) -> None:
|
96
97
|
if not step_name and not name:
|
97
98
|
raise ValueError("All Steps must contain a step_name, name, or both.")
|
@@ -127,6 +128,9 @@ class Step:
|
|
127
128
|
``OutputSlotMappings`` of this ``Step``."""
|
128
129
|
self.is_auto_parallel = is_auto_parallel
|
129
130
|
"""Whether or not this ``Step`` is to be automatically run in parallel."""
|
131
|
+
self.default_implementation = default_implementation
|
132
|
+
"""The default implementation to use for this ``Step`` if the ``Step`` is
|
133
|
+
not explicitly configured in the pipeline specification."""
|
130
134
|
self.parent_step = None
|
131
135
|
"""This ``Step's`` parent ``Step``, if applicable."""
|
132
136
|
self._configuration_state = None
|
@@ -580,6 +584,7 @@ class HierarchicalStep(Step):
|
|
580
584
|
input_slot_mappings=(),
|
581
585
|
output_slot_mappings=(),
|
582
586
|
directly_implemented=True,
|
587
|
+
default_implementation: str | None = None,
|
583
588
|
):
|
584
589
|
super().__init__(
|
585
590
|
step_name,
|
@@ -588,6 +593,7 @@ class HierarchicalStep(Step):
|
|
588
593
|
output_slots,
|
589
594
|
input_slot_mappings,
|
590
595
|
output_slot_mappings,
|
596
|
+
default_implementation=default_implementation,
|
591
597
|
)
|
592
598
|
self.nodes = nodes
|
593
599
|
"""All sub-nodes (i.e. sub-``Steps``) that make up this ``HierarchicalStep``."""
|
@@ -722,13 +728,19 @@ class HierarchicalStep(Step):
|
|
722
728
|
step = self.step_graph.nodes[node]["step"]
|
723
729
|
if isinstance(step, IOStep):
|
724
730
|
continue
|
731
|
+
if step.name not in step_config:
|
732
|
+
default_implementation = self.step_graph.nodes[step.name][
|
733
|
+
"step"
|
734
|
+
].default_implementation
|
735
|
+
step_errors = (
|
736
|
+
{f"step {step.name}": ["The step is not configured."]}
|
737
|
+
if not default_implementation
|
738
|
+
else {}
|
739
|
+
)
|
725
740
|
else:
|
726
|
-
|
727
|
-
|
728
|
-
|
729
|
-
step_errors = step.validate_step(
|
730
|
-
step_config[step.name], combined_implementations, input_data_config
|
731
|
-
)
|
741
|
+
step_errors = step.validate_step(
|
742
|
+
step_config[step.name], combined_implementations, input_data_config
|
743
|
+
)
|
732
744
|
if step_errors:
|
733
745
|
errors.update(step_errors)
|
734
746
|
extra_steps = set(step_config.keys()) - set(self.step_graph.nodes)
|
@@ -830,12 +842,14 @@ class TemplatedStep(Step, ABC):
|
|
830
842
|
def __init__(
|
831
843
|
self,
|
832
844
|
template_step: Step,
|
845
|
+
default_implementation: str | None = None,
|
833
846
|
) -> None:
|
834
847
|
super().__init__(
|
835
848
|
template_step.step_name,
|
836
849
|
template_step.name,
|
837
850
|
template_step.input_slots.values(),
|
838
851
|
template_step.output_slots.values(),
|
852
|
+
default_implementation=default_implementation,
|
839
853
|
)
|
840
854
|
self.step_graph = None
|
841
855
|
"""The :class:`~easylink.graph_components.StepGraph` i.e. the directed acyclic
|
@@ -1110,8 +1124,9 @@ class LoopStep(TemplatedStep):
|
|
1110
1124
|
self,
|
1111
1125
|
template_step: Step | None = None,
|
1112
1126
|
self_edges: Iterable[EdgeParams] = (),
|
1127
|
+
default_implementation: str | None = None,
|
1113
1128
|
) -> None:
|
1114
|
-
super().__init__(template_step)
|
1129
|
+
super().__init__(template_step, default_implementation)
|
1115
1130
|
self.self_edges = self_edges
|
1116
1131
|
""":class:`~easylink.graph_components.EdgeParams` that represent self-edges,
|
1117
1132
|
i.e. edges that connect the output of one loop to the input of the next."""
|
@@ -1119,7 +1134,7 @@ class LoopStep(TemplatedStep):
|
|
1119
1134
|
@property
|
1120
1135
|
def config_key(self):
|
1121
1136
|
"""The pipeline specification key required for a ``LoopStep``."""
|
1122
|
-
return "
|
1137
|
+
return "iterations"
|
1123
1138
|
|
1124
1139
|
@property
|
1125
1140
|
def node_prefix(self):
|
@@ -2181,15 +2196,32 @@ class NonLeafConfigurationState(ConfigurationState):
|
|
2181
2196
|
|
2182
2197
|
This method recursively traverses the ``StepGraph`` and sets the configuration
|
2183
2198
|
state for each ``Step`` until reaching all leaf nodes.
|
2199
|
+
|
2200
|
+
Notes
|
2201
|
+
-----
|
2202
|
+
If a ``Step`` name is missing from the ``step_config``, we know that it
|
2203
|
+
must have a default implementation because we already validated that one
|
2204
|
+
exists during :meth:`HierarchicalStep._validate_step_graph`. In that case,
|
2205
|
+
we manually instantiate and use a ``step_config`` with the default implementation.
|
2184
2206
|
"""
|
2185
2207
|
for sub_node in self._step.step_graph.nodes:
|
2186
2208
|
sub_step = self._step.step_graph.nodes[sub_node]["step"]
|
2187
|
-
|
2188
|
-
|
2189
|
-
|
2190
|
-
|
2191
|
-
|
2192
|
-
|
2209
|
+
try:
|
2210
|
+
step_config = (
|
2211
|
+
self.step_config
|
2212
|
+
if isinstance(sub_step, StandaloneStep)
|
2213
|
+
else self.step_config[sub_step.name]
|
2214
|
+
)
|
2215
|
+
except KeyError:
|
2216
|
+
# We know that any missing keys must have a default implementation
|
2217
|
+
# (because we have already checked that it exists during validation)
|
2218
|
+
step_config = LayeredConfigTree(
|
2219
|
+
{
|
2220
|
+
"implementation": {
|
2221
|
+
"name": sub_step.default_implementation,
|
2222
|
+
}
|
2223
|
+
}
|
2224
|
+
)
|
2193
2225
|
sub_step.set_configuration_state(
|
2194
2226
|
step_config, self.combined_implementations, self.input_data_config
|
2195
2227
|
)
|
@@ -69,8 +69,8 @@ clustered_record_ids = set(dataset_df["Record ID"].unique()) & set(
|
|
69
69
|
|
70
70
|
IDS_TO_REMOVE = pd.DataFrame({"Record ID": list(clustered_record_ids)})
|
71
71
|
|
72
|
-
#
|
73
|
-
results_filepath = os.environ["
|
72
|
+
# OUTPUT_PATHS is a single path to a file (results.parquet)
|
73
|
+
results_filepath = os.environ["OUTPUT_PATHS"]
|
74
74
|
|
75
75
|
logging.info(f"Writing output for dataset from input {dataset_path} to {results_filepath}")
|
76
76
|
IDS_TO_REMOVE.to_parquet(results_filepath)
|
@@ -69,8 +69,8 @@ clusters_df = load_file(clusters_filepath)
|
|
69
69
|
|
70
70
|
IDS_TO_REMOVE = pd.DataFrame(columns=["Record ID"])
|
71
71
|
|
72
|
-
#
|
73
|
-
results_filepath = os.environ["
|
72
|
+
# OUTPUT_PATHS is a single path to a file (results.parquet)
|
73
|
+
results_filepath = os.environ["OUTPUT_PATHS"]
|
74
74
|
|
75
75
|
logging.info(f"Writing output for dataset from input {dataset_path} to {results_filepath}")
|
76
76
|
IDS_TO_REMOVE.to_parquet(results_filepath)
|
@@ -50,8 +50,8 @@ if len(known_clusters_filepaths) == 0:
|
|
50
50
|
known_clusters_filepath = known_clusters_filepaths[0]
|
51
51
|
known_clusters_df = load_file(known_clusters_filepath)
|
52
52
|
|
53
|
-
#
|
54
|
-
results_filepath = os.environ["
|
53
|
+
# OUTPUT_PATHS is a path to a single file (clusters.parquet)
|
54
|
+
results_filepath = os.environ["OUTPUT_PATHS"]
|
55
55
|
Path(results_filepath).parent.mkdir(exist_ok=True, parents=True)
|
56
56
|
|
57
57
|
new_clusters_df = load_file(new_clusters_filepath)
|
@@ -80,8 +80,8 @@ if len(clusters_filepaths) == 0:
|
|
80
80
|
|
81
81
|
clusters_filepath = clusters_filepaths[0]
|
82
82
|
|
83
|
-
#
|
84
|
-
results_filepath = os.environ["
|
83
|
+
# OUTPUT_PATHS is a path to a single file (results.parquet)
|
84
|
+
results_filepath = os.environ["OUTPUT_PATHS"]
|
85
85
|
|
86
86
|
clusters_df = load_file(clusters_filepath)
|
87
87
|
links_df = clusters_to_links(clusters_df)
|
@@ -74,8 +74,8 @@ if len(clusters_df) > 0:
|
|
74
74
|
|
75
75
|
IDS_TO_REMOVE = pd.DataFrame(columns=["Record ID"])
|
76
76
|
|
77
|
-
#
|
78
|
-
results_filepath = os.environ["
|
77
|
+
# OUTPUT_PATHS is a single path to a file (results.parquet)
|
78
|
+
results_filepath = os.environ["OUTPUT_PATHS"]
|
79
79
|
|
80
80
|
logging.info(f"Writing output for dataset from input {dataset_path} to {results_filepath}")
|
81
81
|
IDS_TO_REMOVE.to_parquet(results_filepath)
|