easylink 0.1.22__py3-none-any.whl → 0.1.23__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- easylink/_version.py +1 -1
- easylink/implementation_metadata.yaml +53 -46
- easylink/pipeline_schema_constants/testing.py +1 -1
- easylink/rule.py +5 -5
- easylink/step.py +1 -1
- easylink/steps/cascading/exclude_clustered.py +2 -2
- easylink/steps/cascading/exclude_none.py +2 -2
- easylink/steps/cascading/update_clusters_by_connected_components.py +2 -2
- easylink/steps/default/default_clusters_to_links.py +2 -2
- easylink/steps/default/default_determining_exclusions.py +2 -2
- easylink/steps/default/default_removing_records.py +2 -2
- easylink/steps/default/default_schema_alignment.py +3 -2
- easylink/steps/default/default_updating_clusters.py +2 -2
- easylink/steps/dev/README.md +1 -1
- easylink/steps/dev/python_pandas/dummy_step.py +4 -4
- easylink/steps/dev/python_pandas/python_pandas.def +2 -13
- easylink/steps/dev/python_pyspark/dummy_step.py +5 -7
- easylink/steps/dev/python_pyspark/python_pyspark.def +2 -12
- easylink/steps/dev/r/dummy_step.R +2 -2
- easylink/steps/dev/r/r-image.def +2 -12
- easylink/steps/example/middle_name_to_initial.def +22 -0
- easylink/steps/example/middle_name_to_initial.py +60 -0
- easylink/steps/fastLink/fastLink_evaluating_pairs.R +4 -4
- easylink/steps/fastLink/fastLink_links_to_clusters.R +2 -2
- easylink/steps/output_dir/dummy_step_1_for_output_dir_example.py +1 -1
- easylink/steps/output_dir/dummy_step_2_for_output_dir_example.py +2 -2
- easylink/steps/rl-dummy/canonicalizing_and_downstream_analysis/dummy_canonicalizing_and_downstream_analysis.py +2 -2
- easylink/steps/rl-dummy/pre-processing/dummy_pre-processing.py +2 -2
- easylink/steps/splink/splink_blocking_and_filtering.def +1 -1
- easylink/steps/splink/splink_blocking_and_filtering.py +32 -6
- easylink/steps/splink/splink_evaluating_pairs.py +14 -4
- easylink/steps/splink/splink_links_to_clusters.py +1 -1
- {easylink-0.1.22.dist-info → easylink-0.1.23.dist-info}/METADATA +1 -1
- {easylink-0.1.22.dist-info → easylink-0.1.23.dist-info}/RECORD +38 -36
- {easylink-0.1.22.dist-info → easylink-0.1.23.dist-info}/WHEEL +0 -0
- {easylink-0.1.22.dist-info → easylink-0.1.23.dist-info}/entry_points.txt +0 -0
- {easylink-0.1.22.dist-info → easylink-0.1.23.dist-info}/licenses/LICENSE +0 -0
- {easylink-0.1.22.dist-info → easylink-0.1.23.dist-info}/top_level.txt +0 -0
easylink/_version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.1.
|
1
|
+
__version__ = "0.1.23"
|
@@ -2,8 +2,8 @@ step_1_python_pandas:
|
|
2
2
|
steps:
|
3
3
|
- step_1
|
4
4
|
image_name: python_pandas.sif
|
5
|
-
zenodo_record_id:
|
6
|
-
md5_checksum:
|
5
|
+
zenodo_record_id: 15733426
|
6
|
+
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
7
7
|
script_cmd: python /dummy_step.py
|
8
8
|
outputs:
|
9
9
|
step_1_main_output: result.parquet
|
@@ -11,8 +11,8 @@ step_1a_python_pandas:
|
|
11
11
|
steps:
|
12
12
|
- step_1a
|
13
13
|
image_name: python_pandas.sif
|
14
|
-
zenodo_record_id:
|
15
|
-
md5_checksum:
|
14
|
+
zenodo_record_id: 15733426
|
15
|
+
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
16
16
|
script_cmd: python /dummy_step.py
|
17
17
|
env:
|
18
18
|
INPUT_ENV_VARS: DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS,DUMMY_CONTAINER_SECONDARY_INPUT_FILE_PATHS
|
@@ -22,8 +22,8 @@ step_1b_python_pandas:
|
|
22
22
|
steps:
|
23
23
|
- step_1b
|
24
24
|
image_name: python_pandas.sif
|
25
|
-
zenodo_record_id:
|
26
|
-
md5_checksum:
|
25
|
+
zenodo_record_id: 15733426
|
26
|
+
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
27
27
|
script_cmd: python /dummy_step.py
|
28
28
|
env:
|
29
29
|
INPUT_ENV_VARS: DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS,DUMMY_CONTAINER_SECONDARY_INPUT_FILE_PATHS
|
@@ -33,8 +33,8 @@ step_2_python_pandas:
|
|
33
33
|
steps:
|
34
34
|
- step_2
|
35
35
|
image_name: python_pandas.sif
|
36
|
-
zenodo_record_id:
|
37
|
-
md5_checksum:
|
36
|
+
zenodo_record_id: 15733426
|
37
|
+
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
38
38
|
script_cmd: python /dummy_step.py
|
39
39
|
outputs:
|
40
40
|
step_2_main_output: result.parquet
|
@@ -42,8 +42,8 @@ step_3_python_pandas:
|
|
42
42
|
steps:
|
43
43
|
- step_3
|
44
44
|
image_name: python_pandas.sif
|
45
|
-
zenodo_record_id:
|
46
|
-
md5_checksum:
|
45
|
+
zenodo_record_id: 15733426
|
46
|
+
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
47
47
|
script_cmd: python /dummy_step.py
|
48
48
|
outputs:
|
49
49
|
step_3_main_output: result.parquet
|
@@ -51,8 +51,8 @@ step_4_python_pandas:
|
|
51
51
|
steps:
|
52
52
|
- step_4
|
53
53
|
image_name: python_pandas.sif
|
54
|
-
zenodo_record_id:
|
55
|
-
md5_checksum:
|
54
|
+
zenodo_record_id: 15733426
|
55
|
+
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
56
56
|
script_cmd: python /dummy_step.py
|
57
57
|
env:
|
58
58
|
INPUT_ENV_VARS: DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS,DUMMY_CONTAINER_SECONDARY_INPUT_FILE_PATHS
|
@@ -62,8 +62,8 @@ step_5_python_pandas:
|
|
62
62
|
steps:
|
63
63
|
- step_5
|
64
64
|
image_name: python_pandas.sif
|
65
|
-
zenodo_record_id:
|
66
|
-
md5_checksum:
|
65
|
+
zenodo_record_id: 15733426
|
66
|
+
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
67
67
|
script_cmd: python /dummy_step.py
|
68
68
|
env:
|
69
69
|
INPUT_ENV_VARS: DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS,DUMMY_CONTAINER_SECONDARY_INPUT_FILE_PATHS
|
@@ -73,8 +73,8 @@ step_6_python_pandas:
|
|
73
73
|
steps:
|
74
74
|
- step_6
|
75
75
|
image_name: python_pandas.sif
|
76
|
-
zenodo_record_id:
|
77
|
-
md5_checksum:
|
76
|
+
zenodo_record_id: 15733426
|
77
|
+
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
78
78
|
script_cmd: python /dummy_step.py
|
79
79
|
env:
|
80
80
|
INPUT_ENV_VARS: DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS,DUMMY_CONTAINER_SECONDARY_INPUT_FILE_PATHS
|
@@ -84,8 +84,8 @@ step_4a_python_pandas:
|
|
84
84
|
steps:
|
85
85
|
- step_4a
|
86
86
|
image_name: python_pandas.sif
|
87
|
-
zenodo_record_id:
|
88
|
-
md5_checksum:
|
87
|
+
zenodo_record_id: 15733426
|
88
|
+
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
89
89
|
script_cmd: python /dummy_step.py
|
90
90
|
env:
|
91
91
|
INPUT_ENV_VARS: DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS,DUMMY_CONTAINER_SECONDARY_INPUT_FILE_PATHS
|
@@ -95,8 +95,8 @@ step_4b_python_pandas:
|
|
95
95
|
steps:
|
96
96
|
- step_4b
|
97
97
|
image_name: python_pandas.sif
|
98
|
-
zenodo_record_id:
|
99
|
-
md5_checksum:
|
98
|
+
zenodo_record_id: 15733426
|
99
|
+
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
100
100
|
script_cmd: python /dummy_step.py
|
101
101
|
env:
|
102
102
|
INPUT_ENV_VARS: DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS,DUMMY_CONTAINER_SECONDARY_INPUT_FILE_PATHS
|
@@ -106,8 +106,8 @@ step_4b_r:
|
|
106
106
|
steps:
|
107
107
|
- step_4b
|
108
108
|
image_name: r-image.sif
|
109
|
-
zenodo_record_id:
|
110
|
-
md5_checksum:
|
109
|
+
zenodo_record_id: 15733426
|
110
|
+
md5_checksum: a4a03b836694a2b81a1bd2852736ccc5
|
111
111
|
script_cmd: Rscript /dummy_step.R
|
112
112
|
env:
|
113
113
|
INPUT_ENV_VARS: DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS,DUMMY_CONTAINER_SECONDARY_INPUT_FILE_PATHS
|
@@ -117,8 +117,8 @@ step_1_python_pyspark:
|
|
117
117
|
steps:
|
118
118
|
- step_1
|
119
119
|
image_name: python_pyspark.sif
|
120
|
-
zenodo_record_id:
|
121
|
-
md5_checksum:
|
120
|
+
zenodo_record_id: 15733426
|
121
|
+
md5_checksum: c948577ab0607411dd4b640622d9ec3a
|
122
122
|
script_cmd: python3 /code/dummy_step.py
|
123
123
|
outputs:
|
124
124
|
step_1_main_output: result.parquet
|
@@ -127,8 +127,8 @@ step_2_python_pyspark:
|
|
127
127
|
steps:
|
128
128
|
- step_2
|
129
129
|
image_name: python_pyspark.sif
|
130
|
-
zenodo_record_id:
|
131
|
-
md5_checksum:
|
130
|
+
zenodo_record_id: 15733426
|
131
|
+
md5_checksum: c948577ab0607411dd4b640622d9ec3a
|
132
132
|
script_cmd: python3 /code/dummy_step.py
|
133
133
|
outputs:
|
134
134
|
step_2_main_output: result.parquet
|
@@ -137,8 +137,8 @@ step_3_python_pyspark:
|
|
137
137
|
steps:
|
138
138
|
- step_3
|
139
139
|
image_name: python_pyspark.sif
|
140
|
-
zenodo_record_id:
|
141
|
-
md5_checksum:
|
140
|
+
zenodo_record_id: 15733426
|
141
|
+
md5_checksum: c948577ab0607411dd4b640622d9ec3a
|
142
142
|
script_cmd: python3 /code/dummy_step.py
|
143
143
|
outputs:
|
144
144
|
step_3_main_output: result.parquet
|
@@ -147,8 +147,8 @@ step_4_python_pyspark:
|
|
147
147
|
steps:
|
148
148
|
- step_4
|
149
149
|
image_name: python_pyspark.sif
|
150
|
-
zenodo_record_id:
|
151
|
-
md5_checksum:
|
150
|
+
zenodo_record_id: 15733426
|
151
|
+
md5_checksum: c948577ab0607411dd4b640622d9ec3a
|
152
152
|
script_cmd: python3 /code/dummy_step.py
|
153
153
|
env:
|
154
154
|
INPUT_ENV_VARS: DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS,DUMMY_CONTAINER_SECONDARY_INPUT_FILE_PATHS
|
@@ -158,8 +158,8 @@ step_1_r:
|
|
158
158
|
steps:
|
159
159
|
- step_1
|
160
160
|
image_name: r-image.sif
|
161
|
-
zenodo_record_id:
|
162
|
-
md5_checksum:
|
161
|
+
zenodo_record_id: 15733426
|
162
|
+
md5_checksum: a4a03b836694a2b81a1bd2852736ccc5
|
163
163
|
script_cmd: Rscript /dummy_step.R
|
164
164
|
outputs:
|
165
165
|
step_1_main_output: result.parquet
|
@@ -168,8 +168,8 @@ step_2_r:
|
|
168
168
|
steps:
|
169
169
|
- step_2
|
170
170
|
image_name: r-image.sif
|
171
|
-
zenodo_record_id:
|
172
|
-
md5_checksum:
|
171
|
+
zenodo_record_id: 15733426
|
172
|
+
md5_checksum: a4a03b836694a2b81a1bd2852736ccc5
|
173
173
|
script_cmd: Rscript /dummy_step.R
|
174
174
|
outputs:
|
175
175
|
step_2_main_output: result.parquet
|
@@ -178,8 +178,8 @@ step_3_r:
|
|
178
178
|
steps:
|
179
179
|
- step_3
|
180
180
|
image_name: r-image.sif
|
181
|
-
zenodo_record_id:
|
182
|
-
md5_checksum:
|
181
|
+
zenodo_record_id: 15733426
|
182
|
+
md5_checksum: a4a03b836694a2b81a1bd2852736ccc5
|
183
183
|
script_cmd: Rscript /dummy_step.R
|
184
184
|
outputs:
|
185
185
|
step_3_main_output: result.parquet
|
@@ -188,8 +188,8 @@ step_4_r:
|
|
188
188
|
steps:
|
189
189
|
- step_4
|
190
190
|
image_name: r-image.sif
|
191
|
-
zenodo_record_id:
|
192
|
-
md5_checksum:
|
191
|
+
zenodo_record_id: 15733426
|
192
|
+
md5_checksum: a4a03b836694a2b81a1bd2852736ccc5
|
193
193
|
script_cmd: Rscript /dummy_step.R
|
194
194
|
env:
|
195
195
|
INPUT_ENV_VARS: DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS,DUMMY_CONTAINER_SECONDARY_INPUT_FILE_PATHS
|
@@ -201,8 +201,8 @@ step_1_and_step_2_combined_python_pandas:
|
|
201
201
|
- step_1
|
202
202
|
- step_2
|
203
203
|
image_name: python_pandas.sif
|
204
|
-
zenodo_record_id:
|
205
|
-
md5_checksum:
|
204
|
+
zenodo_record_id: 15733426
|
205
|
+
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
206
206
|
script_cmd: python /dummy_step.py
|
207
207
|
outputs:
|
208
208
|
step_2_main_output: result.parquet
|
@@ -211,8 +211,8 @@ step_1_and_step_2_parallel_python_pandas:
|
|
211
211
|
- step_1
|
212
212
|
- step_2
|
213
213
|
image_name: python_pandas.sif
|
214
|
-
zenodo_record_id:
|
215
|
-
md5_checksum:
|
214
|
+
zenodo_record_id: 15733426
|
215
|
+
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
216
216
|
script_cmd: python /dummy_step.py
|
217
217
|
env:
|
218
218
|
INPUT_ENV_VARS: STEP_1_DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS,STEP_2_DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS
|
@@ -223,8 +223,8 @@ step_3_and_step_4_combined_python_pandas:
|
|
223
223
|
- step_3
|
224
224
|
- step_4
|
225
225
|
image_name: python_pandas.sif
|
226
|
-
zenodo_record_id:
|
227
|
-
md5_checksum:
|
226
|
+
zenodo_record_id: 15733426
|
227
|
+
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
228
228
|
script_cmd: python /dummy_step.py
|
229
229
|
outputs:
|
230
230
|
step_4_main_output: result.parquet
|
@@ -233,8 +233,8 @@ step_1a_and_step_1b_combined_python_pandas:
|
|
233
233
|
- step_1a
|
234
234
|
- step_1b
|
235
235
|
image_name: python_pandas.sif
|
236
|
-
zenodo_record_id:
|
237
|
-
md5_checksum:
|
236
|
+
zenodo_record_id: 15733426
|
237
|
+
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
238
238
|
script_cmd: python /dummy_step.py
|
239
239
|
outputs:
|
240
240
|
step_1_main_output: result.parquet
|
@@ -362,3 +362,10 @@ update_clusters_by_connected_components:
|
|
362
362
|
script_cmd: python /update_clusters_by_connected_components.py
|
363
363
|
outputs:
|
364
364
|
clusters: result.parquet
|
365
|
+
middle_name_to_initial:
|
366
|
+
steps:
|
367
|
+
- pre-processing
|
368
|
+
image_name: main/middle_name_to_initial.sif
|
369
|
+
script_cmd: python /middle_name_to_initial.py
|
370
|
+
outputs:
|
371
|
+
dataset: dataset
|
easylink/rule.py
CHANGED
@@ -182,15 +182,15 @@ rule:
|
|
182
182
|
# TODO [MIC-5787]: handle multiple wildcards, e.g.
|
183
183
|
# output_paths = ",".join(self.output)
|
184
184
|
# wildcards_subdir = "/".join([f"{{wildcards.{wc}}}" for wc in self.wildcards])
|
185
|
-
# and then in shell cmd: export
|
185
|
+
# and then in shell cmd: export OUTPUT_PATHS={output_paths}/{wildcards_subdir}
|
186
186
|
|
187
187
|
# snakemake shell commands require wildcards to be prefaced with 'wildcards.'
|
188
188
|
output_files = ",".join(self.output).replace("{chunk}", "{wildcards.chunk}")
|
189
189
|
shell_cmd = f"""
|
190
190
|
shell:
|
191
191
|
'''
|
192
|
-
export
|
193
|
-
export
|
192
|
+
export OUTPUT_PATHS={output_files}
|
193
|
+
export DIAGNOSTICS_DIRECTORY={self.diagnostics_dir}"""
|
194
194
|
for input_slot_attrs in self.input_slots.values():
|
195
195
|
# snakemake shell commands require wildcards to be prefaced with 'wildcards.'
|
196
196
|
input_files = ",".join(input_slot_attrs["filepaths"]).replace(
|
@@ -200,8 +200,8 @@ rule:
|
|
200
200
|
export {input_slot_attrs["env_var"]}={input_files}"""
|
201
201
|
if self.requires_spark:
|
202
202
|
shell_cmd += f"""
|
203
|
-
read -r
|
204
|
-
export
|
203
|
+
read -r SPARK_MASTER_URL < {{input.master_url}}
|
204
|
+
export SPARK_MASTER_URL"""
|
205
205
|
for var_name, var_value in self.envvars.items():
|
206
206
|
shell_cmd += f"""
|
207
207
|
export {var_name}={var_value}"""
|
easylink/step.py
CHANGED
@@ -69,8 +69,8 @@ clustered_record_ids = set(dataset_df["Record ID"].unique()) & set(
|
|
69
69
|
|
70
70
|
IDS_TO_REMOVE = pd.DataFrame({"Record ID": list(clustered_record_ids)})
|
71
71
|
|
72
|
-
#
|
73
|
-
results_filepath = os.environ["
|
72
|
+
# OUTPUT_PATHS is a single path to a file (results.parquet)
|
73
|
+
results_filepath = os.environ["OUTPUT_PATHS"]
|
74
74
|
|
75
75
|
logging.info(f"Writing output for dataset from input {dataset_path} to {results_filepath}")
|
76
76
|
IDS_TO_REMOVE.to_parquet(results_filepath)
|
@@ -69,8 +69,8 @@ clusters_df = load_file(clusters_filepath)
|
|
69
69
|
|
70
70
|
IDS_TO_REMOVE = pd.DataFrame(columns=["Record ID"])
|
71
71
|
|
72
|
-
#
|
73
|
-
results_filepath = os.environ["
|
72
|
+
# OUTPUT_PATHS is a single path to a file (results.parquet)
|
73
|
+
results_filepath = os.environ["OUTPUT_PATHS"]
|
74
74
|
|
75
75
|
logging.info(f"Writing output for dataset from input {dataset_path} to {results_filepath}")
|
76
76
|
IDS_TO_REMOVE.to_parquet(results_filepath)
|
@@ -50,8 +50,8 @@ if len(known_clusters_filepaths) == 0:
|
|
50
50
|
known_clusters_filepath = known_clusters_filepaths[0]
|
51
51
|
known_clusters_df = load_file(known_clusters_filepath)
|
52
52
|
|
53
|
-
#
|
54
|
-
results_filepath = os.environ["
|
53
|
+
# OUTPUT_PATHS is a path to a single file (clusters.parquet)
|
54
|
+
results_filepath = os.environ["OUTPUT_PATHS"]
|
55
55
|
Path(results_filepath).parent.mkdir(exist_ok=True, parents=True)
|
56
56
|
|
57
57
|
new_clusters_df = load_file(new_clusters_filepath)
|
@@ -80,8 +80,8 @@ if len(clusters_filepaths) == 0:
|
|
80
80
|
|
81
81
|
clusters_filepath = clusters_filepaths[0]
|
82
82
|
|
83
|
-
#
|
84
|
-
results_filepath = os.environ["
|
83
|
+
# OUTPUT_PATHS is a path to a single file (results.parquet)
|
84
|
+
results_filepath = os.environ["OUTPUT_PATHS"]
|
85
85
|
|
86
86
|
clusters_df = load_file(clusters_filepath)
|
87
87
|
links_df = clusters_to_links(clusters_df)
|
@@ -74,8 +74,8 @@ if len(clusters_df) > 0:
|
|
74
74
|
|
75
75
|
IDS_TO_REMOVE = pd.DataFrame(columns=["Record ID"])
|
76
76
|
|
77
|
-
#
|
78
|
-
results_filepath = os.environ["
|
77
|
+
# OUTPUT_PATHS is a single path to a file (results.parquet)
|
78
|
+
results_filepath = os.environ["OUTPUT_PATHS"]
|
79
79
|
|
80
80
|
logging.info(f"Writing output for dataset from input {dataset_path} to {results_filepath}")
|
81
81
|
IDS_TO_REMOVE.to_parquet(results_filepath)
|
@@ -45,8 +45,8 @@ if dataset_path is None:
|
|
45
45
|
|
46
46
|
# IDS_TO_REMOVE_FILE_PATH is a single filepath (Cloneable section)
|
47
47
|
ids_filepath = os.environ["IDS_TO_REMOVE_FILE_PATH"]
|
48
|
-
#
|
49
|
-
results_dir = Path(os.environ["
|
48
|
+
# OUTPUT_PATHS is a single path to a directory ('dataset')
|
49
|
+
results_dir = Path(os.environ["OUTPUT_PATHS"])
|
50
50
|
results_dir.mkdir(exist_ok=True, parents=True)
|
51
51
|
|
52
52
|
dataset = load_file(dataset_path)
|
@@ -42,11 +42,12 @@ records = pd.concat(
|
|
42
42
|
ignore_index=True,
|
43
43
|
sort=False,
|
44
44
|
)
|
45
|
+
# TODO: check both datasets contain all the columns
|
45
46
|
|
46
47
|
records = records.rename(columns={"Record ID": "Input Record ID"})
|
47
48
|
|
48
|
-
#
|
49
|
-
output_path = os.environ["
|
49
|
+
# OUTPUT_PATHS is a single filepath
|
50
|
+
output_path = os.environ["OUTPUT_PATHS"]
|
50
51
|
Path(output_path).parent.mkdir(exist_ok=True, parents=True)
|
51
52
|
|
52
53
|
logging.info(f"Writing output to {output_path}")
|
@@ -54,8 +54,8 @@ if len(known_clusters_df) > 0:
|
|
54
54
|
"Default implementation of updating_clusters passed a non-empty set of known clusters"
|
55
55
|
)
|
56
56
|
|
57
|
-
#
|
58
|
-
results_filepath = os.environ["
|
57
|
+
# OUTPUT_PATHS is a path to a single file (clusters.parquet)
|
58
|
+
results_filepath = os.environ["OUTPUT_PATHS"]
|
59
59
|
Path(results_filepath).parent.mkdir(exist_ok=True, parents=True)
|
60
60
|
|
61
61
|
clusters_df = load_file(new_clusters_filepath)
|
easylink/steps/dev/README.md
CHANGED
@@ -46,7 +46,7 @@ is `DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS`, but you can also specify *what* the
|
|
46
46
|
You can (optionally) provide another input file at `/extra_implementation_specific_input_data/input*` (Parquet or CSV) or a different path passed as `DUMMY_CONTAINER_EXTRA_IMPLEMENTATION_SPECIFIC_INPUT_FILE_PATH`.
|
47
47
|
This is meant to represent an input that is specific to a given implementation.
|
48
48
|
|
49
|
-
Output is written to `/results/result.<ext>` or a different comma-separated list of paths passed as `
|
49
|
+
Output is written to `/results/result.<ext>` or a different comma-separated list of paths passed as `OUTPUT_PATHS`.
|
50
50
|
If `DUMMY_CONTAINER_OUTPUT_FILE_TYPE` is `csv` it will be in CSV format, otherwise it will be Parquet.
|
51
51
|
|
52
52
|
The environment variable `DUMMY_CONTAINER_BROKEN` makes the container return data that does not meet the specification.
|
@@ -101,9 +101,9 @@ else:
|
|
101
101
|
df.drop(columns=columns_to_drop, inplace=True)
|
102
102
|
|
103
103
|
output_file_format = os.getenv("DUMMY_CONTAINER_OUTPUT_FILE_FORMAT", "parquet")
|
104
|
-
output_file_paths = os.getenv(
|
105
|
-
"
|
106
|
-
)
|
104
|
+
output_file_paths = os.getenv("OUTPUT_PATHS", f"/results/result.{output_file_format}").split(
|
105
|
+
","
|
106
|
+
)
|
107
107
|
|
108
108
|
diagnostics["num_output_files"] = len(output_file_paths)
|
109
109
|
diagnostics["output_file_paths"] = output_file_paths
|
@@ -117,7 +117,7 @@ for output_file_path in output_file_paths:
|
|
117
117
|
else:
|
118
118
|
raise ValueError()
|
119
119
|
|
120
|
-
diagnostics_dir = os.getenv("
|
120
|
+
diagnostics_dir = os.getenv("DIAGNOSTICS_DIRECTORY", "/diagnostics")
|
121
121
|
try:
|
122
122
|
with open(f"{diagnostics_dir}/diagnostics.yaml", "w") as f:
|
123
123
|
yaml.dump(diagnostics, f, default_flow_style=False)
|
@@ -1,3 +1,4 @@
|
|
1
|
+
|
1
2
|
Bootstrap: docker
|
2
3
|
From: python@sha256:1c26c25390307b64e8ff73e7edf34b4fbeac59d41da41c08da28dc316a721899
|
3
4
|
|
@@ -16,18 +17,6 @@ From: python@sha256:1c26c25390307b64e8ff73e7edf34b4fbeac59d41da41c08da28dc316a72
|
|
16
17
|
|
17
18
|
%environment
|
18
19
|
export LC_ALL=C
|
19
|
-
export PYTHONPATH=/app:$PYTHONPATH
|
20
20
|
|
21
21
|
%runscript
|
22
|
-
python /dummy_step.py
|
23
|
-
|
24
|
-
%labels
|
25
|
-
Author Patrick Nast
|
26
|
-
Version v1.0
|
27
|
-
Description Python Pandas Implementation
|
28
|
-
|
29
|
-
%startscript
|
30
|
-
# These directories should be bound when running the container
|
31
|
-
mkdir -p /results
|
32
|
-
mkdir -p /diagnostics
|
33
|
-
mkdir -p /input_data
|
22
|
+
python /dummy_step.py '$@'
|
@@ -17,9 +17,7 @@ logging.basicConfig(
|
|
17
17
|
pyspark_log = logging.getLogger("pyspark")
|
18
18
|
pyspark_log.setLevel(logging.WARNING)
|
19
19
|
|
20
|
-
spark = SparkSession.builder.master(
|
21
|
-
os.getenv("DUMMY_CONTAINER_SPARK_MASTER_URL")
|
22
|
-
).getOrCreate()
|
20
|
+
spark = SparkSession.builder.master(os.getenv("SPARK_MASTER_URL")).getOrCreate()
|
23
21
|
|
24
22
|
|
25
23
|
def load_file(file_path, file_format=None):
|
@@ -115,9 +113,9 @@ else:
|
|
115
113
|
df = df.drop(*columns_to_drop)
|
116
114
|
|
117
115
|
output_file_format = os.getenv("DUMMY_CONTAINER_OUTPUT_FILE_FORMAT", "parquet")
|
118
|
-
output_file_paths = os.getenv(
|
119
|
-
"
|
120
|
-
)
|
116
|
+
output_file_paths = os.getenv("OUTPUT_PATHS", f"/results/result.{output_file_format}").split(
|
117
|
+
","
|
118
|
+
)
|
121
119
|
|
122
120
|
diagnostics["num_output_files"] = len(output_file_paths)
|
123
121
|
diagnostics["output_file_paths"] = output_file_paths
|
@@ -132,7 +130,7 @@ for output_file_path in output_file_paths:
|
|
132
130
|
else:
|
133
131
|
raise ValueError()
|
134
132
|
|
135
|
-
diagnostics_dir = os.getenv("
|
133
|
+
diagnostics_dir = os.getenv("DIAGNOSTICS_DIRECTORY", "/diagnostics")
|
136
134
|
try:
|
137
135
|
with open(f"{diagnostics_dir}/diagnostics.yaml", "w") as f:
|
138
136
|
yaml.dump(diagnostics, f, default_flow_style=False)
|
@@ -1,3 +1,4 @@
|
|
1
|
+
|
1
2
|
Bootstrap: docker
|
2
3
|
From: apache/spark-py@sha256:489f904a77f21134df4840de5f8bd9f110925e7b439ca6a04b7c033813edfebc
|
3
4
|
|
@@ -22,15 +23,4 @@ From: apache/spark-py@sha256:489f904a77f21134df4840de5f8bd9f110925e7b439ca6a04b7
|
|
22
23
|
|
23
24
|
%runscript
|
24
25
|
cd /workdir
|
25
|
-
python3 /code/dummy_step.py
|
26
|
-
|
27
|
-
%labels
|
28
|
-
Author Patrick Nast
|
29
|
-
Version v1.0
|
30
|
-
Description Python Pyspark Implementation
|
31
|
-
|
32
|
-
%startscript
|
33
|
-
# These directories should be bound when running the container
|
34
|
-
mkdir -p /results
|
35
|
-
mkdir -p /diagnostics
|
36
|
-
mkdir -p /input_data
|
26
|
+
python3 /code/dummy_step.py '$@'
|
@@ -106,7 +106,7 @@ if (broken) {
|
|
106
106
|
}
|
107
107
|
|
108
108
|
output_file_format <- Sys.getenv("DUMMY_CONTAINER_OUTPUT_FILE_FORMAT", "parquet")
|
109
|
-
output_file_paths <- strsplit(Sys.getenv("
|
109
|
+
output_file_paths <- strsplit(Sys.getenv("OUTPUT_PATHS", paste0("/results/result.", output_file_format)), ",")[[1]]
|
110
110
|
|
111
111
|
diagnostics$num_output_files <- length(output_file_paths)
|
112
112
|
diagnostics$output_file_paths <- output_file_paths
|
@@ -123,7 +123,7 @@ for (output_file_path in output_file_paths) {
|
|
123
123
|
}
|
124
124
|
}
|
125
125
|
|
126
|
-
diagnostics_dir <- Sys.getenv("
|
126
|
+
diagnostics_dir <- Sys.getenv("DIAGNOSTICS_DIRECTORY", "/diagnostics")
|
127
127
|
if (dir.exists(diagnostics_dir) && file.access(diagnostics_dir, mode = 2) == 0) {
|
128
128
|
write_yaml(diagnostics, file.path(diagnostics_dir, 'diagnostics.yaml'))
|
129
129
|
}
|
easylink/steps/dev/r/r-image.def
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
|
1
2
|
Bootstrap: docker
|
2
3
|
From: rocker/tidyverse@sha256:6a7c913590e758b5fe2ad9921ccc5df7c7160e5de1db5f353630fe8e0ee2f876
|
3
4
|
|
@@ -15,15 +16,4 @@ From: rocker/tidyverse@sha256:6a7c913590e758b5fe2ad9921ccc5df7c7160e5de1db5f3536
|
|
15
16
|
export LC_ALL=C
|
16
17
|
|
17
18
|
%runscript
|
18
|
-
Rscript /dummy_step.R
|
19
|
-
|
20
|
-
%labels
|
21
|
-
Author Patrick Nast
|
22
|
-
Version v1.0
|
23
|
-
Description R Implementation
|
24
|
-
|
25
|
-
%startscript
|
26
|
-
# These directories should be bound when running the container
|
27
|
-
mkdir -p /results
|
28
|
-
mkdir -p /diagnostics
|
29
|
-
mkdir -p /input_data
|
19
|
+
Rscript /dummy_step.R '$@'
|
@@ -0,0 +1,22 @@
|
|
1
|
+
|
2
|
+
Bootstrap: docker
|
3
|
+
From: python@sha256:1c26c25390307b64e8ff73e7edf34b4fbeac59d41da41c08da28dc316a721899
|
4
|
+
|
5
|
+
%files
|
6
|
+
./middle_name_to_initial.py /middle_name_to_initial.py
|
7
|
+
|
8
|
+
%post
|
9
|
+
# Create directories
|
10
|
+
mkdir -p /input_data
|
11
|
+
mkdir -p /extra_implementation_specific_input_data
|
12
|
+
mkdir -p /results
|
13
|
+
mkdir -p /diagnostics
|
14
|
+
|
15
|
+
# Install Python packages with specific versions
|
16
|
+
pip install pandas==2.1.2 pyarrow pyyaml
|
17
|
+
|
18
|
+
%environment
|
19
|
+
export LC_ALL=C
|
20
|
+
|
21
|
+
%runscript
|
22
|
+
python /middle_name_to_initial.py '$@'
|
@@ -0,0 +1,60 @@
|
|
1
|
+
# STEP_NAME: pre-processing
|
2
|
+
# REQUIREMENTS: pandas==2.1.2 pyarrow pyyaml
|
3
|
+
|
4
|
+
import logging
|
5
|
+
import os
|
6
|
+
from pathlib import Path
|
7
|
+
|
8
|
+
import pandas as pd
|
9
|
+
|
10
|
+
logging.basicConfig(
|
11
|
+
level=logging.INFO,
|
12
|
+
format="%(asctime)s %(message)s",
|
13
|
+
handlers=[logging.StreamHandler()],
|
14
|
+
)
|
15
|
+
|
16
|
+
|
17
|
+
def load_file(file_path, file_format=None):
|
18
|
+
logging.info(f"Loading file {file_path} with format {file_format}")
|
19
|
+
if file_format is None:
|
20
|
+
file_format = file_path.split(".")[-1]
|
21
|
+
if file_format == "parquet":
|
22
|
+
return pd.read_parquet(file_path)
|
23
|
+
raise ValueError(f"Unknown file format {file_format}")
|
24
|
+
|
25
|
+
|
26
|
+
# LOAD INPUTS and SAVE OUTPUTS
|
27
|
+
|
28
|
+
# DATASET_DIR_PATHS is list of directories, each containing one file
|
29
|
+
dataset_paths = os.environ["DATASET_DIR_PATHS"].split(",")
|
30
|
+
logging.info(f"{dataset_paths=}")
|
31
|
+
|
32
|
+
# for workaround, choose path based on INPUT_DATASET configuration
|
33
|
+
splitter_choice = os.environ["INPUT_DATASET"]
|
34
|
+
logging.info(f"splitter_choice={splitter_choice}")
|
35
|
+
dataset_path = None
|
36
|
+
for path in dataset_paths:
|
37
|
+
path = Path(path)
|
38
|
+
# NOTE: We iterate the dir here, but it should only have one non-hidden
|
39
|
+
# file in it. We don't validate that here as it is checked in the validator.
|
40
|
+
for path_to_check in path.iterdir():
|
41
|
+
if path_to_check.stem == splitter_choice:
|
42
|
+
dataset_path = str(path_to_check)
|
43
|
+
break
|
44
|
+
|
45
|
+
if dataset_path is None:
|
46
|
+
raise ValueError(f"No dataset matching {splitter_choice} found")
|
47
|
+
|
48
|
+
# OUTPUT_PATHS is a single path to a directory ('dataset')
|
49
|
+
results_dir = Path(os.environ["OUTPUT_PATHS"])
|
50
|
+
results_dir.mkdir(exist_ok=True, parents=True)
|
51
|
+
|
52
|
+
output_path = results_dir / Path(dataset_path).name
|
53
|
+
|
54
|
+
dataset = load_file(dataset_path)
|
55
|
+
|
56
|
+
# add middle initial column from middle name
|
57
|
+
dataset["middle_initial"] = dataset["middle_name"].str[0]
|
58
|
+
|
59
|
+
logging.info(f"Writing output for dataset from input {dataset_path} to {output_path}")
|
60
|
+
dataset.to_parquet(output_path)
|
@@ -10,8 +10,8 @@ library(stringr)
|
|
10
10
|
# Check required environment variables
|
11
11
|
required_env_vars <- c(
|
12
12
|
"BLOCKS_DIR_PATH",
|
13
|
-
"
|
14
|
-
"
|
13
|
+
"DIAGNOSTICS_DIRECTORY",
|
14
|
+
"OUTPUT_PATHS",
|
15
15
|
"COMPARISONS",
|
16
16
|
"THRESHOLD_MATCH_PROBABILITY"
|
17
17
|
)
|
@@ -24,8 +24,8 @@ if (length(missing_vars) > 0) {
|
|
24
24
|
}
|
25
25
|
|
26
26
|
blocks_dir <- Sys.getenv("BLOCKS_DIR_PATH")
|
27
|
-
diagnostics_dir <- Sys.getenv("
|
28
|
-
output_path <- Sys.getenv("
|
27
|
+
diagnostics_dir <- Sys.getenv("DIAGNOSTICS_DIRECTORY")
|
28
|
+
output_path <- Sys.getenv("OUTPUT_PATHS")
|
29
29
|
comparisons <- strsplit(Sys.getenv("COMPARISONS"), ",")[[1]]
|
30
30
|
|
31
31
|
all_predictions <- list()
|
@@ -12,7 +12,7 @@ library(stringr)
|
|
12
12
|
# Check required environment variables
|
13
13
|
required_env_vars <- c(
|
14
14
|
"LINKS_FILE_PATH",
|
15
|
-
"
|
15
|
+
"OUTPUT_PATHS",
|
16
16
|
"THRESHOLD_MATCH_PROBABILITY"
|
17
17
|
)
|
18
18
|
missing_vars <- required_env_vars[!nzchar(Sys.getenv(required_env_vars))]
|
@@ -24,7 +24,7 @@ if (length(missing_vars) > 0) {
|
|
24
24
|
}
|
25
25
|
|
26
26
|
links_file_path <- Sys.getenv("LINKS_FILE_PATH")
|
27
|
-
output_path <- Sys.getenv("
|
27
|
+
output_path <- Sys.getenv("OUTPUT_PATHS")
|
28
28
|
|
29
29
|
if (!file.exists(links_file_path)) {
|
30
30
|
stop(sprintf("File not found: %s", links_file_path))
|
@@ -11,7 +11,7 @@ data = pd.read_parquet(os.environ["STEP_1_MAIN_INPUT_FILE_PATHS"])
|
|
11
11
|
|
12
12
|
print(data)
|
13
13
|
|
14
|
-
dir_path = Path(os.environ["
|
14
|
+
dir_path = Path(os.environ["OUTPUT_PATHS"])
|
15
15
|
dir_path.mkdir(parents=True, exist_ok=True)
|
16
16
|
|
17
17
|
for i in range(3):
|
@@ -8,7 +8,7 @@ from pathlib import Path
|
|
8
8
|
|
9
9
|
import pandas as pd
|
10
10
|
|
11
|
-
dir_path = Path(os.environ["
|
11
|
+
dir_path = Path(os.environ["MAIN_INPUT_DIR_PATH"])
|
12
12
|
saved = False
|
13
13
|
|
14
14
|
for i, f in enumerate([f for f in dir_path.iterdir() if f.is_file()]):
|
@@ -16,7 +16,7 @@ for i, f in enumerate([f for f in dir_path.iterdir() if f.is_file()]):
|
|
16
16
|
continue
|
17
17
|
|
18
18
|
if not saved:
|
19
|
-
shutil.copy(f, os.environ["
|
19
|
+
shutil.copy(f, os.environ["OUTPUT_PATHS"])
|
20
20
|
saved = True
|
21
21
|
|
22
22
|
print(pd.read_parquet(f))
|
@@ -33,8 +33,8 @@ def load_file(file_path, file_format=None):
|
|
33
33
|
|
34
34
|
# CLUSTERS_FILE_PATH is a path to a single file
|
35
35
|
clusters_path = os.environ["CLUSTERS_FILE_PATH"]
|
36
|
-
#
|
37
|
-
results_filepath = os.environ["
|
36
|
+
# OUTPUT_PATHS is a path to a single file (results.parquet)
|
37
|
+
results_filepath = os.environ["OUTPUT_PATHS"]
|
38
38
|
|
39
39
|
clusters_df = load_file(clusters_path)
|
40
40
|
|
@@ -45,8 +45,8 @@ for path in dataset_paths:
|
|
45
45
|
if dataset_path is None:
|
46
46
|
raise ValueError(f"No dataset matching {splitter_choice} found")
|
47
47
|
|
48
|
-
#
|
49
|
-
results_dir = Path(os.environ["
|
48
|
+
# OUTPUT_PATHS is a single path to a directory ('dataset')
|
49
|
+
results_dir = Path(os.environ["OUTPUT_PATHS"])
|
50
50
|
results_dir.mkdir(exist_ok=True, parents=True)
|
51
51
|
|
52
52
|
output_path = results_dir / Path(dataset_path).name
|
@@ -13,7 +13,7 @@ From: python@sha256:1c26c25390307b64e8ff73e7edf34b4fbeac59d41da41c08da28dc316a72
|
|
13
13
|
mkdir -p /diagnostics
|
14
14
|
|
15
15
|
# Install Python packages with specific versions
|
16
|
-
pip install pandas pyarrow splink==4.0.7
|
16
|
+
pip install pandas pyarrow splink==4.0.7 vl-convert-python
|
17
17
|
|
18
18
|
%environment
|
19
19
|
export LC_ALL=C
|
@@ -1,5 +1,5 @@
|
|
1
1
|
# STEP_NAME: blocking_and_filtering
|
2
|
-
# REQUIREMENTS: pandas pyarrow splink==4.0.7
|
2
|
+
# REQUIREMENTS: pandas pyarrow splink==4.0.7 vl-convert-python
|
3
3
|
|
4
4
|
import os
|
5
5
|
|
@@ -7,22 +7,27 @@ import pandas as pd
|
|
7
7
|
|
8
8
|
records = pd.read_parquet(os.environ["RECORDS_FILE_PATH"])
|
9
9
|
|
10
|
-
#
|
11
|
-
results_dir = os.environ["
|
10
|
+
# OUTPUT_PATHS is a single path to a directory ('dataset')
|
11
|
+
results_dir = os.environ["OUTPUT_PATHS"]
|
12
12
|
|
13
13
|
import splink
|
14
14
|
|
15
15
|
blocking_rules = os.environ["BLOCKING_RULES"].split(",")
|
16
16
|
|
17
|
-
|
17
|
+
link_only = os.getenv("LINK_ONLY", "false").lower() in ("true", "yes", "1")
|
18
|
+
|
19
|
+
from splink import DuckDBAPI, Linker, SettingsCreator
|
18
20
|
|
19
21
|
# Create the Splink linker in dedupe mode
|
20
22
|
settings = SettingsCreator(
|
21
|
-
link_type="link_and_dedupe",
|
23
|
+
link_type="link_only" if link_only else "link_and_dedupe",
|
22
24
|
blocking_rules_to_generate_predictions=blocking_rules,
|
23
25
|
comparisons=[],
|
24
26
|
)
|
25
27
|
from splink import DuckDBAPI
|
28
|
+
from splink.blocking_analysis import (
|
29
|
+
cumulative_comparisons_to_be_scored_from_blocking_rules_chart,
|
30
|
+
)
|
26
31
|
|
27
32
|
grouped = records.rename(columns={"Input Record ID": "unique_id"}).groupby(
|
28
33
|
"Input Record Dataset"
|
@@ -52,6 +57,7 @@ blocking_input_tablename_r = "__splink__df_concat_with_tf"
|
|
52
57
|
|
53
58
|
link_type = linker._settings_obj._link_type
|
54
59
|
|
60
|
+
|
55
61
|
# If exploded blocking rules exist, we need to materialise
|
56
62
|
# the tables of ID pairs
|
57
63
|
from splink.internals.blocking import materialise_exploded_id_tables
|
@@ -98,7 +104,12 @@ blocked_pairs[["Left Record ID", "Right Record ID"]] = blocked_pairs[
|
|
98
104
|
wrong_order_dataset = (
|
99
105
|
blocked_pairs["Left Record Dataset"] > blocked_pairs["Right Record Dataset"]
|
100
106
|
)
|
101
|
-
id_cols = [
|
107
|
+
id_cols = [
|
108
|
+
"Left Record Dataset",
|
109
|
+
"Left Record ID",
|
110
|
+
"Right Record Dataset",
|
111
|
+
"Right Record ID",
|
112
|
+
]
|
102
113
|
switched_id_cols = [
|
103
114
|
"Right Record Dataset",
|
104
115
|
"Right Record ID",
|
@@ -128,3 +139,18 @@ output_path.mkdir(exist_ok=True, parents=True)
|
|
128
139
|
|
129
140
|
records.to_parquet(output_path / "records.parquet", index=False)
|
130
141
|
blocked_pairs.to_parquet(output_path / "pairs.parquet", index=False)
|
142
|
+
|
143
|
+
records["unique_id"] = (
|
144
|
+
str(records["Input Record Dataset"]) + "_" + str(records["Input Record ID"])
|
145
|
+
)
|
146
|
+
db_api = DuckDBAPI()
|
147
|
+
diagnostics_dir = Path(os.environ["DIAGNOSTICS_DIRECTORY"])
|
148
|
+
chart_path = diagnostics_dir / f"blocking_cumulative_comparisons_chart_block_0.png"
|
149
|
+
cumulative_comparisons_to_be_scored_from_blocking_rules_chart(
|
150
|
+
table_or_tables=records,
|
151
|
+
blocking_rules=blocking_rules,
|
152
|
+
db_api=db_api,
|
153
|
+
link_type=link_type,
|
154
|
+
unique_id_column_name="unique_id",
|
155
|
+
source_dataset_column_name="Input Record Dataset",
|
156
|
+
).save(chart_path)
|
@@ -10,9 +10,10 @@ import splink.comparison_library as cl
|
|
10
10
|
from splink import Linker, SettingsCreator
|
11
11
|
|
12
12
|
blocks_dir = Path(os.environ["BLOCKS_DIR_PATH"])
|
13
|
-
diagnostics_dir = Path(os.environ["
|
14
|
-
output_path = Path(os.environ["
|
13
|
+
diagnostics_dir = Path(os.environ["DIAGNOSTICS_DIRECTORY"])
|
14
|
+
output_path = Path(os.environ["OUTPUT_PATHS"])
|
15
15
|
Path(output_path).parent.mkdir(exist_ok=True, parents=True)
|
16
|
+
link_only = os.getenv("LINK_ONLY", "false").lower() in ("true", "yes", "1")
|
16
17
|
|
17
18
|
all_predictions = []
|
18
19
|
|
@@ -30,17 +31,20 @@ for block_dir in blocks_dir.iterdir():
|
|
30
31
|
comparisons.append(cl.NameComparison(column))
|
31
32
|
elif method == "dob":
|
32
33
|
comparisons.append(cl.DateOfBirthComparison(column))
|
34
|
+
elif method == "levenshtein":
|
35
|
+
comparisons.append(cl.LevenshteinAtThresholds(column))
|
33
36
|
else:
|
34
37
|
raise ValueError(f"Unknown comparison method {method}")
|
35
38
|
|
36
39
|
# Create the Splink linker in dedupe mode
|
37
40
|
settings = SettingsCreator(
|
38
|
-
link_type="link_and_dedupe",
|
41
|
+
link_type="link_only" if link_only else "link_and_dedupe",
|
39
42
|
blocking_rules_to_generate_predictions=[],
|
40
43
|
comparisons=comparisons,
|
41
44
|
probability_two_random_records_match=float(
|
42
45
|
os.environ["PROBABILITY_TWO_RANDOM_RECORDS_MATCH"]
|
43
46
|
),
|
47
|
+
retain_intermediate_calculation_columns=True,
|
44
48
|
)
|
45
49
|
|
46
50
|
grouped = (
|
@@ -59,7 +63,7 @@ for block_dir in blocks_dir.iterdir():
|
|
59
63
|
input_table_aliases=[name for name, _ in grouped],
|
60
64
|
)
|
61
65
|
|
62
|
-
linker.training.estimate_u_using_random_sampling(max_pairs=5e6)
|
66
|
+
linker.training.estimate_u_using_random_sampling(max_pairs=5e6, seed=1234)
|
63
67
|
|
64
68
|
blocking_rules_for_training = os.environ["BLOCKING_RULES_FOR_TRAINING"].split(",")
|
65
69
|
|
@@ -143,6 +147,12 @@ for block_dir in blocks_dir.iterdir():
|
|
143
147
|
|
144
148
|
all_predictions.append(predictions.as_pandas_dataframe())
|
145
149
|
|
150
|
+
comparisons_path = diagnostics_dir / f"comparisons_chart_{block_dir}.html"
|
151
|
+
comparisons_path.parent.mkdir(exist_ok=True, parents=True)
|
152
|
+
linker.visualisations.comparison_viewer_dashboard(
|
153
|
+
predictions, comparisons_path, overwrite=True
|
154
|
+
)
|
155
|
+
|
146
156
|
all_predictions = pd.concat(all_predictions, ignore_index=True)[
|
147
157
|
[
|
148
158
|
"source_dataset_l",
|
@@ -1,39 +1,39 @@
|
|
1
1
|
easylink/__about__.py,sha256=2-oxCfu9t9yUJouLDwqYRZ0eii8kN25SxRzsawjWjho,440
|
2
2
|
easylink/__init__.py,sha256=gGMcIVfiVnHtlDw5mZwhevcDb2wt-kuP6F64gnkFack,159
|
3
|
-
easylink/_version.py,sha256=
|
3
|
+
easylink/_version.py,sha256=0byemO6n6WCv41u9vBG2AIsOkVbxLvok7puvwy8EhfU,23
|
4
4
|
easylink/cli.py,sha256=zQO4lOVoZ3eVgPVWT2sCF4zNoKgiDJP9ReTh2Myr9jc,10307
|
5
5
|
easylink/configuration.py,sha256=TPd3WbqUcJMJDPJuHeo5ZebvZPQrRyfm8-laad2sOFk,12681
|
6
6
|
easylink/graph_components.py,sha256=zZDZXg5smReHO3ryQC4pao24wyKXzWDe6jS3C6fM2ak,13892
|
7
7
|
easylink/implementation.py,sha256=lSF37g-aQYgdLI0lgFaDrBkp23te9O9B1V-CmmRtB-0,14514
|
8
|
-
easylink/implementation_metadata.yaml,sha256=
|
8
|
+
easylink/implementation_metadata.yaml,sha256=pKu_H9fLnTsS8E4wCnYRitumW1-zs7mfE3z66BAyO30,10848
|
9
9
|
easylink/pipeline.py,sha256=NJyMInbOCjJ_5kRzzuK5AcupvC7ecd5qLOC-f1Gy3Ss,17701
|
10
10
|
easylink/pipeline_graph.py,sha256=jtjS7_2IVa189U8cL621600erC_0pa6BKPRRO8VqwiU,23760
|
11
11
|
easylink/pipeline_schema.py,sha256=sj6YQqMFUS35RRFWdlrhPlud68d3Rb5y7iKfAEZy1qs,6877
|
12
|
-
easylink/rule.py,sha256=
|
12
|
+
easylink/rule.py,sha256=MM7WyW56J7zT2FVjHlFtjuz62PfdSBGTD3MNcpLfEZM,16598
|
13
13
|
easylink/runner.py,sha256=Z9GKgiUAWtp0dW1cyAp86MGthIDeABJtHDXUtzv0-kE,6714
|
14
|
-
easylink/step.py,sha256
|
14
|
+
easylink/step.py,sha256=-vdFhPvwAZ3d69LMQGmSIVdcMG8E8ZtSvTE0UWif7zs,91088
|
15
15
|
easylink/devtools/implementation_creator.py,sha256=gZZpfpiOOh912nNjR_d5wR0mBO5c09jWMS0FSYRcE1o,19120
|
16
16
|
easylink/pipeline_schema_constants/__init__.py,sha256=njUL2v67WFC3dW_FjOXWRLSeOhInox_ZINLEji0_7V8,1523
|
17
17
|
easylink/pipeline_schema_constants/development.py,sha256=0x6lWqBmP1K9AebEmeZ4veSnLBcdQcZXzbV6lCU11bc,12622
|
18
18
|
easylink/pipeline_schema_constants/main.py,sha256=kcAhdbK_BhS79LUMhKkn-Uttl2vA-MHVX4M1StTloJQ,22934
|
19
|
-
easylink/pipeline_schema_constants/testing.py,sha256=
|
19
|
+
easylink/pipeline_schema_constants/testing.py,sha256=ti08DeUuF-eWrGKMj4BMyOFFJnEYooDaWX0DGiferbk,24579
|
20
20
|
easylink/steps/cascading/exclude_clustered.def,sha256=GfoDqO2Vtsh7VI8SwGaJtv_KtKjs-UmBcivqQ7OPkjk,503
|
21
|
-
easylink/steps/cascading/exclude_clustered.py,sha256=
|
21
|
+
easylink/steps/cascading/exclude_clustered.py,sha256=Bpsyf9vAZ431Fh96RVzHkF7fy77NQjo1Cl6bHCIy69c,2580
|
22
22
|
easylink/steps/cascading/exclude_none.def,sha256=iFUhUMocxtkA0NErkjVrBxY0MUdS3DIPNsbCpTJRP0k,488
|
23
|
-
easylink/steps/cascading/exclude_none.py,sha256=
|
23
|
+
easylink/steps/cascading/exclude_none.py,sha256=5DK5bNG4TneMwUKE49Kmz7VDnKBNZWjOERkuSJU3BmA,2475
|
24
24
|
easylink/steps/cascading/update_clusters_by_connected_components.def,sha256=sAAAWOod8EuAnotR1cayaGAvs7x6xoMVlwmLso_a9Cc,578
|
25
|
-
easylink/steps/cascading/update_clusters_by_connected_components.py,sha256=
|
25
|
+
easylink/steps/cascading/update_clusters_by_connected_components.py,sha256=sFZXMGXl17jcGt8Fu5hgQz1KW5bFvPYdCoQGZ9Erc0I,3629
|
26
26
|
easylink/steps/default/default_clusters_to_links.def,sha256=9PjUygLvsoYMUZDznceuuv55t8fPs473P57J_RMl3U0,527
|
27
|
-
easylink/steps/default/default_clusters_to_links.py,sha256=
|
27
|
+
easylink/steps/default/default_clusters_to_links.py,sha256=Ckm53d3W-ETNlTvQJPOkpHmSqCmxSWknMPQrEAIoTBY,2816
|
28
28
|
easylink/steps/default/default_determining_exclusions.def,sha256=zZUEHDdrpLxzx3gTm-dki2ge5ivCCg4ziIwTErqCII0,542
|
29
|
-
easylink/steps/default/default_determining_exclusions.py,sha256=
|
29
|
+
easylink/steps/default/default_determining_exclusions.py,sha256=4diLfuqYm_Koj7gwifjwe_7mLZ6xb6RQiEdk-RRtB94,2629
|
30
30
|
easylink/steps/default/default_removing_records.def,sha256=QqacmOu6myxFSULHRKeKsVD8l73KDm4VEkPkPlovwqs,524
|
31
|
-
easylink/steps/default/default_removing_records.py,sha256=
|
31
|
+
easylink/steps/default/default_removing_records.py,sha256=P4mmX2D4mhSoWd_S5CaNT4hlHOMAeZiMhCScWQiR_fQ,1906
|
32
32
|
easylink/steps/default/default_schema_alignment.def,sha256=hFHJkos0Fhe4LvpjLOCd6klIaIqOKqECDDSTVu3G03Y,524
|
33
|
-
easylink/steps/default/default_schema_alignment.py,sha256=
|
33
|
+
easylink/steps/default/default_schema_alignment.py,sha256=oT5jbYQ3C3ocLgqqOnvH0SIJ6NeTtPBWWmCqr_frnAI,1479
|
34
34
|
easylink/steps/default/default_updating_clusters.def,sha256=vDzSkTknDfeiXeHREpw4BkUxFcTWamxr81c3rZ7_enY,527
|
35
|
-
easylink/steps/default/default_updating_clusters.py,sha256=
|
36
|
-
easylink/steps/dev/README.md,sha256=
|
35
|
+
easylink/steps/default/default_updating_clusters.py,sha256=uwblSM9w4PoqPgWA_5cwmnU7ARSruickWxTfr2EZCM0,2107
|
36
|
+
easylink/steps/dev/README.md,sha256=lEHtM48SkFTV1FL-B5bbrEGjSVa_mb690Ed08nnheww,4533
|
37
37
|
easylink/steps/dev/build-containers-local.sh,sha256=Wy3pfcyt7I-BNvHcr7ZXDe0g5Ihd00BIPqt9YuRbLeA,259
|
38
38
|
easylink/steps/dev/build-containers-remote.sh,sha256=Hy-kaaXf-ta6n8SzOz_ahByjMY5T7J71MvzXRXDvQw8,271
|
39
39
|
easylink/steps/dev/test.py,sha256=4iudKBD6CFz2CxbjSBUkc8LCWlMl-Nmw_rB35ZN6TrQ,6835
|
@@ -43,36 +43,38 @@ easylink/steps/dev/input_data/input_file_1.parquet,sha256=Km8jRyfGNdq0MFdz_-bewl
|
|
43
43
|
easylink/steps/dev/input_data/input_file_2.csv,sha256=YqKLZDC4d-aYN8Dh9OB6iQWWUKmvueu5CszckH1AApU,100016
|
44
44
|
easylink/steps/dev/input_data/input_file_2.parquet,sha256=Vpo0sUqQ78qlWLRk8p303Nh89BVcK4uvXJljRGHmsWk,60392
|
45
45
|
easylink/steps/dev/python_pandas/README.md,sha256=c_FbtkKKOTjt2R_LfHUo5lBga1qHiYkxLdQeewRr45g,977
|
46
|
-
easylink/steps/dev/python_pandas/dummy_step.py,sha256=
|
47
|
-
easylink/steps/dev/python_pandas/python_pandas.def,sha256=
|
46
|
+
easylink/steps/dev/python_pandas/dummy_step.py,sha256=ASZIxk_d46zNz0xUtZ37OuTJUpzfi98EwQduZXDiwK0,4330
|
47
|
+
easylink/steps/dev/python_pandas/python_pandas.def,sha256=umAUJL3RQZsCLIvFvsaif_-kCGfWzsnIH8d6Zwx_qYQ,482
|
48
48
|
easylink/steps/dev/python_pyspark/README.md,sha256=di29SAfcdTTpar7gdoJRLqKrL8DEfNeayYUyaywdhUg,1563
|
49
|
-
easylink/steps/dev/python_pyspark/dummy_step.py,sha256=
|
50
|
-
easylink/steps/dev/python_pyspark/python_pyspark.def,sha256=
|
49
|
+
easylink/steps/dev/python_pyspark/dummy_step.py,sha256=6BJi-L6tlXGc6GJnGTI06AnzuRYh3KYBgU09tvId954,4949
|
50
|
+
easylink/steps/dev/python_pyspark/python_pyspark.def,sha256=e-OXax5t96OEgAJdsKDqjGXPuCk12GpPYbwIdrUpBOU,666
|
51
51
|
easylink/steps/dev/r/README.md,sha256=dPjZdDTqcJsZCiwhddzlOj1ob0P7YocZUNFrLIGM1-0,1201
|
52
|
-
easylink/steps/dev/r/dummy_step.R,sha256=
|
53
|
-
easylink/steps/dev/r/r-image.def,sha256=
|
54
|
-
easylink/steps/
|
52
|
+
easylink/steps/dev/r/dummy_step.R,sha256=4eFZgmKaagydjYOVnrOB3W2vjHHcGDhJ8LXsWDrDNyI,4943
|
53
|
+
easylink/steps/dev/r/r-image.def,sha256=gBNCfMb_HtgrGPhTt8qEws5tWVfjQIS_GCIHRGAHG9c,391
|
54
|
+
easylink/steps/example/middle_name_to_initial.def,sha256=UmD3FCuK8CMD0gQRUqg1BFnGq5Mucu7x8eU19jq7pZ0,518
|
55
|
+
easylink/steps/example/middle_name_to_initial.py,sha256=1Q7xaXIxkIvPN6jW98WKTvVhWB9qMC23mRIoO7NYRa8,1901
|
56
|
+
easylink/steps/fastLink/fastLink_evaluating_pairs.R,sha256=ucbHibtoYJ4-GDg1mWv-dtv0r_1XomhdT-KC3Zkat2E,4539
|
55
57
|
easylink/steps/fastLink/fastLink_evaluating_pairs.def,sha256=5rDi-cmWhyuFEsiGFPpTxtySMqq5TpgJG-y8g_MtEvA,509
|
56
|
-
easylink/steps/fastLink/fastLink_links_to_clusters.R,sha256=
|
58
|
+
easylink/steps/fastLink/fastLink_links_to_clusters.R,sha256=iM6bi27bHNJRxfOIBOCvZcFUuKT3VauOWbM6d0Ws5dk,4055
|
57
59
|
easylink/steps/fastLink/fastLink_links_to_clusters.def,sha256=1xYjOMsHtSS2-AI4EC2r6kL8ZX5F2JhmvESefEKeJVY,512
|
58
60
|
easylink/steps/output_dir/dummy_step_1_for_output_dir_example.def,sha256=CkQVG-uDRQ9spAavdkZbhx2GD_fRsKZGELPrr8yltsc,550
|
59
|
-
easylink/steps/output_dir/dummy_step_1_for_output_dir_example.py,sha256=
|
61
|
+
easylink/steps/output_dir/dummy_step_1_for_output_dir_example.py,sha256=sBmF-wMgTpcqeM9gVWbHZkcy_w0OzfDYo98-3P2WMaM,412
|
60
62
|
easylink/steps/output_dir/dummy_step_2_for_output_dir_example.def,sha256=9gShg1EDJEHZcz7Z5VfZ1A4Gpm9XQes8ezn6rAZDgDM,550
|
61
|
-
easylink/steps/output_dir/dummy_step_2_for_output_dir_example.py,sha256=
|
63
|
+
easylink/steps/output_dir/dummy_step_2_for_output_dir_example.py,sha256=ExFljptolMiidU7LiOfQtH13ChbDUGIF3r5qM5paKsA,489
|
62
64
|
easylink/steps/rl-dummy/canonicalizing_and_downstream_analysis/dummy_canonicalizing_and_downstream_analysis.def,sha256=YOWtJZxoe-kHFeEyrgGcVGfdqcbD_Fg17A9shOaK-yc,584
|
63
|
-
easylink/steps/rl-dummy/canonicalizing_and_downstream_analysis/dummy_canonicalizing_and_downstream_analysis.py,sha256=
|
65
|
+
easylink/steps/rl-dummy/canonicalizing_and_downstream_analysis/dummy_canonicalizing_and_downstream_analysis.py,sha256=zctz6LktX1BsVbeSR2gWuNCzRU7WkmWDGg68wlzZw0Q,1170
|
64
66
|
easylink/steps/rl-dummy/input_data/create_input_files.ipynb,sha256=uXvJ8zTTPg0077HgA7MhQ_9jD-aeISFLeMeEBbSnOu8,54498
|
65
67
|
easylink/steps/rl-dummy/input_data/input_file_1.parquet,sha256=GQ_7v7ucwdJn-9mTgKVcvqkJ5gTkwb0B7y38mfOYbic,15200
|
66
68
|
easylink/steps/rl-dummy/input_data/input_file_2.parquet,sha256=Y4eseBm0HmFroksQr_VApgozRL8h8u7nQO6x_Utyns8,14902
|
67
69
|
easylink/steps/rl-dummy/input_data/known_clusters.parquet,sha256=Ysodu65toHZN4AgjVJsm0ueUxPIZAJjbtRm9SVM08JE,2598
|
68
70
|
easylink/steps/rl-dummy/pre-processing/dummy_pre-processing.def,sha256=HeUSv2QvMOQzsyVktYR1xYoEqwiNpDo-p7IRcGSMspE,512
|
69
|
-
easylink/steps/rl-dummy/pre-processing/dummy_pre-processing.py,sha256=
|
70
|
-
easylink/steps/splink/splink_blocking_and_filtering.def,sha256=
|
71
|
-
easylink/steps/splink/splink_blocking_and_filtering.py,sha256=
|
71
|
+
easylink/steps/rl-dummy/pre-processing/dummy_pre-processing.py,sha256=I6kqG4e_H2yFW5MpsMXdpoY_NjHcBvVVAHWv89LUgXE,1852
|
72
|
+
easylink/steps/splink/splink_blocking_and_filtering.def,sha256=umWzxJhsfdi8g3TD-r2mKpjC-FPAMDk6-IERiWigdQc,557
|
73
|
+
easylink/steps/splink/splink_blocking_and_filtering.py,sha256=FO8YJ2_KgCLpQoq5xsM339bTSN1DhCXCL8XT1pb5STY,5259
|
72
74
|
easylink/steps/splink/splink_evaluating_pairs.def,sha256=DN3Ohy9qJOAyK58v164neP23HDVYpedMqzCu4eQh4Hg,521
|
73
|
-
easylink/steps/splink/splink_evaluating_pairs.py,sha256=
|
75
|
+
easylink/steps/splink/splink_evaluating_pairs.py,sha256=m-j1QMRSvPCiSoWVSV1kzzzsK1c_xG8nqYKMd3cj7kM,6195
|
74
76
|
easylink/steps/splink/splink_links_to_clusters.def,sha256=RurvOYyGjNs9tx64DTXwI-GSgHD4T7SzDfhAH18pTEM,524
|
75
|
-
easylink/steps/splink/splink_links_to_clusters.py,sha256=
|
77
|
+
easylink/steps/splink/splink_links_to_clusters.py,sha256=5Sw8yi0dVLuRB-trN2kXmxbHBR0VJBxYee6u4_usg2Y,1920
|
76
78
|
easylink/utilities/__init__.py,sha256=0U33kbv4hoMfFQ_lh5hLwifxRPzOgkLkjKLYxmaK10g,196
|
77
79
|
easylink/utilities/aggregator_utils.py,sha256=_DAHRAf9janbDsuj_jnAn5Dzz2s4R5Ni3YeytDpN9UE,954
|
78
80
|
easylink/utilities/data_utils.py,sha256=XPRjq3qW_fN0xQ23Jms_xBzpTHbRwqZWDP1AW0nYkP0,6926
|
@@ -81,9 +83,9 @@ easylink/utilities/paths.py,sha256=9inDKMPorAaWh5vhjc3j1Tj_aXVKhLdodiJO9H1nNes,9
|
|
81
83
|
easylink/utilities/spark.smk,sha256=kGtpem7LfQc71tMh5WAYaqKnHQKFvcdhPQSdumOP70k,5799
|
82
84
|
easylink/utilities/splitter_utils.py,sha256=KXiVUYJ9TGxCQmrwos18pB1sxG_0Ay67qoDJT6vem2o,3828
|
83
85
|
easylink/utilities/validation_utils.py,sha256=DBJB2TLVklgYw1WaaPse9vqtwPLMGmZNYM2cbCZsoHI,18417
|
84
|
-
easylink-0.1.
|
85
|
-
easylink-0.1.
|
86
|
-
easylink-0.1.
|
87
|
-
easylink-0.1.
|
88
|
-
easylink-0.1.
|
89
|
-
easylink-0.1.
|
86
|
+
easylink-0.1.23.dist-info/licenses/LICENSE,sha256=z6NBo3hY2Pbh2Wjm9IcHLyQoG-kd816uASh8PbwhTcQ,1530
|
87
|
+
easylink-0.1.23.dist-info/METADATA,sha256=u-oRyBse4M0AsFkMjTuy0JCpul-BwHJ1JaD9fIALrHU,3565
|
88
|
+
easylink-0.1.23.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
89
|
+
easylink-0.1.23.dist-info/entry_points.txt,sha256=OGMZDFltg3yMboT7XjJt3joiPhRfV_7jnREVtrAIQNU,51
|
90
|
+
easylink-0.1.23.dist-info/top_level.txt,sha256=oHcOpcF_jDMWFiJRzfGQvuskENGDjSPC_Agu9Z_Xvik,9
|
91
|
+
easylink-0.1.23.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|