easylink 0.1.24__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- easylink/_version.py +1 -1
- easylink/cli.py +19 -10
- easylink/implementation_metadata.yaml +94 -44
- easylink/runner.py +174 -1
- easylink/steps/cascading/accept_all_pairs.def +22 -0
- easylink/steps/cascading/accept_all_pairs.py +26 -0
- easylink/steps/cascading/exclude_clustered.py +11 -2
- easylink/steps/cascading/exclude_none.py +1 -1
- easylink/steps/cascading/one_to_many_links_to_clusters.def +22 -0
- easylink/steps/cascading/one_to_many_links_to_clusters.py +109 -0
- easylink/steps/cascading/update_clusters_by_connected_components.py +17 -4
- easylink/steps/default/default_determining_exclusions.py +1 -1
- easylink/steps/default/default_removing_records.py +1 -1
- easylink/steps/splink/splink_blocking_and_filtering.py +12 -6
- easylink/steps/splink/splink_evaluating_pairs.py +2 -1
- easylink/steps/splink/splink_links_to_clusters.py +2 -0
- easylink/utilities/general_utils.py +18 -8
- easylink/utilities/validation_utils.py +6 -6
- {easylink-0.1.24.dist-info → easylink-0.2.0.dist-info}/METADATA +27 -11
- {easylink-0.1.24.dist-info → easylink-0.2.0.dist-info}/RECORD +24 -20
- {easylink-0.1.24.dist-info → easylink-0.2.0.dist-info}/WHEEL +0 -0
- {easylink-0.1.24.dist-info → easylink-0.2.0.dist-info}/entry_points.txt +0 -0
- {easylink-0.1.24.dist-info → easylink-0.2.0.dist-info}/licenses/LICENSE +0 -0
- {easylink-0.1.24.dist-info → easylink-0.2.0.dist-info}/top_level.txt +0 -0
easylink/_version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.
|
1
|
+
__version__ = "0.2.0"
|
easylink/cli.py
CHANGED
@@ -201,16 +201,25 @@ def run(
|
|
201
201
|
main = handle_exceptions(
|
202
202
|
func=runner.main, exceptions_logger=logger, with_debugger=with_debugger
|
203
203
|
)
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
204
|
+
try:
|
205
|
+
main(
|
206
|
+
command="run",
|
207
|
+
pipeline_specification=pipeline_specification,
|
208
|
+
input_data=input_data,
|
209
|
+
computing_environment=computing_environment,
|
210
|
+
results_dir=results_dir,
|
211
|
+
images_dir=images,
|
212
|
+
schema_name=schema,
|
213
|
+
)
|
214
|
+
except SystemExit as e:
|
215
|
+
# Snakemake uses SystemExit with exit code 0 for success, non-zero for failure
|
216
|
+
if e.code == 0:
|
217
|
+
logger.info("\033[32m*** FINISHED ***\033[0m") # Green
|
218
|
+
else:
|
219
|
+
logger.error(
|
220
|
+
f"\033[31mERROR: Pipeline failed with exit code {e.code}\033[0m"
|
221
|
+
) # Red
|
222
|
+
raise
|
214
223
|
|
215
224
|
|
216
225
|
@easylink.command()
|
@@ -2,7 +2,7 @@ step_1_python_pandas:
|
|
2
2
|
steps:
|
3
3
|
- step_1
|
4
4
|
image_name: python_pandas.sif
|
5
|
-
zenodo_record_id:
|
5
|
+
zenodo_record_id: 15778354
|
6
6
|
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
7
7
|
script_cmd: python /dummy_step.py
|
8
8
|
outputs:
|
@@ -11,7 +11,7 @@ step_1a_python_pandas:
|
|
11
11
|
steps:
|
12
12
|
- step_1a
|
13
13
|
image_name: python_pandas.sif
|
14
|
-
zenodo_record_id:
|
14
|
+
zenodo_record_id: 15778354
|
15
15
|
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
16
16
|
script_cmd: python /dummy_step.py
|
17
17
|
env:
|
@@ -22,7 +22,7 @@ step_1b_python_pandas:
|
|
22
22
|
steps:
|
23
23
|
- step_1b
|
24
24
|
image_name: python_pandas.sif
|
25
|
-
zenodo_record_id:
|
25
|
+
zenodo_record_id: 15778354
|
26
26
|
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
27
27
|
script_cmd: python /dummy_step.py
|
28
28
|
env:
|
@@ -33,7 +33,7 @@ step_2_python_pandas:
|
|
33
33
|
steps:
|
34
34
|
- step_2
|
35
35
|
image_name: python_pandas.sif
|
36
|
-
zenodo_record_id:
|
36
|
+
zenodo_record_id: 15778354
|
37
37
|
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
38
38
|
script_cmd: python /dummy_step.py
|
39
39
|
outputs:
|
@@ -42,7 +42,7 @@ step_3_python_pandas:
|
|
42
42
|
steps:
|
43
43
|
- step_3
|
44
44
|
image_name: python_pandas.sif
|
45
|
-
zenodo_record_id:
|
45
|
+
zenodo_record_id: 15778354
|
46
46
|
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
47
47
|
script_cmd: python /dummy_step.py
|
48
48
|
outputs:
|
@@ -51,7 +51,7 @@ step_4_python_pandas:
|
|
51
51
|
steps:
|
52
52
|
- step_4
|
53
53
|
image_name: python_pandas.sif
|
54
|
-
zenodo_record_id:
|
54
|
+
zenodo_record_id: 15778354
|
55
55
|
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
56
56
|
script_cmd: python /dummy_step.py
|
57
57
|
env:
|
@@ -62,7 +62,7 @@ step_5_python_pandas:
|
|
62
62
|
steps:
|
63
63
|
- step_5
|
64
64
|
image_name: python_pandas.sif
|
65
|
-
zenodo_record_id:
|
65
|
+
zenodo_record_id: 15778354
|
66
66
|
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
67
67
|
script_cmd: python /dummy_step.py
|
68
68
|
env:
|
@@ -73,7 +73,7 @@ step_6_python_pandas:
|
|
73
73
|
steps:
|
74
74
|
- step_6
|
75
75
|
image_name: python_pandas.sif
|
76
|
-
zenodo_record_id:
|
76
|
+
zenodo_record_id: 15778354
|
77
77
|
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
78
78
|
script_cmd: python /dummy_step.py
|
79
79
|
env:
|
@@ -84,7 +84,7 @@ step_4a_python_pandas:
|
|
84
84
|
steps:
|
85
85
|
- step_4a
|
86
86
|
image_name: python_pandas.sif
|
87
|
-
zenodo_record_id:
|
87
|
+
zenodo_record_id: 15778354
|
88
88
|
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
89
89
|
script_cmd: python /dummy_step.py
|
90
90
|
env:
|
@@ -95,7 +95,7 @@ step_4b_python_pandas:
|
|
95
95
|
steps:
|
96
96
|
- step_4b
|
97
97
|
image_name: python_pandas.sif
|
98
|
-
zenodo_record_id:
|
98
|
+
zenodo_record_id: 15778354
|
99
99
|
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
100
100
|
script_cmd: python /dummy_step.py
|
101
101
|
env:
|
@@ -106,7 +106,7 @@ step_4b_r:
|
|
106
106
|
steps:
|
107
107
|
- step_4b
|
108
108
|
image_name: r-image.sif
|
109
|
-
zenodo_record_id:
|
109
|
+
zenodo_record_id: 15778354
|
110
110
|
md5_checksum: a4a03b836694a2b81a1bd2852736ccc5
|
111
111
|
script_cmd: Rscript /dummy_step.R
|
112
112
|
env:
|
@@ -117,7 +117,7 @@ step_1_python_pyspark:
|
|
117
117
|
steps:
|
118
118
|
- step_1
|
119
119
|
image_name: python_pyspark.sif
|
120
|
-
zenodo_record_id:
|
120
|
+
zenodo_record_id: 15778354
|
121
121
|
md5_checksum: c948577ab0607411dd4b640622d9ec3a
|
122
122
|
script_cmd: python3 /code/dummy_step.py
|
123
123
|
outputs:
|
@@ -127,7 +127,7 @@ step_2_python_pyspark:
|
|
127
127
|
steps:
|
128
128
|
- step_2
|
129
129
|
image_name: python_pyspark.sif
|
130
|
-
zenodo_record_id:
|
130
|
+
zenodo_record_id: 15778354
|
131
131
|
md5_checksum: c948577ab0607411dd4b640622d9ec3a
|
132
132
|
script_cmd: python3 /code/dummy_step.py
|
133
133
|
outputs:
|
@@ -137,7 +137,7 @@ step_3_python_pyspark:
|
|
137
137
|
steps:
|
138
138
|
- step_3
|
139
139
|
image_name: python_pyspark.sif
|
140
|
-
zenodo_record_id:
|
140
|
+
zenodo_record_id: 15778354
|
141
141
|
md5_checksum: c948577ab0607411dd4b640622d9ec3a
|
142
142
|
script_cmd: python3 /code/dummy_step.py
|
143
143
|
outputs:
|
@@ -147,7 +147,7 @@ step_4_python_pyspark:
|
|
147
147
|
steps:
|
148
148
|
- step_4
|
149
149
|
image_name: python_pyspark.sif
|
150
|
-
zenodo_record_id:
|
150
|
+
zenodo_record_id: 15778354
|
151
151
|
md5_checksum: c948577ab0607411dd4b640622d9ec3a
|
152
152
|
script_cmd: python3 /code/dummy_step.py
|
153
153
|
env:
|
@@ -158,7 +158,7 @@ step_1_r:
|
|
158
158
|
steps:
|
159
159
|
- step_1
|
160
160
|
image_name: r-image.sif
|
161
|
-
zenodo_record_id:
|
161
|
+
zenodo_record_id: 15778354
|
162
162
|
md5_checksum: a4a03b836694a2b81a1bd2852736ccc5
|
163
163
|
script_cmd: Rscript /dummy_step.R
|
164
164
|
outputs:
|
@@ -168,7 +168,7 @@ step_2_r:
|
|
168
168
|
steps:
|
169
169
|
- step_2
|
170
170
|
image_name: r-image.sif
|
171
|
-
zenodo_record_id:
|
171
|
+
zenodo_record_id: 15778354
|
172
172
|
md5_checksum: a4a03b836694a2b81a1bd2852736ccc5
|
173
173
|
script_cmd: Rscript /dummy_step.R
|
174
174
|
outputs:
|
@@ -178,7 +178,7 @@ step_3_r:
|
|
178
178
|
steps:
|
179
179
|
- step_3
|
180
180
|
image_name: r-image.sif
|
181
|
-
zenodo_record_id:
|
181
|
+
zenodo_record_id: 15778354
|
182
182
|
md5_checksum: a4a03b836694a2b81a1bd2852736ccc5
|
183
183
|
script_cmd: Rscript /dummy_step.R
|
184
184
|
outputs:
|
@@ -188,7 +188,7 @@ step_4_r:
|
|
188
188
|
steps:
|
189
189
|
- step_4
|
190
190
|
image_name: r-image.sif
|
191
|
-
zenodo_record_id:
|
191
|
+
zenodo_record_id: 15778354
|
192
192
|
md5_checksum: a4a03b836694a2b81a1bd2852736ccc5
|
193
193
|
script_cmd: Rscript /dummy_step.R
|
194
194
|
env:
|
@@ -201,7 +201,7 @@ step_1_and_step_2_combined_python_pandas:
|
|
201
201
|
- step_1
|
202
202
|
- step_2
|
203
203
|
image_name: python_pandas.sif
|
204
|
-
zenodo_record_id:
|
204
|
+
zenodo_record_id: 15778354
|
205
205
|
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
206
206
|
script_cmd: python /dummy_step.py
|
207
207
|
outputs:
|
@@ -211,7 +211,7 @@ step_1_and_step_2_parallel_python_pandas:
|
|
211
211
|
- step_1
|
212
212
|
- step_2
|
213
213
|
image_name: python_pandas.sif
|
214
|
-
zenodo_record_id:
|
214
|
+
zenodo_record_id: 15778354
|
215
215
|
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
216
216
|
script_cmd: python /dummy_step.py
|
217
217
|
env:
|
@@ -223,7 +223,7 @@ step_3_and_step_4_combined_python_pandas:
|
|
223
223
|
- step_3
|
224
224
|
- step_4
|
225
225
|
image_name: python_pandas.sif
|
226
|
-
zenodo_record_id:
|
226
|
+
zenodo_record_id: 15778354
|
227
227
|
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
228
228
|
script_cmd: python /dummy_step.py
|
229
229
|
outputs:
|
@@ -233,7 +233,7 @@ step_1a_and_step_1b_combined_python_pandas:
|
|
233
233
|
- step_1a
|
234
234
|
- step_1b
|
235
235
|
image_name: python_pandas.sif
|
236
|
-
zenodo_record_id:
|
236
|
+
zenodo_record_id: 15778354
|
237
237
|
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
238
238
|
script_cmd: python /dummy_step.py
|
239
239
|
outputs:
|
@@ -241,131 +241,181 @@ step_1a_and_step_1b_combined_python_pandas:
|
|
241
241
|
dummy_step_1_for_output_dir_example:
|
242
242
|
steps:
|
243
243
|
- step_1_for_output_dir_example
|
244
|
-
image_name:
|
244
|
+
image_name: dummy_step_1_for_output_dir_example.sif
|
245
245
|
script_cmd: python /dummy_step_1_for_output_dir_example.py
|
246
246
|
outputs:
|
247
247
|
step_1_main_output_directory: output_dir/
|
248
248
|
dummy_step_1_for_output_dir_example_default:
|
249
249
|
steps:
|
250
250
|
- step_1_for_output_dir_example
|
251
|
-
image_name:
|
251
|
+
image_name: dummy_step_1_for_output_dir_example.sif
|
252
252
|
script_cmd: python /dummy_step_1_for_output_dir_example.py
|
253
253
|
dummy_step_2_for_output_dir_example:
|
254
254
|
steps:
|
255
255
|
- step_2_for_output_dir_example
|
256
|
-
image_name:
|
256
|
+
image_name: dummy_step_2_for_output_dir_example.sif
|
257
257
|
script_cmd: python /dummy_step_2_for_output_dir_example.py
|
258
258
|
outputs:
|
259
259
|
step_2_main_output: result.parquet
|
260
260
|
default_removing_records:
|
261
261
|
steps:
|
262
262
|
- removing_records
|
263
|
-
image_name:
|
263
|
+
image_name: default_removing_records.sif
|
264
|
+
zenodo_record_id: 15778354
|
265
|
+
md5_checksum: 05123136e756bfa57f1d7d5a3315f2f6
|
264
266
|
script_cmd: python /default_removing_records.py
|
265
267
|
outputs:
|
266
268
|
dataset: dataset
|
267
269
|
default_clusters_to_links:
|
268
270
|
steps:
|
269
271
|
- clusters_to_links
|
270
|
-
image_name:
|
272
|
+
image_name: default_clusters_to_links.sif
|
273
|
+
zenodo_record_id: 15778354
|
274
|
+
md5_checksum: 0d00d1272bd8193f60727791097aa065
|
271
275
|
script_cmd: python /default_clusters_to_links.py
|
272
276
|
outputs:
|
273
277
|
known_links: result.parquet
|
274
278
|
default_determining_exclusions:
|
275
279
|
steps:
|
276
280
|
- determining_exclusions
|
277
|
-
image_name:
|
281
|
+
image_name: default_determining_exclusions.sif
|
282
|
+
zenodo_record_id: 15778354
|
283
|
+
md5_checksum: f4e9f740d8dd7599bfbb2b9eb54ced38
|
278
284
|
script_cmd: python /default_determining_exclusions.py
|
279
285
|
outputs:
|
280
286
|
ids_to_remove: result.parquet
|
281
287
|
default_updating_clusters:
|
282
288
|
steps:
|
283
289
|
- updating_clusters
|
284
|
-
image_name:
|
290
|
+
image_name: default_updating_clusters.sif
|
291
|
+
zenodo_record_id: 15778354
|
292
|
+
md5_checksum: cc6bd29e099c2523347fa04545aa35c9
|
285
293
|
script_cmd: python /default_updating_clusters.py
|
286
294
|
outputs:
|
287
295
|
clusters: clusters.parquet
|
288
|
-
dummy_canonicalizing_and_downstream_analysis
|
296
|
+
# NOTE: This was made from dummy_canonicalizing_and_downstream_analysis.py,
|
297
|
+
# if rebuilding change the name of that file to save_clusters.py
|
298
|
+
save_clusters:
|
289
299
|
steps:
|
290
300
|
- canonicalizing_and_downstream_analysis
|
291
|
-
image_name:
|
301
|
+
image_name: save_clusters.sif
|
302
|
+
zenodo_record_id: 15778354
|
303
|
+
md5_checksum: 384ab2be668cbadc45160a674f621022
|
292
304
|
script_cmd: python /dummy_canonicalizing_and_downstream_analysis.py
|
293
305
|
outputs:
|
294
306
|
analysis_output: result.parquet
|
295
|
-
dummy_pre-processing
|
307
|
+
# NOTE: This was made from dummy_pre-processing.py,
|
308
|
+
# if rebuilding change the name of that file to no_pre-processing.py
|
309
|
+
no_pre-processing:
|
296
310
|
steps:
|
297
311
|
- pre-processing
|
298
|
-
image_name:
|
312
|
+
image_name: no_pre-processing.sif
|
313
|
+
zenodo_record_id: 15778354
|
314
|
+
md5_checksum: 9a9c080cf145078152501cf96bf61f27
|
299
315
|
script_cmd: python /dummy_pre-processing.py
|
300
316
|
outputs:
|
301
317
|
dataset: dataset
|
302
318
|
default_schema_alignment:
|
303
319
|
steps:
|
304
320
|
- schema_alignment
|
305
|
-
image_name:
|
321
|
+
image_name: default_schema_alignment.sif
|
322
|
+
zenodo_record_id: 15778354
|
323
|
+
md5_checksum: 3166587f9cfec478b999a17074d628f7
|
306
324
|
script_cmd: python /default_schema_alignment.py
|
307
325
|
outputs:
|
308
326
|
records: result.parquet
|
309
327
|
splink_blocking_and_filtering:
|
310
328
|
steps:
|
311
329
|
- blocking_and_filtering
|
312
|
-
image_name:
|
330
|
+
image_name: splink_blocking_and_filtering.sif
|
331
|
+
zenodo_record_id: 15778354
|
332
|
+
md5_checksum: 3f8777c5751d7550762be078d87e7db2
|
313
333
|
script_cmd: python /splink_blocking_and_filtering.py
|
314
334
|
outputs:
|
315
335
|
blocks: blocks
|
316
336
|
splink_evaluating_pairs:
|
317
337
|
steps:
|
318
338
|
- evaluating_pairs
|
319
|
-
image_name:
|
339
|
+
image_name: splink_evaluating_pairs.sif
|
340
|
+
zenodo_record_id: 15778354
|
341
|
+
md5_checksum: b57f4bd16b7a3aa5099569078ea4c064
|
320
342
|
script_cmd: python /splink_evaluating_pairs.py
|
321
343
|
outputs:
|
322
344
|
links: result.parquet
|
323
345
|
splink_links_to_clusters:
|
324
346
|
steps:
|
325
347
|
- links_to_clusters
|
326
|
-
image_name:
|
348
|
+
image_name: splink_links_to_clusters.sif
|
349
|
+
zenodo_record_id: 15778354
|
350
|
+
md5_checksum: 81a71aa2ce6544953f3edb88d4ee6ec1
|
327
351
|
script_cmd: python /splink_links_to_clusters.py
|
328
352
|
outputs:
|
329
353
|
clusters: result.parquet
|
330
354
|
fastLink_evaluating_pairs:
|
331
355
|
steps:
|
332
356
|
- evaluating_pairs
|
333
|
-
image_name:
|
357
|
+
image_name: fastLink_evaluating_pairs.sif
|
334
358
|
script_cmd: Rscript /fastLink_evaluating_pairs.R
|
335
359
|
outputs:
|
336
360
|
links: result.parquet
|
337
361
|
fastLink_links_to_clusters:
|
338
362
|
steps:
|
339
363
|
- links_to_clusters
|
340
|
-
image_name:
|
364
|
+
image_name: fastLink_links_to_clusters.sif
|
341
365
|
script_cmd: Rscript /fastLink_links_to_clusters.R
|
342
366
|
outputs:
|
343
367
|
clusters: result.parquet
|
344
368
|
exclude_clustered:
|
345
369
|
steps:
|
346
370
|
- determining_exclusions
|
347
|
-
image_name:
|
371
|
+
image_name: exclude_clustered.sif
|
372
|
+
zenodo_record_id: 15778354
|
373
|
+
md5_checksum: db51f68ea24d114ed2b83a1382b6e6b6
|
348
374
|
script_cmd: python /exclude_clustered.py
|
349
375
|
outputs:
|
350
376
|
ids_to_remove: result.parquet
|
351
377
|
exclude_none:
|
352
378
|
steps:
|
353
379
|
- determining_exclusions
|
354
|
-
image_name:
|
380
|
+
image_name: exclude_none.sif
|
381
|
+
zenodo_record_id: 15778354
|
382
|
+
md5_checksum: af12b6dde2aace9dab08d352368b16a1
|
355
383
|
script_cmd: python /exclude_none.py
|
356
384
|
outputs:
|
357
385
|
ids_to_remove: result.parquet
|
358
386
|
update_clusters_by_connected_components:
|
359
387
|
steps:
|
360
388
|
- updating_clusters
|
361
|
-
image_name:
|
389
|
+
image_name: update_clusters_by_connected_components.sif
|
390
|
+
zenodo_record_id: 15778354
|
391
|
+
md5_checksum: 806b0fe86a3306d74391678ed951b054
|
362
392
|
script_cmd: python /update_clusters_by_connected_components.py
|
363
393
|
outputs:
|
364
394
|
clusters: result.parquet
|
365
395
|
middle_name_to_initial:
|
366
396
|
steps:
|
367
397
|
- pre-processing
|
368
|
-
image_name:
|
398
|
+
image_name: middle_name_to_initial.sif
|
399
|
+
zenodo_record_id: 15778354
|
400
|
+
md5_checksum: 89db9c3318300cda9d538cde08c3c323
|
369
401
|
script_cmd: python /middle_name_to_initial.py
|
370
402
|
outputs:
|
371
403
|
dataset: dataset
|
404
|
+
one_to_many_links_to_clusters:
|
405
|
+
steps:
|
406
|
+
- links_to_clusters
|
407
|
+
image_name: one_to_many_links_to_clusters.sif
|
408
|
+
zenodo_record_id: 15778354
|
409
|
+
md5_checksum: 0bf6f0b8663b4c2f99995a2213dc541a
|
410
|
+
script_cmd: python /one_to_many_links_to_clusters.py
|
411
|
+
outputs:
|
412
|
+
clusters: result.parquet
|
413
|
+
accept_all_pairs:
|
414
|
+
steps:
|
415
|
+
- evaluating_pairs
|
416
|
+
image_name: accept_all_pairs.sif
|
417
|
+
zenodo_record_id: 15778354
|
418
|
+
md5_checksum: c71c88d159c3d7343ebc39cd37224bd9
|
419
|
+
script_cmd: python /accept_all_pairs.py
|
420
|
+
outputs:
|
421
|
+
links: result.parquet
|
easylink/runner.py
CHANGED
@@ -11,6 +11,9 @@ be called from the ``easylink.cli`` module.
|
|
11
11
|
import os
|
12
12
|
import socket
|
13
13
|
import subprocess
|
14
|
+
import threading
|
15
|
+
import time
|
16
|
+
from contextlib import redirect_stderr, redirect_stdout
|
14
17
|
from pathlib import Path
|
15
18
|
|
16
19
|
from graphviz import Source
|
@@ -123,7 +126,177 @@ def main(
|
|
123
126
|
argv.extend(environment_args)
|
124
127
|
logger.info(f"Running Snakemake")
|
125
128
|
logger.debug(f"Snakemake arguments: {argv}")
|
126
|
-
|
129
|
+
|
130
|
+
# Run snakemake
|
131
|
+
if debug:
|
132
|
+
snake_main(argv)
|
133
|
+
else:
|
134
|
+
_run_snakemake_with_filtered_output(argv, Path(results_dir))
|
135
|
+
|
136
|
+
|
137
|
+
def _run_snakemake_with_filtered_output(argv: list[str], results_dir: Path) -> None:
|
138
|
+
"""Runs Snakemake with simplified log filtering.
|
139
|
+
|
140
|
+
Parameters
|
141
|
+
----------
|
142
|
+
argv
|
143
|
+
Snakemake command line arguments.
|
144
|
+
results_dir
|
145
|
+
Directory to save the full Snakemake log.
|
146
|
+
"""
|
147
|
+
snakemake_log_file = results_dir / "pipeline.log"
|
148
|
+
|
149
|
+
# Create a filtering output handler that processes lines in real-time
|
150
|
+
class FilteringOutput:
|
151
|
+
"""Handles real-time filtering and logging of Snakemake output.
|
152
|
+
|
153
|
+
This class writes all snakemake output to a log file and selectively logs
|
154
|
+
filtered lines to the logger for user visibility.
|
155
|
+
|
156
|
+
Parameters
|
157
|
+
----------
|
158
|
+
log_file_path
|
159
|
+
The path to the log file where all output will be written.
|
160
|
+
|
161
|
+
"""
|
162
|
+
|
163
|
+
def __init__(self, log_file_path: Path):
|
164
|
+
self.log_file = open(log_file_path, "w")
|
165
|
+
self.buffer = ""
|
166
|
+
self.last_output_time = time.time()
|
167
|
+
self.heartbeat_timer = None
|
168
|
+
self.dots_printed = False # Track if we've printed progress dots
|
169
|
+
self._start_heartbeat()
|
170
|
+
|
171
|
+
def _start_heartbeat(self):
|
172
|
+
"""Start a timer that prints progress dots during long-running containers."""
|
173
|
+
|
174
|
+
def heartbeat():
|
175
|
+
current_time = time.time()
|
176
|
+
if current_time - self.last_output_time > 30: # 30 seconds since last output
|
177
|
+
# Print a dot to show progress - use original stdout if available
|
178
|
+
if hasattr(self, "original_stdout") and self.original_stdout:
|
179
|
+
self.original_stdout.write(".")
|
180
|
+
self.original_stdout.flush()
|
181
|
+
self.dots_printed = True # Mark that we've printed dots
|
182
|
+
self.last_output_time = current_time
|
183
|
+
# Schedule next heartbeat
|
184
|
+
self.heartbeat_timer = threading.Timer(30.0, heartbeat)
|
185
|
+
self.heartbeat_timer.daemon = True
|
186
|
+
self.heartbeat_timer.start()
|
187
|
+
|
188
|
+
# Start first heartbeat after 30 seconds
|
189
|
+
self.heartbeat_timer = threading.Timer(30.0, heartbeat)
|
190
|
+
self.heartbeat_timer.daemon = True
|
191
|
+
self.heartbeat_timer.start()
|
192
|
+
|
193
|
+
def write(self, text: str) -> int:
|
194
|
+
# Write to log file
|
195
|
+
self.log_file.write(text)
|
196
|
+
self.log_file.flush()
|
197
|
+
|
198
|
+
# Process and log filtered output
|
199
|
+
self.buffer += text
|
200
|
+
while "\n" in self.buffer:
|
201
|
+
line, self.buffer = self.buffer.split("\n", 1)
|
202
|
+
if line.strip():
|
203
|
+
filtered_line = _filter_snakemake_output(line.strip())
|
204
|
+
if filtered_line:
|
205
|
+
# Add newline after dots if we've printed any
|
206
|
+
if (
|
207
|
+
self.dots_printed
|
208
|
+
and hasattr(self, "original_stdout")
|
209
|
+
and self.original_stdout
|
210
|
+
):
|
211
|
+
self.original_stdout.write("\n")
|
212
|
+
self.original_stdout.flush()
|
213
|
+
self.dots_printed = False # Reset the flag
|
214
|
+
logger.info(filtered_line)
|
215
|
+
self.last_output_time = time.time() # Reset heartbeat timer
|
216
|
+
|
217
|
+
return len(text)
|
218
|
+
|
219
|
+
def flush(self):
|
220
|
+
self.log_file.flush()
|
221
|
+
|
222
|
+
def close(self):
|
223
|
+
# Stop heartbeat timer
|
224
|
+
if self.heartbeat_timer:
|
225
|
+
self.heartbeat_timer.cancel()
|
226
|
+
|
227
|
+
# Process and log any remaining buffer content
|
228
|
+
if self.buffer.strip():
|
229
|
+
filtered_line = _filter_snakemake_output(self.buffer.strip())
|
230
|
+
if filtered_line:
|
231
|
+
# Add newline after dots if we've printed any
|
232
|
+
if (
|
233
|
+
self.dots_printed
|
234
|
+
and hasattr(self, "original_stdout")
|
235
|
+
and self.original_stdout
|
236
|
+
):
|
237
|
+
self.original_stdout.write("\n")
|
238
|
+
self.original_stdout.flush()
|
239
|
+
self.dots_printed = False
|
240
|
+
logger.info(filtered_line)
|
241
|
+
self.log_file.close()
|
242
|
+
|
243
|
+
def __enter__(self):
|
244
|
+
return self
|
245
|
+
|
246
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
247
|
+
self.close()
|
248
|
+
|
249
|
+
# Create the filtering output handler and ensure the log file is always closed
|
250
|
+
# Save original stdout for progress dots before redirection
|
251
|
+
import sys
|
252
|
+
|
253
|
+
original_stdout = sys.stdout
|
254
|
+
|
255
|
+
with FilteringOutput(snakemake_log_file) as filtering_output:
|
256
|
+
# Pass original stdout to filtering output for progress dots
|
257
|
+
filtering_output.original_stdout = original_stdout
|
258
|
+
try:
|
259
|
+
# Redirect both stdout and stderr to our filtering handler
|
260
|
+
with redirect_stdout(filtering_output), redirect_stderr(filtering_output):
|
261
|
+
snake_main(argv)
|
262
|
+
except SystemExit:
|
263
|
+
# Snakemake uses SystemExit for both success and failure
|
264
|
+
logger.info(
|
265
|
+
f"Pipeline finished running - full log saved to: {snakemake_log_file}"
|
266
|
+
)
|
267
|
+
# Always re-raise to allow test frameworks to detect completion
|
268
|
+
raise
|
269
|
+
|
270
|
+
|
271
|
+
def _filter_snakemake_output(line: str) -> str:
|
272
|
+
"""Filter for Snakemake output.
|
273
|
+
|
274
|
+
Parameters
|
275
|
+
----------
|
276
|
+
line
|
277
|
+
A single line of Snakemake output.
|
278
|
+
|
279
|
+
Returns
|
280
|
+
-------
|
281
|
+
The filtered line for display.
|
282
|
+
"""
|
283
|
+
# Skip empty lines
|
284
|
+
if not line.strip():
|
285
|
+
return ""
|
286
|
+
|
287
|
+
if line.startswith("localrule "):
|
288
|
+
# Show localrule names (without the "localrule" prefix)
|
289
|
+
# Extract rule name (remove "localrule " prefix and colon at the end)
|
290
|
+
filtered_line = line.replace("localrule ", "").rstrip(":")
|
291
|
+
elif line.startswith("Job ") and ":" in line:
|
292
|
+
# Show Job messages
|
293
|
+
# Extract everything after "Job ##: "
|
294
|
+
parts = line.split(":", 1)
|
295
|
+
filtered_line = parts[1].strip() if len(parts) > 1 else ""
|
296
|
+
else:
|
297
|
+
# Suppress everything else
|
298
|
+
filtered_line = ""
|
299
|
+
return filtered_line
|
127
300
|
|
128
301
|
|
129
302
|
def _get_singularity_args(config: Config) -> str:
|
@@ -0,0 +1,22 @@
|
|
1
|
+
|
2
|
+
Bootstrap: docker
|
3
|
+
From: python@sha256:1c26c25390307b64e8ff73e7edf34b4fbeac59d41da41c08da28dc316a721899
|
4
|
+
|
5
|
+
%files
|
6
|
+
./accept_all_pairs.py /accept_all_pairs.py
|
7
|
+
|
8
|
+
%post
|
9
|
+
# Create directories
|
10
|
+
mkdir -p /input_data
|
11
|
+
mkdir -p /extra_implementation_specific_input_data
|
12
|
+
mkdir -p /results
|
13
|
+
mkdir -p /diagnostics
|
14
|
+
|
15
|
+
# Install Python packages with specific versions
|
16
|
+
pip install pandas pyarrow
|
17
|
+
|
18
|
+
%environment
|
19
|
+
export LC_ALL=C
|
20
|
+
|
21
|
+
%runscript
|
22
|
+
python /accept_all_pairs.py '$@'
|
@@ -0,0 +1,26 @@
|
|
1
|
+
# STEP_NAME: evaluating_pairs
|
2
|
+
# REQUIREMENTS: pandas pyarrow
|
3
|
+
|
4
|
+
import os
|
5
|
+
from pathlib import Path
|
6
|
+
|
7
|
+
import pandas as pd
|
8
|
+
|
9
|
+
blocks_dir = Path(os.environ["BLOCKS_DIR_PATH"])
|
10
|
+
diagnostics_dir = Path(os.environ["DIAGNOSTICS_DIRECTORY"])
|
11
|
+
output_path = Path(os.environ["OUTPUT_PATHS"])
|
12
|
+
Path(output_path).parent.mkdir(exist_ok=True, parents=True)
|
13
|
+
|
14
|
+
all_predictions = []
|
15
|
+
|
16
|
+
for block_dir in blocks_dir.iterdir():
|
17
|
+
if str(block_dir.stem).startswith("."):
|
18
|
+
continue
|
19
|
+
|
20
|
+
pairs = pd.read_parquet(block_dir / "pairs.parquet")
|
21
|
+
|
22
|
+
all_predictions.append(pairs.assign(Probability=1.0))
|
23
|
+
|
24
|
+
all_predictions = pd.concat(all_predictions, ignore_index=True)
|
25
|
+
print(all_predictions)
|
26
|
+
all_predictions.to_parquet(output_path)
|
@@ -62,12 +62,21 @@ clusters_filepath = clusters_filepaths[0]
|
|
62
62
|
|
63
63
|
# Exclude records that have been clustered
|
64
64
|
clusters_df = load_file(clusters_filepath)
|
65
|
+
# NOTE: We defined "clustered" for these purposes as clustered *with* anything else.
|
66
|
+
# Simply putting a record into its own cluster does not indicate to us that it has
|
67
|
+
# been sufficiently clustered to ignore.
|
68
|
+
cluster_sizes = clusters_df.groupby("Cluster ID").size()
|
69
|
+
clusters_df["size"] = cluster_sizes.loc[clusters_df["Cluster ID"]].values
|
70
|
+
clusters_df = clusters_df[clusters_df["size"] > 1]
|
71
|
+
|
65
72
|
dataset_df = load_file(dataset_path)
|
66
73
|
clustered_record_ids = set(dataset_df["Record ID"].unique()) & set(
|
67
|
-
clusters_df["Input Record
|
74
|
+
clusters_df[clusters_df["Input Record Dataset"] == splitter_choice][
|
75
|
+
"Input Record ID"
|
76
|
+
].unique()
|
68
77
|
)
|
69
78
|
|
70
|
-
IDS_TO_REMOVE = pd.DataFrame({"Record ID": list(clustered_record_ids)})
|
79
|
+
IDS_TO_REMOVE = pd.DataFrame({"Input Record ID": list(clustered_record_ids)})
|
71
80
|
|
72
81
|
# OUTPUT_PATHS is a single path to a file (results.parquet)
|
73
82
|
results_filepath = os.environ["OUTPUT_PATHS"]
|
@@ -67,7 +67,7 @@ clusters_df = load_file(clusters_filepath)
|
|
67
67
|
|
68
68
|
# SAVE OUTPUTS
|
69
69
|
|
70
|
-
IDS_TO_REMOVE = pd.DataFrame(columns=["Record ID"])
|
70
|
+
IDS_TO_REMOVE = pd.DataFrame(columns=["Input Record ID"])
|
71
71
|
|
72
72
|
# OUTPUT_PATHS is a single path to a file (results.parquet)
|
73
73
|
results_filepath = os.environ["OUTPUT_PATHS"]
|
@@ -0,0 +1,22 @@
|
|
1
|
+
|
2
|
+
Bootstrap: docker
|
3
|
+
From: python@sha256:1c26c25390307b64e8ff73e7edf34b4fbeac59d41da41c08da28dc316a721899
|
4
|
+
|
5
|
+
%files
|
6
|
+
./one_to_many_links_to_clusters.py /one_to_many_links_to_clusters.py
|
7
|
+
|
8
|
+
%post
|
9
|
+
# Create directories
|
10
|
+
mkdir -p /input_data
|
11
|
+
mkdir -p /extra_implementation_specific_input_data
|
12
|
+
mkdir -p /results
|
13
|
+
mkdir -p /diagnostics
|
14
|
+
|
15
|
+
# Install Python packages with specific versions
|
16
|
+
pip install pandas pyarrow networkx
|
17
|
+
|
18
|
+
%environment
|
19
|
+
export LC_ALL=C
|
20
|
+
|
21
|
+
%runscript
|
22
|
+
python /one_to_many_links_to_clusters.py '$@'
|
@@ -0,0 +1,109 @@
|
|
1
|
+
# STEP_NAME: links_to_clusters
|
2
|
+
# REQUIREMENTS: pandas pyarrow networkx
|
3
|
+
|
4
|
+
import os
|
5
|
+
from pathlib import Path
|
6
|
+
|
7
|
+
import networkx as nx
|
8
|
+
import pandas as pd
|
9
|
+
|
10
|
+
links = pd.read_parquet(os.environ["LINKS_FILE_PATH"])
|
11
|
+
output_path = Path(os.environ["OUTPUT_PATHS"])
|
12
|
+
|
13
|
+
no_duplicates_dataset = os.environ["NO_DUPLICATES_DATASET"]
|
14
|
+
break_ties_method = os.getenv("BREAK_TIES_METHOD", "drop")
|
15
|
+
|
16
|
+
left_no_duplicates_dataset = links["Left Record Dataset"] == no_duplicates_dataset
|
17
|
+
right_no_duplicates_dataset = links["Right Record Dataset"] == no_duplicates_dataset
|
18
|
+
|
19
|
+
if (left_no_duplicates_dataset & right_no_duplicates_dataset).any():
|
20
|
+
raise ValueError(
|
21
|
+
f"Provided links include links within the no_duplicates_dataset ({no_duplicates_dataset})"
|
22
|
+
)
|
23
|
+
|
24
|
+
if not (left_no_duplicates_dataset | right_no_duplicates_dataset).all():
|
25
|
+
raise ValueError(
|
26
|
+
f"Provided links include links that don't involve the no_duplicates_dataset ({no_duplicates_dataset})"
|
27
|
+
)
|
28
|
+
|
29
|
+
# Get the no-duplicates dataset all on the right
|
30
|
+
id_cols = [
|
31
|
+
"Left Record Dataset",
|
32
|
+
"Left Record ID",
|
33
|
+
"Right Record Dataset",
|
34
|
+
"Right Record ID",
|
35
|
+
]
|
36
|
+
switched_id_cols = [
|
37
|
+
"Right Record Dataset",
|
38
|
+
"Right Record ID",
|
39
|
+
"Left Record Dataset",
|
40
|
+
"Left Record ID",
|
41
|
+
]
|
42
|
+
links.loc[left_no_duplicates_dataset, id_cols] = links.loc[
|
43
|
+
left_no_duplicates_dataset, switched_id_cols
|
44
|
+
].to_numpy()
|
45
|
+
links[["Left Record ID", "Right Record ID"]] = links[
|
46
|
+
["Left Record ID", "Right Record ID"]
|
47
|
+
].astype(int)
|
48
|
+
|
49
|
+
links["Left Record Key"] = (
|
50
|
+
links["Left Record Dataset"] + "-__-" + links["Left Record ID"].astype(int).astype(str)
|
51
|
+
)
|
52
|
+
links["Right Record Key"] = (
|
53
|
+
links["Right Record Dataset"] + "-__-" + links["Right Record ID"].astype(int).astype(str)
|
54
|
+
)
|
55
|
+
|
56
|
+
links_to_accept = (
|
57
|
+
links[links["Probability"] >= float(os.environ["THRESHOLD_MATCH_PROBABILITY"])]
|
58
|
+
# Pre-emptively break probability ties by right record key for the highest_id method
|
59
|
+
.sort_values(["Probability", "Right Record Key"], ascending=False)
|
60
|
+
# No duplicates in the *right* means only one link per *left* record
|
61
|
+
.groupby(["Left Record Key"]).first()
|
62
|
+
)
|
63
|
+
|
64
|
+
if break_ties_method == "drop":
|
65
|
+
num_tied = (
|
66
|
+
links_to_accept.merge(links, on=["Left Record Key", "Probability"])
|
67
|
+
.groupby(["Left Record Key"])
|
68
|
+
.size()
|
69
|
+
)
|
70
|
+
print("Ties:")
|
71
|
+
print(num_tied)
|
72
|
+
print(num_tied.describe())
|
73
|
+
links_to_accept = links_to_accept[num_tied == 1]
|
74
|
+
elif break_ties_method == "highest_id":
|
75
|
+
# Done above pre-emptively
|
76
|
+
pass
|
77
|
+
else:
|
78
|
+
raise ValueError(f"Unknown break_ties_method {break_ties_method}")
|
79
|
+
|
80
|
+
# NOTE: We only include nodes involved in an accepted link in our cluster.
|
81
|
+
# If a node isn't involved in an accepted link, that could just represent
|
82
|
+
# that we haven't evaluated the right pairs involving it, not confidence that
|
83
|
+
# it is a singleton.
|
84
|
+
G = nx.from_pandas_edgelist(
|
85
|
+
links_to_accept.reset_index()[["Left Record Key", "Right Record Key"]].rename(
|
86
|
+
columns={"Left Record Key": "source", "Right Record Key": "target"}
|
87
|
+
)
|
88
|
+
)
|
89
|
+
|
90
|
+
# Compute connected components
|
91
|
+
components = list(nx.connected_components(G))
|
92
|
+
|
93
|
+
# Assign new cluster IDs
|
94
|
+
merged_data = []
|
95
|
+
for cluster_id, records in enumerate(components, start=1):
|
96
|
+
for record_key in records:
|
97
|
+
merged_data.append((record_key, cluster_id))
|
98
|
+
|
99
|
+
# Build the final DataFrame
|
100
|
+
merged_df = pd.DataFrame(merged_data, columns=["Input Record Key", "Cluster ID"])
|
101
|
+
|
102
|
+
merged_df[["Input Record Dataset", "Input Record ID"]] = (
|
103
|
+
merged_df["Input Record Key"].str.split("-__-", n=1, expand=True)
|
104
|
+
if not merged_df.empty
|
105
|
+
else pd.DataFrame(columns=["Input Record Dataset", "Input Record ID"])
|
106
|
+
)
|
107
|
+
merged_df["Input Record ID"] = merged_df["Input Record ID"].astype(int)
|
108
|
+
|
109
|
+
merged_df[["Input Record Dataset", "Input Record ID", "Cluster ID"]].to_parquet(output_path)
|
@@ -59,7 +59,18 @@ new_clusters_df = load_file(new_clusters_filepath)
|
|
59
59
|
|
60
60
|
def merge_clusters(known_clusters_df, new_clusters_df):
|
61
61
|
# Combine both dataframes
|
62
|
-
combined_df = pd.concat(
|
62
|
+
combined_df = pd.concat(
|
63
|
+
[
|
64
|
+
# Ensure cluster names are unique
|
65
|
+
known_clusters_df.assign(
|
66
|
+
**{"Cluster ID": lambda df: "known__" + df["Cluster ID"].astype(str)}
|
67
|
+
),
|
68
|
+
new_clusters_df.assign(
|
69
|
+
**{"Cluster ID": lambda df: "new__" + df["Cluster ID"].astype(str)}
|
70
|
+
),
|
71
|
+
],
|
72
|
+
ignore_index=True,
|
73
|
+
)
|
63
74
|
combined_df["Input Record Key"] = (
|
64
75
|
combined_df["Input Record Dataset"]
|
65
76
|
+ "-__-"
|
@@ -92,9 +103,11 @@ def merge_clusters(known_clusters_df, new_clusters_df):
|
|
92
103
|
# Build the final DataFrame
|
93
104
|
merged_df = pd.DataFrame(merged_data, columns=["Input Record Key", "Cluster ID"])
|
94
105
|
|
95
|
-
merged_df[["Input Record Dataset", "Input Record ID"]] =
|
96
|
-
"Input Record Key"
|
97
|
-
|
106
|
+
merged_df[["Input Record Dataset", "Input Record ID"]] = (
|
107
|
+
merged_df["Input Record Key"].str.split("-__-", n=1, expand=True)
|
108
|
+
if not merged_df.empty
|
109
|
+
else pd.DataFrame(columns=["Input Record Dataset", "Input Record ID"])
|
110
|
+
)
|
98
111
|
|
99
112
|
merged_df["Input Record ID"] = merged_df["Input Record ID"].astype(int)
|
100
113
|
|
@@ -72,7 +72,7 @@ if len(clusters_df) > 0:
|
|
72
72
|
|
73
73
|
# SAVE OUTPUTS
|
74
74
|
|
75
|
-
IDS_TO_REMOVE = pd.DataFrame(columns=["Record ID"])
|
75
|
+
IDS_TO_REMOVE = pd.DataFrame(columns=["Input Record ID"])
|
76
76
|
|
77
77
|
# OUTPUT_PATHS is a single path to a file (results.parquet)
|
78
78
|
results_filepath = os.environ["OUTPUT_PATHS"]
|
@@ -52,7 +52,7 @@ results_dir.mkdir(exist_ok=True, parents=True)
|
|
52
52
|
dataset = load_file(dataset_path)
|
53
53
|
ids_to_remove = load_file(ids_filepath)
|
54
54
|
|
55
|
-
dataset = dataset[~dataset["Record ID"].isin(ids_to_remove)]
|
55
|
+
dataset = dataset[~dataset["Record ID"].isin(ids_to_remove["Input Record ID"])]
|
56
56
|
|
57
57
|
output_path = results_dir / Path(dataset_path).name
|
58
58
|
logging.info(f"Writing output for dataset from input {dataset_path} to {output_path}")
|
@@ -90,12 +90,18 @@ blocked_pairs = (
|
|
90
90
|
.drop(columns=["match_key"])
|
91
91
|
)
|
92
92
|
|
93
|
-
blocked_pairs[["Left Record Dataset", "Left Record ID"]] =
|
94
|
-
"join_key_l"
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
93
|
+
blocked_pairs[["Left Record Dataset", "Left Record ID"]] = (
|
94
|
+
blocked_pairs.pop("join_key_l").str.split("-__-", n=1, expand=True)
|
95
|
+
if not blocked_pairs.empty
|
96
|
+
else pd.DataFrame(columns=["Left Record Dataset", "Left Record ID"])
|
97
|
+
)
|
98
|
+
|
99
|
+
blocked_pairs[["Right Record Dataset", "Right Record ID"]] = (
|
100
|
+
blocked_pairs.pop("join_key_r").str.split("-__-", n=1, expand=True)
|
101
|
+
if not blocked_pairs.empty
|
102
|
+
else pd.DataFrame(columns=["Right Record Dataset", "Right Record ID"])
|
103
|
+
)
|
104
|
+
|
99
105
|
blocked_pairs[["Left Record ID", "Right Record ID"]] = blocked_pairs[
|
100
106
|
["Left Record ID", "Right Record ID"]
|
101
107
|
].astype(int)
|
@@ -35,6 +35,7 @@ for block_dir in blocks_dir.iterdir():
|
|
35
35
|
comparisons.append(cl.LevenshteinAtThresholds(column))
|
36
36
|
else:
|
37
37
|
raise ValueError(f"Unknown comparison method {method}")
|
38
|
+
# TODO: check both datasets contain all the columns
|
38
39
|
|
39
40
|
# Create the Splink linker in dedupe mode
|
40
41
|
settings = SettingsCreator(
|
@@ -135,7 +136,7 @@ for block_dir in blocks_dir.iterdir():
|
|
135
136
|
|
136
137
|
sqls = predict_from_comparison_vectors_sqls_using_settings(
|
137
138
|
linker._settings_obj,
|
138
|
-
float(os.
|
139
|
+
float(os.getenv("THRESHOLD_MATCH_PROBABILITY", 0)),
|
139
140
|
threshold_match_weight=None,
|
140
141
|
sql_infinity_expression=linker._infinity_expression,
|
141
142
|
)
|
@@ -53,6 +53,8 @@ cc = (
|
|
53
53
|
# Split "Record Key" back into "Input Record Dataset" and "Input Record ID"
|
54
54
|
cc[["Input Record Dataset", "Input Record ID"]] = (
|
55
55
|
cc["Record Key"].astype(str).str.split("-__-", n=1, expand=True)
|
56
|
+
if not cc.empty
|
57
|
+
else pd.DataFrame(columns=["Input Record Dataset", "Input Record ID"])
|
56
58
|
)
|
57
59
|
cc = cc.drop(columns=["Record Key"])
|
58
60
|
cc["Input Record ID"] = cc["Input Record ID"].astype(int)
|
@@ -97,24 +97,34 @@ def _add_logging_sink(
|
|
97
97
|
Whether the logs should be converted to JSON before they're dumped
|
98
98
|
to the logging sink.
|
99
99
|
"""
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
100
|
+
|
101
|
+
def format_message(record):
|
102
|
+
elapsed_seconds = int(record["elapsed"].total_seconds())
|
103
|
+
hours = elapsed_seconds // 3600
|
104
|
+
minutes = (elapsed_seconds % 3600) // 60
|
105
|
+
seconds = elapsed_seconds % 60
|
106
|
+
elapsed_str = f"{hours:02d}:{minutes:02d}:{seconds:02d}"
|
107
|
+
time_str = record["time"].strftime("%Y-%m-%d %H:%M:%S")
|
108
|
+
|
109
|
+
if colorize:
|
110
|
+
return f"\033[32m{time_str}\033[0m | \033[32m{elapsed_str}\033[0m | {record['message']}\n"
|
111
|
+
else:
|
112
|
+
return f"{time_str} | {elapsed_str} | {record['message']}\n"
|
113
|
+
|
104
114
|
if verbose == 0:
|
105
115
|
logger.add(
|
106
116
|
sink,
|
107
|
-
colorize=
|
117
|
+
colorize=False, # We handle colors in format_message
|
108
118
|
level="INFO",
|
109
|
-
format=
|
119
|
+
format=format_message,
|
110
120
|
serialize=serialize,
|
111
121
|
)
|
112
122
|
elif verbose >= 1:
|
113
123
|
logger.add(
|
114
124
|
sink,
|
115
|
-
colorize=
|
125
|
+
colorize=False, # We handle colors in format_message
|
116
126
|
level="DEBUG",
|
117
|
-
format=
|
127
|
+
format=format_message,
|
118
128
|
serialize=serialize,
|
119
129
|
)
|
120
130
|
|
@@ -341,8 +341,8 @@ def _validate_pairs(df: pd.DataFrame, filepath: str) -> None:
|
|
341
341
|
def validate_ids_to_remove(filepath: str) -> None:
|
342
342
|
"""Validates a file containing IDs to remove.
|
343
343
|
|
344
|
-
- The file must contain a single column: "Record ID".
|
345
|
-
- "Record ID" must have unique values.
|
344
|
+
- The file must contain a single column: "Input Record ID".
|
345
|
+
- "Input Record ID" must have unique values.
|
346
346
|
|
347
347
|
Parameters
|
348
348
|
----------
|
@@ -352,13 +352,13 @@ def validate_ids_to_remove(filepath: str) -> None:
|
|
352
352
|
Raises
|
353
353
|
------
|
354
354
|
LookupError
|
355
|
-
If the file is missing the "Record ID" column.
|
355
|
+
If the file is missing the "Input Record ID" column.
|
356
356
|
ValueError
|
357
|
-
If the "Record ID" column is not unique.
|
357
|
+
If the "Input Record ID" column is not unique.
|
358
358
|
"""
|
359
|
-
_validate_required_columns(filepath, {"Record ID"})
|
359
|
+
_validate_required_columns(filepath, {"Input Record ID"})
|
360
360
|
df = _read_file(filepath)
|
361
|
-
_validate_unique_column(df, "Record ID", filepath)
|
361
|
+
_validate_unique_column(df, "Input Record ID", filepath)
|
362
362
|
|
363
363
|
|
364
364
|
def validate_records(filepath: str) -> None:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: easylink
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.2.0
|
4
4
|
Summary: Research repository for the EasyLink ER ecosystem project.
|
5
5
|
Home-page: https://github.com/ihmeuw/easylink
|
6
6
|
Author: The EasyLink developers
|
@@ -78,34 +78,50 @@ Installation
|
|
78
78
|
|
79
79
|
.. _installation:
|
80
80
|
|
81
|
+
**NOTE: This package requires AMD64 CPU architecture - it is not compatible with
|
82
|
+
Apple's ARM64 architecture (e.g. M1 and newer Macs).**
|
83
|
+
|
81
84
|
There are a few things to install in order to use this package:
|
82
85
|
|
83
|
-
-
|
86
|
+
- Set up Linux.
|
87
|
+
|
88
|
+
Singularity (and thus EasyLink) requires Linux to run. If you are not already
|
89
|
+
using Linux, you will need to set up a virtual machine; refer to the
|
90
|
+
`Singularity documentation for installing on Windows or Mac <https://docs.sylabs.io/guides/4.1/admin-guide/installation.html#installation-on-windows-or-mac>`_.
|
84
91
|
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
``singularity --version``. For an existing installation, your
|
92
|
+
- Install Singularity.
|
93
|
+
|
94
|
+
First check if you already have Singularity installed by running the command
|
95
|
+
``singularity --version``. For an existing installation, your Singularity version
|
89
96
|
number is printed.
|
90
97
|
|
98
|
+
If Singularity is not yet installed, you will need to install it;
|
99
|
+
refer to the `Singularity docs for installing on Linux <https://docs.sylabs.io/guides/4.1/admin-guide/installation.html#installation-on-linux>`_.
|
100
|
+
|
101
|
+
Note that this requires administrator privileges; you may need to request installation
|
102
|
+
from your system admin if you are working in a shared computing environment.
|
103
|
+
|
91
104
|
- Install conda.
|
92
105
|
|
93
106
|
We recommend `miniforge <https://github.com/conda-forge/miniforge>`_. You can
|
94
107
|
check if you already have conda installed by running the command ``conda --version``.
|
95
108
|
For an existing installation, a version will be displayed.
|
96
109
|
|
97
|
-
-
|
110
|
+
- Create a conda environment with python and graphviz installed.
|
111
|
+
|
112
|
+
::
|
113
|
+
|
114
|
+
$ conda create --name easylink -c conda-forge python=3.12 graphviz 'gcc<14' -y
|
115
|
+
$ conda activate easylink
|
116
|
+
|
117
|
+
- Install easylink in the environment.
|
98
118
|
|
99
119
|
Option 1 - Install from PyPI with pip::
|
100
120
|
|
101
|
-
$ conda create --name easylink -c conda-forge python=3.12 graphviz 'gcc<14' -y
|
102
|
-
$ conda activate easylink
|
103
121
|
$ pip install easylink
|
104
122
|
|
105
123
|
Option 2 - Build from source with pip::
|
106
124
|
|
107
|
-
$ conda create --name easylink -c conda-forge python=3.12 graphviz 'gcc<14' -y
|
108
|
-
$ conda activate easylink
|
109
125
|
$ pip install git+https://github.com/ihmeuw/easylink.git
|
110
126
|
|
111
127
|
.. _end_installation:
|
@@ -1,34 +1,38 @@
|
|
1
1
|
easylink/__about__.py,sha256=2-oxCfu9t9yUJouLDwqYRZ0eii8kN25SxRzsawjWjho,440
|
2
2
|
easylink/__init__.py,sha256=gGMcIVfiVnHtlDw5mZwhevcDb2wt-kuP6F64gnkFack,159
|
3
|
-
easylink/_version.py,sha256=
|
4
|
-
easylink/cli.py,sha256=
|
3
|
+
easylink/_version.py,sha256=Zn1KFblwuFHiDRdRAiRnDBRkbPttWh44jKa5zG2ov0E,22
|
4
|
+
easylink/cli.py,sha256=3Xoqclhn7mEHzuqyuVUjt67-V3Fqu0_Jr3B3lCdIuAg,10704
|
5
5
|
easylink/configuration.py,sha256=TPd3WbqUcJMJDPJuHeo5ZebvZPQrRyfm8-laad2sOFk,12681
|
6
6
|
easylink/graph_components.py,sha256=zZDZXg5smReHO3ryQC4pao24wyKXzWDe6jS3C6fM2ak,13892
|
7
7
|
easylink/implementation.py,sha256=lSF37g-aQYgdLI0lgFaDrBkp23te9O9B1V-CmmRtB-0,14514
|
8
|
-
easylink/implementation_metadata.yaml,sha256=
|
8
|
+
easylink/implementation_metadata.yaml,sha256=ahuSVk5Ur1V0F8EsLZO5apkNC2bWv2RsytNaiWGo9Yk,12562
|
9
9
|
easylink/pipeline.py,sha256=NJyMInbOCjJ_5kRzzuK5AcupvC7ecd5qLOC-f1Gy3Ss,17701
|
10
10
|
easylink/pipeline_graph.py,sha256=jtjS7_2IVa189U8cL621600erC_0pa6BKPRRO8VqwiU,23760
|
11
11
|
easylink/pipeline_schema.py,sha256=sj6YQqMFUS35RRFWdlrhPlud68d3Rb5y7iKfAEZy1qs,6877
|
12
12
|
easylink/rule.py,sha256=n8r4NL7MiNzppl39MRbEMdHEpn_e_XS7LfrsJD6KNfA,16629
|
13
|
-
easylink/runner.py,sha256=
|
13
|
+
easylink/runner.py,sha256=h39MbWHgTs-VwkPxk76186si76e8UTf1hySqepqUSS8,13155
|
14
14
|
easylink/step.py,sha256=-vdFhPvwAZ3d69LMQGmSIVdcMG8E8ZtSvTE0UWif7zs,91088
|
15
15
|
easylink/devtools/implementation_creator.py,sha256=gZZpfpiOOh912nNjR_d5wR0mBO5c09jWMS0FSYRcE1o,19120
|
16
16
|
easylink/pipeline_schema_constants/__init__.py,sha256=njUL2v67WFC3dW_FjOXWRLSeOhInox_ZINLEji0_7V8,1523
|
17
17
|
easylink/pipeline_schema_constants/development.py,sha256=0x6lWqBmP1K9AebEmeZ4veSnLBcdQcZXzbV6lCU11bc,12622
|
18
18
|
easylink/pipeline_schema_constants/main.py,sha256=kcAhdbK_BhS79LUMhKkn-Uttl2vA-MHVX4M1StTloJQ,22934
|
19
19
|
easylink/pipeline_schema_constants/testing.py,sha256=ti08DeUuF-eWrGKMj4BMyOFFJnEYooDaWX0DGiferbk,24579
|
20
|
+
easylink/steps/cascading/accept_all_pairs.def,sha256=kwZMF3H0mqCBcO1Y2parJXFBLp4e9bLQoVIYU7zZ8xY,486
|
21
|
+
easylink/steps/cascading/accept_all_pairs.py,sha256=eF_rmqcZtL3vI1u-TJejOcKX2Qou-AbaLI7qAAGjoGI,703
|
20
22
|
easylink/steps/cascading/exclude_clustered.def,sha256=GfoDqO2Vtsh7VI8SwGaJtv_KtKjs-UmBcivqQ7OPkjk,503
|
21
|
-
easylink/steps/cascading/exclude_clustered.py,sha256=
|
23
|
+
easylink/steps/cascading/exclude_clustered.py,sha256=T60deNb91_ZFg5K190G-Q7BC5EYrEdLPhFEK7Togv0Y,3048
|
22
24
|
easylink/steps/cascading/exclude_none.def,sha256=iFUhUMocxtkA0NErkjVrBxY0MUdS3DIPNsbCpTJRP0k,488
|
23
|
-
easylink/steps/cascading/exclude_none.py,sha256=
|
25
|
+
easylink/steps/cascading/exclude_none.py,sha256=DesKAO-UcPqKKtUS92OHU25YDXMJLiBEcGLk69UYWDk,2481
|
26
|
+
easylink/steps/cascading/one_to_many_links_to_clusters.def,sha256=BVFusUydsV3hY1en16OVr3TPqzwst-cEVBwvb8dtpqA,534
|
27
|
+
easylink/steps/cascading/one_to_many_links_to_clusters.py,sha256=7QSJxW3mmR3LIjWBzzgi3vcOsmoYOsiSJn6iYGppHLA,3789
|
24
28
|
easylink/steps/cascading/update_clusters_by_connected_components.def,sha256=sAAAWOod8EuAnotR1cayaGAvs7x6xoMVlwmLso_a9Cc,578
|
25
|
-
easylink/steps/cascading/update_clusters_by_connected_components.py,sha256=
|
29
|
+
easylink/steps/cascading/update_clusters_by_connected_components.py,sha256=jhpMgewztCXrRxBw2FnH2HjIybpp7GcHe4kjTMgQOyg,4059
|
26
30
|
easylink/steps/default/default_clusters_to_links.def,sha256=9PjUygLvsoYMUZDznceuuv55t8fPs473P57J_RMl3U0,527
|
27
31
|
easylink/steps/default/default_clusters_to_links.py,sha256=Ckm53d3W-ETNlTvQJPOkpHmSqCmxSWknMPQrEAIoTBY,2816
|
28
32
|
easylink/steps/default/default_determining_exclusions.def,sha256=zZUEHDdrpLxzx3gTm-dki2ge5ivCCg4ziIwTErqCII0,542
|
29
|
-
easylink/steps/default/default_determining_exclusions.py,sha256=
|
33
|
+
easylink/steps/default/default_determining_exclusions.py,sha256=RpYHFAral4uYevgiOsYqUHYgsEIejV5NhYdQ3q7VeU0,2635
|
30
34
|
easylink/steps/default/default_removing_records.def,sha256=QqacmOu6myxFSULHRKeKsVD8l73KDm4VEkPkPlovwqs,524
|
31
|
-
easylink/steps/default/default_removing_records.py,sha256=
|
35
|
+
easylink/steps/default/default_removing_records.py,sha256=I_xGdWftlwP7H8HdxfblSG2YFVqA986KOECVwMCn4fk,1925
|
32
36
|
easylink/steps/default/default_schema_alignment.def,sha256=hFHJkos0Fhe4LvpjLOCd6klIaIqOKqECDDSTVu3G03Y,524
|
33
37
|
easylink/steps/default/default_schema_alignment.py,sha256=oT5jbYQ3C3ocLgqqOnvH0SIJ6NeTtPBWWmCqr_frnAI,1479
|
34
38
|
easylink/steps/default/default_updating_clusters.def,sha256=vDzSkTknDfeiXeHREpw4BkUxFcTWamxr81c3rZ7_enY,527
|
@@ -70,22 +74,22 @@ easylink/steps/rl-dummy/input_data/known_clusters.parquet,sha256=Ysodu65toHZN4Ag
|
|
70
74
|
easylink/steps/rl-dummy/pre-processing/dummy_pre-processing.def,sha256=HeUSv2QvMOQzsyVktYR1xYoEqwiNpDo-p7IRcGSMspE,512
|
71
75
|
easylink/steps/rl-dummy/pre-processing/dummy_pre-processing.py,sha256=I6kqG4e_H2yFW5MpsMXdpoY_NjHcBvVVAHWv89LUgXE,1852
|
72
76
|
easylink/steps/splink/splink_blocking_and_filtering.def,sha256=umWzxJhsfdi8g3TD-r2mKpjC-FPAMDk6-IERiWigdQc,557
|
73
|
-
easylink/steps/splink/splink_blocking_and_filtering.py,sha256=
|
77
|
+
easylink/steps/splink/splink_blocking_and_filtering.py,sha256=3WMBmNEECB9Kxu4D6PAesZzBrhHTdpFEgvnGPsV4bww,5475
|
74
78
|
easylink/steps/splink/splink_evaluating_pairs.def,sha256=DN3Ohy9qJOAyK58v164neP23HDVYpedMqzCu4eQh4Hg,521
|
75
|
-
easylink/steps/splink/splink_evaluating_pairs.py,sha256=
|
79
|
+
easylink/steps/splink/splink_evaluating_pairs.py,sha256=vWUe3vQo9uGs0Cy8pG5PbolzsJX_cEaPS3Q0PMcBjcg,6253
|
76
80
|
easylink/steps/splink/splink_links_to_clusters.def,sha256=RurvOYyGjNs9tx64DTXwI-GSgHD4T7SzDfhAH18pTEM,524
|
77
|
-
easylink/steps/splink/splink_links_to_clusters.py,sha256=
|
81
|
+
easylink/steps/splink/splink_links_to_clusters.py,sha256=Brpy3ZKSBpBUeOitg1ZgDvuMVwILH0QBkLXRJN8LXno,2015
|
78
82
|
easylink/utilities/__init__.py,sha256=0U33kbv4hoMfFQ_lh5hLwifxRPzOgkLkjKLYxmaK10g,196
|
79
83
|
easylink/utilities/aggregator_utils.py,sha256=_DAHRAf9janbDsuj_jnAn5Dzz2s4R5Ni3YeytDpN9UE,954
|
80
84
|
easylink/utilities/data_utils.py,sha256=XPRjq3qW_fN0xQ23Jms_xBzpTHbRwqZWDP1AW0nYkP0,6926
|
81
|
-
easylink/utilities/general_utils.py,sha256=
|
85
|
+
easylink/utilities/general_utils.py,sha256=MmuoV4T6PgyEDjbepC_1D3TGrq70Hp-hl-GaAYr5wRU,5033
|
82
86
|
easylink/utilities/paths.py,sha256=9inDKMPorAaWh5vhjc3j1Tj_aXVKhLdodiJO9H1nNes,947
|
83
87
|
easylink/utilities/spark.smk,sha256=kGtpem7LfQc71tMh5WAYaqKnHQKFvcdhPQSdumOP70k,5799
|
84
88
|
easylink/utilities/splitter_utils.py,sha256=KXiVUYJ9TGxCQmrwos18pB1sxG_0Ay67qoDJT6vem2o,3828
|
85
|
-
easylink/utilities/validation_utils.py,sha256=
|
86
|
-
easylink-0.
|
87
|
-
easylink-0.
|
88
|
-
easylink-0.
|
89
|
-
easylink-0.
|
90
|
-
easylink-0.
|
91
|
-
easylink-0.
|
89
|
+
easylink/utilities/validation_utils.py,sha256=1naksMPStw_xIOqskX6DE99f16Y7eCcVF9I5ZILjMvI,18453
|
90
|
+
easylink-0.2.0.dist-info/licenses/LICENSE,sha256=z6NBo3hY2Pbh2Wjm9IcHLyQoG-kd816uASh8PbwhTcQ,1530
|
91
|
+
easylink-0.2.0.dist-info/METADATA,sha256=HxtOiOMe9hTRcK6HL6sLTTQNeP9X7hrhiodTpEMUeOA,4218
|
92
|
+
easylink-0.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
93
|
+
easylink-0.2.0.dist-info/entry_points.txt,sha256=OGMZDFltg3yMboT7XjJt3joiPhRfV_7jnREVtrAIQNU,51
|
94
|
+
easylink-0.2.0.dist-info/top_level.txt,sha256=oHcOpcF_jDMWFiJRzfGQvuskENGDjSPC_Agu9Z_Xvik,9
|
95
|
+
easylink-0.2.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|