easylink 0.1.25__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- easylink/_version.py +1 -1
- easylink/cli.py +8 -3
- easylink/implementation_metadata.yaml +62 -38
- easylink/runner.py +66 -10
- easylink/steps/cascading/accept_all_pairs.def +22 -0
- easylink/steps/cascading/accept_all_pairs.py +26 -0
- easylink/steps/cascading/exclude_clustered.py +11 -2
- easylink/steps/cascading/exclude_none.py +1 -1
- easylink/steps/cascading/one_to_many_links_to_clusters.def +22 -0
- easylink/steps/cascading/one_to_many_links_to_clusters.py +109 -0
- easylink/steps/cascading/update_clusters_by_connected_components.py +17 -4
- easylink/steps/default/default_determining_exclusions.py +1 -1
- easylink/steps/default/default_removing_records.py +1 -1
- easylink/steps/splink/splink_blocking_and_filtering.py +12 -6
- easylink/steps/splink/splink_links_to_clusters.py +2 -0
- easylink/utilities/validation_utils.py +6 -6
- {easylink-0.1.25.dist-info → easylink-0.2.0.dist-info}/METADATA +1 -1
- {easylink-0.1.25.dist-info → easylink-0.2.0.dist-info}/RECORD +22 -18
- {easylink-0.1.25.dist-info → easylink-0.2.0.dist-info}/WHEEL +0 -0
- {easylink-0.1.25.dist-info → easylink-0.2.0.dist-info}/entry_points.txt +0 -0
- {easylink-0.1.25.dist-info → easylink-0.2.0.dist-info}/licenses/LICENSE +0 -0
- {easylink-0.1.25.dist-info → easylink-0.2.0.dist-info}/top_level.txt +0 -0
easylink/_version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.
|
1
|
+
__version__ = "0.2.0"
|
easylink/cli.py
CHANGED
@@ -211,9 +211,14 @@ def run(
|
|
211
211
|
images_dir=images,
|
212
212
|
schema_name=schema,
|
213
213
|
)
|
214
|
-
except SystemExit:
|
215
|
-
# Snakemake uses SystemExit
|
216
|
-
|
214
|
+
except SystemExit as e:
|
215
|
+
# Snakemake uses SystemExit with exit code 0 for success, non-zero for failure
|
216
|
+
if e.code == 0:
|
217
|
+
logger.info("\033[32m*** FINISHED ***\033[0m") # Green
|
218
|
+
else:
|
219
|
+
logger.error(
|
220
|
+
f"\033[31mERROR: Pipeline failed with exit code {e.code}\033[0m"
|
221
|
+
) # Red
|
217
222
|
raise
|
218
223
|
|
219
224
|
|
@@ -2,7 +2,7 @@ step_1_python_pandas:
|
|
2
2
|
steps:
|
3
3
|
- step_1
|
4
4
|
image_name: python_pandas.sif
|
5
|
-
zenodo_record_id:
|
5
|
+
zenodo_record_id: 15778354
|
6
6
|
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
7
7
|
script_cmd: python /dummy_step.py
|
8
8
|
outputs:
|
@@ -11,7 +11,7 @@ step_1a_python_pandas:
|
|
11
11
|
steps:
|
12
12
|
- step_1a
|
13
13
|
image_name: python_pandas.sif
|
14
|
-
zenodo_record_id:
|
14
|
+
zenodo_record_id: 15778354
|
15
15
|
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
16
16
|
script_cmd: python /dummy_step.py
|
17
17
|
env:
|
@@ -22,7 +22,7 @@ step_1b_python_pandas:
|
|
22
22
|
steps:
|
23
23
|
- step_1b
|
24
24
|
image_name: python_pandas.sif
|
25
|
-
zenodo_record_id:
|
25
|
+
zenodo_record_id: 15778354
|
26
26
|
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
27
27
|
script_cmd: python /dummy_step.py
|
28
28
|
env:
|
@@ -33,7 +33,7 @@ step_2_python_pandas:
|
|
33
33
|
steps:
|
34
34
|
- step_2
|
35
35
|
image_name: python_pandas.sif
|
36
|
-
zenodo_record_id:
|
36
|
+
zenodo_record_id: 15778354
|
37
37
|
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
38
38
|
script_cmd: python /dummy_step.py
|
39
39
|
outputs:
|
@@ -42,7 +42,7 @@ step_3_python_pandas:
|
|
42
42
|
steps:
|
43
43
|
- step_3
|
44
44
|
image_name: python_pandas.sif
|
45
|
-
zenodo_record_id:
|
45
|
+
zenodo_record_id: 15778354
|
46
46
|
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
47
47
|
script_cmd: python /dummy_step.py
|
48
48
|
outputs:
|
@@ -51,7 +51,7 @@ step_4_python_pandas:
|
|
51
51
|
steps:
|
52
52
|
- step_4
|
53
53
|
image_name: python_pandas.sif
|
54
|
-
zenodo_record_id:
|
54
|
+
zenodo_record_id: 15778354
|
55
55
|
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
56
56
|
script_cmd: python /dummy_step.py
|
57
57
|
env:
|
@@ -62,7 +62,7 @@ step_5_python_pandas:
|
|
62
62
|
steps:
|
63
63
|
- step_5
|
64
64
|
image_name: python_pandas.sif
|
65
|
-
zenodo_record_id:
|
65
|
+
zenodo_record_id: 15778354
|
66
66
|
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
67
67
|
script_cmd: python /dummy_step.py
|
68
68
|
env:
|
@@ -73,7 +73,7 @@ step_6_python_pandas:
|
|
73
73
|
steps:
|
74
74
|
- step_6
|
75
75
|
image_name: python_pandas.sif
|
76
|
-
zenodo_record_id:
|
76
|
+
zenodo_record_id: 15778354
|
77
77
|
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
78
78
|
script_cmd: python /dummy_step.py
|
79
79
|
env:
|
@@ -84,7 +84,7 @@ step_4a_python_pandas:
|
|
84
84
|
steps:
|
85
85
|
- step_4a
|
86
86
|
image_name: python_pandas.sif
|
87
|
-
zenodo_record_id:
|
87
|
+
zenodo_record_id: 15778354
|
88
88
|
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
89
89
|
script_cmd: python /dummy_step.py
|
90
90
|
env:
|
@@ -95,7 +95,7 @@ step_4b_python_pandas:
|
|
95
95
|
steps:
|
96
96
|
- step_4b
|
97
97
|
image_name: python_pandas.sif
|
98
|
-
zenodo_record_id:
|
98
|
+
zenodo_record_id: 15778354
|
99
99
|
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
100
100
|
script_cmd: python /dummy_step.py
|
101
101
|
env:
|
@@ -106,7 +106,7 @@ step_4b_r:
|
|
106
106
|
steps:
|
107
107
|
- step_4b
|
108
108
|
image_name: r-image.sif
|
109
|
-
zenodo_record_id:
|
109
|
+
zenodo_record_id: 15778354
|
110
110
|
md5_checksum: a4a03b836694a2b81a1bd2852736ccc5
|
111
111
|
script_cmd: Rscript /dummy_step.R
|
112
112
|
env:
|
@@ -117,7 +117,7 @@ step_1_python_pyspark:
|
|
117
117
|
steps:
|
118
118
|
- step_1
|
119
119
|
image_name: python_pyspark.sif
|
120
|
-
zenodo_record_id:
|
120
|
+
zenodo_record_id: 15778354
|
121
121
|
md5_checksum: c948577ab0607411dd4b640622d9ec3a
|
122
122
|
script_cmd: python3 /code/dummy_step.py
|
123
123
|
outputs:
|
@@ -127,7 +127,7 @@ step_2_python_pyspark:
|
|
127
127
|
steps:
|
128
128
|
- step_2
|
129
129
|
image_name: python_pyspark.sif
|
130
|
-
zenodo_record_id:
|
130
|
+
zenodo_record_id: 15778354
|
131
131
|
md5_checksum: c948577ab0607411dd4b640622d9ec3a
|
132
132
|
script_cmd: python3 /code/dummy_step.py
|
133
133
|
outputs:
|
@@ -137,7 +137,7 @@ step_3_python_pyspark:
|
|
137
137
|
steps:
|
138
138
|
- step_3
|
139
139
|
image_name: python_pyspark.sif
|
140
|
-
zenodo_record_id:
|
140
|
+
zenodo_record_id: 15778354
|
141
141
|
md5_checksum: c948577ab0607411dd4b640622d9ec3a
|
142
142
|
script_cmd: python3 /code/dummy_step.py
|
143
143
|
outputs:
|
@@ -147,7 +147,7 @@ step_4_python_pyspark:
|
|
147
147
|
steps:
|
148
148
|
- step_4
|
149
149
|
image_name: python_pyspark.sif
|
150
|
-
zenodo_record_id:
|
150
|
+
zenodo_record_id: 15778354
|
151
151
|
md5_checksum: c948577ab0607411dd4b640622d9ec3a
|
152
152
|
script_cmd: python3 /code/dummy_step.py
|
153
153
|
env:
|
@@ -158,7 +158,7 @@ step_1_r:
|
|
158
158
|
steps:
|
159
159
|
- step_1
|
160
160
|
image_name: r-image.sif
|
161
|
-
zenodo_record_id:
|
161
|
+
zenodo_record_id: 15778354
|
162
162
|
md5_checksum: a4a03b836694a2b81a1bd2852736ccc5
|
163
163
|
script_cmd: Rscript /dummy_step.R
|
164
164
|
outputs:
|
@@ -168,7 +168,7 @@ step_2_r:
|
|
168
168
|
steps:
|
169
169
|
- step_2
|
170
170
|
image_name: r-image.sif
|
171
|
-
zenodo_record_id:
|
171
|
+
zenodo_record_id: 15778354
|
172
172
|
md5_checksum: a4a03b836694a2b81a1bd2852736ccc5
|
173
173
|
script_cmd: Rscript /dummy_step.R
|
174
174
|
outputs:
|
@@ -178,7 +178,7 @@ step_3_r:
|
|
178
178
|
steps:
|
179
179
|
- step_3
|
180
180
|
image_name: r-image.sif
|
181
|
-
zenodo_record_id:
|
181
|
+
zenodo_record_id: 15778354
|
182
182
|
md5_checksum: a4a03b836694a2b81a1bd2852736ccc5
|
183
183
|
script_cmd: Rscript /dummy_step.R
|
184
184
|
outputs:
|
@@ -188,7 +188,7 @@ step_4_r:
|
|
188
188
|
steps:
|
189
189
|
- step_4
|
190
190
|
image_name: r-image.sif
|
191
|
-
zenodo_record_id:
|
191
|
+
zenodo_record_id: 15778354
|
192
192
|
md5_checksum: a4a03b836694a2b81a1bd2852736ccc5
|
193
193
|
script_cmd: Rscript /dummy_step.R
|
194
194
|
env:
|
@@ -201,7 +201,7 @@ step_1_and_step_2_combined_python_pandas:
|
|
201
201
|
- step_1
|
202
202
|
- step_2
|
203
203
|
image_name: python_pandas.sif
|
204
|
-
zenodo_record_id:
|
204
|
+
zenodo_record_id: 15778354
|
205
205
|
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
206
206
|
script_cmd: python /dummy_step.py
|
207
207
|
outputs:
|
@@ -211,7 +211,7 @@ step_1_and_step_2_parallel_python_pandas:
|
|
211
211
|
- step_1
|
212
212
|
- step_2
|
213
213
|
image_name: python_pandas.sif
|
214
|
-
zenodo_record_id:
|
214
|
+
zenodo_record_id: 15778354
|
215
215
|
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
216
216
|
script_cmd: python /dummy_step.py
|
217
217
|
env:
|
@@ -223,7 +223,7 @@ step_3_and_step_4_combined_python_pandas:
|
|
223
223
|
- step_3
|
224
224
|
- step_4
|
225
225
|
image_name: python_pandas.sif
|
226
|
-
zenodo_record_id:
|
226
|
+
zenodo_record_id: 15778354
|
227
227
|
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
228
228
|
script_cmd: python /dummy_step.py
|
229
229
|
outputs:
|
@@ -233,7 +233,7 @@ step_1a_and_step_1b_combined_python_pandas:
|
|
233
233
|
- step_1a
|
234
234
|
- step_1b
|
235
235
|
image_name: python_pandas.sif
|
236
|
-
zenodo_record_id:
|
236
|
+
zenodo_record_id: 15778354
|
237
237
|
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
238
238
|
script_cmd: python /dummy_step.py
|
239
239
|
outputs:
|
@@ -261,8 +261,8 @@ default_removing_records:
|
|
261
261
|
steps:
|
262
262
|
- removing_records
|
263
263
|
image_name: default_removing_records.sif
|
264
|
-
zenodo_record_id:
|
265
|
-
md5_checksum:
|
264
|
+
zenodo_record_id: 15778354
|
265
|
+
md5_checksum: 05123136e756bfa57f1d7d5a3315f2f6
|
266
266
|
script_cmd: python /default_removing_records.py
|
267
267
|
outputs:
|
268
268
|
dataset: dataset
|
@@ -270,7 +270,7 @@ default_clusters_to_links:
|
|
270
270
|
steps:
|
271
271
|
- clusters_to_links
|
272
272
|
image_name: default_clusters_to_links.sif
|
273
|
-
zenodo_record_id:
|
273
|
+
zenodo_record_id: 15778354
|
274
274
|
md5_checksum: 0d00d1272bd8193f60727791097aa065
|
275
275
|
script_cmd: python /default_clusters_to_links.py
|
276
276
|
outputs:
|
@@ -279,8 +279,8 @@ default_determining_exclusions:
|
|
279
279
|
steps:
|
280
280
|
- determining_exclusions
|
281
281
|
image_name: default_determining_exclusions.sif
|
282
|
-
zenodo_record_id:
|
283
|
-
md5_checksum:
|
282
|
+
zenodo_record_id: 15778354
|
283
|
+
md5_checksum: f4e9f740d8dd7599bfbb2b9eb54ced38
|
284
284
|
script_cmd: python /default_determining_exclusions.py
|
285
285
|
outputs:
|
286
286
|
ids_to_remove: result.parquet
|
@@ -288,7 +288,7 @@ default_updating_clusters:
|
|
288
288
|
steps:
|
289
289
|
- updating_clusters
|
290
290
|
image_name: default_updating_clusters.sif
|
291
|
-
zenodo_record_id:
|
291
|
+
zenodo_record_id: 15778354
|
292
292
|
md5_checksum: cc6bd29e099c2523347fa04545aa35c9
|
293
293
|
script_cmd: python /default_updating_clusters.py
|
294
294
|
outputs:
|
@@ -299,7 +299,7 @@ save_clusters:
|
|
299
299
|
steps:
|
300
300
|
- canonicalizing_and_downstream_analysis
|
301
301
|
image_name: save_clusters.sif
|
302
|
-
zenodo_record_id:
|
302
|
+
zenodo_record_id: 15778354
|
303
303
|
md5_checksum: 384ab2be668cbadc45160a674f621022
|
304
304
|
script_cmd: python /dummy_canonicalizing_and_downstream_analysis.py
|
305
305
|
outputs:
|
@@ -310,7 +310,7 @@ no_pre-processing:
|
|
310
310
|
steps:
|
311
311
|
- pre-processing
|
312
312
|
image_name: no_pre-processing.sif
|
313
|
-
zenodo_record_id:
|
313
|
+
zenodo_record_id: 15778354
|
314
314
|
md5_checksum: 9a9c080cf145078152501cf96bf61f27
|
315
315
|
script_cmd: python /dummy_pre-processing.py
|
316
316
|
outputs:
|
@@ -319,7 +319,7 @@ default_schema_alignment:
|
|
319
319
|
steps:
|
320
320
|
- schema_alignment
|
321
321
|
image_name: default_schema_alignment.sif
|
322
|
-
zenodo_record_id:
|
322
|
+
zenodo_record_id: 15778354
|
323
323
|
md5_checksum: 3166587f9cfec478b999a17074d628f7
|
324
324
|
script_cmd: python /default_schema_alignment.py
|
325
325
|
outputs:
|
@@ -328,8 +328,8 @@ splink_blocking_and_filtering:
|
|
328
328
|
steps:
|
329
329
|
- blocking_and_filtering
|
330
330
|
image_name: splink_blocking_and_filtering.sif
|
331
|
-
zenodo_record_id:
|
332
|
-
md5_checksum:
|
331
|
+
zenodo_record_id: 15778354
|
332
|
+
md5_checksum: 3f8777c5751d7550762be078d87e7db2
|
333
333
|
script_cmd: python /splink_blocking_and_filtering.py
|
334
334
|
outputs:
|
335
335
|
blocks: blocks
|
@@ -337,7 +337,7 @@ splink_evaluating_pairs:
|
|
337
337
|
steps:
|
338
338
|
- evaluating_pairs
|
339
339
|
image_name: splink_evaluating_pairs.sif
|
340
|
-
zenodo_record_id:
|
340
|
+
zenodo_record_id: 15778354
|
341
341
|
md5_checksum: b57f4bd16b7a3aa5099569078ea4c064
|
342
342
|
script_cmd: python /splink_evaluating_pairs.py
|
343
343
|
outputs:
|
@@ -346,8 +346,8 @@ splink_links_to_clusters:
|
|
346
346
|
steps:
|
347
347
|
- links_to_clusters
|
348
348
|
image_name: splink_links_to_clusters.sif
|
349
|
-
zenodo_record_id:
|
350
|
-
md5_checksum:
|
349
|
+
zenodo_record_id: 15778354
|
350
|
+
md5_checksum: 81a71aa2ce6544953f3edb88d4ee6ec1
|
351
351
|
script_cmd: python /splink_links_to_clusters.py
|
352
352
|
outputs:
|
353
353
|
clusters: result.parquet
|
@@ -369,6 +369,8 @@ exclude_clustered:
|
|
369
369
|
steps:
|
370
370
|
- determining_exclusions
|
371
371
|
image_name: exclude_clustered.sif
|
372
|
+
zenodo_record_id: 15778354
|
373
|
+
md5_checksum: db51f68ea24d114ed2b83a1382b6e6b6
|
372
374
|
script_cmd: python /exclude_clustered.py
|
373
375
|
outputs:
|
374
376
|
ids_to_remove: result.parquet
|
@@ -376,6 +378,8 @@ exclude_none:
|
|
376
378
|
steps:
|
377
379
|
- determining_exclusions
|
378
380
|
image_name: exclude_none.sif
|
381
|
+
zenodo_record_id: 15778354
|
382
|
+
md5_checksum: af12b6dde2aace9dab08d352368b16a1
|
379
383
|
script_cmd: python /exclude_none.py
|
380
384
|
outputs:
|
381
385
|
ids_to_remove: result.parquet
|
@@ -383,6 +387,8 @@ update_clusters_by_connected_components:
|
|
383
387
|
steps:
|
384
388
|
- updating_clusters
|
385
389
|
image_name: update_clusters_by_connected_components.sif
|
390
|
+
zenodo_record_id: 15778354
|
391
|
+
md5_checksum: 806b0fe86a3306d74391678ed951b054
|
386
392
|
script_cmd: python /update_clusters_by_connected_components.py
|
387
393
|
outputs:
|
388
394
|
clusters: result.parquet
|
@@ -390,8 +396,26 @@ middle_name_to_initial:
|
|
390
396
|
steps:
|
391
397
|
- pre-processing
|
392
398
|
image_name: middle_name_to_initial.sif
|
393
|
-
zenodo_record_id:
|
399
|
+
zenodo_record_id: 15778354
|
394
400
|
md5_checksum: 89db9c3318300cda9d538cde08c3c323
|
395
401
|
script_cmd: python /middle_name_to_initial.py
|
396
402
|
outputs:
|
397
403
|
dataset: dataset
|
404
|
+
one_to_many_links_to_clusters:
|
405
|
+
steps:
|
406
|
+
- links_to_clusters
|
407
|
+
image_name: one_to_many_links_to_clusters.sif
|
408
|
+
zenodo_record_id: 15778354
|
409
|
+
md5_checksum: 0bf6f0b8663b4c2f99995a2213dc541a
|
410
|
+
script_cmd: python /one_to_many_links_to_clusters.py
|
411
|
+
outputs:
|
412
|
+
clusters: result.parquet
|
413
|
+
accept_all_pairs:
|
414
|
+
steps:
|
415
|
+
- evaluating_pairs
|
416
|
+
image_name: accept_all_pairs.sif
|
417
|
+
zenodo_record_id: 15778354
|
418
|
+
md5_checksum: c71c88d159c3d7343ebc39cd37224bd9
|
419
|
+
script_cmd: python /accept_all_pairs.py
|
420
|
+
outputs:
|
421
|
+
links: result.parquet
|
easylink/runner.py
CHANGED
@@ -11,6 +11,8 @@ be called from the ``easylink.cli`` module.
|
|
11
11
|
import os
|
12
12
|
import socket
|
13
13
|
import subprocess
|
14
|
+
import threading
|
15
|
+
import time
|
14
16
|
from contextlib import redirect_stderr, redirect_stdout
|
15
17
|
from pathlib import Path
|
16
18
|
|
@@ -161,6 +163,32 @@ def _run_snakemake_with_filtered_output(argv: list[str], results_dir: Path) -> N
|
|
161
163
|
def __init__(self, log_file_path: Path):
|
162
164
|
self.log_file = open(log_file_path, "w")
|
163
165
|
self.buffer = ""
|
166
|
+
self.last_output_time = time.time()
|
167
|
+
self.heartbeat_timer = None
|
168
|
+
self.dots_printed = False # Track if we've printed progress dots
|
169
|
+
self._start_heartbeat()
|
170
|
+
|
171
|
+
def _start_heartbeat(self):
|
172
|
+
"""Start a timer that prints progress dots during long-running containers."""
|
173
|
+
|
174
|
+
def heartbeat():
|
175
|
+
current_time = time.time()
|
176
|
+
if current_time - self.last_output_time > 30: # 30 seconds since last output
|
177
|
+
# Print a dot to show progress - use original stdout if available
|
178
|
+
if hasattr(self, "original_stdout") and self.original_stdout:
|
179
|
+
self.original_stdout.write(".")
|
180
|
+
self.original_stdout.flush()
|
181
|
+
self.dots_printed = True # Mark that we've printed dots
|
182
|
+
self.last_output_time = current_time
|
183
|
+
# Schedule next heartbeat
|
184
|
+
self.heartbeat_timer = threading.Timer(30.0, heartbeat)
|
185
|
+
self.heartbeat_timer.daemon = True
|
186
|
+
self.heartbeat_timer.start()
|
187
|
+
|
188
|
+
# Start first heartbeat after 30 seconds
|
189
|
+
self.heartbeat_timer = threading.Timer(30.0, heartbeat)
|
190
|
+
self.heartbeat_timer.daemon = True
|
191
|
+
self.heartbeat_timer.start()
|
164
192
|
|
165
193
|
def write(self, text: str) -> int:
|
166
194
|
# Write to log file
|
@@ -172,9 +200,19 @@ def _run_snakemake_with_filtered_output(argv: list[str], results_dir: Path) -> N
|
|
172
200
|
while "\n" in self.buffer:
|
173
201
|
line, self.buffer = self.buffer.split("\n", 1)
|
174
202
|
if line.strip():
|
175
|
-
filtered_line =
|
203
|
+
filtered_line = _filter_snakemake_output(line.strip())
|
176
204
|
if filtered_line:
|
205
|
+
# Add newline after dots if we've printed any
|
206
|
+
if (
|
207
|
+
self.dots_printed
|
208
|
+
and hasattr(self, "original_stdout")
|
209
|
+
and self.original_stdout
|
210
|
+
):
|
211
|
+
self.original_stdout.write("\n")
|
212
|
+
self.original_stdout.flush()
|
213
|
+
self.dots_printed = False # Reset the flag
|
177
214
|
logger.info(filtered_line)
|
215
|
+
self.last_output_time = time.time() # Reset heartbeat timer
|
178
216
|
|
179
217
|
return len(text)
|
180
218
|
|
@@ -182,10 +220,23 @@ def _run_snakemake_with_filtered_output(argv: list[str], results_dir: Path) -> N
|
|
182
220
|
self.log_file.flush()
|
183
221
|
|
184
222
|
def close(self):
|
223
|
+
# Stop heartbeat timer
|
224
|
+
if self.heartbeat_timer:
|
225
|
+
self.heartbeat_timer.cancel()
|
226
|
+
|
185
227
|
# Process and log any remaining buffer content
|
186
228
|
if self.buffer.strip():
|
187
|
-
filtered_line =
|
229
|
+
filtered_line = _filter_snakemake_output(self.buffer.strip())
|
188
230
|
if filtered_line:
|
231
|
+
# Add newline after dots if we've printed any
|
232
|
+
if (
|
233
|
+
self.dots_printed
|
234
|
+
and hasattr(self, "original_stdout")
|
235
|
+
and self.original_stdout
|
236
|
+
):
|
237
|
+
self.original_stdout.write("\n")
|
238
|
+
self.original_stdout.flush()
|
239
|
+
self.dots_printed = False
|
189
240
|
logger.info(filtered_line)
|
190
241
|
self.log_file.close()
|
191
242
|
|
@@ -196,7 +247,14 @@ def _run_snakemake_with_filtered_output(argv: list[str], results_dir: Path) -> N
|
|
196
247
|
self.close()
|
197
248
|
|
198
249
|
# Create the filtering output handler and ensure the log file is always closed
|
250
|
+
# Save original stdout for progress dots before redirection
|
251
|
+
import sys
|
252
|
+
|
253
|
+
original_stdout = sys.stdout
|
254
|
+
|
199
255
|
with FilteringOutput(snakemake_log_file) as filtering_output:
|
256
|
+
# Pass original stdout to filtering output for progress dots
|
257
|
+
filtering_output.original_stdout = original_stdout
|
200
258
|
try:
|
201
259
|
# Redirect both stdout and stderr to our filtering handler
|
202
260
|
with redirect_stdout(filtering_output), redirect_stderr(filtering_output):
|
@@ -210,9 +268,8 @@ def _run_snakemake_with_filtered_output(argv: list[str], results_dir: Path) -> N
|
|
210
268
|
raise
|
211
269
|
|
212
270
|
|
213
|
-
def
|
214
|
-
"""
|
215
|
-
Simple filter for Snakemake output showing only localrules and Job messages.
|
271
|
+
def _filter_snakemake_output(line: str) -> str:
|
272
|
+
"""Filter for Snakemake output.
|
216
273
|
|
217
274
|
Parameters
|
218
275
|
----------
|
@@ -221,12 +278,11 @@ def _filter_snakemake_output_simple(line: str) -> str | None:
|
|
221
278
|
|
222
279
|
Returns
|
223
280
|
-------
|
224
|
-
|
225
|
-
The filtered line for display, or None to suppress the line.
|
281
|
+
The filtered line for display.
|
226
282
|
"""
|
227
283
|
# Skip empty lines
|
228
284
|
if not line.strip():
|
229
|
-
return
|
285
|
+
return ""
|
230
286
|
|
231
287
|
if line.startswith("localrule "):
|
232
288
|
# Show localrule names (without the "localrule" prefix)
|
@@ -236,10 +292,10 @@ def _filter_snakemake_output_simple(line: str) -> str | None:
|
|
236
292
|
# Show Job messages
|
237
293
|
# Extract everything after "Job ##: "
|
238
294
|
parts = line.split(":", 1)
|
239
|
-
filtered_line = parts[1].strip() if len(parts) > 1 else
|
295
|
+
filtered_line = parts[1].strip() if len(parts) > 1 else ""
|
240
296
|
else:
|
241
297
|
# Suppress everything else
|
242
|
-
filtered_line =
|
298
|
+
filtered_line = ""
|
243
299
|
return filtered_line
|
244
300
|
|
245
301
|
|
@@ -0,0 +1,22 @@
|
|
1
|
+
|
2
|
+
Bootstrap: docker
|
3
|
+
From: python@sha256:1c26c25390307b64e8ff73e7edf34b4fbeac59d41da41c08da28dc316a721899
|
4
|
+
|
5
|
+
%files
|
6
|
+
./accept_all_pairs.py /accept_all_pairs.py
|
7
|
+
|
8
|
+
%post
|
9
|
+
# Create directories
|
10
|
+
mkdir -p /input_data
|
11
|
+
mkdir -p /extra_implementation_specific_input_data
|
12
|
+
mkdir -p /results
|
13
|
+
mkdir -p /diagnostics
|
14
|
+
|
15
|
+
# Install Python packages with specific versions
|
16
|
+
pip install pandas pyarrow
|
17
|
+
|
18
|
+
%environment
|
19
|
+
export LC_ALL=C
|
20
|
+
|
21
|
+
%runscript
|
22
|
+
python /accept_all_pairs.py '$@'
|
@@ -0,0 +1,26 @@
|
|
1
|
+
# STEP_NAME: evaluating_pairs
|
2
|
+
# REQUIREMENTS: pandas pyarrow
|
3
|
+
|
4
|
+
import os
|
5
|
+
from pathlib import Path
|
6
|
+
|
7
|
+
import pandas as pd
|
8
|
+
|
9
|
+
blocks_dir = Path(os.environ["BLOCKS_DIR_PATH"])
|
10
|
+
diagnostics_dir = Path(os.environ["DIAGNOSTICS_DIRECTORY"])
|
11
|
+
output_path = Path(os.environ["OUTPUT_PATHS"])
|
12
|
+
Path(output_path).parent.mkdir(exist_ok=True, parents=True)
|
13
|
+
|
14
|
+
all_predictions = []
|
15
|
+
|
16
|
+
for block_dir in blocks_dir.iterdir():
|
17
|
+
if str(block_dir.stem).startswith("."):
|
18
|
+
continue
|
19
|
+
|
20
|
+
pairs = pd.read_parquet(block_dir / "pairs.parquet")
|
21
|
+
|
22
|
+
all_predictions.append(pairs.assign(Probability=1.0))
|
23
|
+
|
24
|
+
all_predictions = pd.concat(all_predictions, ignore_index=True)
|
25
|
+
print(all_predictions)
|
26
|
+
all_predictions.to_parquet(output_path)
|
@@ -62,12 +62,21 @@ clusters_filepath = clusters_filepaths[0]
|
|
62
62
|
|
63
63
|
# Exclude records that have been clustered
|
64
64
|
clusters_df = load_file(clusters_filepath)
|
65
|
+
# NOTE: We defined "clustered" for these purposes as clustered *with* anything else.
|
66
|
+
# Simply putting a record into its own cluster does not indicate to us that it has
|
67
|
+
# been sufficiently clustered to ignore.
|
68
|
+
cluster_sizes = clusters_df.groupby("Cluster ID").size()
|
69
|
+
clusters_df["size"] = cluster_sizes.loc[clusters_df["Cluster ID"]].values
|
70
|
+
clusters_df = clusters_df[clusters_df["size"] > 1]
|
71
|
+
|
65
72
|
dataset_df = load_file(dataset_path)
|
66
73
|
clustered_record_ids = set(dataset_df["Record ID"].unique()) & set(
|
67
|
-
clusters_df["Input Record
|
74
|
+
clusters_df[clusters_df["Input Record Dataset"] == splitter_choice][
|
75
|
+
"Input Record ID"
|
76
|
+
].unique()
|
68
77
|
)
|
69
78
|
|
70
|
-
IDS_TO_REMOVE = pd.DataFrame({"Record ID": list(clustered_record_ids)})
|
79
|
+
IDS_TO_REMOVE = pd.DataFrame({"Input Record ID": list(clustered_record_ids)})
|
71
80
|
|
72
81
|
# OUTPUT_PATHS is a single path to a file (results.parquet)
|
73
82
|
results_filepath = os.environ["OUTPUT_PATHS"]
|
@@ -67,7 +67,7 @@ clusters_df = load_file(clusters_filepath)
|
|
67
67
|
|
68
68
|
# SAVE OUTPUTS
|
69
69
|
|
70
|
-
IDS_TO_REMOVE = pd.DataFrame(columns=["Record ID"])
|
70
|
+
IDS_TO_REMOVE = pd.DataFrame(columns=["Input Record ID"])
|
71
71
|
|
72
72
|
# OUTPUT_PATHS is a single path to a file (results.parquet)
|
73
73
|
results_filepath = os.environ["OUTPUT_PATHS"]
|
@@ -0,0 +1,22 @@
|
|
1
|
+
|
2
|
+
Bootstrap: docker
|
3
|
+
From: python@sha256:1c26c25390307b64e8ff73e7edf34b4fbeac59d41da41c08da28dc316a721899
|
4
|
+
|
5
|
+
%files
|
6
|
+
./one_to_many_links_to_clusters.py /one_to_many_links_to_clusters.py
|
7
|
+
|
8
|
+
%post
|
9
|
+
# Create directories
|
10
|
+
mkdir -p /input_data
|
11
|
+
mkdir -p /extra_implementation_specific_input_data
|
12
|
+
mkdir -p /results
|
13
|
+
mkdir -p /diagnostics
|
14
|
+
|
15
|
+
# Install Python packages with specific versions
|
16
|
+
pip install pandas pyarrow networkx
|
17
|
+
|
18
|
+
%environment
|
19
|
+
export LC_ALL=C
|
20
|
+
|
21
|
+
%runscript
|
22
|
+
python /one_to_many_links_to_clusters.py '$@'
|
@@ -0,0 +1,109 @@
|
|
1
|
+
# STEP_NAME: links_to_clusters
|
2
|
+
# REQUIREMENTS: pandas pyarrow networkx
|
3
|
+
|
4
|
+
import os
|
5
|
+
from pathlib import Path
|
6
|
+
|
7
|
+
import networkx as nx
|
8
|
+
import pandas as pd
|
9
|
+
|
10
|
+
links = pd.read_parquet(os.environ["LINKS_FILE_PATH"])
|
11
|
+
output_path = Path(os.environ["OUTPUT_PATHS"])
|
12
|
+
|
13
|
+
no_duplicates_dataset = os.environ["NO_DUPLICATES_DATASET"]
|
14
|
+
break_ties_method = os.getenv("BREAK_TIES_METHOD", "drop")
|
15
|
+
|
16
|
+
left_no_duplicates_dataset = links["Left Record Dataset"] == no_duplicates_dataset
|
17
|
+
right_no_duplicates_dataset = links["Right Record Dataset"] == no_duplicates_dataset
|
18
|
+
|
19
|
+
if (left_no_duplicates_dataset & right_no_duplicates_dataset).any():
|
20
|
+
raise ValueError(
|
21
|
+
f"Provided links include links within the no_duplicates_dataset ({no_duplicates_dataset})"
|
22
|
+
)
|
23
|
+
|
24
|
+
if not (left_no_duplicates_dataset | right_no_duplicates_dataset).all():
|
25
|
+
raise ValueError(
|
26
|
+
f"Provided links include links that don't involve the no_duplicates_dataset ({no_duplicates_dataset})"
|
27
|
+
)
|
28
|
+
|
29
|
+
# Get the no-duplicates dataset all on the right
|
30
|
+
id_cols = [
|
31
|
+
"Left Record Dataset",
|
32
|
+
"Left Record ID",
|
33
|
+
"Right Record Dataset",
|
34
|
+
"Right Record ID",
|
35
|
+
]
|
36
|
+
switched_id_cols = [
|
37
|
+
"Right Record Dataset",
|
38
|
+
"Right Record ID",
|
39
|
+
"Left Record Dataset",
|
40
|
+
"Left Record ID",
|
41
|
+
]
|
42
|
+
links.loc[left_no_duplicates_dataset, id_cols] = links.loc[
|
43
|
+
left_no_duplicates_dataset, switched_id_cols
|
44
|
+
].to_numpy()
|
45
|
+
links[["Left Record ID", "Right Record ID"]] = links[
|
46
|
+
["Left Record ID", "Right Record ID"]
|
47
|
+
].astype(int)
|
48
|
+
|
49
|
+
links["Left Record Key"] = (
|
50
|
+
links["Left Record Dataset"] + "-__-" + links["Left Record ID"].astype(int).astype(str)
|
51
|
+
)
|
52
|
+
links["Right Record Key"] = (
|
53
|
+
links["Right Record Dataset"] + "-__-" + links["Right Record ID"].astype(int).astype(str)
|
54
|
+
)
|
55
|
+
|
56
|
+
links_to_accept = (
|
57
|
+
links[links["Probability"] >= float(os.environ["THRESHOLD_MATCH_PROBABILITY"])]
|
58
|
+
# Pre-emptively break probability ties by right record key for the highest_id method
|
59
|
+
.sort_values(["Probability", "Right Record Key"], ascending=False)
|
60
|
+
# No duplicates in the *right* means only one link per *left* record
|
61
|
+
.groupby(["Left Record Key"]).first()
|
62
|
+
)
|
63
|
+
|
64
|
+
if break_ties_method == "drop":
|
65
|
+
num_tied = (
|
66
|
+
links_to_accept.merge(links, on=["Left Record Key", "Probability"])
|
67
|
+
.groupby(["Left Record Key"])
|
68
|
+
.size()
|
69
|
+
)
|
70
|
+
print("Ties:")
|
71
|
+
print(num_tied)
|
72
|
+
print(num_tied.describe())
|
73
|
+
links_to_accept = links_to_accept[num_tied == 1]
|
74
|
+
elif break_ties_method == "highest_id":
|
75
|
+
# Done above pre-emptively
|
76
|
+
pass
|
77
|
+
else:
|
78
|
+
raise ValueError(f"Unknown break_ties_method {break_ties_method}")
|
79
|
+
|
80
|
+
# NOTE: We only include nodes involved in an accepted link in our cluster.
|
81
|
+
# If a node isn't involved in an accepted link, that could just represent
|
82
|
+
# that we haven't evaluated the right pairs involving it, not confidence that
|
83
|
+
# it is a singleton.
|
84
|
+
G = nx.from_pandas_edgelist(
|
85
|
+
links_to_accept.reset_index()[["Left Record Key", "Right Record Key"]].rename(
|
86
|
+
columns={"Left Record Key": "source", "Right Record Key": "target"}
|
87
|
+
)
|
88
|
+
)
|
89
|
+
|
90
|
+
# Compute connected components
|
91
|
+
components = list(nx.connected_components(G))
|
92
|
+
|
93
|
+
# Assign new cluster IDs
|
94
|
+
merged_data = []
|
95
|
+
for cluster_id, records in enumerate(components, start=1):
|
96
|
+
for record_key in records:
|
97
|
+
merged_data.append((record_key, cluster_id))
|
98
|
+
|
99
|
+
# Build the final DataFrame
|
100
|
+
merged_df = pd.DataFrame(merged_data, columns=["Input Record Key", "Cluster ID"])
|
101
|
+
|
102
|
+
merged_df[["Input Record Dataset", "Input Record ID"]] = (
|
103
|
+
merged_df["Input Record Key"].str.split("-__-", n=1, expand=True)
|
104
|
+
if not merged_df.empty
|
105
|
+
else pd.DataFrame(columns=["Input Record Dataset", "Input Record ID"])
|
106
|
+
)
|
107
|
+
merged_df["Input Record ID"] = merged_df["Input Record ID"].astype(int)
|
108
|
+
|
109
|
+
merged_df[["Input Record Dataset", "Input Record ID", "Cluster ID"]].to_parquet(output_path)
|
@@ -59,7 +59,18 @@ new_clusters_df = load_file(new_clusters_filepath)
|
|
59
59
|
|
60
60
|
def merge_clusters(known_clusters_df, new_clusters_df):
|
61
61
|
# Combine both dataframes
|
62
|
-
combined_df = pd.concat(
|
62
|
+
combined_df = pd.concat(
|
63
|
+
[
|
64
|
+
# Ensure cluster names are unique
|
65
|
+
known_clusters_df.assign(
|
66
|
+
**{"Cluster ID": lambda df: "known__" + df["Cluster ID"].astype(str)}
|
67
|
+
),
|
68
|
+
new_clusters_df.assign(
|
69
|
+
**{"Cluster ID": lambda df: "new__" + df["Cluster ID"].astype(str)}
|
70
|
+
),
|
71
|
+
],
|
72
|
+
ignore_index=True,
|
73
|
+
)
|
63
74
|
combined_df["Input Record Key"] = (
|
64
75
|
combined_df["Input Record Dataset"]
|
65
76
|
+ "-__-"
|
@@ -92,9 +103,11 @@ def merge_clusters(known_clusters_df, new_clusters_df):
|
|
92
103
|
# Build the final DataFrame
|
93
104
|
merged_df = pd.DataFrame(merged_data, columns=["Input Record Key", "Cluster ID"])
|
94
105
|
|
95
|
-
merged_df[["Input Record Dataset", "Input Record ID"]] =
|
96
|
-
"Input Record Key"
|
97
|
-
|
106
|
+
merged_df[["Input Record Dataset", "Input Record ID"]] = (
|
107
|
+
merged_df["Input Record Key"].str.split("-__-", n=1, expand=True)
|
108
|
+
if not merged_df.empty
|
109
|
+
else pd.DataFrame(columns=["Input Record Dataset", "Input Record ID"])
|
110
|
+
)
|
98
111
|
|
99
112
|
merged_df["Input Record ID"] = merged_df["Input Record ID"].astype(int)
|
100
113
|
|
@@ -72,7 +72,7 @@ if len(clusters_df) > 0:
|
|
72
72
|
|
73
73
|
# SAVE OUTPUTS
|
74
74
|
|
75
|
-
IDS_TO_REMOVE = pd.DataFrame(columns=["Record ID"])
|
75
|
+
IDS_TO_REMOVE = pd.DataFrame(columns=["Input Record ID"])
|
76
76
|
|
77
77
|
# OUTPUT_PATHS is a single path to a file (results.parquet)
|
78
78
|
results_filepath = os.environ["OUTPUT_PATHS"]
|
@@ -52,7 +52,7 @@ results_dir.mkdir(exist_ok=True, parents=True)
|
|
52
52
|
dataset = load_file(dataset_path)
|
53
53
|
ids_to_remove = load_file(ids_filepath)
|
54
54
|
|
55
|
-
dataset = dataset[~dataset["Record ID"].isin(ids_to_remove)]
|
55
|
+
dataset = dataset[~dataset["Record ID"].isin(ids_to_remove["Input Record ID"])]
|
56
56
|
|
57
57
|
output_path = results_dir / Path(dataset_path).name
|
58
58
|
logging.info(f"Writing output for dataset from input {dataset_path} to {output_path}")
|
@@ -90,12 +90,18 @@ blocked_pairs = (
|
|
90
90
|
.drop(columns=["match_key"])
|
91
91
|
)
|
92
92
|
|
93
|
-
blocked_pairs[["Left Record Dataset", "Left Record ID"]] =
|
94
|
-
"join_key_l"
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
93
|
+
blocked_pairs[["Left Record Dataset", "Left Record ID"]] = (
|
94
|
+
blocked_pairs.pop("join_key_l").str.split("-__-", n=1, expand=True)
|
95
|
+
if not blocked_pairs.empty
|
96
|
+
else pd.DataFrame(columns=["Left Record Dataset", "Left Record ID"])
|
97
|
+
)
|
98
|
+
|
99
|
+
blocked_pairs[["Right Record Dataset", "Right Record ID"]] = (
|
100
|
+
blocked_pairs.pop("join_key_r").str.split("-__-", n=1, expand=True)
|
101
|
+
if not blocked_pairs.empty
|
102
|
+
else pd.DataFrame(columns=["Right Record Dataset", "Right Record ID"])
|
103
|
+
)
|
104
|
+
|
99
105
|
blocked_pairs[["Left Record ID", "Right Record ID"]] = blocked_pairs[
|
100
106
|
["Left Record ID", "Right Record ID"]
|
101
107
|
].astype(int)
|
@@ -53,6 +53,8 @@ cc = (
|
|
53
53
|
# Split "Record Key" back into "Input Record Dataset" and "Input Record ID"
|
54
54
|
cc[["Input Record Dataset", "Input Record ID"]] = (
|
55
55
|
cc["Record Key"].astype(str).str.split("-__-", n=1, expand=True)
|
56
|
+
if not cc.empty
|
57
|
+
else pd.DataFrame(columns=["Input Record Dataset", "Input Record ID"])
|
56
58
|
)
|
57
59
|
cc = cc.drop(columns=["Record Key"])
|
58
60
|
cc["Input Record ID"] = cc["Input Record ID"].astype(int)
|
@@ -341,8 +341,8 @@ def _validate_pairs(df: pd.DataFrame, filepath: str) -> None:
|
|
341
341
|
def validate_ids_to_remove(filepath: str) -> None:
|
342
342
|
"""Validates a file containing IDs to remove.
|
343
343
|
|
344
|
-
- The file must contain a single column: "Record ID".
|
345
|
-
- "Record ID" must have unique values.
|
344
|
+
- The file must contain a single column: "Input Record ID".
|
345
|
+
- "Input Record ID" must have unique values.
|
346
346
|
|
347
347
|
Parameters
|
348
348
|
----------
|
@@ -352,13 +352,13 @@ def validate_ids_to_remove(filepath: str) -> None:
|
|
352
352
|
Raises
|
353
353
|
------
|
354
354
|
LookupError
|
355
|
-
If the file is missing the "Record ID" column.
|
355
|
+
If the file is missing the "Input Record ID" column.
|
356
356
|
ValueError
|
357
|
-
If the "Record ID" column is not unique.
|
357
|
+
If the "Input Record ID" column is not unique.
|
358
358
|
"""
|
359
|
-
_validate_required_columns(filepath, {"Record ID"})
|
359
|
+
_validate_required_columns(filepath, {"Input Record ID"})
|
360
360
|
df = _read_file(filepath)
|
361
|
-
_validate_unique_column(df, "Record ID", filepath)
|
361
|
+
_validate_unique_column(df, "Input Record ID", filepath)
|
362
362
|
|
363
363
|
|
364
364
|
def validate_records(filepath: str) -> None:
|
@@ -1,34 +1,38 @@
|
|
1
1
|
easylink/__about__.py,sha256=2-oxCfu9t9yUJouLDwqYRZ0eii8kN25SxRzsawjWjho,440
|
2
2
|
easylink/__init__.py,sha256=gGMcIVfiVnHtlDw5mZwhevcDb2wt-kuP6F64gnkFack,159
|
3
|
-
easylink/_version.py,sha256=
|
4
|
-
easylink/cli.py,sha256=
|
3
|
+
easylink/_version.py,sha256=Zn1KFblwuFHiDRdRAiRnDBRkbPttWh44jKa5zG2ov0E,22
|
4
|
+
easylink/cli.py,sha256=3Xoqclhn7mEHzuqyuVUjt67-V3Fqu0_Jr3B3lCdIuAg,10704
|
5
5
|
easylink/configuration.py,sha256=TPd3WbqUcJMJDPJuHeo5ZebvZPQrRyfm8-laad2sOFk,12681
|
6
6
|
easylink/graph_components.py,sha256=zZDZXg5smReHO3ryQC4pao24wyKXzWDe6jS3C6fM2ak,13892
|
7
7
|
easylink/implementation.py,sha256=lSF37g-aQYgdLI0lgFaDrBkp23te9O9B1V-CmmRtB-0,14514
|
8
|
-
easylink/implementation_metadata.yaml,sha256=
|
8
|
+
easylink/implementation_metadata.yaml,sha256=ahuSVk5Ur1V0F8EsLZO5apkNC2bWv2RsytNaiWGo9Yk,12562
|
9
9
|
easylink/pipeline.py,sha256=NJyMInbOCjJ_5kRzzuK5AcupvC7ecd5qLOC-f1Gy3Ss,17701
|
10
10
|
easylink/pipeline_graph.py,sha256=jtjS7_2IVa189U8cL621600erC_0pa6BKPRRO8VqwiU,23760
|
11
11
|
easylink/pipeline_schema.py,sha256=sj6YQqMFUS35RRFWdlrhPlud68d3Rb5y7iKfAEZy1qs,6877
|
12
12
|
easylink/rule.py,sha256=n8r4NL7MiNzppl39MRbEMdHEpn_e_XS7LfrsJD6KNfA,16629
|
13
|
-
easylink/runner.py,sha256=
|
13
|
+
easylink/runner.py,sha256=h39MbWHgTs-VwkPxk76186si76e8UTf1hySqepqUSS8,13155
|
14
14
|
easylink/step.py,sha256=-vdFhPvwAZ3d69LMQGmSIVdcMG8E8ZtSvTE0UWif7zs,91088
|
15
15
|
easylink/devtools/implementation_creator.py,sha256=gZZpfpiOOh912nNjR_d5wR0mBO5c09jWMS0FSYRcE1o,19120
|
16
16
|
easylink/pipeline_schema_constants/__init__.py,sha256=njUL2v67WFC3dW_FjOXWRLSeOhInox_ZINLEji0_7V8,1523
|
17
17
|
easylink/pipeline_schema_constants/development.py,sha256=0x6lWqBmP1K9AebEmeZ4veSnLBcdQcZXzbV6lCU11bc,12622
|
18
18
|
easylink/pipeline_schema_constants/main.py,sha256=kcAhdbK_BhS79LUMhKkn-Uttl2vA-MHVX4M1StTloJQ,22934
|
19
19
|
easylink/pipeline_schema_constants/testing.py,sha256=ti08DeUuF-eWrGKMj4BMyOFFJnEYooDaWX0DGiferbk,24579
|
20
|
+
easylink/steps/cascading/accept_all_pairs.def,sha256=kwZMF3H0mqCBcO1Y2parJXFBLp4e9bLQoVIYU7zZ8xY,486
|
21
|
+
easylink/steps/cascading/accept_all_pairs.py,sha256=eF_rmqcZtL3vI1u-TJejOcKX2Qou-AbaLI7qAAGjoGI,703
|
20
22
|
easylink/steps/cascading/exclude_clustered.def,sha256=GfoDqO2Vtsh7VI8SwGaJtv_KtKjs-UmBcivqQ7OPkjk,503
|
21
|
-
easylink/steps/cascading/exclude_clustered.py,sha256=
|
23
|
+
easylink/steps/cascading/exclude_clustered.py,sha256=T60deNb91_ZFg5K190G-Q7BC5EYrEdLPhFEK7Togv0Y,3048
|
22
24
|
easylink/steps/cascading/exclude_none.def,sha256=iFUhUMocxtkA0NErkjVrBxY0MUdS3DIPNsbCpTJRP0k,488
|
23
|
-
easylink/steps/cascading/exclude_none.py,sha256=
|
25
|
+
easylink/steps/cascading/exclude_none.py,sha256=DesKAO-UcPqKKtUS92OHU25YDXMJLiBEcGLk69UYWDk,2481
|
26
|
+
easylink/steps/cascading/one_to_many_links_to_clusters.def,sha256=BVFusUydsV3hY1en16OVr3TPqzwst-cEVBwvb8dtpqA,534
|
27
|
+
easylink/steps/cascading/one_to_many_links_to_clusters.py,sha256=7QSJxW3mmR3LIjWBzzgi3vcOsmoYOsiSJn6iYGppHLA,3789
|
24
28
|
easylink/steps/cascading/update_clusters_by_connected_components.def,sha256=sAAAWOod8EuAnotR1cayaGAvs7x6xoMVlwmLso_a9Cc,578
|
25
|
-
easylink/steps/cascading/update_clusters_by_connected_components.py,sha256=
|
29
|
+
easylink/steps/cascading/update_clusters_by_connected_components.py,sha256=jhpMgewztCXrRxBw2FnH2HjIybpp7GcHe4kjTMgQOyg,4059
|
26
30
|
easylink/steps/default/default_clusters_to_links.def,sha256=9PjUygLvsoYMUZDznceuuv55t8fPs473P57J_RMl3U0,527
|
27
31
|
easylink/steps/default/default_clusters_to_links.py,sha256=Ckm53d3W-ETNlTvQJPOkpHmSqCmxSWknMPQrEAIoTBY,2816
|
28
32
|
easylink/steps/default/default_determining_exclusions.def,sha256=zZUEHDdrpLxzx3gTm-dki2ge5ivCCg4ziIwTErqCII0,542
|
29
|
-
easylink/steps/default/default_determining_exclusions.py,sha256=
|
33
|
+
easylink/steps/default/default_determining_exclusions.py,sha256=RpYHFAral4uYevgiOsYqUHYgsEIejV5NhYdQ3q7VeU0,2635
|
30
34
|
easylink/steps/default/default_removing_records.def,sha256=QqacmOu6myxFSULHRKeKsVD8l73KDm4VEkPkPlovwqs,524
|
31
|
-
easylink/steps/default/default_removing_records.py,sha256=
|
35
|
+
easylink/steps/default/default_removing_records.py,sha256=I_xGdWftlwP7H8HdxfblSG2YFVqA986KOECVwMCn4fk,1925
|
32
36
|
easylink/steps/default/default_schema_alignment.def,sha256=hFHJkos0Fhe4LvpjLOCd6klIaIqOKqECDDSTVu3G03Y,524
|
33
37
|
easylink/steps/default/default_schema_alignment.py,sha256=oT5jbYQ3C3ocLgqqOnvH0SIJ6NeTtPBWWmCqr_frnAI,1479
|
34
38
|
easylink/steps/default/default_updating_clusters.def,sha256=vDzSkTknDfeiXeHREpw4BkUxFcTWamxr81c3rZ7_enY,527
|
@@ -70,11 +74,11 @@ easylink/steps/rl-dummy/input_data/known_clusters.parquet,sha256=Ysodu65toHZN4Ag
|
|
70
74
|
easylink/steps/rl-dummy/pre-processing/dummy_pre-processing.def,sha256=HeUSv2QvMOQzsyVktYR1xYoEqwiNpDo-p7IRcGSMspE,512
|
71
75
|
easylink/steps/rl-dummy/pre-processing/dummy_pre-processing.py,sha256=I6kqG4e_H2yFW5MpsMXdpoY_NjHcBvVVAHWv89LUgXE,1852
|
72
76
|
easylink/steps/splink/splink_blocking_and_filtering.def,sha256=umWzxJhsfdi8g3TD-r2mKpjC-FPAMDk6-IERiWigdQc,557
|
73
|
-
easylink/steps/splink/splink_blocking_and_filtering.py,sha256=
|
77
|
+
easylink/steps/splink/splink_blocking_and_filtering.py,sha256=3WMBmNEECB9Kxu4D6PAesZzBrhHTdpFEgvnGPsV4bww,5475
|
74
78
|
easylink/steps/splink/splink_evaluating_pairs.def,sha256=DN3Ohy9qJOAyK58v164neP23HDVYpedMqzCu4eQh4Hg,521
|
75
79
|
easylink/steps/splink/splink_evaluating_pairs.py,sha256=vWUe3vQo9uGs0Cy8pG5PbolzsJX_cEaPS3Q0PMcBjcg,6253
|
76
80
|
easylink/steps/splink/splink_links_to_clusters.def,sha256=RurvOYyGjNs9tx64DTXwI-GSgHD4T7SzDfhAH18pTEM,524
|
77
|
-
easylink/steps/splink/splink_links_to_clusters.py,sha256=
|
81
|
+
easylink/steps/splink/splink_links_to_clusters.py,sha256=Brpy3ZKSBpBUeOitg1ZgDvuMVwILH0QBkLXRJN8LXno,2015
|
78
82
|
easylink/utilities/__init__.py,sha256=0U33kbv4hoMfFQ_lh5hLwifxRPzOgkLkjKLYxmaK10g,196
|
79
83
|
easylink/utilities/aggregator_utils.py,sha256=_DAHRAf9janbDsuj_jnAn5Dzz2s4R5Ni3YeytDpN9UE,954
|
80
84
|
easylink/utilities/data_utils.py,sha256=XPRjq3qW_fN0xQ23Jms_xBzpTHbRwqZWDP1AW0nYkP0,6926
|
@@ -82,10 +86,10 @@ easylink/utilities/general_utils.py,sha256=MmuoV4T6PgyEDjbepC_1D3TGrq70Hp-hl-GaA
|
|
82
86
|
easylink/utilities/paths.py,sha256=9inDKMPorAaWh5vhjc3j1Tj_aXVKhLdodiJO9H1nNes,947
|
83
87
|
easylink/utilities/spark.smk,sha256=kGtpem7LfQc71tMh5WAYaqKnHQKFvcdhPQSdumOP70k,5799
|
84
88
|
easylink/utilities/splitter_utils.py,sha256=KXiVUYJ9TGxCQmrwos18pB1sxG_0Ay67qoDJT6vem2o,3828
|
85
|
-
easylink/utilities/validation_utils.py,sha256=
|
86
|
-
easylink-0.
|
87
|
-
easylink-0.
|
88
|
-
easylink-0.
|
89
|
-
easylink-0.
|
90
|
-
easylink-0.
|
91
|
-
easylink-0.
|
89
|
+
easylink/utilities/validation_utils.py,sha256=1naksMPStw_xIOqskX6DE99f16Y7eCcVF9I5ZILjMvI,18453
|
90
|
+
easylink-0.2.0.dist-info/licenses/LICENSE,sha256=z6NBo3hY2Pbh2Wjm9IcHLyQoG-kd816uASh8PbwhTcQ,1530
|
91
|
+
easylink-0.2.0.dist-info/METADATA,sha256=HxtOiOMe9hTRcK6HL6sLTTQNeP9X7hrhiodTpEMUeOA,4218
|
92
|
+
easylink-0.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
93
|
+
easylink-0.2.0.dist-info/entry_points.txt,sha256=OGMZDFltg3yMboT7XjJt3joiPhRfV_7jnREVtrAIQNU,51
|
94
|
+
easylink-0.2.0.dist-info/top_level.txt,sha256=oHcOpcF_jDMWFiJRzfGQvuskENGDjSPC_Agu9Z_Xvik,9
|
95
|
+
easylink-0.2.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|