easylink 0.1.23__py3-none-any.whl → 0.1.25__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- easylink/_version.py +1 -1
- easylink/cli.py +15 -10
- easylink/implementation_metadata.yaml +70 -44
- easylink/rule.py +2 -1
- easylink/runner.py +118 -1
- easylink/steps/splink/splink_evaluating_pairs.py +2 -1
- easylink/utilities/general_utils.py +18 -8
- {easylink-0.1.23.dist-info → easylink-0.1.25.dist-info}/METADATA +27 -11
- {easylink-0.1.23.dist-info → easylink-0.1.25.dist-info}/RECORD +13 -13
- {easylink-0.1.23.dist-info → easylink-0.1.25.dist-info}/WHEEL +0 -0
- {easylink-0.1.23.dist-info → easylink-0.1.25.dist-info}/entry_points.txt +0 -0
- {easylink-0.1.23.dist-info → easylink-0.1.25.dist-info}/licenses/LICENSE +0 -0
- {easylink-0.1.23.dist-info → easylink-0.1.25.dist-info}/top_level.txt +0 -0
easylink/_version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.1.
|
1
|
+
__version__ = "0.1.25"
|
easylink/cli.py
CHANGED
@@ -201,16 +201,20 @@ def run(
|
|
201
201
|
main = handle_exceptions(
|
202
202
|
func=runner.main, exceptions_logger=logger, with_debugger=with_debugger
|
203
203
|
)
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
204
|
+
try:
|
205
|
+
main(
|
206
|
+
command="run",
|
207
|
+
pipeline_specification=pipeline_specification,
|
208
|
+
input_data=input_data,
|
209
|
+
computing_environment=computing_environment,
|
210
|
+
results_dir=results_dir,
|
211
|
+
images_dir=images,
|
212
|
+
schema_name=schema,
|
213
|
+
)
|
214
|
+
except SystemExit:
|
215
|
+
# Snakemake uses SystemExit for completion - log success and re-raise
|
216
|
+
logger.info("*** FINISHED ***")
|
217
|
+
raise
|
214
218
|
|
215
219
|
|
216
220
|
@easylink.command()
|
@@ -243,6 +247,7 @@ def generate_dag(
|
|
243
247
|
input_data=input_data,
|
244
248
|
computing_environment=None,
|
245
249
|
results_dir=results_dir,
|
250
|
+
images_dir=None,
|
246
251
|
schema_name=schema,
|
247
252
|
)
|
248
253
|
logger.info("*** DAG saved to result directory ***")
|
@@ -2,7 +2,7 @@ step_1_python_pandas:
|
|
2
2
|
steps:
|
3
3
|
- step_1
|
4
4
|
image_name: python_pandas.sif
|
5
|
-
zenodo_record_id:
|
5
|
+
zenodo_record_id: 15757317
|
6
6
|
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
7
7
|
script_cmd: python /dummy_step.py
|
8
8
|
outputs:
|
@@ -11,7 +11,7 @@ step_1a_python_pandas:
|
|
11
11
|
steps:
|
12
12
|
- step_1a
|
13
13
|
image_name: python_pandas.sif
|
14
|
-
zenodo_record_id:
|
14
|
+
zenodo_record_id: 15757317
|
15
15
|
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
16
16
|
script_cmd: python /dummy_step.py
|
17
17
|
env:
|
@@ -22,7 +22,7 @@ step_1b_python_pandas:
|
|
22
22
|
steps:
|
23
23
|
- step_1b
|
24
24
|
image_name: python_pandas.sif
|
25
|
-
zenodo_record_id:
|
25
|
+
zenodo_record_id: 15757317
|
26
26
|
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
27
27
|
script_cmd: python /dummy_step.py
|
28
28
|
env:
|
@@ -33,7 +33,7 @@ step_2_python_pandas:
|
|
33
33
|
steps:
|
34
34
|
- step_2
|
35
35
|
image_name: python_pandas.sif
|
36
|
-
zenodo_record_id:
|
36
|
+
zenodo_record_id: 15757317
|
37
37
|
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
38
38
|
script_cmd: python /dummy_step.py
|
39
39
|
outputs:
|
@@ -42,7 +42,7 @@ step_3_python_pandas:
|
|
42
42
|
steps:
|
43
43
|
- step_3
|
44
44
|
image_name: python_pandas.sif
|
45
|
-
zenodo_record_id:
|
45
|
+
zenodo_record_id: 15757317
|
46
46
|
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
47
47
|
script_cmd: python /dummy_step.py
|
48
48
|
outputs:
|
@@ -51,7 +51,7 @@ step_4_python_pandas:
|
|
51
51
|
steps:
|
52
52
|
- step_4
|
53
53
|
image_name: python_pandas.sif
|
54
|
-
zenodo_record_id:
|
54
|
+
zenodo_record_id: 15757317
|
55
55
|
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
56
56
|
script_cmd: python /dummy_step.py
|
57
57
|
env:
|
@@ -62,7 +62,7 @@ step_5_python_pandas:
|
|
62
62
|
steps:
|
63
63
|
- step_5
|
64
64
|
image_name: python_pandas.sif
|
65
|
-
zenodo_record_id:
|
65
|
+
zenodo_record_id: 15757317
|
66
66
|
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
67
67
|
script_cmd: python /dummy_step.py
|
68
68
|
env:
|
@@ -73,7 +73,7 @@ step_6_python_pandas:
|
|
73
73
|
steps:
|
74
74
|
- step_6
|
75
75
|
image_name: python_pandas.sif
|
76
|
-
zenodo_record_id:
|
76
|
+
zenodo_record_id: 15757317
|
77
77
|
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
78
78
|
script_cmd: python /dummy_step.py
|
79
79
|
env:
|
@@ -84,7 +84,7 @@ step_4a_python_pandas:
|
|
84
84
|
steps:
|
85
85
|
- step_4a
|
86
86
|
image_name: python_pandas.sif
|
87
|
-
zenodo_record_id:
|
87
|
+
zenodo_record_id: 15757317
|
88
88
|
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
89
89
|
script_cmd: python /dummy_step.py
|
90
90
|
env:
|
@@ -95,7 +95,7 @@ step_4b_python_pandas:
|
|
95
95
|
steps:
|
96
96
|
- step_4b
|
97
97
|
image_name: python_pandas.sif
|
98
|
-
zenodo_record_id:
|
98
|
+
zenodo_record_id: 15757317
|
99
99
|
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
100
100
|
script_cmd: python /dummy_step.py
|
101
101
|
env:
|
@@ -106,7 +106,7 @@ step_4b_r:
|
|
106
106
|
steps:
|
107
107
|
- step_4b
|
108
108
|
image_name: r-image.sif
|
109
|
-
zenodo_record_id:
|
109
|
+
zenodo_record_id: 15757317
|
110
110
|
md5_checksum: a4a03b836694a2b81a1bd2852736ccc5
|
111
111
|
script_cmd: Rscript /dummy_step.R
|
112
112
|
env:
|
@@ -117,7 +117,7 @@ step_1_python_pyspark:
|
|
117
117
|
steps:
|
118
118
|
- step_1
|
119
119
|
image_name: python_pyspark.sif
|
120
|
-
zenodo_record_id:
|
120
|
+
zenodo_record_id: 15757317
|
121
121
|
md5_checksum: c948577ab0607411dd4b640622d9ec3a
|
122
122
|
script_cmd: python3 /code/dummy_step.py
|
123
123
|
outputs:
|
@@ -127,7 +127,7 @@ step_2_python_pyspark:
|
|
127
127
|
steps:
|
128
128
|
- step_2
|
129
129
|
image_name: python_pyspark.sif
|
130
|
-
zenodo_record_id:
|
130
|
+
zenodo_record_id: 15757317
|
131
131
|
md5_checksum: c948577ab0607411dd4b640622d9ec3a
|
132
132
|
script_cmd: python3 /code/dummy_step.py
|
133
133
|
outputs:
|
@@ -137,7 +137,7 @@ step_3_python_pyspark:
|
|
137
137
|
steps:
|
138
138
|
- step_3
|
139
139
|
image_name: python_pyspark.sif
|
140
|
-
zenodo_record_id:
|
140
|
+
zenodo_record_id: 15757317
|
141
141
|
md5_checksum: c948577ab0607411dd4b640622d9ec3a
|
142
142
|
script_cmd: python3 /code/dummy_step.py
|
143
143
|
outputs:
|
@@ -147,7 +147,7 @@ step_4_python_pyspark:
|
|
147
147
|
steps:
|
148
148
|
- step_4
|
149
149
|
image_name: python_pyspark.sif
|
150
|
-
zenodo_record_id:
|
150
|
+
zenodo_record_id: 15757317
|
151
151
|
md5_checksum: c948577ab0607411dd4b640622d9ec3a
|
152
152
|
script_cmd: python3 /code/dummy_step.py
|
153
153
|
env:
|
@@ -158,7 +158,7 @@ step_1_r:
|
|
158
158
|
steps:
|
159
159
|
- step_1
|
160
160
|
image_name: r-image.sif
|
161
|
-
zenodo_record_id:
|
161
|
+
zenodo_record_id: 15757317
|
162
162
|
md5_checksum: a4a03b836694a2b81a1bd2852736ccc5
|
163
163
|
script_cmd: Rscript /dummy_step.R
|
164
164
|
outputs:
|
@@ -168,7 +168,7 @@ step_2_r:
|
|
168
168
|
steps:
|
169
169
|
- step_2
|
170
170
|
image_name: r-image.sif
|
171
|
-
zenodo_record_id:
|
171
|
+
zenodo_record_id: 15757317
|
172
172
|
md5_checksum: a4a03b836694a2b81a1bd2852736ccc5
|
173
173
|
script_cmd: Rscript /dummy_step.R
|
174
174
|
outputs:
|
@@ -178,7 +178,7 @@ step_3_r:
|
|
178
178
|
steps:
|
179
179
|
- step_3
|
180
180
|
image_name: r-image.sif
|
181
|
-
zenodo_record_id:
|
181
|
+
zenodo_record_id: 15757317
|
182
182
|
md5_checksum: a4a03b836694a2b81a1bd2852736ccc5
|
183
183
|
script_cmd: Rscript /dummy_step.R
|
184
184
|
outputs:
|
@@ -188,7 +188,7 @@ step_4_r:
|
|
188
188
|
steps:
|
189
189
|
- step_4
|
190
190
|
image_name: r-image.sif
|
191
|
-
zenodo_record_id:
|
191
|
+
zenodo_record_id: 15757317
|
192
192
|
md5_checksum: a4a03b836694a2b81a1bd2852736ccc5
|
193
193
|
script_cmd: Rscript /dummy_step.R
|
194
194
|
env:
|
@@ -201,7 +201,7 @@ step_1_and_step_2_combined_python_pandas:
|
|
201
201
|
- step_1
|
202
202
|
- step_2
|
203
203
|
image_name: python_pandas.sif
|
204
|
-
zenodo_record_id:
|
204
|
+
zenodo_record_id: 15757317
|
205
205
|
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
206
206
|
script_cmd: python /dummy_step.py
|
207
207
|
outputs:
|
@@ -211,7 +211,7 @@ step_1_and_step_2_parallel_python_pandas:
|
|
211
211
|
- step_1
|
212
212
|
- step_2
|
213
213
|
image_name: python_pandas.sif
|
214
|
-
zenodo_record_id:
|
214
|
+
zenodo_record_id: 15757317
|
215
215
|
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
216
216
|
script_cmd: python /dummy_step.py
|
217
217
|
env:
|
@@ -223,7 +223,7 @@ step_3_and_step_4_combined_python_pandas:
|
|
223
223
|
- step_3
|
224
224
|
- step_4
|
225
225
|
image_name: python_pandas.sif
|
226
|
-
zenodo_record_id:
|
226
|
+
zenodo_record_id: 15757317
|
227
227
|
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
228
228
|
script_cmd: python /dummy_step.py
|
229
229
|
outputs:
|
@@ -233,7 +233,7 @@ step_1a_and_step_1b_combined_python_pandas:
|
|
233
233
|
- step_1a
|
234
234
|
- step_1b
|
235
235
|
image_name: python_pandas.sif
|
236
|
-
zenodo_record_id:
|
236
|
+
zenodo_record_id: 15757317
|
237
237
|
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
238
238
|
script_cmd: python /dummy_step.py
|
239
239
|
outputs:
|
@@ -241,131 +241,157 @@ step_1a_and_step_1b_combined_python_pandas:
|
|
241
241
|
dummy_step_1_for_output_dir_example:
|
242
242
|
steps:
|
243
243
|
- step_1_for_output_dir_example
|
244
|
-
image_name:
|
244
|
+
image_name: dummy_step_1_for_output_dir_example.sif
|
245
245
|
script_cmd: python /dummy_step_1_for_output_dir_example.py
|
246
246
|
outputs:
|
247
247
|
step_1_main_output_directory: output_dir/
|
248
248
|
dummy_step_1_for_output_dir_example_default:
|
249
249
|
steps:
|
250
250
|
- step_1_for_output_dir_example
|
251
|
-
image_name:
|
251
|
+
image_name: dummy_step_1_for_output_dir_example.sif
|
252
252
|
script_cmd: python /dummy_step_1_for_output_dir_example.py
|
253
253
|
dummy_step_2_for_output_dir_example:
|
254
254
|
steps:
|
255
255
|
- step_2_for_output_dir_example
|
256
|
-
image_name:
|
256
|
+
image_name: dummy_step_2_for_output_dir_example.sif
|
257
257
|
script_cmd: python /dummy_step_2_for_output_dir_example.py
|
258
258
|
outputs:
|
259
259
|
step_2_main_output: result.parquet
|
260
260
|
default_removing_records:
|
261
261
|
steps:
|
262
262
|
- removing_records
|
263
|
-
image_name:
|
263
|
+
image_name: default_removing_records.sif
|
264
|
+
zenodo_record_id: 15757317
|
265
|
+
md5_checksum: 85dba6fd73c9f8f504fddb6d5c30f2de
|
264
266
|
script_cmd: python /default_removing_records.py
|
265
267
|
outputs:
|
266
268
|
dataset: dataset
|
267
269
|
default_clusters_to_links:
|
268
270
|
steps:
|
269
271
|
- clusters_to_links
|
270
|
-
image_name:
|
272
|
+
image_name: default_clusters_to_links.sif
|
273
|
+
zenodo_record_id: 15757317
|
274
|
+
md5_checksum: 0d00d1272bd8193f60727791097aa065
|
271
275
|
script_cmd: python /default_clusters_to_links.py
|
272
276
|
outputs:
|
273
277
|
known_links: result.parquet
|
274
278
|
default_determining_exclusions:
|
275
279
|
steps:
|
276
280
|
- determining_exclusions
|
277
|
-
image_name:
|
281
|
+
image_name: default_determining_exclusions.sif
|
282
|
+
zenodo_record_id: 15757317
|
283
|
+
md5_checksum: e61cb32ad45b79ca9a2c36db4e76ef7e
|
278
284
|
script_cmd: python /default_determining_exclusions.py
|
279
285
|
outputs:
|
280
286
|
ids_to_remove: result.parquet
|
281
287
|
default_updating_clusters:
|
282
288
|
steps:
|
283
289
|
- updating_clusters
|
284
|
-
image_name:
|
290
|
+
image_name: default_updating_clusters.sif
|
291
|
+
zenodo_record_id: 15757317
|
292
|
+
md5_checksum: cc6bd29e099c2523347fa04545aa35c9
|
285
293
|
script_cmd: python /default_updating_clusters.py
|
286
294
|
outputs:
|
287
295
|
clusters: clusters.parquet
|
288
|
-
dummy_canonicalizing_and_downstream_analysis
|
296
|
+
# NOTE: This was made from dummy_canonicalizing_and_downstream_analysis.py,
|
297
|
+
# if rebuilding change the name of that file to save_clusters.py
|
298
|
+
save_clusters:
|
289
299
|
steps:
|
290
300
|
- canonicalizing_and_downstream_analysis
|
291
|
-
image_name:
|
301
|
+
image_name: save_clusters.sif
|
302
|
+
zenodo_record_id: 15757317
|
303
|
+
md5_checksum: 384ab2be668cbadc45160a674f621022
|
292
304
|
script_cmd: python /dummy_canonicalizing_and_downstream_analysis.py
|
293
305
|
outputs:
|
294
306
|
analysis_output: result.parquet
|
295
|
-
dummy_pre-processing
|
307
|
+
# NOTE: This was made from dummy_pre-processing.py,
|
308
|
+
# if rebuilding change the name of that file to no_pre-processing.py
|
309
|
+
no_pre-processing:
|
296
310
|
steps:
|
297
311
|
- pre-processing
|
298
|
-
image_name:
|
312
|
+
image_name: no_pre-processing.sif
|
313
|
+
zenodo_record_id: 15757317
|
314
|
+
md5_checksum: 9a9c080cf145078152501cf96bf61f27
|
299
315
|
script_cmd: python /dummy_pre-processing.py
|
300
316
|
outputs:
|
301
317
|
dataset: dataset
|
302
318
|
default_schema_alignment:
|
303
319
|
steps:
|
304
320
|
- schema_alignment
|
305
|
-
image_name:
|
321
|
+
image_name: default_schema_alignment.sif
|
322
|
+
zenodo_record_id: 15757317
|
323
|
+
md5_checksum: 3166587f9cfec478b999a17074d628f7
|
306
324
|
script_cmd: python /default_schema_alignment.py
|
307
325
|
outputs:
|
308
326
|
records: result.parquet
|
309
327
|
splink_blocking_and_filtering:
|
310
328
|
steps:
|
311
329
|
- blocking_and_filtering
|
312
|
-
image_name:
|
330
|
+
image_name: splink_blocking_and_filtering.sif
|
331
|
+
zenodo_record_id: 15757317
|
332
|
+
md5_checksum: 8a365b90295ef6beaad2b7f80a03d768
|
313
333
|
script_cmd: python /splink_blocking_and_filtering.py
|
314
334
|
outputs:
|
315
335
|
blocks: blocks
|
316
336
|
splink_evaluating_pairs:
|
317
337
|
steps:
|
318
338
|
- evaluating_pairs
|
319
|
-
image_name:
|
339
|
+
image_name: splink_evaluating_pairs.sif
|
340
|
+
zenodo_record_id: 15757317
|
341
|
+
md5_checksum: b57f4bd16b7a3aa5099569078ea4c064
|
320
342
|
script_cmd: python /splink_evaluating_pairs.py
|
321
343
|
outputs:
|
322
344
|
links: result.parquet
|
323
345
|
splink_links_to_clusters:
|
324
346
|
steps:
|
325
347
|
- links_to_clusters
|
326
|
-
image_name:
|
348
|
+
image_name: splink_links_to_clusters.sif
|
349
|
+
zenodo_record_id: 15757317
|
350
|
+
md5_checksum: 645937f7bab9c2557b7aacafaf4e4765
|
327
351
|
script_cmd: python /splink_links_to_clusters.py
|
328
352
|
outputs:
|
329
353
|
clusters: result.parquet
|
330
354
|
fastLink_evaluating_pairs:
|
331
355
|
steps:
|
332
356
|
- evaluating_pairs
|
333
|
-
image_name:
|
357
|
+
image_name: fastLink_evaluating_pairs.sif
|
334
358
|
script_cmd: Rscript /fastLink_evaluating_pairs.R
|
335
359
|
outputs:
|
336
360
|
links: result.parquet
|
337
361
|
fastLink_links_to_clusters:
|
338
362
|
steps:
|
339
363
|
- links_to_clusters
|
340
|
-
image_name:
|
364
|
+
image_name: fastLink_links_to_clusters.sif
|
341
365
|
script_cmd: Rscript /fastLink_links_to_clusters.R
|
342
366
|
outputs:
|
343
367
|
clusters: result.parquet
|
344
368
|
exclude_clustered:
|
345
369
|
steps:
|
346
370
|
- determining_exclusions
|
347
|
-
image_name:
|
371
|
+
image_name: exclude_clustered.sif
|
348
372
|
script_cmd: python /exclude_clustered.py
|
349
373
|
outputs:
|
350
374
|
ids_to_remove: result.parquet
|
351
375
|
exclude_none:
|
352
376
|
steps:
|
353
377
|
- determining_exclusions
|
354
|
-
image_name:
|
378
|
+
image_name: exclude_none.sif
|
355
379
|
script_cmd: python /exclude_none.py
|
356
380
|
outputs:
|
357
381
|
ids_to_remove: result.parquet
|
358
382
|
update_clusters_by_connected_components:
|
359
383
|
steps:
|
360
384
|
- updating_clusters
|
361
|
-
image_name:
|
385
|
+
image_name: update_clusters_by_connected_components.sif
|
362
386
|
script_cmd: python /update_clusters_by_connected_components.py
|
363
387
|
outputs:
|
364
388
|
clusters: result.parquet
|
365
389
|
middle_name_to_initial:
|
366
390
|
steps:
|
367
391
|
- pre-processing
|
368
|
-
image_name:
|
392
|
+
image_name: middle_name_to_initial.sif
|
393
|
+
zenodo_record_id: 15757317
|
394
|
+
md5_checksum: 89db9c3318300cda9d538cde08c3c323
|
369
395
|
script_cmd: python /middle_name_to_initial.py
|
370
396
|
outputs:
|
371
397
|
dataset: dataset
|
easylink/rule.py
CHANGED
@@ -14,6 +14,7 @@ dynamically as strings and appended to the Snakefile.
|
|
14
14
|
"""
|
15
15
|
|
16
16
|
import os
|
17
|
+
import shlex
|
17
18
|
from abc import ABC, abstractmethod
|
18
19
|
from collections.abc import Callable
|
19
20
|
from dataclasses import dataclass
|
@@ -204,7 +205,7 @@ rule:
|
|
204
205
|
export SPARK_MASTER_URL"""
|
205
206
|
for var_name, var_value in self.envvars.items():
|
206
207
|
shell_cmd += f"""
|
207
|
-
export {var_name}={var_value}"""
|
208
|
+
export {var_name}={shlex.quote(str(var_value))}"""
|
208
209
|
# Log stdout/stderr to diagnostics directory
|
209
210
|
shell_cmd += f"""
|
210
211
|
{self.script_cmd} > {{log}} 2>&1
|
easylink/runner.py
CHANGED
@@ -11,6 +11,7 @@ be called from the ``easylink.cli`` module.
|
|
11
11
|
import os
|
12
12
|
import socket
|
13
13
|
import subprocess
|
14
|
+
from contextlib import redirect_stderr, redirect_stdout
|
14
15
|
from pathlib import Path
|
15
16
|
|
16
17
|
from graphviz import Source
|
@@ -123,7 +124,123 @@ def main(
|
|
123
124
|
argv.extend(environment_args)
|
124
125
|
logger.info(f"Running Snakemake")
|
125
126
|
logger.debug(f"Snakemake arguments: {argv}")
|
126
|
-
|
127
|
+
|
128
|
+
# Run snakemake
|
129
|
+
if debug:
|
130
|
+
snake_main(argv)
|
131
|
+
else:
|
132
|
+
_run_snakemake_with_filtered_output(argv, Path(results_dir))
|
133
|
+
|
134
|
+
|
135
|
+
def _run_snakemake_with_filtered_output(argv: list[str], results_dir: Path) -> None:
|
136
|
+
"""Runs Snakemake with simplified log filtering.
|
137
|
+
|
138
|
+
Parameters
|
139
|
+
----------
|
140
|
+
argv
|
141
|
+
Snakemake command line arguments.
|
142
|
+
results_dir
|
143
|
+
Directory to save the full Snakemake log.
|
144
|
+
"""
|
145
|
+
snakemake_log_file = results_dir / "pipeline.log"
|
146
|
+
|
147
|
+
# Create a filtering output handler that processes lines in real-time
|
148
|
+
class FilteringOutput:
|
149
|
+
"""Handles real-time filtering and logging of Snakemake output.
|
150
|
+
|
151
|
+
This class writes all snakemake output to a log file and selectively logs
|
152
|
+
filtered lines to the logger for user visibility.
|
153
|
+
|
154
|
+
Parameters
|
155
|
+
----------
|
156
|
+
log_file_path
|
157
|
+
The path to the log file where all output will be written.
|
158
|
+
|
159
|
+
"""
|
160
|
+
|
161
|
+
def __init__(self, log_file_path: Path):
|
162
|
+
self.log_file = open(log_file_path, "w")
|
163
|
+
self.buffer = ""
|
164
|
+
|
165
|
+
def write(self, text: str) -> int:
|
166
|
+
# Write to log file
|
167
|
+
self.log_file.write(text)
|
168
|
+
self.log_file.flush()
|
169
|
+
|
170
|
+
# Process and log filtered output
|
171
|
+
self.buffer += text
|
172
|
+
while "\n" in self.buffer:
|
173
|
+
line, self.buffer = self.buffer.split("\n", 1)
|
174
|
+
if line.strip():
|
175
|
+
filtered_line = _filter_snakemake_output_simple(line.strip())
|
176
|
+
if filtered_line:
|
177
|
+
logger.info(filtered_line)
|
178
|
+
|
179
|
+
return len(text)
|
180
|
+
|
181
|
+
def flush(self):
|
182
|
+
self.log_file.flush()
|
183
|
+
|
184
|
+
def close(self):
|
185
|
+
# Process and log any remaining buffer content
|
186
|
+
if self.buffer.strip():
|
187
|
+
filtered_line = _filter_snakemake_output_simple(self.buffer.strip())
|
188
|
+
if filtered_line:
|
189
|
+
logger.info(filtered_line)
|
190
|
+
self.log_file.close()
|
191
|
+
|
192
|
+
def __enter__(self):
|
193
|
+
return self
|
194
|
+
|
195
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
196
|
+
self.close()
|
197
|
+
|
198
|
+
# Create the filtering output handler and ensure the log file is always closed
|
199
|
+
with FilteringOutput(snakemake_log_file) as filtering_output:
|
200
|
+
try:
|
201
|
+
# Redirect both stdout and stderr to our filtering handler
|
202
|
+
with redirect_stdout(filtering_output), redirect_stderr(filtering_output):
|
203
|
+
snake_main(argv)
|
204
|
+
except SystemExit:
|
205
|
+
# Snakemake uses SystemExit for both success and failure
|
206
|
+
logger.info(
|
207
|
+
f"Pipeline finished running - full log saved to: {snakemake_log_file}"
|
208
|
+
)
|
209
|
+
# Always re-raise to allow test frameworks to detect completion
|
210
|
+
raise
|
211
|
+
|
212
|
+
|
213
|
+
def _filter_snakemake_output_simple(line: str) -> str | None:
|
214
|
+
"""
|
215
|
+
Simple filter for Snakemake output showing only localrules and Job messages.
|
216
|
+
|
217
|
+
Parameters
|
218
|
+
----------
|
219
|
+
line
|
220
|
+
A single line of Snakemake output.
|
221
|
+
|
222
|
+
Returns
|
223
|
+
-------
|
224
|
+
str or None
|
225
|
+
The filtered line for display, or None to suppress the line.
|
226
|
+
"""
|
227
|
+
# Skip empty lines
|
228
|
+
if not line.strip():
|
229
|
+
return None
|
230
|
+
|
231
|
+
if line.startswith("localrule "):
|
232
|
+
# Show localrule names (without the "localrule" prefix)
|
233
|
+
# Extract rule name (remove "localrule " prefix and colon at the end)
|
234
|
+
filtered_line = line.replace("localrule ", "").rstrip(":")
|
235
|
+
elif line.startswith("Job ") and ":" in line:
|
236
|
+
# Show Job messages
|
237
|
+
# Extract everything after "Job ##: "
|
238
|
+
parts = line.split(":", 1)
|
239
|
+
filtered_line = parts[1].strip() if len(parts) > 1 else None
|
240
|
+
else:
|
241
|
+
# Suppress everything else
|
242
|
+
filtered_line = None
|
243
|
+
return filtered_line
|
127
244
|
|
128
245
|
|
129
246
|
def _get_singularity_args(config: Config) -> str:
|
@@ -35,6 +35,7 @@ for block_dir in blocks_dir.iterdir():
|
|
35
35
|
comparisons.append(cl.LevenshteinAtThresholds(column))
|
36
36
|
else:
|
37
37
|
raise ValueError(f"Unknown comparison method {method}")
|
38
|
+
# TODO: check both datasets contain all the columns
|
38
39
|
|
39
40
|
# Create the Splink linker in dedupe mode
|
40
41
|
settings = SettingsCreator(
|
@@ -135,7 +136,7 @@ for block_dir in blocks_dir.iterdir():
|
|
135
136
|
|
136
137
|
sqls = predict_from_comparison_vectors_sqls_using_settings(
|
137
138
|
linker._settings_obj,
|
138
|
-
float(os.
|
139
|
+
float(os.getenv("THRESHOLD_MATCH_PROBABILITY", 0)),
|
139
140
|
threshold_match_weight=None,
|
140
141
|
sql_infinity_expression=linker._infinity_expression,
|
141
142
|
)
|
@@ -97,24 +97,34 @@ def _add_logging_sink(
|
|
97
97
|
Whether the logs should be converted to JSON before they're dumped
|
98
98
|
to the logging sink.
|
99
99
|
"""
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
100
|
+
|
101
|
+
def format_message(record):
|
102
|
+
elapsed_seconds = int(record["elapsed"].total_seconds())
|
103
|
+
hours = elapsed_seconds // 3600
|
104
|
+
minutes = (elapsed_seconds % 3600) // 60
|
105
|
+
seconds = elapsed_seconds % 60
|
106
|
+
elapsed_str = f"{hours:02d}:{minutes:02d}:{seconds:02d}"
|
107
|
+
time_str = record["time"].strftime("%Y-%m-%d %H:%M:%S")
|
108
|
+
|
109
|
+
if colorize:
|
110
|
+
return f"\033[32m{time_str}\033[0m | \033[32m{elapsed_str}\033[0m | {record['message']}\n"
|
111
|
+
else:
|
112
|
+
return f"{time_str} | {elapsed_str} | {record['message']}\n"
|
113
|
+
|
104
114
|
if verbose == 0:
|
105
115
|
logger.add(
|
106
116
|
sink,
|
107
|
-
colorize=
|
117
|
+
colorize=False, # We handle colors in format_message
|
108
118
|
level="INFO",
|
109
|
-
format=
|
119
|
+
format=format_message,
|
110
120
|
serialize=serialize,
|
111
121
|
)
|
112
122
|
elif verbose >= 1:
|
113
123
|
logger.add(
|
114
124
|
sink,
|
115
|
-
colorize=
|
125
|
+
colorize=False, # We handle colors in format_message
|
116
126
|
level="DEBUG",
|
117
|
-
format=
|
127
|
+
format=format_message,
|
118
128
|
serialize=serialize,
|
119
129
|
)
|
120
130
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: easylink
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.25
|
4
4
|
Summary: Research repository for the EasyLink ER ecosystem project.
|
5
5
|
Home-page: https://github.com/ihmeuw/easylink
|
6
6
|
Author: The EasyLink developers
|
@@ -78,34 +78,50 @@ Installation
|
|
78
78
|
|
79
79
|
.. _installation:
|
80
80
|
|
81
|
+
**NOTE: This package requires AMD64 CPU architecture - it is not compatible with
|
82
|
+
Apple's ARM64 architecture (e.g. M1 and newer Macs).**
|
83
|
+
|
81
84
|
There are a few things to install in order to use this package:
|
82
85
|
|
83
|
-
-
|
86
|
+
- Set up Linux.
|
87
|
+
|
88
|
+
Singularity (and thus EasyLink) requires Linux to run. If you are not already
|
89
|
+
using Linux, you will need to set up a virtual machine; refer to the
|
90
|
+
`Singularity documentation for installing on Windows or Mac <https://docs.sylabs.io/guides/4.1/admin-guide/installation.html#installation-on-windows-or-mac>`_.
|
84
91
|
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
``singularity --version``. For an existing installation, your
|
92
|
+
- Install Singularity.
|
93
|
+
|
94
|
+
First check if you already have Singularity installed by running the command
|
95
|
+
``singularity --version``. For an existing installation, your Singularity version
|
89
96
|
number is printed.
|
90
97
|
|
98
|
+
If Singularity is not yet installed, you will need to install it;
|
99
|
+
refer to the `Singularity docs for installing on Linux <https://docs.sylabs.io/guides/4.1/admin-guide/installation.html#installation-on-linux>`_.
|
100
|
+
|
101
|
+
Note that this requires administrator privileges; you may need to request installation
|
102
|
+
from your system admin if you are working in a shared computing environment.
|
103
|
+
|
91
104
|
- Install conda.
|
92
105
|
|
93
106
|
We recommend `miniforge <https://github.com/conda-forge/miniforge>`_. You can
|
94
107
|
check if you already have conda installed by running the command ``conda --version``.
|
95
108
|
For an existing installation, a version will be displayed.
|
96
109
|
|
97
|
-
-
|
110
|
+
- Create a conda environment with python and graphviz installed.
|
111
|
+
|
112
|
+
::
|
113
|
+
|
114
|
+
$ conda create --name easylink -c conda-forge python=3.12 graphviz 'gcc<14' -y
|
115
|
+
$ conda activate easylink
|
116
|
+
|
117
|
+
- Install easylink in the environment.
|
98
118
|
|
99
119
|
Option 1 - Install from PyPI with pip::
|
100
120
|
|
101
|
-
$ conda create --name easylink -c conda-forge python=3.12 graphviz 'gcc<14' -y
|
102
|
-
$ conda activate easylink
|
103
121
|
$ pip install easylink
|
104
122
|
|
105
123
|
Option 2 - Build from source with pip::
|
106
124
|
|
107
|
-
$ conda create --name easylink -c conda-forge python=3.12 graphviz 'gcc<14' -y
|
108
|
-
$ conda activate easylink
|
109
125
|
$ pip install git+https://github.com/ihmeuw/easylink.git
|
110
126
|
|
111
127
|
.. _end_installation:
|
@@ -1,16 +1,16 @@
|
|
1
1
|
easylink/__about__.py,sha256=2-oxCfu9t9yUJouLDwqYRZ0eii8kN25SxRzsawjWjho,440
|
2
2
|
easylink/__init__.py,sha256=gGMcIVfiVnHtlDw5mZwhevcDb2wt-kuP6F64gnkFack,159
|
3
|
-
easylink/_version.py,sha256=
|
4
|
-
easylink/cli.py,sha256=
|
3
|
+
easylink/_version.py,sha256=Ej7LsXg-6CASlaEHsZkUoLDpYEfHeFKdIeXMIM0esgA,23
|
4
|
+
easylink/cli.py,sha256=80_EVklOdX78fPqAinTpsfTfWUqMM4ghFaQcVgZG354,10496
|
5
5
|
easylink/configuration.py,sha256=TPd3WbqUcJMJDPJuHeo5ZebvZPQrRyfm8-laad2sOFk,12681
|
6
6
|
easylink/graph_components.py,sha256=zZDZXg5smReHO3ryQC4pao24wyKXzWDe6jS3C6fM2ak,13892
|
7
7
|
easylink/implementation.py,sha256=lSF37g-aQYgdLI0lgFaDrBkp23te9O9B1V-CmmRtB-0,14514
|
8
|
-
easylink/implementation_metadata.yaml,sha256=
|
8
|
+
easylink/implementation_metadata.yaml,sha256=u_E51gVzVzTuM19dMv7-p_0JV-A6j5dfUwJrxtAZDBQ,11805
|
9
9
|
easylink/pipeline.py,sha256=NJyMInbOCjJ_5kRzzuK5AcupvC7ecd5qLOC-f1Gy3Ss,17701
|
10
10
|
easylink/pipeline_graph.py,sha256=jtjS7_2IVa189U8cL621600erC_0pa6BKPRRO8VqwiU,23760
|
11
11
|
easylink/pipeline_schema.py,sha256=sj6YQqMFUS35RRFWdlrhPlud68d3Rb5y7iKfAEZy1qs,6877
|
12
|
-
easylink/rule.py,sha256=
|
13
|
-
easylink/runner.py,sha256=
|
12
|
+
easylink/rule.py,sha256=n8r4NL7MiNzppl39MRbEMdHEpn_e_XS7LfrsJD6KNfA,16629
|
13
|
+
easylink/runner.py,sha256=irMmrUME1B8BFTtQkCr-u-lHBDKaEll7IX_a4Q4AJNc,10576
|
14
14
|
easylink/step.py,sha256=-vdFhPvwAZ3d69LMQGmSIVdcMG8E8ZtSvTE0UWif7zs,91088
|
15
15
|
easylink/devtools/implementation_creator.py,sha256=gZZpfpiOOh912nNjR_d5wR0mBO5c09jWMS0FSYRcE1o,19120
|
16
16
|
easylink/pipeline_schema_constants/__init__.py,sha256=njUL2v67WFC3dW_FjOXWRLSeOhInox_ZINLEji0_7V8,1523
|
@@ -72,20 +72,20 @@ easylink/steps/rl-dummy/pre-processing/dummy_pre-processing.py,sha256=I6kqG4e_H2
|
|
72
72
|
easylink/steps/splink/splink_blocking_and_filtering.def,sha256=umWzxJhsfdi8g3TD-r2mKpjC-FPAMDk6-IERiWigdQc,557
|
73
73
|
easylink/steps/splink/splink_blocking_and_filtering.py,sha256=FO8YJ2_KgCLpQoq5xsM339bTSN1DhCXCL8XT1pb5STY,5259
|
74
74
|
easylink/steps/splink/splink_evaluating_pairs.def,sha256=DN3Ohy9qJOAyK58v164neP23HDVYpedMqzCu4eQh4Hg,521
|
75
|
-
easylink/steps/splink/splink_evaluating_pairs.py,sha256=
|
75
|
+
easylink/steps/splink/splink_evaluating_pairs.py,sha256=vWUe3vQo9uGs0Cy8pG5PbolzsJX_cEaPS3Q0PMcBjcg,6253
|
76
76
|
easylink/steps/splink/splink_links_to_clusters.def,sha256=RurvOYyGjNs9tx64DTXwI-GSgHD4T7SzDfhAH18pTEM,524
|
77
77
|
easylink/steps/splink/splink_links_to_clusters.py,sha256=5Sw8yi0dVLuRB-trN2kXmxbHBR0VJBxYee6u4_usg2Y,1920
|
78
78
|
easylink/utilities/__init__.py,sha256=0U33kbv4hoMfFQ_lh5hLwifxRPzOgkLkjKLYxmaK10g,196
|
79
79
|
easylink/utilities/aggregator_utils.py,sha256=_DAHRAf9janbDsuj_jnAn5Dzz2s4R5Ni3YeytDpN9UE,954
|
80
80
|
easylink/utilities/data_utils.py,sha256=XPRjq3qW_fN0xQ23Jms_xBzpTHbRwqZWDP1AW0nYkP0,6926
|
81
|
-
easylink/utilities/general_utils.py,sha256=
|
81
|
+
easylink/utilities/general_utils.py,sha256=MmuoV4T6PgyEDjbepC_1D3TGrq70Hp-hl-GaAYr5wRU,5033
|
82
82
|
easylink/utilities/paths.py,sha256=9inDKMPorAaWh5vhjc3j1Tj_aXVKhLdodiJO9H1nNes,947
|
83
83
|
easylink/utilities/spark.smk,sha256=kGtpem7LfQc71tMh5WAYaqKnHQKFvcdhPQSdumOP70k,5799
|
84
84
|
easylink/utilities/splitter_utils.py,sha256=KXiVUYJ9TGxCQmrwos18pB1sxG_0Ay67qoDJT6vem2o,3828
|
85
85
|
easylink/utilities/validation_utils.py,sha256=DBJB2TLVklgYw1WaaPse9vqtwPLMGmZNYM2cbCZsoHI,18417
|
86
|
-
easylink-0.1.
|
87
|
-
easylink-0.1.
|
88
|
-
easylink-0.1.
|
89
|
-
easylink-0.1.
|
90
|
-
easylink-0.1.
|
91
|
-
easylink-0.1.
|
86
|
+
easylink-0.1.25.dist-info/licenses/LICENSE,sha256=z6NBo3hY2Pbh2Wjm9IcHLyQoG-kd816uASh8PbwhTcQ,1530
|
87
|
+
easylink-0.1.25.dist-info/METADATA,sha256=agOx4R08pqrpwjWmoSAmhU33gmHZ5QhDt9UHVRbnkHI,4219
|
88
|
+
easylink-0.1.25.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
89
|
+
easylink-0.1.25.dist-info/entry_points.txt,sha256=OGMZDFltg3yMboT7XjJt3joiPhRfV_7jnREVtrAIQNU,51
|
90
|
+
easylink-0.1.25.dist-info/top_level.txt,sha256=oHcOpcF_jDMWFiJRzfGQvuskENGDjSPC_Agu9Z_Xvik,9
|
91
|
+
easylink-0.1.25.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|