easylink 0.1.24__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
easylink/_version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.1.24"
1
+ __version__ = "0.2.0"
easylink/cli.py CHANGED
@@ -201,16 +201,25 @@ def run(
201
201
  main = handle_exceptions(
202
202
  func=runner.main, exceptions_logger=logger, with_debugger=with_debugger
203
203
  )
204
- main(
205
- command="run",
206
- pipeline_specification=pipeline_specification,
207
- input_data=input_data,
208
- computing_environment=computing_environment,
209
- results_dir=results_dir,
210
- images_dir=images,
211
- schema_name=schema,
212
- )
213
- logger.info("*** FINISHED ***")
204
+ try:
205
+ main(
206
+ command="run",
207
+ pipeline_specification=pipeline_specification,
208
+ input_data=input_data,
209
+ computing_environment=computing_environment,
210
+ results_dir=results_dir,
211
+ images_dir=images,
212
+ schema_name=schema,
213
+ )
214
+ except SystemExit as e:
215
+ # Snakemake uses SystemExit with exit code 0 for success, non-zero for failure
216
+ if e.code == 0:
217
+ logger.info("\033[32m*** FINISHED ***\033[0m") # Green
218
+ else:
219
+ logger.error(
220
+ f"\033[31mERROR: Pipeline failed with exit code {e.code}\033[0m"
221
+ ) # Red
222
+ raise
214
223
 
215
224
 
216
225
  @easylink.command()
@@ -2,7 +2,7 @@ step_1_python_pandas:
2
2
  steps:
3
3
  - step_1
4
4
  image_name: python_pandas.sif
5
- zenodo_record_id: 15733426
5
+ zenodo_record_id: 15778354
6
6
  md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
7
7
  script_cmd: python /dummy_step.py
8
8
  outputs:
@@ -11,7 +11,7 @@ step_1a_python_pandas:
11
11
  steps:
12
12
  - step_1a
13
13
  image_name: python_pandas.sif
14
- zenodo_record_id: 15733426
14
+ zenodo_record_id: 15778354
15
15
  md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
16
16
  script_cmd: python /dummy_step.py
17
17
  env:
@@ -22,7 +22,7 @@ step_1b_python_pandas:
22
22
  steps:
23
23
  - step_1b
24
24
  image_name: python_pandas.sif
25
- zenodo_record_id: 15733426
25
+ zenodo_record_id: 15778354
26
26
  md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
27
27
  script_cmd: python /dummy_step.py
28
28
  env:
@@ -33,7 +33,7 @@ step_2_python_pandas:
33
33
  steps:
34
34
  - step_2
35
35
  image_name: python_pandas.sif
36
- zenodo_record_id: 15733426
36
+ zenodo_record_id: 15778354
37
37
  md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
38
38
  script_cmd: python /dummy_step.py
39
39
  outputs:
@@ -42,7 +42,7 @@ step_3_python_pandas:
42
42
  steps:
43
43
  - step_3
44
44
  image_name: python_pandas.sif
45
- zenodo_record_id: 15733426
45
+ zenodo_record_id: 15778354
46
46
  md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
47
47
  script_cmd: python /dummy_step.py
48
48
  outputs:
@@ -51,7 +51,7 @@ step_4_python_pandas:
51
51
  steps:
52
52
  - step_4
53
53
  image_name: python_pandas.sif
54
- zenodo_record_id: 15733426
54
+ zenodo_record_id: 15778354
55
55
  md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
56
56
  script_cmd: python /dummy_step.py
57
57
  env:
@@ -62,7 +62,7 @@ step_5_python_pandas:
62
62
  steps:
63
63
  - step_5
64
64
  image_name: python_pandas.sif
65
- zenodo_record_id: 15733426
65
+ zenodo_record_id: 15778354
66
66
  md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
67
67
  script_cmd: python /dummy_step.py
68
68
  env:
@@ -73,7 +73,7 @@ step_6_python_pandas:
73
73
  steps:
74
74
  - step_6
75
75
  image_name: python_pandas.sif
76
- zenodo_record_id: 15733426
76
+ zenodo_record_id: 15778354
77
77
  md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
78
78
  script_cmd: python /dummy_step.py
79
79
  env:
@@ -84,7 +84,7 @@ step_4a_python_pandas:
84
84
  steps:
85
85
  - step_4a
86
86
  image_name: python_pandas.sif
87
- zenodo_record_id: 15733426
87
+ zenodo_record_id: 15778354
88
88
  md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
89
89
  script_cmd: python /dummy_step.py
90
90
  env:
@@ -95,7 +95,7 @@ step_4b_python_pandas:
95
95
  steps:
96
96
  - step_4b
97
97
  image_name: python_pandas.sif
98
- zenodo_record_id: 15733426
98
+ zenodo_record_id: 15778354
99
99
  md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
100
100
  script_cmd: python /dummy_step.py
101
101
  env:
@@ -106,7 +106,7 @@ step_4b_r:
106
106
  steps:
107
107
  - step_4b
108
108
  image_name: r-image.sif
109
- zenodo_record_id: 15733426
109
+ zenodo_record_id: 15778354
110
110
  md5_checksum: a4a03b836694a2b81a1bd2852736ccc5
111
111
  script_cmd: Rscript /dummy_step.R
112
112
  env:
@@ -117,7 +117,7 @@ step_1_python_pyspark:
117
117
  steps:
118
118
  - step_1
119
119
  image_name: python_pyspark.sif
120
- zenodo_record_id: 15733426
120
+ zenodo_record_id: 15778354
121
121
  md5_checksum: c948577ab0607411dd4b640622d9ec3a
122
122
  script_cmd: python3 /code/dummy_step.py
123
123
  outputs:
@@ -127,7 +127,7 @@ step_2_python_pyspark:
127
127
  steps:
128
128
  - step_2
129
129
  image_name: python_pyspark.sif
130
- zenodo_record_id: 15733426
130
+ zenodo_record_id: 15778354
131
131
  md5_checksum: c948577ab0607411dd4b640622d9ec3a
132
132
  script_cmd: python3 /code/dummy_step.py
133
133
  outputs:
@@ -137,7 +137,7 @@ step_3_python_pyspark:
137
137
  steps:
138
138
  - step_3
139
139
  image_name: python_pyspark.sif
140
- zenodo_record_id: 15733426
140
+ zenodo_record_id: 15778354
141
141
  md5_checksum: c948577ab0607411dd4b640622d9ec3a
142
142
  script_cmd: python3 /code/dummy_step.py
143
143
  outputs:
@@ -147,7 +147,7 @@ step_4_python_pyspark:
147
147
  steps:
148
148
  - step_4
149
149
  image_name: python_pyspark.sif
150
- zenodo_record_id: 15733426
150
+ zenodo_record_id: 15778354
151
151
  md5_checksum: c948577ab0607411dd4b640622d9ec3a
152
152
  script_cmd: python3 /code/dummy_step.py
153
153
  env:
@@ -158,7 +158,7 @@ step_1_r:
158
158
  steps:
159
159
  - step_1
160
160
  image_name: r-image.sif
161
- zenodo_record_id: 15733426
161
+ zenodo_record_id: 15778354
162
162
  md5_checksum: a4a03b836694a2b81a1bd2852736ccc5
163
163
  script_cmd: Rscript /dummy_step.R
164
164
  outputs:
@@ -168,7 +168,7 @@ step_2_r:
168
168
  steps:
169
169
  - step_2
170
170
  image_name: r-image.sif
171
- zenodo_record_id: 15733426
171
+ zenodo_record_id: 15778354
172
172
  md5_checksum: a4a03b836694a2b81a1bd2852736ccc5
173
173
  script_cmd: Rscript /dummy_step.R
174
174
  outputs:
@@ -178,7 +178,7 @@ step_3_r:
178
178
  steps:
179
179
  - step_3
180
180
  image_name: r-image.sif
181
- zenodo_record_id: 15733426
181
+ zenodo_record_id: 15778354
182
182
  md5_checksum: a4a03b836694a2b81a1bd2852736ccc5
183
183
  script_cmd: Rscript /dummy_step.R
184
184
  outputs:
@@ -188,7 +188,7 @@ step_4_r:
188
188
  steps:
189
189
  - step_4
190
190
  image_name: r-image.sif
191
- zenodo_record_id: 15733426
191
+ zenodo_record_id: 15778354
192
192
  md5_checksum: a4a03b836694a2b81a1bd2852736ccc5
193
193
  script_cmd: Rscript /dummy_step.R
194
194
  env:
@@ -201,7 +201,7 @@ step_1_and_step_2_combined_python_pandas:
201
201
  - step_1
202
202
  - step_2
203
203
  image_name: python_pandas.sif
204
- zenodo_record_id: 15733426
204
+ zenodo_record_id: 15778354
205
205
  md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
206
206
  script_cmd: python /dummy_step.py
207
207
  outputs:
@@ -211,7 +211,7 @@ step_1_and_step_2_parallel_python_pandas:
211
211
  - step_1
212
212
  - step_2
213
213
  image_name: python_pandas.sif
214
- zenodo_record_id: 15733426
214
+ zenodo_record_id: 15778354
215
215
  md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
216
216
  script_cmd: python /dummy_step.py
217
217
  env:
@@ -223,7 +223,7 @@ step_3_and_step_4_combined_python_pandas:
223
223
  - step_3
224
224
  - step_4
225
225
  image_name: python_pandas.sif
226
- zenodo_record_id: 15733426
226
+ zenodo_record_id: 15778354
227
227
  md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
228
228
  script_cmd: python /dummy_step.py
229
229
  outputs:
@@ -233,7 +233,7 @@ step_1a_and_step_1b_combined_python_pandas:
233
233
  - step_1a
234
234
  - step_1b
235
235
  image_name: python_pandas.sif
236
- zenodo_record_id: 15733426
236
+ zenodo_record_id: 15778354
237
237
  md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
238
238
  script_cmd: python /dummy_step.py
239
239
  outputs:
@@ -241,131 +241,181 @@ step_1a_and_step_1b_combined_python_pandas:
241
241
  dummy_step_1_for_output_dir_example:
242
242
  steps:
243
243
  - step_1_for_output_dir_example
244
- image_name: main/dummy_step_1_for_output_dir_example.sif
244
+ image_name: dummy_step_1_for_output_dir_example.sif
245
245
  script_cmd: python /dummy_step_1_for_output_dir_example.py
246
246
  outputs:
247
247
  step_1_main_output_directory: output_dir/
248
248
  dummy_step_1_for_output_dir_example_default:
249
249
  steps:
250
250
  - step_1_for_output_dir_example
251
- image_name: main/dummy_step_1_for_output_dir_example.sif
251
+ image_name: dummy_step_1_for_output_dir_example.sif
252
252
  script_cmd: python /dummy_step_1_for_output_dir_example.py
253
253
  dummy_step_2_for_output_dir_example:
254
254
  steps:
255
255
  - step_2_for_output_dir_example
256
- image_name: main/dummy_step_2_for_output_dir_example.sif
256
+ image_name: dummy_step_2_for_output_dir_example.sif
257
257
  script_cmd: python /dummy_step_2_for_output_dir_example.py
258
258
  outputs:
259
259
  step_2_main_output: result.parquet
260
260
  default_removing_records:
261
261
  steps:
262
262
  - removing_records
263
- image_name: main/default_removing_records.sif
263
+ image_name: default_removing_records.sif
264
+ zenodo_record_id: 15778354
265
+ md5_checksum: 05123136e756bfa57f1d7d5a3315f2f6
264
266
  script_cmd: python /default_removing_records.py
265
267
  outputs:
266
268
  dataset: dataset
267
269
  default_clusters_to_links:
268
270
  steps:
269
271
  - clusters_to_links
270
- image_name: main/default_clusters_to_links.sif
272
+ image_name: default_clusters_to_links.sif
273
+ zenodo_record_id: 15778354
274
+ md5_checksum: 0d00d1272bd8193f60727791097aa065
271
275
  script_cmd: python /default_clusters_to_links.py
272
276
  outputs:
273
277
  known_links: result.parquet
274
278
  default_determining_exclusions:
275
279
  steps:
276
280
  - determining_exclusions
277
- image_name: main/default_determining_exclusions.sif
281
+ image_name: default_determining_exclusions.sif
282
+ zenodo_record_id: 15778354
283
+ md5_checksum: f4e9f740d8dd7599bfbb2b9eb54ced38
278
284
  script_cmd: python /default_determining_exclusions.py
279
285
  outputs:
280
286
  ids_to_remove: result.parquet
281
287
  default_updating_clusters:
282
288
  steps:
283
289
  - updating_clusters
284
- image_name: main/default_updating_clusters.sif
290
+ image_name: default_updating_clusters.sif
291
+ zenodo_record_id: 15778354
292
+ md5_checksum: cc6bd29e099c2523347fa04545aa35c9
285
293
  script_cmd: python /default_updating_clusters.py
286
294
  outputs:
287
295
  clusters: clusters.parquet
288
- dummy_canonicalizing_and_downstream_analysis:
296
+ # NOTE: This was made from dummy_canonicalizing_and_downstream_analysis.py,
297
+ # if rebuilding change the name of that file to save_clusters.py
298
+ save_clusters:
289
299
  steps:
290
300
  - canonicalizing_and_downstream_analysis
291
- image_name: main/dummy_canonicalizing_and_downstream_analysis.sif
301
+ image_name: save_clusters.sif
302
+ zenodo_record_id: 15778354
303
+ md5_checksum: 384ab2be668cbadc45160a674f621022
292
304
  script_cmd: python /dummy_canonicalizing_and_downstream_analysis.py
293
305
  outputs:
294
306
  analysis_output: result.parquet
295
- dummy_pre-processing:
307
+ # NOTE: This was made from dummy_pre-processing.py,
308
+ # if rebuilding change the name of that file to no_pre-processing.py
309
+ no_pre-processing:
296
310
  steps:
297
311
  - pre-processing
298
- image_name: main/dummy_pre-processing.sif
312
+ image_name: no_pre-processing.sif
313
+ zenodo_record_id: 15778354
314
+ md5_checksum: 9a9c080cf145078152501cf96bf61f27
299
315
  script_cmd: python /dummy_pre-processing.py
300
316
  outputs:
301
317
  dataset: dataset
302
318
  default_schema_alignment:
303
319
  steps:
304
320
  - schema_alignment
305
- image_name: main/default_schema_alignment.sif
321
+ image_name: default_schema_alignment.sif
322
+ zenodo_record_id: 15778354
323
+ md5_checksum: 3166587f9cfec478b999a17074d628f7
306
324
  script_cmd: python /default_schema_alignment.py
307
325
  outputs:
308
326
  records: result.parquet
309
327
  splink_blocking_and_filtering:
310
328
  steps:
311
329
  - blocking_and_filtering
312
- image_name: main/splink_blocking_and_filtering.sif
330
+ image_name: splink_blocking_and_filtering.sif
331
+ zenodo_record_id: 15778354
332
+ md5_checksum: 3f8777c5751d7550762be078d87e7db2
313
333
  script_cmd: python /splink_blocking_and_filtering.py
314
334
  outputs:
315
335
  blocks: blocks
316
336
  splink_evaluating_pairs:
317
337
  steps:
318
338
  - evaluating_pairs
319
- image_name: main/splink_evaluating_pairs.sif
339
+ image_name: splink_evaluating_pairs.sif
340
+ zenodo_record_id: 15778354
341
+ md5_checksum: b57f4bd16b7a3aa5099569078ea4c064
320
342
  script_cmd: python /splink_evaluating_pairs.py
321
343
  outputs:
322
344
  links: result.parquet
323
345
  splink_links_to_clusters:
324
346
  steps:
325
347
  - links_to_clusters
326
- image_name: main/splink_links_to_clusters.sif
348
+ image_name: splink_links_to_clusters.sif
349
+ zenodo_record_id: 15778354
350
+ md5_checksum: 81a71aa2ce6544953f3edb88d4ee6ec1
327
351
  script_cmd: python /splink_links_to_clusters.py
328
352
  outputs:
329
353
  clusters: result.parquet
330
354
  fastLink_evaluating_pairs:
331
355
  steps:
332
356
  - evaluating_pairs
333
- image_name: main/fastLink_evaluating_pairs.sif
357
+ image_name: fastLink_evaluating_pairs.sif
334
358
  script_cmd: Rscript /fastLink_evaluating_pairs.R
335
359
  outputs:
336
360
  links: result.parquet
337
361
  fastLink_links_to_clusters:
338
362
  steps:
339
363
  - links_to_clusters
340
- image_name: main/fastLink_links_to_clusters.sif
364
+ image_name: fastLink_links_to_clusters.sif
341
365
  script_cmd: Rscript /fastLink_links_to_clusters.R
342
366
  outputs:
343
367
  clusters: result.parquet
344
368
  exclude_clustered:
345
369
  steps:
346
370
  - determining_exclusions
347
- image_name: main/exclude_clustered.sif
371
+ image_name: exclude_clustered.sif
372
+ zenodo_record_id: 15778354
373
+ md5_checksum: db51f68ea24d114ed2b83a1382b6e6b6
348
374
  script_cmd: python /exclude_clustered.py
349
375
  outputs:
350
376
  ids_to_remove: result.parquet
351
377
  exclude_none:
352
378
  steps:
353
379
  - determining_exclusions
354
- image_name: main/exclude_none.sif
380
+ image_name: exclude_none.sif
381
+ zenodo_record_id: 15778354
382
+ md5_checksum: af12b6dde2aace9dab08d352368b16a1
355
383
  script_cmd: python /exclude_none.py
356
384
  outputs:
357
385
  ids_to_remove: result.parquet
358
386
  update_clusters_by_connected_components:
359
387
  steps:
360
388
  - updating_clusters
361
- image_name: main/update_clusters_by_connected_components.sif
389
+ image_name: update_clusters_by_connected_components.sif
390
+ zenodo_record_id: 15778354
391
+ md5_checksum: 806b0fe86a3306d74391678ed951b054
362
392
  script_cmd: python /update_clusters_by_connected_components.py
363
393
  outputs:
364
394
  clusters: result.parquet
365
395
  middle_name_to_initial:
366
396
  steps:
367
397
  - pre-processing
368
- image_name: main/middle_name_to_initial.sif
398
+ image_name: middle_name_to_initial.sif
399
+ zenodo_record_id: 15778354
400
+ md5_checksum: 89db9c3318300cda9d538cde08c3c323
369
401
  script_cmd: python /middle_name_to_initial.py
370
402
  outputs:
371
403
  dataset: dataset
404
+ one_to_many_links_to_clusters:
405
+ steps:
406
+ - links_to_clusters
407
+ image_name: one_to_many_links_to_clusters.sif
408
+ zenodo_record_id: 15778354
409
+ md5_checksum: 0bf6f0b8663b4c2f99995a2213dc541a
410
+ script_cmd: python /one_to_many_links_to_clusters.py
411
+ outputs:
412
+ clusters: result.parquet
413
+ accept_all_pairs:
414
+ steps:
415
+ - evaluating_pairs
416
+ image_name: accept_all_pairs.sif
417
+ zenodo_record_id: 15778354
418
+ md5_checksum: c71c88d159c3d7343ebc39cd37224bd9
419
+ script_cmd: python /accept_all_pairs.py
420
+ outputs:
421
+ links: result.parquet
easylink/runner.py CHANGED
@@ -11,6 +11,9 @@ be called from the ``easylink.cli`` module.
11
11
  import os
12
12
  import socket
13
13
  import subprocess
14
+ import threading
15
+ import time
16
+ from contextlib import redirect_stderr, redirect_stdout
14
17
  from pathlib import Path
15
18
 
16
19
  from graphviz import Source
@@ -123,7 +126,177 @@ def main(
123
126
  argv.extend(environment_args)
124
127
  logger.info(f"Running Snakemake")
125
128
  logger.debug(f"Snakemake arguments: {argv}")
126
- snake_main(argv)
129
+
130
+ # Run snakemake
131
+ if debug:
132
+ snake_main(argv)
133
+ else:
134
+ _run_snakemake_with_filtered_output(argv, Path(results_dir))
135
+
136
+
137
+ def _run_snakemake_with_filtered_output(argv: list[str], results_dir: Path) -> None:
138
+ """Runs Snakemake with simplified log filtering.
139
+
140
+ Parameters
141
+ ----------
142
+ argv
143
+ Snakemake command line arguments.
144
+ results_dir
145
+ Directory to save the full Snakemake log.
146
+ """
147
+ snakemake_log_file = results_dir / "pipeline.log"
148
+
149
+ # Create a filtering output handler that processes lines in real-time
150
+ class FilteringOutput:
151
+ """Handles real-time filtering and logging of Snakemake output.
152
+
153
+ This class writes all snakemake output to a log file and selectively logs
154
+ filtered lines to the logger for user visibility.
155
+
156
+ Parameters
157
+ ----------
158
+ log_file_path
159
+ The path to the log file where all output will be written.
160
+
161
+ """
162
+
163
+ def __init__(self, log_file_path: Path):
164
+ self.log_file = open(log_file_path, "w")
165
+ self.buffer = ""
166
+ self.last_output_time = time.time()
167
+ self.heartbeat_timer = None
168
+ self.dots_printed = False # Track if we've printed progress dots
169
+ self._start_heartbeat()
170
+
171
+ def _start_heartbeat(self):
172
+ """Start a timer that prints progress dots during long-running containers."""
173
+
174
+ def heartbeat():
175
+ current_time = time.time()
176
+ if current_time - self.last_output_time > 30: # 30 seconds since last output
177
+ # Print a dot to show progress - use original stdout if available
178
+ if hasattr(self, "original_stdout") and self.original_stdout:
179
+ self.original_stdout.write(".")
180
+ self.original_stdout.flush()
181
+ self.dots_printed = True # Mark that we've printed dots
182
+ self.last_output_time = current_time
183
+ # Schedule next heartbeat
184
+ self.heartbeat_timer = threading.Timer(30.0, heartbeat)
185
+ self.heartbeat_timer.daemon = True
186
+ self.heartbeat_timer.start()
187
+
188
+ # Start first heartbeat after 30 seconds
189
+ self.heartbeat_timer = threading.Timer(30.0, heartbeat)
190
+ self.heartbeat_timer.daemon = True
191
+ self.heartbeat_timer.start()
192
+
193
+ def write(self, text: str) -> int:
194
+ # Write to log file
195
+ self.log_file.write(text)
196
+ self.log_file.flush()
197
+
198
+ # Process and log filtered output
199
+ self.buffer += text
200
+ while "\n" in self.buffer:
201
+ line, self.buffer = self.buffer.split("\n", 1)
202
+ if line.strip():
203
+ filtered_line = _filter_snakemake_output(line.strip())
204
+ if filtered_line:
205
+ # Add newline after dots if we've printed any
206
+ if (
207
+ self.dots_printed
208
+ and hasattr(self, "original_stdout")
209
+ and self.original_stdout
210
+ ):
211
+ self.original_stdout.write("\n")
212
+ self.original_stdout.flush()
213
+ self.dots_printed = False # Reset the flag
214
+ logger.info(filtered_line)
215
+ self.last_output_time = time.time() # Reset heartbeat timer
216
+
217
+ return len(text)
218
+
219
+ def flush(self):
220
+ self.log_file.flush()
221
+
222
+ def close(self):
223
+ # Stop heartbeat timer
224
+ if self.heartbeat_timer:
225
+ self.heartbeat_timer.cancel()
226
+
227
+ # Process and log any remaining buffer content
228
+ if self.buffer.strip():
229
+ filtered_line = _filter_snakemake_output(self.buffer.strip())
230
+ if filtered_line:
231
+ # Add newline after dots if we've printed any
232
+ if (
233
+ self.dots_printed
234
+ and hasattr(self, "original_stdout")
235
+ and self.original_stdout
236
+ ):
237
+ self.original_stdout.write("\n")
238
+ self.original_stdout.flush()
239
+ self.dots_printed = False
240
+ logger.info(filtered_line)
241
+ self.log_file.close()
242
+
243
+ def __enter__(self):
244
+ return self
245
+
246
+ def __exit__(self, exc_type, exc_val, exc_tb):
247
+ self.close()
248
+
249
+ # Create the filtering output handler and ensure the log file is always closed
250
+ # Save original stdout for progress dots before redirection
251
+ import sys
252
+
253
+ original_stdout = sys.stdout
254
+
255
+ with FilteringOutput(snakemake_log_file) as filtering_output:
256
+ # Pass original stdout to filtering output for progress dots
257
+ filtering_output.original_stdout = original_stdout
258
+ try:
259
+ # Redirect both stdout and stderr to our filtering handler
260
+ with redirect_stdout(filtering_output), redirect_stderr(filtering_output):
261
+ snake_main(argv)
262
+ except SystemExit:
263
+ # Snakemake uses SystemExit for both success and failure
264
+ logger.info(
265
+ f"Pipeline finished running - full log saved to: {snakemake_log_file}"
266
+ )
267
+ # Always re-raise to allow test frameworks to detect completion
268
+ raise
269
+
270
+
271
+ def _filter_snakemake_output(line: str) -> str:
272
+ """Filter for Snakemake output.
273
+
274
+ Parameters
275
+ ----------
276
+ line
277
+ A single line of Snakemake output.
278
+
279
+ Returns
280
+ -------
281
+ The filtered line for display.
282
+ """
283
+ # Skip empty lines
284
+ if not line.strip():
285
+ return ""
286
+
287
+ if line.startswith("localrule "):
288
+ # Show localrule names (without the "localrule" prefix)
289
+ # Extract rule name (remove "localrule " prefix and colon at the end)
290
+ filtered_line = line.replace("localrule ", "").rstrip(":")
291
+ elif line.startswith("Job ") and ":" in line:
292
+ # Show Job messages
293
+ # Extract everything after "Job ##: "
294
+ parts = line.split(":", 1)
295
+ filtered_line = parts[1].strip() if len(parts) > 1 else ""
296
+ else:
297
+ # Suppress everything else
298
+ filtered_line = ""
299
+ return filtered_line
127
300
 
128
301
 
129
302
  def _get_singularity_args(config: Config) -> str:
@@ -0,0 +1,22 @@
1
+
2
+ Bootstrap: docker
3
+ From: python@sha256:1c26c25390307b64e8ff73e7edf34b4fbeac59d41da41c08da28dc316a721899
4
+
5
+ %files
6
+ ./accept_all_pairs.py /accept_all_pairs.py
7
+
8
+ %post
9
+ # Create directories
10
+ mkdir -p /input_data
11
+ mkdir -p /extra_implementation_specific_input_data
12
+ mkdir -p /results
13
+ mkdir -p /diagnostics
14
+
15
+ # Install Python packages with specific versions
16
+ pip install pandas pyarrow
17
+
18
+ %environment
19
+ export LC_ALL=C
20
+
21
+ %runscript
22
+ python /accept_all_pairs.py '$@'
@@ -0,0 +1,26 @@
1
+ # STEP_NAME: evaluating_pairs
2
+ # REQUIREMENTS: pandas pyarrow
3
+
4
+ import os
5
+ from pathlib import Path
6
+
7
+ import pandas as pd
8
+
9
+ blocks_dir = Path(os.environ["BLOCKS_DIR_PATH"])
10
+ diagnostics_dir = Path(os.environ["DIAGNOSTICS_DIRECTORY"])
11
+ output_path = Path(os.environ["OUTPUT_PATHS"])
12
+ Path(output_path).parent.mkdir(exist_ok=True, parents=True)
13
+
14
+ all_predictions = []
15
+
16
+ for block_dir in blocks_dir.iterdir():
17
+ if str(block_dir.stem).startswith("."):
18
+ continue
19
+
20
+ pairs = pd.read_parquet(block_dir / "pairs.parquet")
21
+
22
+ all_predictions.append(pairs.assign(Probability=1.0))
23
+
24
+ all_predictions = pd.concat(all_predictions, ignore_index=True)
25
+ print(all_predictions)
26
+ all_predictions.to_parquet(output_path)
@@ -62,12 +62,21 @@ clusters_filepath = clusters_filepaths[0]
62
62
 
63
63
  # Exclude records that have been clustered
64
64
  clusters_df = load_file(clusters_filepath)
65
+ # NOTE: We defined "clustered" for these purposes as clustered *with* anything else.
66
+ # Simply putting a record into its own cluster does not indicate to us that it has
67
+ # been sufficiently clustered to ignore.
68
+ cluster_sizes = clusters_df.groupby("Cluster ID").size()
69
+ clusters_df["size"] = cluster_sizes.loc[clusters_df["Cluster ID"]].values
70
+ clusters_df = clusters_df[clusters_df["size"] > 1]
71
+
65
72
  dataset_df = load_file(dataset_path)
66
73
  clustered_record_ids = set(dataset_df["Record ID"].unique()) & set(
67
- clusters_df["Input Record ID"].unique()
74
+ clusters_df[clusters_df["Input Record Dataset"] == splitter_choice][
75
+ "Input Record ID"
76
+ ].unique()
68
77
  )
69
78
 
70
- IDS_TO_REMOVE = pd.DataFrame({"Record ID": list(clustered_record_ids)})
79
+ IDS_TO_REMOVE = pd.DataFrame({"Input Record ID": list(clustered_record_ids)})
71
80
 
72
81
  # OUTPUT_PATHS is a single path to a file (results.parquet)
73
82
  results_filepath = os.environ["OUTPUT_PATHS"]
@@ -67,7 +67,7 @@ clusters_df = load_file(clusters_filepath)
67
67
 
68
68
  # SAVE OUTPUTS
69
69
 
70
- IDS_TO_REMOVE = pd.DataFrame(columns=["Record ID"])
70
+ IDS_TO_REMOVE = pd.DataFrame(columns=["Input Record ID"])
71
71
 
72
72
  # OUTPUT_PATHS is a single path to a file (results.parquet)
73
73
  results_filepath = os.environ["OUTPUT_PATHS"]
@@ -0,0 +1,22 @@
1
+
2
+ Bootstrap: docker
3
+ From: python@sha256:1c26c25390307b64e8ff73e7edf34b4fbeac59d41da41c08da28dc316a721899
4
+
5
+ %files
6
+ ./one_to_many_links_to_clusters.py /one_to_many_links_to_clusters.py
7
+
8
+ %post
9
+ # Create directories
10
+ mkdir -p /input_data
11
+ mkdir -p /extra_implementation_specific_input_data
12
+ mkdir -p /results
13
+ mkdir -p /diagnostics
14
+
15
+ # Install Python packages with specific versions
16
+ pip install pandas pyarrow networkx
17
+
18
+ %environment
19
+ export LC_ALL=C
20
+
21
+ %runscript
22
+ python /one_to_many_links_to_clusters.py '$@'
@@ -0,0 +1,109 @@
1
+ # STEP_NAME: links_to_clusters
2
+ # REQUIREMENTS: pandas pyarrow networkx
3
+
4
+ import os
5
+ from pathlib import Path
6
+
7
+ import networkx as nx
8
+ import pandas as pd
9
+
10
+ links = pd.read_parquet(os.environ["LINKS_FILE_PATH"])
11
+ output_path = Path(os.environ["OUTPUT_PATHS"])
12
+
13
+ no_duplicates_dataset = os.environ["NO_DUPLICATES_DATASET"]
14
+ break_ties_method = os.getenv("BREAK_TIES_METHOD", "drop")
15
+
16
+ left_no_duplicates_dataset = links["Left Record Dataset"] == no_duplicates_dataset
17
+ right_no_duplicates_dataset = links["Right Record Dataset"] == no_duplicates_dataset
18
+
19
+ if (left_no_duplicates_dataset & right_no_duplicates_dataset).any():
20
+ raise ValueError(
21
+ f"Provided links include links within the no_duplicates_dataset ({no_duplicates_dataset})"
22
+ )
23
+
24
+ if not (left_no_duplicates_dataset | right_no_duplicates_dataset).all():
25
+ raise ValueError(
26
+ f"Provided links include links that don't involve the no_duplicates_dataset ({no_duplicates_dataset})"
27
+ )
28
+
29
+ # Get the no-duplicates dataset all on the right
30
+ id_cols = [
31
+ "Left Record Dataset",
32
+ "Left Record ID",
33
+ "Right Record Dataset",
34
+ "Right Record ID",
35
+ ]
36
+ switched_id_cols = [
37
+ "Right Record Dataset",
38
+ "Right Record ID",
39
+ "Left Record Dataset",
40
+ "Left Record ID",
41
+ ]
42
+ links.loc[left_no_duplicates_dataset, id_cols] = links.loc[
43
+ left_no_duplicates_dataset, switched_id_cols
44
+ ].to_numpy()
45
+ links[["Left Record ID", "Right Record ID"]] = links[
46
+ ["Left Record ID", "Right Record ID"]
47
+ ].astype(int)
48
+
49
+ links["Left Record Key"] = (
50
+ links["Left Record Dataset"] + "-__-" + links["Left Record ID"].astype(int).astype(str)
51
+ )
52
+ links["Right Record Key"] = (
53
+ links["Right Record Dataset"] + "-__-" + links["Right Record ID"].astype(int).astype(str)
54
+ )
55
+
56
+ links_to_accept = (
57
+ links[links["Probability"] >= float(os.environ["THRESHOLD_MATCH_PROBABILITY"])]
58
+ # Pre-emptively break probability ties by right record key for the highest_id method
59
+ .sort_values(["Probability", "Right Record Key"], ascending=False)
60
+ # No duplicates in the *right* means only one link per *left* record
61
+ .groupby(["Left Record Key"]).first()
62
+ )
63
+
64
+ if break_ties_method == "drop":
65
+ num_tied = (
66
+ links_to_accept.merge(links, on=["Left Record Key", "Probability"])
67
+ .groupby(["Left Record Key"])
68
+ .size()
69
+ )
70
+ print("Ties:")
71
+ print(num_tied)
72
+ print(num_tied.describe())
73
+ links_to_accept = links_to_accept[num_tied == 1]
74
+ elif break_ties_method == "highest_id":
75
+ # Done above pre-emptively
76
+ pass
77
+ else:
78
+ raise ValueError(f"Unknown break_ties_method {break_ties_method}")
79
+
80
+ # NOTE: We only include nodes involved in an accepted link in our cluster.
81
+ # If a node isn't involved in an accepted link, that could just represent
82
+ # that we haven't evaluated the right pairs involving it, not confidence that
83
+ # it is a singleton.
84
+ G = nx.from_pandas_edgelist(
85
+ links_to_accept.reset_index()[["Left Record Key", "Right Record Key"]].rename(
86
+ columns={"Left Record Key": "source", "Right Record Key": "target"}
87
+ )
88
+ )
89
+
90
+ # Compute connected components
91
+ components = list(nx.connected_components(G))
92
+
93
+ # Assign new cluster IDs
94
+ merged_data = []
95
+ for cluster_id, records in enumerate(components, start=1):
96
+ for record_key in records:
97
+ merged_data.append((record_key, cluster_id))
98
+
99
+ # Build the final DataFrame
100
+ merged_df = pd.DataFrame(merged_data, columns=["Input Record Key", "Cluster ID"])
101
+
102
+ merged_df[["Input Record Dataset", "Input Record ID"]] = (
103
+ merged_df["Input Record Key"].str.split("-__-", n=1, expand=True)
104
+ if not merged_df.empty
105
+ else pd.DataFrame(columns=["Input Record Dataset", "Input Record ID"])
106
+ )
107
+ merged_df["Input Record ID"] = merged_df["Input Record ID"].astype(int)
108
+
109
+ merged_df[["Input Record Dataset", "Input Record ID", "Cluster ID"]].to_parquet(output_path)
@@ -59,7 +59,18 @@ new_clusters_df = load_file(new_clusters_filepath)
59
59
 
60
60
  def merge_clusters(known_clusters_df, new_clusters_df):
61
61
  # Combine both dataframes
62
- combined_df = pd.concat([known_clusters_df, new_clusters_df], ignore_index=True)
62
+ combined_df = pd.concat(
63
+ [
64
+ # Ensure cluster names are unique
65
+ known_clusters_df.assign(
66
+ **{"Cluster ID": lambda df: "known__" + df["Cluster ID"].astype(str)}
67
+ ),
68
+ new_clusters_df.assign(
69
+ **{"Cluster ID": lambda df: "new__" + df["Cluster ID"].astype(str)}
70
+ ),
71
+ ],
72
+ ignore_index=True,
73
+ )
63
74
  combined_df["Input Record Key"] = (
64
75
  combined_df["Input Record Dataset"]
65
76
  + "-__-"
@@ -92,9 +103,11 @@ def merge_clusters(known_clusters_df, new_clusters_df):
92
103
  # Build the final DataFrame
93
104
  merged_df = pd.DataFrame(merged_data, columns=["Input Record Key", "Cluster ID"])
94
105
 
95
- merged_df[["Input Record Dataset", "Input Record ID"]] = merged_df[
96
- "Input Record Key"
97
- ].str.split("-__-", n=1, expand=True)
106
+ merged_df[["Input Record Dataset", "Input Record ID"]] = (
107
+ merged_df["Input Record Key"].str.split("-__-", n=1, expand=True)
108
+ if not merged_df.empty
109
+ else pd.DataFrame(columns=["Input Record Dataset", "Input Record ID"])
110
+ )
98
111
 
99
112
  merged_df["Input Record ID"] = merged_df["Input Record ID"].astype(int)
100
113
 
@@ -72,7 +72,7 @@ if len(clusters_df) > 0:
72
72
 
73
73
  # SAVE OUTPUTS
74
74
 
75
- IDS_TO_REMOVE = pd.DataFrame(columns=["Record ID"])
75
+ IDS_TO_REMOVE = pd.DataFrame(columns=["Input Record ID"])
76
76
 
77
77
  # OUTPUT_PATHS is a single path to a file (results.parquet)
78
78
  results_filepath = os.environ["OUTPUT_PATHS"]
@@ -52,7 +52,7 @@ results_dir.mkdir(exist_ok=True, parents=True)
52
52
  dataset = load_file(dataset_path)
53
53
  ids_to_remove = load_file(ids_filepath)
54
54
 
55
- dataset = dataset[~dataset["Record ID"].isin(ids_to_remove)]
55
+ dataset = dataset[~dataset["Record ID"].isin(ids_to_remove["Input Record ID"])]
56
56
 
57
57
  output_path = results_dir / Path(dataset_path).name
58
58
  logging.info(f"Writing output for dataset from input {dataset_path} to {output_path}")
@@ -90,12 +90,18 @@ blocked_pairs = (
90
90
  .drop(columns=["match_key"])
91
91
  )
92
92
 
93
- blocked_pairs[["Left Record Dataset", "Left Record ID"]] = blocked_pairs.pop(
94
- "join_key_l"
95
- ).str.split("-__-", n=1, expand=True)
96
- blocked_pairs[["Right Record Dataset", "Right Record ID"]] = blocked_pairs.pop(
97
- "join_key_r"
98
- ).str.split("-__-", n=1, expand=True)
93
+ blocked_pairs[["Left Record Dataset", "Left Record ID"]] = (
94
+ blocked_pairs.pop("join_key_l").str.split("-__-", n=1, expand=True)
95
+ if not blocked_pairs.empty
96
+ else pd.DataFrame(columns=["Left Record Dataset", "Left Record ID"])
97
+ )
98
+
99
+ blocked_pairs[["Right Record Dataset", "Right Record ID"]] = (
100
+ blocked_pairs.pop("join_key_r").str.split("-__-", n=1, expand=True)
101
+ if not blocked_pairs.empty
102
+ else pd.DataFrame(columns=["Right Record Dataset", "Right Record ID"])
103
+ )
104
+
99
105
  blocked_pairs[["Left Record ID", "Right Record ID"]] = blocked_pairs[
100
106
  ["Left Record ID", "Right Record ID"]
101
107
  ].astype(int)
@@ -35,6 +35,7 @@ for block_dir in blocks_dir.iterdir():
35
35
  comparisons.append(cl.LevenshteinAtThresholds(column))
36
36
  else:
37
37
  raise ValueError(f"Unknown comparison method {method}")
38
+ # TODO: check both datasets contain all the columns
38
39
 
39
40
  # Create the Splink linker in dedupe mode
40
41
  settings = SettingsCreator(
@@ -135,7 +136,7 @@ for block_dir in blocks_dir.iterdir():
135
136
 
136
137
  sqls = predict_from_comparison_vectors_sqls_using_settings(
137
138
  linker._settings_obj,
138
- float(os.environ["THRESHOLD_MATCH_PROBABILITY"]),
139
+ float(os.getenv("THRESHOLD_MATCH_PROBABILITY", 0)),
139
140
  threshold_match_weight=None,
140
141
  sql_infinity_expression=linker._infinity_expression,
141
142
  )
@@ -53,6 +53,8 @@ cc = (
53
53
  # Split "Record Key" back into "Input Record Dataset" and "Input Record ID"
54
54
  cc[["Input Record Dataset", "Input Record ID"]] = (
55
55
  cc["Record Key"].astype(str).str.split("-__-", n=1, expand=True)
56
+ if not cc.empty
57
+ else pd.DataFrame(columns=["Input Record Dataset", "Input Record ID"])
56
58
  )
57
59
  cc = cc.drop(columns=["Record Key"])
58
60
  cc["Input Record ID"] = cc["Input Record ID"].astype(int)
@@ -97,24 +97,34 @@ def _add_logging_sink(
97
97
  Whether the logs should be converted to JSON before they're dumped
98
98
  to the logging sink.
99
99
  """
100
- message_format = (
101
- "<green>{time:YYYY-MM-DD HH:mm:ss.SSS}</green> | <green>{elapsed}</green> | "
102
- "<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>"
103
- )
100
+
101
+ def format_message(record):
102
+ elapsed_seconds = int(record["elapsed"].total_seconds())
103
+ hours = elapsed_seconds // 3600
104
+ minutes = (elapsed_seconds % 3600) // 60
105
+ seconds = elapsed_seconds % 60
106
+ elapsed_str = f"{hours:02d}:{minutes:02d}:{seconds:02d}"
107
+ time_str = record["time"].strftime("%Y-%m-%d %H:%M:%S")
108
+
109
+ if colorize:
110
+ return f"\033[32m{time_str}\033[0m | \033[32m{elapsed_str}\033[0m | {record['message']}\n"
111
+ else:
112
+ return f"{time_str} | {elapsed_str} | {record['message']}\n"
113
+
104
114
  if verbose == 0:
105
115
  logger.add(
106
116
  sink,
107
- colorize=colorize,
117
+ colorize=False, # We handle colors in format_message
108
118
  level="INFO",
109
- format=message_format,
119
+ format=format_message,
110
120
  serialize=serialize,
111
121
  )
112
122
  elif verbose >= 1:
113
123
  logger.add(
114
124
  sink,
115
- colorize=colorize,
125
+ colorize=False, # We handle colors in format_message
116
126
  level="DEBUG",
117
- format=message_format,
127
+ format=format_message,
118
128
  serialize=serialize,
119
129
  )
120
130
 
@@ -341,8 +341,8 @@ def _validate_pairs(df: pd.DataFrame, filepath: str) -> None:
341
341
  def validate_ids_to_remove(filepath: str) -> None:
342
342
  """Validates a file containing IDs to remove.
343
343
 
344
- - The file must contain a single column: "Record ID".
345
- - "Record ID" must have unique values.
344
+ - The file must contain a single column: "Input Record ID".
345
+ - "Input Record ID" must have unique values.
346
346
 
347
347
  Parameters
348
348
  ----------
@@ -352,13 +352,13 @@ def validate_ids_to_remove(filepath: str) -> None:
352
352
  Raises
353
353
  ------
354
354
  LookupError
355
- If the file is missing the "Record ID" column.
355
+ If the file is missing the "Input Record ID" column.
356
356
  ValueError
357
- If the "Record ID" column is not unique.
357
+ If the "Input Record ID" column is not unique.
358
358
  """
359
- _validate_required_columns(filepath, {"Record ID"})
359
+ _validate_required_columns(filepath, {"Input Record ID"})
360
360
  df = _read_file(filepath)
361
- _validate_unique_column(df, "Record ID", filepath)
361
+ _validate_unique_column(df, "Input Record ID", filepath)
362
362
 
363
363
 
364
364
  def validate_records(filepath: str) -> None:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: easylink
3
- Version: 0.1.24
3
+ Version: 0.2.0
4
4
  Summary: Research repository for the EasyLink ER ecosystem project.
5
5
  Home-page: https://github.com/ihmeuw/easylink
6
6
  Author: The EasyLink developers
@@ -78,34 +78,50 @@ Installation
78
78
 
79
79
  .. _installation:
80
80
 
81
+ **NOTE: This package requires AMD64 CPU architecture - it is not compatible with
82
+ Apple's ARM64 architecture (e.g. M1 and newer Macs).**
83
+
81
84
  There are a few things to install in order to use this package:
82
85
 
83
- - Install singularity.
86
+ - Set up Linux.
87
+
88
+ Singularity (and thus EasyLink) requires Linux to run. If you are not already
89
+ using Linux, you will need to set up a virtual machine; refer to the
90
+ `Singularity documentation for installing on Windows or Mac <https://docs.sylabs.io/guides/4.1/admin-guide/installation.html#installation-on-windows-or-mac>`_.
84
91
 
85
- You may need to request it from your system admin.
86
- Refer to https://docs.sylabs.io/guides/4.1/admin-guide/installation.html.
87
- You can check if you already have singularity installed by running the command
88
- ``singularity --version``. For an existing installation, your singularity version
92
+ - Install Singularity.
93
+
94
+ First check if you already have Singularity installed by running the command
95
+ ``singularity --version``. For an existing installation, your Singularity version
89
96
  number is printed.
90
97
 
98
+ If Singularity is not yet installed, you will need to install it;
99
+ refer to the `Singularity docs for installing on Linux <https://docs.sylabs.io/guides/4.1/admin-guide/installation.html#installation-on-linux>`_.
100
+
101
+ Note that this requires administrator privileges; you may need to request installation
102
+ from your system admin if you are working in a shared computing environment.
103
+
91
104
  - Install conda.
92
105
 
93
106
  We recommend `miniforge <https://github.com/conda-forge/miniforge>`_. You can
94
107
  check if you already have conda installed by running the command ``conda --version``.
95
108
  For an existing installation, a version will be displayed.
96
109
 
97
- - Install easylink, python and graphviz in a conda environment.
110
+ - Create a conda environment with python and graphviz installed.
111
+
112
+ ::
113
+
114
+ $ conda create --name easylink -c conda-forge python=3.12 graphviz 'gcc<14' -y
115
+ $ conda activate easylink
116
+
117
+ - Install easylink in the environment.
98
118
 
99
119
  Option 1 - Install from PyPI with pip::
100
120
 
101
- $ conda create --name easylink -c conda-forge python=3.12 graphviz 'gcc<14' -y
102
- $ conda activate easylink
103
121
  $ pip install easylink
104
122
 
105
123
  Option 2 - Build from source with pip::
106
124
 
107
- $ conda create --name easylink -c conda-forge python=3.12 graphviz 'gcc<14' -y
108
- $ conda activate easylink
109
125
  $ pip install git+https://github.com/ihmeuw/easylink.git
110
126
 
111
127
  .. _end_installation:
@@ -1,34 +1,38 @@
1
1
  easylink/__about__.py,sha256=2-oxCfu9t9yUJouLDwqYRZ0eii8kN25SxRzsawjWjho,440
2
2
  easylink/__init__.py,sha256=gGMcIVfiVnHtlDw5mZwhevcDb2wt-kuP6F64gnkFack,159
3
- easylink/_version.py,sha256=Jq7e1LcKcQSNVg4EOJ-acPyPgs8Os5cYEZWXrQsI7Pg,23
4
- easylink/cli.py,sha256=06nv_3or9SxEynfi8rZTgYP1hm-Y_CeNymf5qn5s6Qo,10332
3
+ easylink/_version.py,sha256=Zn1KFblwuFHiDRdRAiRnDBRkbPttWh44jKa5zG2ov0E,22
4
+ easylink/cli.py,sha256=3Xoqclhn7mEHzuqyuVUjt67-V3Fqu0_Jr3B3lCdIuAg,10704
5
5
  easylink/configuration.py,sha256=TPd3WbqUcJMJDPJuHeo5ZebvZPQrRyfm8-laad2sOFk,12681
6
6
  easylink/graph_components.py,sha256=zZDZXg5smReHO3ryQC4pao24wyKXzWDe6jS3C6fM2ak,13892
7
7
  easylink/implementation.py,sha256=lSF37g-aQYgdLI0lgFaDrBkp23te9O9B1V-CmmRtB-0,14514
8
- easylink/implementation_metadata.yaml,sha256=pKu_H9fLnTsS8E4wCnYRitumW1-zs7mfE3z66BAyO30,10848
8
+ easylink/implementation_metadata.yaml,sha256=ahuSVk5Ur1V0F8EsLZO5apkNC2bWv2RsytNaiWGo9Yk,12562
9
9
  easylink/pipeline.py,sha256=NJyMInbOCjJ_5kRzzuK5AcupvC7ecd5qLOC-f1Gy3Ss,17701
10
10
  easylink/pipeline_graph.py,sha256=jtjS7_2IVa189U8cL621600erC_0pa6BKPRRO8VqwiU,23760
11
11
  easylink/pipeline_schema.py,sha256=sj6YQqMFUS35RRFWdlrhPlud68d3Rb5y7iKfAEZy1qs,6877
12
12
  easylink/rule.py,sha256=n8r4NL7MiNzppl39MRbEMdHEpn_e_XS7LfrsJD6KNfA,16629
13
- easylink/runner.py,sha256=Z9GKgiUAWtp0dW1cyAp86MGthIDeABJtHDXUtzv0-kE,6714
13
+ easylink/runner.py,sha256=h39MbWHgTs-VwkPxk76186si76e8UTf1hySqepqUSS8,13155
14
14
  easylink/step.py,sha256=-vdFhPvwAZ3d69LMQGmSIVdcMG8E8ZtSvTE0UWif7zs,91088
15
15
  easylink/devtools/implementation_creator.py,sha256=gZZpfpiOOh912nNjR_d5wR0mBO5c09jWMS0FSYRcE1o,19120
16
16
  easylink/pipeline_schema_constants/__init__.py,sha256=njUL2v67WFC3dW_FjOXWRLSeOhInox_ZINLEji0_7V8,1523
17
17
  easylink/pipeline_schema_constants/development.py,sha256=0x6lWqBmP1K9AebEmeZ4veSnLBcdQcZXzbV6lCU11bc,12622
18
18
  easylink/pipeline_schema_constants/main.py,sha256=kcAhdbK_BhS79LUMhKkn-Uttl2vA-MHVX4M1StTloJQ,22934
19
19
  easylink/pipeline_schema_constants/testing.py,sha256=ti08DeUuF-eWrGKMj4BMyOFFJnEYooDaWX0DGiferbk,24579
20
+ easylink/steps/cascading/accept_all_pairs.def,sha256=kwZMF3H0mqCBcO1Y2parJXFBLp4e9bLQoVIYU7zZ8xY,486
21
+ easylink/steps/cascading/accept_all_pairs.py,sha256=eF_rmqcZtL3vI1u-TJejOcKX2Qou-AbaLI7qAAGjoGI,703
20
22
  easylink/steps/cascading/exclude_clustered.def,sha256=GfoDqO2Vtsh7VI8SwGaJtv_KtKjs-UmBcivqQ7OPkjk,503
21
- easylink/steps/cascading/exclude_clustered.py,sha256=Bpsyf9vAZ431Fh96RVzHkF7fy77NQjo1Cl6bHCIy69c,2580
23
+ easylink/steps/cascading/exclude_clustered.py,sha256=T60deNb91_ZFg5K190G-Q7BC5EYrEdLPhFEK7Togv0Y,3048
22
24
  easylink/steps/cascading/exclude_none.def,sha256=iFUhUMocxtkA0NErkjVrBxY0MUdS3DIPNsbCpTJRP0k,488
23
- easylink/steps/cascading/exclude_none.py,sha256=5DK5bNG4TneMwUKE49Kmz7VDnKBNZWjOERkuSJU3BmA,2475
25
+ easylink/steps/cascading/exclude_none.py,sha256=DesKAO-UcPqKKtUS92OHU25YDXMJLiBEcGLk69UYWDk,2481
26
+ easylink/steps/cascading/one_to_many_links_to_clusters.def,sha256=BVFusUydsV3hY1en16OVr3TPqzwst-cEVBwvb8dtpqA,534
27
+ easylink/steps/cascading/one_to_many_links_to_clusters.py,sha256=7QSJxW3mmR3LIjWBzzgi3vcOsmoYOsiSJn6iYGppHLA,3789
24
28
  easylink/steps/cascading/update_clusters_by_connected_components.def,sha256=sAAAWOod8EuAnotR1cayaGAvs7x6xoMVlwmLso_a9Cc,578
25
- easylink/steps/cascading/update_clusters_by_connected_components.py,sha256=sFZXMGXl17jcGt8Fu5hgQz1KW5bFvPYdCoQGZ9Erc0I,3629
29
+ easylink/steps/cascading/update_clusters_by_connected_components.py,sha256=jhpMgewztCXrRxBw2FnH2HjIybpp7GcHe4kjTMgQOyg,4059
26
30
  easylink/steps/default/default_clusters_to_links.def,sha256=9PjUygLvsoYMUZDznceuuv55t8fPs473P57J_RMl3U0,527
27
31
  easylink/steps/default/default_clusters_to_links.py,sha256=Ckm53d3W-ETNlTvQJPOkpHmSqCmxSWknMPQrEAIoTBY,2816
28
32
  easylink/steps/default/default_determining_exclusions.def,sha256=zZUEHDdrpLxzx3gTm-dki2ge5ivCCg4ziIwTErqCII0,542
29
- easylink/steps/default/default_determining_exclusions.py,sha256=4diLfuqYm_Koj7gwifjwe_7mLZ6xb6RQiEdk-RRtB94,2629
33
+ easylink/steps/default/default_determining_exclusions.py,sha256=RpYHFAral4uYevgiOsYqUHYgsEIejV5NhYdQ3q7VeU0,2635
30
34
  easylink/steps/default/default_removing_records.def,sha256=QqacmOu6myxFSULHRKeKsVD8l73KDm4VEkPkPlovwqs,524
31
- easylink/steps/default/default_removing_records.py,sha256=P4mmX2D4mhSoWd_S5CaNT4hlHOMAeZiMhCScWQiR_fQ,1906
35
+ easylink/steps/default/default_removing_records.py,sha256=I_xGdWftlwP7H8HdxfblSG2YFVqA986KOECVwMCn4fk,1925
32
36
  easylink/steps/default/default_schema_alignment.def,sha256=hFHJkos0Fhe4LvpjLOCd6klIaIqOKqECDDSTVu3G03Y,524
33
37
  easylink/steps/default/default_schema_alignment.py,sha256=oT5jbYQ3C3ocLgqqOnvH0SIJ6NeTtPBWWmCqr_frnAI,1479
34
38
  easylink/steps/default/default_updating_clusters.def,sha256=vDzSkTknDfeiXeHREpw4BkUxFcTWamxr81c3rZ7_enY,527
@@ -70,22 +74,22 @@ easylink/steps/rl-dummy/input_data/known_clusters.parquet,sha256=Ysodu65toHZN4Ag
70
74
  easylink/steps/rl-dummy/pre-processing/dummy_pre-processing.def,sha256=HeUSv2QvMOQzsyVktYR1xYoEqwiNpDo-p7IRcGSMspE,512
71
75
  easylink/steps/rl-dummy/pre-processing/dummy_pre-processing.py,sha256=I6kqG4e_H2yFW5MpsMXdpoY_NjHcBvVVAHWv89LUgXE,1852
72
76
  easylink/steps/splink/splink_blocking_and_filtering.def,sha256=umWzxJhsfdi8g3TD-r2mKpjC-FPAMDk6-IERiWigdQc,557
73
- easylink/steps/splink/splink_blocking_and_filtering.py,sha256=FO8YJ2_KgCLpQoq5xsM339bTSN1DhCXCL8XT1pb5STY,5259
77
+ easylink/steps/splink/splink_blocking_and_filtering.py,sha256=3WMBmNEECB9Kxu4D6PAesZzBrhHTdpFEgvnGPsV4bww,5475
74
78
  easylink/steps/splink/splink_evaluating_pairs.def,sha256=DN3Ohy9qJOAyK58v164neP23HDVYpedMqzCu4eQh4Hg,521
75
- easylink/steps/splink/splink_evaluating_pairs.py,sha256=m-j1QMRSvPCiSoWVSV1kzzzsK1c_xG8nqYKMd3cj7kM,6195
79
+ easylink/steps/splink/splink_evaluating_pairs.py,sha256=vWUe3vQo9uGs0Cy8pG5PbolzsJX_cEaPS3Q0PMcBjcg,6253
76
80
  easylink/steps/splink/splink_links_to_clusters.def,sha256=RurvOYyGjNs9tx64DTXwI-GSgHD4T7SzDfhAH18pTEM,524
77
- easylink/steps/splink/splink_links_to_clusters.py,sha256=5Sw8yi0dVLuRB-trN2kXmxbHBR0VJBxYee6u4_usg2Y,1920
81
+ easylink/steps/splink/splink_links_to_clusters.py,sha256=Brpy3ZKSBpBUeOitg1ZgDvuMVwILH0QBkLXRJN8LXno,2015
78
82
  easylink/utilities/__init__.py,sha256=0U33kbv4hoMfFQ_lh5hLwifxRPzOgkLkjKLYxmaK10g,196
79
83
  easylink/utilities/aggregator_utils.py,sha256=_DAHRAf9janbDsuj_jnAn5Dzz2s4R5Ni3YeytDpN9UE,954
80
84
  easylink/utilities/data_utils.py,sha256=XPRjq3qW_fN0xQ23Jms_xBzpTHbRwqZWDP1AW0nYkP0,6926
81
- easylink/utilities/general_utils.py,sha256=El1W0nn4P27sRBGotNQb-9du-Gbhk9ggSuu4vmGDfwo,4591
85
+ easylink/utilities/general_utils.py,sha256=MmuoV4T6PgyEDjbepC_1D3TGrq70Hp-hl-GaAYr5wRU,5033
82
86
  easylink/utilities/paths.py,sha256=9inDKMPorAaWh5vhjc3j1Tj_aXVKhLdodiJO9H1nNes,947
83
87
  easylink/utilities/spark.smk,sha256=kGtpem7LfQc71tMh5WAYaqKnHQKFvcdhPQSdumOP70k,5799
84
88
  easylink/utilities/splitter_utils.py,sha256=KXiVUYJ9TGxCQmrwos18pB1sxG_0Ay67qoDJT6vem2o,3828
85
- easylink/utilities/validation_utils.py,sha256=DBJB2TLVklgYw1WaaPse9vqtwPLMGmZNYM2cbCZsoHI,18417
86
- easylink-0.1.24.dist-info/licenses/LICENSE,sha256=z6NBo3hY2Pbh2Wjm9IcHLyQoG-kd816uASh8PbwhTcQ,1530
87
- easylink-0.1.24.dist-info/METADATA,sha256=qXBlEqCrYzAtLJGKC4Lqf_Q7pMn5gJdRxFGYDGUX4pE,3565
88
- easylink-0.1.24.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
89
- easylink-0.1.24.dist-info/entry_points.txt,sha256=OGMZDFltg3yMboT7XjJt3joiPhRfV_7jnREVtrAIQNU,51
90
- easylink-0.1.24.dist-info/top_level.txt,sha256=oHcOpcF_jDMWFiJRzfGQvuskENGDjSPC_Agu9Z_Xvik,9
91
- easylink-0.1.24.dist-info/RECORD,,
89
+ easylink/utilities/validation_utils.py,sha256=1naksMPStw_xIOqskX6DE99f16Y7eCcVF9I5ZILjMvI,18453
90
+ easylink-0.2.0.dist-info/licenses/LICENSE,sha256=z6NBo3hY2Pbh2Wjm9IcHLyQoG-kd816uASh8PbwhTcQ,1530
91
+ easylink-0.2.0.dist-info/METADATA,sha256=HxtOiOMe9hTRcK6HL6sLTTQNeP9X7hrhiodTpEMUeOA,4218
92
+ easylink-0.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
93
+ easylink-0.2.0.dist-info/entry_points.txt,sha256=OGMZDFltg3yMboT7XjJt3joiPhRfV_7jnREVtrAIQNU,51
94
+ easylink-0.2.0.dist-info/top_level.txt,sha256=oHcOpcF_jDMWFiJRzfGQvuskENGDjSPC_Agu9Z_Xvik,9
95
+ easylink-0.2.0.dist-info/RECORD,,