easylink 0.1.23__py3-none-any.whl → 0.1.25__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
easylink/_version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.1.23"
1
+ __version__ = "0.1.25"
easylink/cli.py CHANGED
@@ -201,16 +201,20 @@ def run(
201
201
  main = handle_exceptions(
202
202
  func=runner.main, exceptions_logger=logger, with_debugger=with_debugger
203
203
  )
204
- main(
205
- command="run",
206
- pipeline_specification=pipeline_specification,
207
- input_data=input_data,
208
- computing_environment=computing_environment,
209
- results_dir=results_dir,
210
- images_dir=images,
211
- schema_name=schema,
212
- )
213
- logger.info("*** FINISHED ***")
204
+ try:
205
+ main(
206
+ command="run",
207
+ pipeline_specification=pipeline_specification,
208
+ input_data=input_data,
209
+ computing_environment=computing_environment,
210
+ results_dir=results_dir,
211
+ images_dir=images,
212
+ schema_name=schema,
213
+ )
214
+ except SystemExit:
215
+ # Snakemake uses SystemExit for completion - log success and re-raise
216
+ logger.info("*** FINISHED ***")
217
+ raise
214
218
 
215
219
 
216
220
  @easylink.command()
@@ -243,6 +247,7 @@ def generate_dag(
243
247
  input_data=input_data,
244
248
  computing_environment=None,
245
249
  results_dir=results_dir,
250
+ images_dir=None,
246
251
  schema_name=schema,
247
252
  )
248
253
  logger.info("*** DAG saved to result directory ***")
@@ -2,7 +2,7 @@ step_1_python_pandas:
2
2
  steps:
3
3
  - step_1
4
4
  image_name: python_pandas.sif
5
- zenodo_record_id: 15733426
5
+ zenodo_record_id: 15757317
6
6
  md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
7
7
  script_cmd: python /dummy_step.py
8
8
  outputs:
@@ -11,7 +11,7 @@ step_1a_python_pandas:
11
11
  steps:
12
12
  - step_1a
13
13
  image_name: python_pandas.sif
14
- zenodo_record_id: 15733426
14
+ zenodo_record_id: 15757317
15
15
  md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
16
16
  script_cmd: python /dummy_step.py
17
17
  env:
@@ -22,7 +22,7 @@ step_1b_python_pandas:
22
22
  steps:
23
23
  - step_1b
24
24
  image_name: python_pandas.sif
25
- zenodo_record_id: 15733426
25
+ zenodo_record_id: 15757317
26
26
  md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
27
27
  script_cmd: python /dummy_step.py
28
28
  env:
@@ -33,7 +33,7 @@ step_2_python_pandas:
33
33
  steps:
34
34
  - step_2
35
35
  image_name: python_pandas.sif
36
- zenodo_record_id: 15733426
36
+ zenodo_record_id: 15757317
37
37
  md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
38
38
  script_cmd: python /dummy_step.py
39
39
  outputs:
@@ -42,7 +42,7 @@ step_3_python_pandas:
42
42
  steps:
43
43
  - step_3
44
44
  image_name: python_pandas.sif
45
- zenodo_record_id: 15733426
45
+ zenodo_record_id: 15757317
46
46
  md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
47
47
  script_cmd: python /dummy_step.py
48
48
  outputs:
@@ -51,7 +51,7 @@ step_4_python_pandas:
51
51
  steps:
52
52
  - step_4
53
53
  image_name: python_pandas.sif
54
- zenodo_record_id: 15733426
54
+ zenodo_record_id: 15757317
55
55
  md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
56
56
  script_cmd: python /dummy_step.py
57
57
  env:
@@ -62,7 +62,7 @@ step_5_python_pandas:
62
62
  steps:
63
63
  - step_5
64
64
  image_name: python_pandas.sif
65
- zenodo_record_id: 15733426
65
+ zenodo_record_id: 15757317
66
66
  md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
67
67
  script_cmd: python /dummy_step.py
68
68
  env:
@@ -73,7 +73,7 @@ step_6_python_pandas:
73
73
  steps:
74
74
  - step_6
75
75
  image_name: python_pandas.sif
76
- zenodo_record_id: 15733426
76
+ zenodo_record_id: 15757317
77
77
  md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
78
78
  script_cmd: python /dummy_step.py
79
79
  env:
@@ -84,7 +84,7 @@ step_4a_python_pandas:
84
84
  steps:
85
85
  - step_4a
86
86
  image_name: python_pandas.sif
87
- zenodo_record_id: 15733426
87
+ zenodo_record_id: 15757317
88
88
  md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
89
89
  script_cmd: python /dummy_step.py
90
90
  env:
@@ -95,7 +95,7 @@ step_4b_python_pandas:
95
95
  steps:
96
96
  - step_4b
97
97
  image_name: python_pandas.sif
98
- zenodo_record_id: 15733426
98
+ zenodo_record_id: 15757317
99
99
  md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
100
100
  script_cmd: python /dummy_step.py
101
101
  env:
@@ -106,7 +106,7 @@ step_4b_r:
106
106
  steps:
107
107
  - step_4b
108
108
  image_name: r-image.sif
109
- zenodo_record_id: 15733426
109
+ zenodo_record_id: 15757317
110
110
  md5_checksum: a4a03b836694a2b81a1bd2852736ccc5
111
111
  script_cmd: Rscript /dummy_step.R
112
112
  env:
@@ -117,7 +117,7 @@ step_1_python_pyspark:
117
117
  steps:
118
118
  - step_1
119
119
  image_name: python_pyspark.sif
120
- zenodo_record_id: 15733426
120
+ zenodo_record_id: 15757317
121
121
  md5_checksum: c948577ab0607411dd4b640622d9ec3a
122
122
  script_cmd: python3 /code/dummy_step.py
123
123
  outputs:
@@ -127,7 +127,7 @@ step_2_python_pyspark:
127
127
  steps:
128
128
  - step_2
129
129
  image_name: python_pyspark.sif
130
- zenodo_record_id: 15733426
130
+ zenodo_record_id: 15757317
131
131
  md5_checksum: c948577ab0607411dd4b640622d9ec3a
132
132
  script_cmd: python3 /code/dummy_step.py
133
133
  outputs:
@@ -137,7 +137,7 @@ step_3_python_pyspark:
137
137
  steps:
138
138
  - step_3
139
139
  image_name: python_pyspark.sif
140
- zenodo_record_id: 15733426
140
+ zenodo_record_id: 15757317
141
141
  md5_checksum: c948577ab0607411dd4b640622d9ec3a
142
142
  script_cmd: python3 /code/dummy_step.py
143
143
  outputs:
@@ -147,7 +147,7 @@ step_4_python_pyspark:
147
147
  steps:
148
148
  - step_4
149
149
  image_name: python_pyspark.sif
150
- zenodo_record_id: 15733426
150
+ zenodo_record_id: 15757317
151
151
  md5_checksum: c948577ab0607411dd4b640622d9ec3a
152
152
  script_cmd: python3 /code/dummy_step.py
153
153
  env:
@@ -158,7 +158,7 @@ step_1_r:
158
158
  steps:
159
159
  - step_1
160
160
  image_name: r-image.sif
161
- zenodo_record_id: 15733426
161
+ zenodo_record_id: 15757317
162
162
  md5_checksum: a4a03b836694a2b81a1bd2852736ccc5
163
163
  script_cmd: Rscript /dummy_step.R
164
164
  outputs:
@@ -168,7 +168,7 @@ step_2_r:
168
168
  steps:
169
169
  - step_2
170
170
  image_name: r-image.sif
171
- zenodo_record_id: 15733426
171
+ zenodo_record_id: 15757317
172
172
  md5_checksum: a4a03b836694a2b81a1bd2852736ccc5
173
173
  script_cmd: Rscript /dummy_step.R
174
174
  outputs:
@@ -178,7 +178,7 @@ step_3_r:
178
178
  steps:
179
179
  - step_3
180
180
  image_name: r-image.sif
181
- zenodo_record_id: 15733426
181
+ zenodo_record_id: 15757317
182
182
  md5_checksum: a4a03b836694a2b81a1bd2852736ccc5
183
183
  script_cmd: Rscript /dummy_step.R
184
184
  outputs:
@@ -188,7 +188,7 @@ step_4_r:
188
188
  steps:
189
189
  - step_4
190
190
  image_name: r-image.sif
191
- zenodo_record_id: 15733426
191
+ zenodo_record_id: 15757317
192
192
  md5_checksum: a4a03b836694a2b81a1bd2852736ccc5
193
193
  script_cmd: Rscript /dummy_step.R
194
194
  env:
@@ -201,7 +201,7 @@ step_1_and_step_2_combined_python_pandas:
201
201
  - step_1
202
202
  - step_2
203
203
  image_name: python_pandas.sif
204
- zenodo_record_id: 15733426
204
+ zenodo_record_id: 15757317
205
205
  md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
206
206
  script_cmd: python /dummy_step.py
207
207
  outputs:
@@ -211,7 +211,7 @@ step_1_and_step_2_parallel_python_pandas:
211
211
  - step_1
212
212
  - step_2
213
213
  image_name: python_pandas.sif
214
- zenodo_record_id: 15733426
214
+ zenodo_record_id: 15757317
215
215
  md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
216
216
  script_cmd: python /dummy_step.py
217
217
  env:
@@ -223,7 +223,7 @@ step_3_and_step_4_combined_python_pandas:
223
223
  - step_3
224
224
  - step_4
225
225
  image_name: python_pandas.sif
226
- zenodo_record_id: 15733426
226
+ zenodo_record_id: 15757317
227
227
  md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
228
228
  script_cmd: python /dummy_step.py
229
229
  outputs:
@@ -233,7 +233,7 @@ step_1a_and_step_1b_combined_python_pandas:
233
233
  - step_1a
234
234
  - step_1b
235
235
  image_name: python_pandas.sif
236
- zenodo_record_id: 15733426
236
+ zenodo_record_id: 15757317
237
237
  md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
238
238
  script_cmd: python /dummy_step.py
239
239
  outputs:
@@ -241,131 +241,157 @@ step_1a_and_step_1b_combined_python_pandas:
241
241
  dummy_step_1_for_output_dir_example:
242
242
  steps:
243
243
  - step_1_for_output_dir_example
244
- image_name: main/dummy_step_1_for_output_dir_example.sif
244
+ image_name: dummy_step_1_for_output_dir_example.sif
245
245
  script_cmd: python /dummy_step_1_for_output_dir_example.py
246
246
  outputs:
247
247
  step_1_main_output_directory: output_dir/
248
248
  dummy_step_1_for_output_dir_example_default:
249
249
  steps:
250
250
  - step_1_for_output_dir_example
251
- image_name: main/dummy_step_1_for_output_dir_example.sif
251
+ image_name: dummy_step_1_for_output_dir_example.sif
252
252
  script_cmd: python /dummy_step_1_for_output_dir_example.py
253
253
  dummy_step_2_for_output_dir_example:
254
254
  steps:
255
255
  - step_2_for_output_dir_example
256
- image_name: main/dummy_step_2_for_output_dir_example.sif
256
+ image_name: dummy_step_2_for_output_dir_example.sif
257
257
  script_cmd: python /dummy_step_2_for_output_dir_example.py
258
258
  outputs:
259
259
  step_2_main_output: result.parquet
260
260
  default_removing_records:
261
261
  steps:
262
262
  - removing_records
263
- image_name: main/default_removing_records.sif
263
+ image_name: default_removing_records.sif
264
+ zenodo_record_id: 15757317
265
+ md5_checksum: 85dba6fd73c9f8f504fddb6d5c30f2de
264
266
  script_cmd: python /default_removing_records.py
265
267
  outputs:
266
268
  dataset: dataset
267
269
  default_clusters_to_links:
268
270
  steps:
269
271
  - clusters_to_links
270
- image_name: main/default_clusters_to_links.sif
272
+ image_name: default_clusters_to_links.sif
273
+ zenodo_record_id: 15757317
274
+ md5_checksum: 0d00d1272bd8193f60727791097aa065
271
275
  script_cmd: python /default_clusters_to_links.py
272
276
  outputs:
273
277
  known_links: result.parquet
274
278
  default_determining_exclusions:
275
279
  steps:
276
280
  - determining_exclusions
277
- image_name: main/default_determining_exclusions.sif
281
+ image_name: default_determining_exclusions.sif
282
+ zenodo_record_id: 15757317
283
+ md5_checksum: e61cb32ad45b79ca9a2c36db4e76ef7e
278
284
  script_cmd: python /default_determining_exclusions.py
279
285
  outputs:
280
286
  ids_to_remove: result.parquet
281
287
  default_updating_clusters:
282
288
  steps:
283
289
  - updating_clusters
284
- image_name: main/default_updating_clusters.sif
290
+ image_name: default_updating_clusters.sif
291
+ zenodo_record_id: 15757317
292
+ md5_checksum: cc6bd29e099c2523347fa04545aa35c9
285
293
  script_cmd: python /default_updating_clusters.py
286
294
  outputs:
287
295
  clusters: clusters.parquet
288
- dummy_canonicalizing_and_downstream_analysis:
296
+ # NOTE: This was made from dummy_canonicalizing_and_downstream_analysis.py,
297
+ # if rebuilding change the name of that file to save_clusters.py
298
+ save_clusters:
289
299
  steps:
290
300
  - canonicalizing_and_downstream_analysis
291
- image_name: main/dummy_canonicalizing_and_downstream_analysis.sif
301
+ image_name: save_clusters.sif
302
+ zenodo_record_id: 15757317
303
+ md5_checksum: 384ab2be668cbadc45160a674f621022
292
304
  script_cmd: python /dummy_canonicalizing_and_downstream_analysis.py
293
305
  outputs:
294
306
  analysis_output: result.parquet
295
- dummy_pre-processing:
307
+ # NOTE: This was made from dummy_pre-processing.py,
308
+ # if rebuilding change the name of that file to no_pre-processing.py
309
+ no_pre-processing:
296
310
  steps:
297
311
  - pre-processing
298
- image_name: main/dummy_pre-processing.sif
312
+ image_name: no_pre-processing.sif
313
+ zenodo_record_id: 15757317
314
+ md5_checksum: 9a9c080cf145078152501cf96bf61f27
299
315
  script_cmd: python /dummy_pre-processing.py
300
316
  outputs:
301
317
  dataset: dataset
302
318
  default_schema_alignment:
303
319
  steps:
304
320
  - schema_alignment
305
- image_name: main/default_schema_alignment.sif
321
+ image_name: default_schema_alignment.sif
322
+ zenodo_record_id: 15757317
323
+ md5_checksum: 3166587f9cfec478b999a17074d628f7
306
324
  script_cmd: python /default_schema_alignment.py
307
325
  outputs:
308
326
  records: result.parquet
309
327
  splink_blocking_and_filtering:
310
328
  steps:
311
329
  - blocking_and_filtering
312
- image_name: main/splink_blocking_and_filtering.sif
330
+ image_name: splink_blocking_and_filtering.sif
331
+ zenodo_record_id: 15757317
332
+ md5_checksum: 8a365b90295ef6beaad2b7f80a03d768
313
333
  script_cmd: python /splink_blocking_and_filtering.py
314
334
  outputs:
315
335
  blocks: blocks
316
336
  splink_evaluating_pairs:
317
337
  steps:
318
338
  - evaluating_pairs
319
- image_name: main/splink_evaluating_pairs.sif
339
+ image_name: splink_evaluating_pairs.sif
340
+ zenodo_record_id: 15757317
341
+ md5_checksum: b57f4bd16b7a3aa5099569078ea4c064
320
342
  script_cmd: python /splink_evaluating_pairs.py
321
343
  outputs:
322
344
  links: result.parquet
323
345
  splink_links_to_clusters:
324
346
  steps:
325
347
  - links_to_clusters
326
- image_name: main/splink_links_to_clusters.sif
348
+ image_name: splink_links_to_clusters.sif
349
+ zenodo_record_id: 15757317
350
+ md5_checksum: 645937f7bab9c2557b7aacafaf4e4765
327
351
  script_cmd: python /splink_links_to_clusters.py
328
352
  outputs:
329
353
  clusters: result.parquet
330
354
  fastLink_evaluating_pairs:
331
355
  steps:
332
356
  - evaluating_pairs
333
- image_name: main/fastLink_evaluating_pairs.sif
357
+ image_name: fastLink_evaluating_pairs.sif
334
358
  script_cmd: Rscript /fastLink_evaluating_pairs.R
335
359
  outputs:
336
360
  links: result.parquet
337
361
  fastLink_links_to_clusters:
338
362
  steps:
339
363
  - links_to_clusters
340
- image_name: main/fastLink_links_to_clusters.sif
364
+ image_name: fastLink_links_to_clusters.sif
341
365
  script_cmd: Rscript /fastLink_links_to_clusters.R
342
366
  outputs:
343
367
  clusters: result.parquet
344
368
  exclude_clustered:
345
369
  steps:
346
370
  - determining_exclusions
347
- image_name: main/exclude_clustered.sif
371
+ image_name: exclude_clustered.sif
348
372
  script_cmd: python /exclude_clustered.py
349
373
  outputs:
350
374
  ids_to_remove: result.parquet
351
375
  exclude_none:
352
376
  steps:
353
377
  - determining_exclusions
354
- image_name: main/exclude_none.sif
378
+ image_name: exclude_none.sif
355
379
  script_cmd: python /exclude_none.py
356
380
  outputs:
357
381
  ids_to_remove: result.parquet
358
382
  update_clusters_by_connected_components:
359
383
  steps:
360
384
  - updating_clusters
361
- image_name: main/update_clusters_by_connected_components.sif
385
+ image_name: update_clusters_by_connected_components.sif
362
386
  script_cmd: python /update_clusters_by_connected_components.py
363
387
  outputs:
364
388
  clusters: result.parquet
365
389
  middle_name_to_initial:
366
390
  steps:
367
391
  - pre-processing
368
- image_name: main/middle_name_to_initial.sif
392
+ image_name: middle_name_to_initial.sif
393
+ zenodo_record_id: 15757317
394
+ md5_checksum: 89db9c3318300cda9d538cde08c3c323
369
395
  script_cmd: python /middle_name_to_initial.py
370
396
  outputs:
371
397
  dataset: dataset
easylink/rule.py CHANGED
@@ -14,6 +14,7 @@ dynamically as strings and appended to the Snakefile.
14
14
  """
15
15
 
16
16
  import os
17
+ import shlex
17
18
  from abc import ABC, abstractmethod
18
19
  from collections.abc import Callable
19
20
  from dataclasses import dataclass
@@ -204,7 +205,7 @@ rule:
204
205
  export SPARK_MASTER_URL"""
205
206
  for var_name, var_value in self.envvars.items():
206
207
  shell_cmd += f"""
207
- export {var_name}={var_value}"""
208
+ export {var_name}={shlex.quote(str(var_value))}"""
208
209
  # Log stdout/stderr to diagnostics directory
209
210
  shell_cmd += f"""
210
211
  {self.script_cmd} > {{log}} 2>&1
easylink/runner.py CHANGED
@@ -11,6 +11,7 @@ be called from the ``easylink.cli`` module.
11
11
  import os
12
12
  import socket
13
13
  import subprocess
14
+ from contextlib import redirect_stderr, redirect_stdout
14
15
  from pathlib import Path
15
16
 
16
17
  from graphviz import Source
@@ -123,7 +124,123 @@ def main(
123
124
  argv.extend(environment_args)
124
125
  logger.info(f"Running Snakemake")
125
126
  logger.debug(f"Snakemake arguments: {argv}")
126
- snake_main(argv)
127
+
128
+ # Run snakemake
129
+ if debug:
130
+ snake_main(argv)
131
+ else:
132
+ _run_snakemake_with_filtered_output(argv, Path(results_dir))
133
+
134
+
135
+ def _run_snakemake_with_filtered_output(argv: list[str], results_dir: Path) -> None:
136
+ """Runs Snakemake with simplified log filtering.
137
+
138
+ Parameters
139
+ ----------
140
+ argv
141
+ Snakemake command line arguments.
142
+ results_dir
143
+ Directory to save the full Snakemake log.
144
+ """
145
+ snakemake_log_file = results_dir / "pipeline.log"
146
+
147
+ # Create a filtering output handler that processes lines in real-time
148
+ class FilteringOutput:
149
+ """Handles real-time filtering and logging of Snakemake output.
150
+
151
+ This class writes all snakemake output to a log file and selectively logs
152
+ filtered lines to the logger for user visibility.
153
+
154
+ Parameters
155
+ ----------
156
+ log_file_path
157
+ The path to the log file where all output will be written.
158
+
159
+ """
160
+
161
+ def __init__(self, log_file_path: Path):
162
+ self.log_file = open(log_file_path, "w")
163
+ self.buffer = ""
164
+
165
+ def write(self, text: str) -> int:
166
+ # Write to log file
167
+ self.log_file.write(text)
168
+ self.log_file.flush()
169
+
170
+ # Process and log filtered output
171
+ self.buffer += text
172
+ while "\n" in self.buffer:
173
+ line, self.buffer = self.buffer.split("\n", 1)
174
+ if line.strip():
175
+ filtered_line = _filter_snakemake_output_simple(line.strip())
176
+ if filtered_line:
177
+ logger.info(filtered_line)
178
+
179
+ return len(text)
180
+
181
+ def flush(self):
182
+ self.log_file.flush()
183
+
184
+ def close(self):
185
+ # Process and log any remaining buffer content
186
+ if self.buffer.strip():
187
+ filtered_line = _filter_snakemake_output_simple(self.buffer.strip())
188
+ if filtered_line:
189
+ logger.info(filtered_line)
190
+ self.log_file.close()
191
+
192
+ def __enter__(self):
193
+ return self
194
+
195
+ def __exit__(self, exc_type, exc_val, exc_tb):
196
+ self.close()
197
+
198
+ # Create the filtering output handler and ensure the log file is always closed
199
+ with FilteringOutput(snakemake_log_file) as filtering_output:
200
+ try:
201
+ # Redirect both stdout and stderr to our filtering handler
202
+ with redirect_stdout(filtering_output), redirect_stderr(filtering_output):
203
+ snake_main(argv)
204
+ except SystemExit:
205
+ # Snakemake uses SystemExit for both success and failure
206
+ logger.info(
207
+ f"Pipeline finished running - full log saved to: {snakemake_log_file}"
208
+ )
209
+ # Always re-raise to allow test frameworks to detect completion
210
+ raise
211
+
212
+
213
+ def _filter_snakemake_output_simple(line: str) -> str | None:
214
+ """
215
+ Simple filter for Snakemake output showing only localrules and Job messages.
216
+
217
+ Parameters
218
+ ----------
219
+ line
220
+ A single line of Snakemake output.
221
+
222
+ Returns
223
+ -------
224
+ str or None
225
+ The filtered line for display, or None to suppress the line.
226
+ """
227
+ # Skip empty lines
228
+ if not line.strip():
229
+ return None
230
+
231
+ if line.startswith("localrule "):
232
+ # Show localrule names (without the "localrule" prefix)
233
+ # Extract rule name (remove "localrule " prefix and colon at the end)
234
+ filtered_line = line.replace("localrule ", "").rstrip(":")
235
+ elif line.startswith("Job ") and ":" in line:
236
+ # Show Job messages
237
+ # Extract everything after "Job ##: "
238
+ parts = line.split(":", 1)
239
+ filtered_line = parts[1].strip() if len(parts) > 1 else None
240
+ else:
241
+ # Suppress everything else
242
+ filtered_line = None
243
+ return filtered_line
127
244
 
128
245
 
129
246
  def _get_singularity_args(config: Config) -> str:
@@ -35,6 +35,7 @@ for block_dir in blocks_dir.iterdir():
35
35
  comparisons.append(cl.LevenshteinAtThresholds(column))
36
36
  else:
37
37
  raise ValueError(f"Unknown comparison method {method}")
38
+ # TODO: check both datasets contain all the columns
38
39
 
39
40
  # Create the Splink linker in dedupe mode
40
41
  settings = SettingsCreator(
@@ -135,7 +136,7 @@ for block_dir in blocks_dir.iterdir():
135
136
 
136
137
  sqls = predict_from_comparison_vectors_sqls_using_settings(
137
138
  linker._settings_obj,
138
- float(os.environ["THRESHOLD_MATCH_PROBABILITY"]),
139
+ float(os.getenv("THRESHOLD_MATCH_PROBABILITY", 0)),
139
140
  threshold_match_weight=None,
140
141
  sql_infinity_expression=linker._infinity_expression,
141
142
  )
@@ -97,24 +97,34 @@ def _add_logging_sink(
97
97
  Whether the logs should be converted to JSON before they're dumped
98
98
  to the logging sink.
99
99
  """
100
- message_format = (
101
- "<green>{time:YYYY-MM-DD HH:mm:ss.SSS}</green> | <green>{elapsed}</green> | "
102
- "<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>"
103
- )
100
+
101
+ def format_message(record):
102
+ elapsed_seconds = int(record["elapsed"].total_seconds())
103
+ hours = elapsed_seconds // 3600
104
+ minutes = (elapsed_seconds % 3600) // 60
105
+ seconds = elapsed_seconds % 60
106
+ elapsed_str = f"{hours:02d}:{minutes:02d}:{seconds:02d}"
107
+ time_str = record["time"].strftime("%Y-%m-%d %H:%M:%S")
108
+
109
+ if colorize:
110
+ return f"\033[32m{time_str}\033[0m | \033[32m{elapsed_str}\033[0m | {record['message']}\n"
111
+ else:
112
+ return f"{time_str} | {elapsed_str} | {record['message']}\n"
113
+
104
114
  if verbose == 0:
105
115
  logger.add(
106
116
  sink,
107
- colorize=colorize,
117
+ colorize=False, # We handle colors in format_message
108
118
  level="INFO",
109
- format=message_format,
119
+ format=format_message,
110
120
  serialize=serialize,
111
121
  )
112
122
  elif verbose >= 1:
113
123
  logger.add(
114
124
  sink,
115
- colorize=colorize,
125
+ colorize=False, # We handle colors in format_message
116
126
  level="DEBUG",
117
- format=message_format,
127
+ format=format_message,
118
128
  serialize=serialize,
119
129
  )
120
130
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: easylink
3
- Version: 0.1.23
3
+ Version: 0.1.25
4
4
  Summary: Research repository for the EasyLink ER ecosystem project.
5
5
  Home-page: https://github.com/ihmeuw/easylink
6
6
  Author: The EasyLink developers
@@ -78,34 +78,50 @@ Installation
78
78
 
79
79
  .. _installation:
80
80
 
81
+ **NOTE: This package requires AMD64 CPU architecture - it is not compatible with
82
+ Apple's ARM64 architecture (e.g. M1 and newer Macs).**
83
+
81
84
  There are a few things to install in order to use this package:
82
85
 
83
- - Install singularity.
86
+ - Set up Linux.
87
+
88
+ Singularity (and thus EasyLink) requires Linux to run. If you are not already
89
+ using Linux, you will need to set up a virtual machine; refer to the
90
+ `Singularity documentation for installing on Windows or Mac <https://docs.sylabs.io/guides/4.1/admin-guide/installation.html#installation-on-windows-or-mac>`_.
84
91
 
85
- You may need to request it from your system admin.
86
- Refer to https://docs.sylabs.io/guides/4.1/admin-guide/installation.html.
87
- You can check if you already have singularity installed by running the command
88
- ``singularity --version``. For an existing installation, your singularity version
92
+ - Install Singularity.
93
+
94
+ First check if you already have Singularity installed by running the command
95
+ ``singularity --version``. For an existing installation, your Singularity version
89
96
  number is printed.
90
97
 
98
+ If Singularity is not yet installed, you will need to install it;
99
+ refer to the `Singularity docs for installing on Linux <https://docs.sylabs.io/guides/4.1/admin-guide/installation.html#installation-on-linux>`_.
100
+
101
+ Note that this requires administrator privileges; you may need to request installation
102
+ from your system admin if you are working in a shared computing environment.
103
+
91
104
  - Install conda.
92
105
 
93
106
  We recommend `miniforge <https://github.com/conda-forge/miniforge>`_. You can
94
107
  check if you already have conda installed by running the command ``conda --version``.
95
108
  For an existing installation, a version will be displayed.
96
109
 
97
- - Install easylink, python and graphviz in a conda environment.
110
+ - Create a conda environment with python and graphviz installed.
111
+
112
+ ::
113
+
114
+ $ conda create --name easylink -c conda-forge python=3.12 graphviz 'gcc<14' -y
115
+ $ conda activate easylink
116
+
117
+ - Install easylink in the environment.
98
118
 
99
119
  Option 1 - Install from PyPI with pip::
100
120
 
101
- $ conda create --name easylink -c conda-forge python=3.12 graphviz 'gcc<14' -y
102
- $ conda activate easylink
103
121
  $ pip install easylink
104
122
 
105
123
  Option 2 - Build from source with pip::
106
124
 
107
- $ conda create --name easylink -c conda-forge python=3.12 graphviz 'gcc<14' -y
108
- $ conda activate easylink
109
125
  $ pip install git+https://github.com/ihmeuw/easylink.git
110
126
 
111
127
  .. _end_installation:
@@ -1,16 +1,16 @@
1
1
  easylink/__about__.py,sha256=2-oxCfu9t9yUJouLDwqYRZ0eii8kN25SxRzsawjWjho,440
2
2
  easylink/__init__.py,sha256=gGMcIVfiVnHtlDw5mZwhevcDb2wt-kuP6F64gnkFack,159
3
- easylink/_version.py,sha256=0byemO6n6WCv41u9vBG2AIsOkVbxLvok7puvwy8EhfU,23
4
- easylink/cli.py,sha256=zQO4lOVoZ3eVgPVWT2sCF4zNoKgiDJP9ReTh2Myr9jc,10307
3
+ easylink/_version.py,sha256=Ej7LsXg-6CASlaEHsZkUoLDpYEfHeFKdIeXMIM0esgA,23
4
+ easylink/cli.py,sha256=80_EVklOdX78fPqAinTpsfTfWUqMM4ghFaQcVgZG354,10496
5
5
  easylink/configuration.py,sha256=TPd3WbqUcJMJDPJuHeo5ZebvZPQrRyfm8-laad2sOFk,12681
6
6
  easylink/graph_components.py,sha256=zZDZXg5smReHO3ryQC4pao24wyKXzWDe6jS3C6fM2ak,13892
7
7
  easylink/implementation.py,sha256=lSF37g-aQYgdLI0lgFaDrBkp23te9O9B1V-CmmRtB-0,14514
8
- easylink/implementation_metadata.yaml,sha256=pKu_H9fLnTsS8E4wCnYRitumW1-zs7mfE3z66BAyO30,10848
8
+ easylink/implementation_metadata.yaml,sha256=u_E51gVzVzTuM19dMv7-p_0JV-A6j5dfUwJrxtAZDBQ,11805
9
9
  easylink/pipeline.py,sha256=NJyMInbOCjJ_5kRzzuK5AcupvC7ecd5qLOC-f1Gy3Ss,17701
10
10
  easylink/pipeline_graph.py,sha256=jtjS7_2IVa189U8cL621600erC_0pa6BKPRRO8VqwiU,23760
11
11
  easylink/pipeline_schema.py,sha256=sj6YQqMFUS35RRFWdlrhPlud68d3Rb5y7iKfAEZy1qs,6877
12
- easylink/rule.py,sha256=MM7WyW56J7zT2FVjHlFtjuz62PfdSBGTD3MNcpLfEZM,16598
13
- easylink/runner.py,sha256=Z9GKgiUAWtp0dW1cyAp86MGthIDeABJtHDXUtzv0-kE,6714
12
+ easylink/rule.py,sha256=n8r4NL7MiNzppl39MRbEMdHEpn_e_XS7LfrsJD6KNfA,16629
13
+ easylink/runner.py,sha256=irMmrUME1B8BFTtQkCr-u-lHBDKaEll7IX_a4Q4AJNc,10576
14
14
  easylink/step.py,sha256=-vdFhPvwAZ3d69LMQGmSIVdcMG8E8ZtSvTE0UWif7zs,91088
15
15
  easylink/devtools/implementation_creator.py,sha256=gZZpfpiOOh912nNjR_d5wR0mBO5c09jWMS0FSYRcE1o,19120
16
16
  easylink/pipeline_schema_constants/__init__.py,sha256=njUL2v67WFC3dW_FjOXWRLSeOhInox_ZINLEji0_7V8,1523
@@ -72,20 +72,20 @@ easylink/steps/rl-dummy/pre-processing/dummy_pre-processing.py,sha256=I6kqG4e_H2
72
72
  easylink/steps/splink/splink_blocking_and_filtering.def,sha256=umWzxJhsfdi8g3TD-r2mKpjC-FPAMDk6-IERiWigdQc,557
73
73
  easylink/steps/splink/splink_blocking_and_filtering.py,sha256=FO8YJ2_KgCLpQoq5xsM339bTSN1DhCXCL8XT1pb5STY,5259
74
74
  easylink/steps/splink/splink_evaluating_pairs.def,sha256=DN3Ohy9qJOAyK58v164neP23HDVYpedMqzCu4eQh4Hg,521
75
- easylink/steps/splink/splink_evaluating_pairs.py,sha256=m-j1QMRSvPCiSoWVSV1kzzzsK1c_xG8nqYKMd3cj7kM,6195
75
+ easylink/steps/splink/splink_evaluating_pairs.py,sha256=vWUe3vQo9uGs0Cy8pG5PbolzsJX_cEaPS3Q0PMcBjcg,6253
76
76
  easylink/steps/splink/splink_links_to_clusters.def,sha256=RurvOYyGjNs9tx64DTXwI-GSgHD4T7SzDfhAH18pTEM,524
77
77
  easylink/steps/splink/splink_links_to_clusters.py,sha256=5Sw8yi0dVLuRB-trN2kXmxbHBR0VJBxYee6u4_usg2Y,1920
78
78
  easylink/utilities/__init__.py,sha256=0U33kbv4hoMfFQ_lh5hLwifxRPzOgkLkjKLYxmaK10g,196
79
79
  easylink/utilities/aggregator_utils.py,sha256=_DAHRAf9janbDsuj_jnAn5Dzz2s4R5Ni3YeytDpN9UE,954
80
80
  easylink/utilities/data_utils.py,sha256=XPRjq3qW_fN0xQ23Jms_xBzpTHbRwqZWDP1AW0nYkP0,6926
81
- easylink/utilities/general_utils.py,sha256=El1W0nn4P27sRBGotNQb-9du-Gbhk9ggSuu4vmGDfwo,4591
81
+ easylink/utilities/general_utils.py,sha256=MmuoV4T6PgyEDjbepC_1D3TGrq70Hp-hl-GaAYr5wRU,5033
82
82
  easylink/utilities/paths.py,sha256=9inDKMPorAaWh5vhjc3j1Tj_aXVKhLdodiJO9H1nNes,947
83
83
  easylink/utilities/spark.smk,sha256=kGtpem7LfQc71tMh5WAYaqKnHQKFvcdhPQSdumOP70k,5799
84
84
  easylink/utilities/splitter_utils.py,sha256=KXiVUYJ9TGxCQmrwos18pB1sxG_0Ay67qoDJT6vem2o,3828
85
85
  easylink/utilities/validation_utils.py,sha256=DBJB2TLVklgYw1WaaPse9vqtwPLMGmZNYM2cbCZsoHI,18417
86
- easylink-0.1.23.dist-info/licenses/LICENSE,sha256=z6NBo3hY2Pbh2Wjm9IcHLyQoG-kd816uASh8PbwhTcQ,1530
87
- easylink-0.1.23.dist-info/METADATA,sha256=u-oRyBse4M0AsFkMjTuy0JCpul-BwHJ1JaD9fIALrHU,3565
88
- easylink-0.1.23.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
89
- easylink-0.1.23.dist-info/entry_points.txt,sha256=OGMZDFltg3yMboT7XjJt3joiPhRfV_7jnREVtrAIQNU,51
90
- easylink-0.1.23.dist-info/top_level.txt,sha256=oHcOpcF_jDMWFiJRzfGQvuskENGDjSPC_Agu9Z_Xvik,9
91
- easylink-0.1.23.dist-info/RECORD,,
86
+ easylink-0.1.25.dist-info/licenses/LICENSE,sha256=z6NBo3hY2Pbh2Wjm9IcHLyQoG-kd816uASh8PbwhTcQ,1530
87
+ easylink-0.1.25.dist-info/METADATA,sha256=agOx4R08pqrpwjWmoSAmhU33gmHZ5QhDt9UHVRbnkHI,4219
88
+ easylink-0.1.25.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
89
+ easylink-0.1.25.dist-info/entry_points.txt,sha256=OGMZDFltg3yMboT7XjJt3joiPhRfV_7jnREVtrAIQNU,51
90
+ easylink-0.1.25.dist-info/top_level.txt,sha256=oHcOpcF_jDMWFiJRzfGQvuskENGDjSPC_Agu9Z_Xvik,9
91
+ easylink-0.1.25.dist-info/RECORD,,