easylink 0.1.25__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
easylink/_version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.1.25"
1
+ __version__ = "0.2.0"
easylink/cli.py CHANGED
@@ -211,9 +211,14 @@ def run(
211
211
  images_dir=images,
212
212
  schema_name=schema,
213
213
  )
214
- except SystemExit:
215
- # Snakemake uses SystemExit for completion - log success and re-raise
216
- logger.info("*** FINISHED ***")
214
+ except SystemExit as e:
215
+ # Snakemake uses SystemExit with exit code 0 for success, non-zero for failure
216
+ if e.code == 0:
217
+ logger.info("\033[32m*** FINISHED ***\033[0m") # Green
218
+ else:
219
+ logger.error(
220
+ f"\033[31mERROR: Pipeline failed with exit code {e.code}\033[0m"
221
+ ) # Red
217
222
  raise
218
223
 
219
224
 
@@ -2,7 +2,7 @@ step_1_python_pandas:
2
2
  steps:
3
3
  - step_1
4
4
  image_name: python_pandas.sif
5
- zenodo_record_id: 15757317
5
+ zenodo_record_id: 15778354
6
6
  md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
7
7
  script_cmd: python /dummy_step.py
8
8
  outputs:
@@ -11,7 +11,7 @@ step_1a_python_pandas:
11
11
  steps:
12
12
  - step_1a
13
13
  image_name: python_pandas.sif
14
- zenodo_record_id: 15757317
14
+ zenodo_record_id: 15778354
15
15
  md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
16
16
  script_cmd: python /dummy_step.py
17
17
  env:
@@ -22,7 +22,7 @@ step_1b_python_pandas:
22
22
  steps:
23
23
  - step_1b
24
24
  image_name: python_pandas.sif
25
- zenodo_record_id: 15757317
25
+ zenodo_record_id: 15778354
26
26
  md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
27
27
  script_cmd: python /dummy_step.py
28
28
  env:
@@ -33,7 +33,7 @@ step_2_python_pandas:
33
33
  steps:
34
34
  - step_2
35
35
  image_name: python_pandas.sif
36
- zenodo_record_id: 15757317
36
+ zenodo_record_id: 15778354
37
37
  md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
38
38
  script_cmd: python /dummy_step.py
39
39
  outputs:
@@ -42,7 +42,7 @@ step_3_python_pandas:
42
42
  steps:
43
43
  - step_3
44
44
  image_name: python_pandas.sif
45
- zenodo_record_id: 15757317
45
+ zenodo_record_id: 15778354
46
46
  md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
47
47
  script_cmd: python /dummy_step.py
48
48
  outputs:
@@ -51,7 +51,7 @@ step_4_python_pandas:
51
51
  steps:
52
52
  - step_4
53
53
  image_name: python_pandas.sif
54
- zenodo_record_id: 15757317
54
+ zenodo_record_id: 15778354
55
55
  md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
56
56
  script_cmd: python /dummy_step.py
57
57
  env:
@@ -62,7 +62,7 @@ step_5_python_pandas:
62
62
  steps:
63
63
  - step_5
64
64
  image_name: python_pandas.sif
65
- zenodo_record_id: 15757317
65
+ zenodo_record_id: 15778354
66
66
  md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
67
67
  script_cmd: python /dummy_step.py
68
68
  env:
@@ -73,7 +73,7 @@ step_6_python_pandas:
73
73
  steps:
74
74
  - step_6
75
75
  image_name: python_pandas.sif
76
- zenodo_record_id: 15757317
76
+ zenodo_record_id: 15778354
77
77
  md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
78
78
  script_cmd: python /dummy_step.py
79
79
  env:
@@ -84,7 +84,7 @@ step_4a_python_pandas:
84
84
  steps:
85
85
  - step_4a
86
86
  image_name: python_pandas.sif
87
- zenodo_record_id: 15757317
87
+ zenodo_record_id: 15778354
88
88
  md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
89
89
  script_cmd: python /dummy_step.py
90
90
  env:
@@ -95,7 +95,7 @@ step_4b_python_pandas:
95
95
  steps:
96
96
  - step_4b
97
97
  image_name: python_pandas.sif
98
- zenodo_record_id: 15757317
98
+ zenodo_record_id: 15778354
99
99
  md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
100
100
  script_cmd: python /dummy_step.py
101
101
  env:
@@ -106,7 +106,7 @@ step_4b_r:
106
106
  steps:
107
107
  - step_4b
108
108
  image_name: r-image.sif
109
- zenodo_record_id: 15757317
109
+ zenodo_record_id: 15778354
110
110
  md5_checksum: a4a03b836694a2b81a1bd2852736ccc5
111
111
  script_cmd: Rscript /dummy_step.R
112
112
  env:
@@ -117,7 +117,7 @@ step_1_python_pyspark:
117
117
  steps:
118
118
  - step_1
119
119
  image_name: python_pyspark.sif
120
- zenodo_record_id: 15757317
120
+ zenodo_record_id: 15778354
121
121
  md5_checksum: c948577ab0607411dd4b640622d9ec3a
122
122
  script_cmd: python3 /code/dummy_step.py
123
123
  outputs:
@@ -127,7 +127,7 @@ step_2_python_pyspark:
127
127
  steps:
128
128
  - step_2
129
129
  image_name: python_pyspark.sif
130
- zenodo_record_id: 15757317
130
+ zenodo_record_id: 15778354
131
131
  md5_checksum: c948577ab0607411dd4b640622d9ec3a
132
132
  script_cmd: python3 /code/dummy_step.py
133
133
  outputs:
@@ -137,7 +137,7 @@ step_3_python_pyspark:
137
137
  steps:
138
138
  - step_3
139
139
  image_name: python_pyspark.sif
140
- zenodo_record_id: 15757317
140
+ zenodo_record_id: 15778354
141
141
  md5_checksum: c948577ab0607411dd4b640622d9ec3a
142
142
  script_cmd: python3 /code/dummy_step.py
143
143
  outputs:
@@ -147,7 +147,7 @@ step_4_python_pyspark:
147
147
  steps:
148
148
  - step_4
149
149
  image_name: python_pyspark.sif
150
- zenodo_record_id: 15757317
150
+ zenodo_record_id: 15778354
151
151
  md5_checksum: c948577ab0607411dd4b640622d9ec3a
152
152
  script_cmd: python3 /code/dummy_step.py
153
153
  env:
@@ -158,7 +158,7 @@ step_1_r:
158
158
  steps:
159
159
  - step_1
160
160
  image_name: r-image.sif
161
- zenodo_record_id: 15757317
161
+ zenodo_record_id: 15778354
162
162
  md5_checksum: a4a03b836694a2b81a1bd2852736ccc5
163
163
  script_cmd: Rscript /dummy_step.R
164
164
  outputs:
@@ -168,7 +168,7 @@ step_2_r:
168
168
  steps:
169
169
  - step_2
170
170
  image_name: r-image.sif
171
- zenodo_record_id: 15757317
171
+ zenodo_record_id: 15778354
172
172
  md5_checksum: a4a03b836694a2b81a1bd2852736ccc5
173
173
  script_cmd: Rscript /dummy_step.R
174
174
  outputs:
@@ -178,7 +178,7 @@ step_3_r:
178
178
  steps:
179
179
  - step_3
180
180
  image_name: r-image.sif
181
- zenodo_record_id: 15757317
181
+ zenodo_record_id: 15778354
182
182
  md5_checksum: a4a03b836694a2b81a1bd2852736ccc5
183
183
  script_cmd: Rscript /dummy_step.R
184
184
  outputs:
@@ -188,7 +188,7 @@ step_4_r:
188
188
  steps:
189
189
  - step_4
190
190
  image_name: r-image.sif
191
- zenodo_record_id: 15757317
191
+ zenodo_record_id: 15778354
192
192
  md5_checksum: a4a03b836694a2b81a1bd2852736ccc5
193
193
  script_cmd: Rscript /dummy_step.R
194
194
  env:
@@ -201,7 +201,7 @@ step_1_and_step_2_combined_python_pandas:
201
201
  - step_1
202
202
  - step_2
203
203
  image_name: python_pandas.sif
204
- zenodo_record_id: 15757317
204
+ zenodo_record_id: 15778354
205
205
  md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
206
206
  script_cmd: python /dummy_step.py
207
207
  outputs:
@@ -211,7 +211,7 @@ step_1_and_step_2_parallel_python_pandas:
211
211
  - step_1
212
212
  - step_2
213
213
  image_name: python_pandas.sif
214
- zenodo_record_id: 15757317
214
+ zenodo_record_id: 15778354
215
215
  md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
216
216
  script_cmd: python /dummy_step.py
217
217
  env:
@@ -223,7 +223,7 @@ step_3_and_step_4_combined_python_pandas:
223
223
  - step_3
224
224
  - step_4
225
225
  image_name: python_pandas.sif
226
- zenodo_record_id: 15757317
226
+ zenodo_record_id: 15778354
227
227
  md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
228
228
  script_cmd: python /dummy_step.py
229
229
  outputs:
@@ -233,7 +233,7 @@ step_1a_and_step_1b_combined_python_pandas:
233
233
  - step_1a
234
234
  - step_1b
235
235
  image_name: python_pandas.sif
236
- zenodo_record_id: 15757317
236
+ zenodo_record_id: 15778354
237
237
  md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
238
238
  script_cmd: python /dummy_step.py
239
239
  outputs:
@@ -261,8 +261,8 @@ default_removing_records:
261
261
  steps:
262
262
  - removing_records
263
263
  image_name: default_removing_records.sif
264
- zenodo_record_id: 15757317
265
- md5_checksum: 85dba6fd73c9f8f504fddb6d5c30f2de
264
+ zenodo_record_id: 15778354
265
+ md5_checksum: 05123136e756bfa57f1d7d5a3315f2f6
266
266
  script_cmd: python /default_removing_records.py
267
267
  outputs:
268
268
  dataset: dataset
@@ -270,7 +270,7 @@ default_clusters_to_links:
270
270
  steps:
271
271
  - clusters_to_links
272
272
  image_name: default_clusters_to_links.sif
273
- zenodo_record_id: 15757317
273
+ zenodo_record_id: 15778354
274
274
  md5_checksum: 0d00d1272bd8193f60727791097aa065
275
275
  script_cmd: python /default_clusters_to_links.py
276
276
  outputs:
@@ -279,8 +279,8 @@ default_determining_exclusions:
279
279
  steps:
280
280
  - determining_exclusions
281
281
  image_name: default_determining_exclusions.sif
282
- zenodo_record_id: 15757317
283
- md5_checksum: e61cb32ad45b79ca9a2c36db4e76ef7e
282
+ zenodo_record_id: 15778354
283
+ md5_checksum: f4e9f740d8dd7599bfbb2b9eb54ced38
284
284
  script_cmd: python /default_determining_exclusions.py
285
285
  outputs:
286
286
  ids_to_remove: result.parquet
@@ -288,7 +288,7 @@ default_updating_clusters:
288
288
  steps:
289
289
  - updating_clusters
290
290
  image_name: default_updating_clusters.sif
291
- zenodo_record_id: 15757317
291
+ zenodo_record_id: 15778354
292
292
  md5_checksum: cc6bd29e099c2523347fa04545aa35c9
293
293
  script_cmd: python /default_updating_clusters.py
294
294
  outputs:
@@ -299,7 +299,7 @@ save_clusters:
299
299
  steps:
300
300
  - canonicalizing_and_downstream_analysis
301
301
  image_name: save_clusters.sif
302
- zenodo_record_id: 15757317
302
+ zenodo_record_id: 15778354
303
303
  md5_checksum: 384ab2be668cbadc45160a674f621022
304
304
  script_cmd: python /dummy_canonicalizing_and_downstream_analysis.py
305
305
  outputs:
@@ -310,7 +310,7 @@ no_pre-processing:
310
310
  steps:
311
311
  - pre-processing
312
312
  image_name: no_pre-processing.sif
313
- zenodo_record_id: 15757317
313
+ zenodo_record_id: 15778354
314
314
  md5_checksum: 9a9c080cf145078152501cf96bf61f27
315
315
  script_cmd: python /dummy_pre-processing.py
316
316
  outputs:
@@ -319,7 +319,7 @@ default_schema_alignment:
319
319
  steps:
320
320
  - schema_alignment
321
321
  image_name: default_schema_alignment.sif
322
- zenodo_record_id: 15757317
322
+ zenodo_record_id: 15778354
323
323
  md5_checksum: 3166587f9cfec478b999a17074d628f7
324
324
  script_cmd: python /default_schema_alignment.py
325
325
  outputs:
@@ -328,8 +328,8 @@ splink_blocking_and_filtering:
328
328
  steps:
329
329
  - blocking_and_filtering
330
330
  image_name: splink_blocking_and_filtering.sif
331
- zenodo_record_id: 15757317
332
- md5_checksum: 8a365b90295ef6beaad2b7f80a03d768
331
+ zenodo_record_id: 15778354
332
+ md5_checksum: 3f8777c5751d7550762be078d87e7db2
333
333
  script_cmd: python /splink_blocking_and_filtering.py
334
334
  outputs:
335
335
  blocks: blocks
@@ -337,7 +337,7 @@ splink_evaluating_pairs:
337
337
  steps:
338
338
  - evaluating_pairs
339
339
  image_name: splink_evaluating_pairs.sif
340
- zenodo_record_id: 15757317
340
+ zenodo_record_id: 15778354
341
341
  md5_checksum: b57f4bd16b7a3aa5099569078ea4c064
342
342
  script_cmd: python /splink_evaluating_pairs.py
343
343
  outputs:
@@ -346,8 +346,8 @@ splink_links_to_clusters:
346
346
  steps:
347
347
  - links_to_clusters
348
348
  image_name: splink_links_to_clusters.sif
349
- zenodo_record_id: 15757317
350
- md5_checksum: 645937f7bab9c2557b7aacafaf4e4765
349
+ zenodo_record_id: 15778354
350
+ md5_checksum: 81a71aa2ce6544953f3edb88d4ee6ec1
351
351
  script_cmd: python /splink_links_to_clusters.py
352
352
  outputs:
353
353
  clusters: result.parquet
@@ -369,6 +369,8 @@ exclude_clustered:
369
369
  steps:
370
370
  - determining_exclusions
371
371
  image_name: exclude_clustered.sif
372
+ zenodo_record_id: 15778354
373
+ md5_checksum: db51f68ea24d114ed2b83a1382b6e6b6
372
374
  script_cmd: python /exclude_clustered.py
373
375
  outputs:
374
376
  ids_to_remove: result.parquet
@@ -376,6 +378,8 @@ exclude_none:
376
378
  steps:
377
379
  - determining_exclusions
378
380
  image_name: exclude_none.sif
381
+ zenodo_record_id: 15778354
382
+ md5_checksum: af12b6dde2aace9dab08d352368b16a1
379
383
  script_cmd: python /exclude_none.py
380
384
  outputs:
381
385
  ids_to_remove: result.parquet
@@ -383,6 +387,8 @@ update_clusters_by_connected_components:
383
387
  steps:
384
388
  - updating_clusters
385
389
  image_name: update_clusters_by_connected_components.sif
390
+ zenodo_record_id: 15778354
391
+ md5_checksum: 806b0fe86a3306d74391678ed951b054
386
392
  script_cmd: python /update_clusters_by_connected_components.py
387
393
  outputs:
388
394
  clusters: result.parquet
@@ -390,8 +396,26 @@ middle_name_to_initial:
390
396
  steps:
391
397
  - pre-processing
392
398
  image_name: middle_name_to_initial.sif
393
- zenodo_record_id: 15757317
399
+ zenodo_record_id: 15778354
394
400
  md5_checksum: 89db9c3318300cda9d538cde08c3c323
395
401
  script_cmd: python /middle_name_to_initial.py
396
402
  outputs:
397
403
  dataset: dataset
404
+ one_to_many_links_to_clusters:
405
+ steps:
406
+ - links_to_clusters
407
+ image_name: one_to_many_links_to_clusters.sif
408
+ zenodo_record_id: 15778354
409
+ md5_checksum: 0bf6f0b8663b4c2f99995a2213dc541a
410
+ script_cmd: python /one_to_many_links_to_clusters.py
411
+ outputs:
412
+ clusters: result.parquet
413
+ accept_all_pairs:
414
+ steps:
415
+ - evaluating_pairs
416
+ image_name: accept_all_pairs.sif
417
+ zenodo_record_id: 15778354
418
+ md5_checksum: c71c88d159c3d7343ebc39cd37224bd9
419
+ script_cmd: python /accept_all_pairs.py
420
+ outputs:
421
+ links: result.parquet
easylink/runner.py CHANGED
@@ -11,6 +11,8 @@ be called from the ``easylink.cli`` module.
11
11
  import os
12
12
  import socket
13
13
  import subprocess
14
+ import threading
15
+ import time
14
16
  from contextlib import redirect_stderr, redirect_stdout
15
17
  from pathlib import Path
16
18
 
@@ -161,6 +163,32 @@ def _run_snakemake_with_filtered_output(argv: list[str], results_dir: Path) -> N
161
163
  def __init__(self, log_file_path: Path):
162
164
  self.log_file = open(log_file_path, "w")
163
165
  self.buffer = ""
166
+ self.last_output_time = time.time()
167
+ self.heartbeat_timer = None
168
+ self.dots_printed = False # Track if we've printed progress dots
169
+ self._start_heartbeat()
170
+
171
+ def _start_heartbeat(self):
172
+ """Start a timer that prints progress dots during long-running containers."""
173
+
174
+ def heartbeat():
175
+ current_time = time.time()
176
+ if current_time - self.last_output_time > 30: # 30 seconds since last output
177
+ # Print a dot to show progress - use original stdout if available
178
+ if hasattr(self, "original_stdout") and self.original_stdout:
179
+ self.original_stdout.write(".")
180
+ self.original_stdout.flush()
181
+ self.dots_printed = True # Mark that we've printed dots
182
+ self.last_output_time = current_time
183
+ # Schedule next heartbeat
184
+ self.heartbeat_timer = threading.Timer(30.0, heartbeat)
185
+ self.heartbeat_timer.daemon = True
186
+ self.heartbeat_timer.start()
187
+
188
+ # Start first heartbeat after 30 seconds
189
+ self.heartbeat_timer = threading.Timer(30.0, heartbeat)
190
+ self.heartbeat_timer.daemon = True
191
+ self.heartbeat_timer.start()
164
192
 
165
193
  def write(self, text: str) -> int:
166
194
  # Write to log file
@@ -172,9 +200,19 @@ def _run_snakemake_with_filtered_output(argv: list[str], results_dir: Path) -> N
172
200
  while "\n" in self.buffer:
173
201
  line, self.buffer = self.buffer.split("\n", 1)
174
202
  if line.strip():
175
- filtered_line = _filter_snakemake_output_simple(line.strip())
203
+ filtered_line = _filter_snakemake_output(line.strip())
176
204
  if filtered_line:
205
+ # Add newline after dots if we've printed any
206
+ if (
207
+ self.dots_printed
208
+ and hasattr(self, "original_stdout")
209
+ and self.original_stdout
210
+ ):
211
+ self.original_stdout.write("\n")
212
+ self.original_stdout.flush()
213
+ self.dots_printed = False # Reset the flag
177
214
  logger.info(filtered_line)
215
+ self.last_output_time = time.time() # Reset heartbeat timer
178
216
 
179
217
  return len(text)
180
218
 
@@ -182,10 +220,23 @@ def _run_snakemake_with_filtered_output(argv: list[str], results_dir: Path) -> N
182
220
  self.log_file.flush()
183
221
 
184
222
  def close(self):
223
+ # Stop heartbeat timer
224
+ if self.heartbeat_timer:
225
+ self.heartbeat_timer.cancel()
226
+
185
227
  # Process and log any remaining buffer content
186
228
  if self.buffer.strip():
187
- filtered_line = _filter_snakemake_output_simple(self.buffer.strip())
229
+ filtered_line = _filter_snakemake_output(self.buffer.strip())
188
230
  if filtered_line:
231
+ # Add newline after dots if we've printed any
232
+ if (
233
+ self.dots_printed
234
+ and hasattr(self, "original_stdout")
235
+ and self.original_stdout
236
+ ):
237
+ self.original_stdout.write("\n")
238
+ self.original_stdout.flush()
239
+ self.dots_printed = False
189
240
  logger.info(filtered_line)
190
241
  self.log_file.close()
191
242
 
@@ -196,7 +247,14 @@ def _run_snakemake_with_filtered_output(argv: list[str], results_dir: Path) -> N
196
247
  self.close()
197
248
 
198
249
  # Create the filtering output handler and ensure the log file is always closed
250
+ # Save original stdout for progress dots before redirection
251
+ import sys
252
+
253
+ original_stdout = sys.stdout
254
+
199
255
  with FilteringOutput(snakemake_log_file) as filtering_output:
256
+ # Pass original stdout to filtering output for progress dots
257
+ filtering_output.original_stdout = original_stdout
200
258
  try:
201
259
  # Redirect both stdout and stderr to our filtering handler
202
260
  with redirect_stdout(filtering_output), redirect_stderr(filtering_output):
@@ -210,9 +268,8 @@ def _run_snakemake_with_filtered_output(argv: list[str], results_dir: Path) -> N
210
268
  raise
211
269
 
212
270
 
213
- def _filter_snakemake_output_simple(line: str) -> str | None:
214
- """
215
- Simple filter for Snakemake output showing only localrules and Job messages.
271
+ def _filter_snakemake_output(line: str) -> str:
272
+ """Filter for Snakemake output.
216
273
 
217
274
  Parameters
218
275
  ----------
@@ -221,12 +278,11 @@ def _filter_snakemake_output_simple(line: str) -> str | None:
221
278
 
222
279
  Returns
223
280
  -------
224
- str or None
225
- The filtered line for display, or None to suppress the line.
281
+ The filtered line for display.
226
282
  """
227
283
  # Skip empty lines
228
284
  if not line.strip():
229
- return None
285
+ return ""
230
286
 
231
287
  if line.startswith("localrule "):
232
288
  # Show localrule names (without the "localrule" prefix)
@@ -236,10 +292,10 @@ def _filter_snakemake_output_simple(line: str) -> str | None:
236
292
  # Show Job messages
237
293
  # Extract everything after "Job ##: "
238
294
  parts = line.split(":", 1)
239
- filtered_line = parts[1].strip() if len(parts) > 1 else None
295
+ filtered_line = parts[1].strip() if len(parts) > 1 else ""
240
296
  else:
241
297
  # Suppress everything else
242
- filtered_line = None
298
+ filtered_line = ""
243
299
  return filtered_line
244
300
 
245
301
 
@@ -0,0 +1,22 @@
1
+
2
+ Bootstrap: docker
3
+ From: python@sha256:1c26c25390307b64e8ff73e7edf34b4fbeac59d41da41c08da28dc316a721899
4
+
5
+ %files
6
+ ./accept_all_pairs.py /accept_all_pairs.py
7
+
8
+ %post
9
+ # Create directories
10
+ mkdir -p /input_data
11
+ mkdir -p /extra_implementation_specific_input_data
12
+ mkdir -p /results
13
+ mkdir -p /diagnostics
14
+
15
+ # Install Python packages with specific versions
16
+ pip install pandas pyarrow
17
+
18
+ %environment
19
+ export LC_ALL=C
20
+
21
+ %runscript
22
+ python /accept_all_pairs.py '$@'
@@ -0,0 +1,26 @@
1
+ # STEP_NAME: evaluating_pairs
2
+ # REQUIREMENTS: pandas pyarrow
3
+
4
+ import os
5
+ from pathlib import Path
6
+
7
+ import pandas as pd
8
+
9
+ blocks_dir = Path(os.environ["BLOCKS_DIR_PATH"])
10
+ diagnostics_dir = Path(os.environ["DIAGNOSTICS_DIRECTORY"])
11
+ output_path = Path(os.environ["OUTPUT_PATHS"])
12
+ Path(output_path).parent.mkdir(exist_ok=True, parents=True)
13
+
14
+ all_predictions = []
15
+
16
+ for block_dir in blocks_dir.iterdir():
17
+ if str(block_dir.stem).startswith("."):
18
+ continue
19
+
20
+ pairs = pd.read_parquet(block_dir / "pairs.parquet")
21
+
22
+ all_predictions.append(pairs.assign(Probability=1.0))
23
+
24
+ all_predictions = pd.concat(all_predictions, ignore_index=True)
25
+ print(all_predictions)
26
+ all_predictions.to_parquet(output_path)
@@ -62,12 +62,21 @@ clusters_filepath = clusters_filepaths[0]
62
62
 
63
63
  # Exclude records that have been clustered
64
64
  clusters_df = load_file(clusters_filepath)
65
+ # NOTE: We defined "clustered" for these purposes as clustered *with* anything else.
66
+ # Simply putting a record into its own cluster does not indicate to us that it has
67
+ # been sufficiently clustered to ignore.
68
+ cluster_sizes = clusters_df.groupby("Cluster ID").size()
69
+ clusters_df["size"] = cluster_sizes.loc[clusters_df["Cluster ID"]].values
70
+ clusters_df = clusters_df[clusters_df["size"] > 1]
71
+
65
72
  dataset_df = load_file(dataset_path)
66
73
  clustered_record_ids = set(dataset_df["Record ID"].unique()) & set(
67
- clusters_df["Input Record ID"].unique()
74
+ clusters_df[clusters_df["Input Record Dataset"] == splitter_choice][
75
+ "Input Record ID"
76
+ ].unique()
68
77
  )
69
78
 
70
- IDS_TO_REMOVE = pd.DataFrame({"Record ID": list(clustered_record_ids)})
79
+ IDS_TO_REMOVE = pd.DataFrame({"Input Record ID": list(clustered_record_ids)})
71
80
 
72
81
  # OUTPUT_PATHS is a single path to a file (results.parquet)
73
82
  results_filepath = os.environ["OUTPUT_PATHS"]
@@ -67,7 +67,7 @@ clusters_df = load_file(clusters_filepath)
67
67
 
68
68
  # SAVE OUTPUTS
69
69
 
70
- IDS_TO_REMOVE = pd.DataFrame(columns=["Record ID"])
70
+ IDS_TO_REMOVE = pd.DataFrame(columns=["Input Record ID"])
71
71
 
72
72
  # OUTPUT_PATHS is a single path to a file (results.parquet)
73
73
  results_filepath = os.environ["OUTPUT_PATHS"]
@@ -0,0 +1,22 @@
1
+
2
+ Bootstrap: docker
3
+ From: python@sha256:1c26c25390307b64e8ff73e7edf34b4fbeac59d41da41c08da28dc316a721899
4
+
5
+ %files
6
+ ./one_to_many_links_to_clusters.py /one_to_many_links_to_clusters.py
7
+
8
+ %post
9
+ # Create directories
10
+ mkdir -p /input_data
11
+ mkdir -p /extra_implementation_specific_input_data
12
+ mkdir -p /results
13
+ mkdir -p /diagnostics
14
+
15
+ # Install Python packages with specific versions
16
+ pip install pandas pyarrow networkx
17
+
18
+ %environment
19
+ export LC_ALL=C
20
+
21
+ %runscript
22
+ python /one_to_many_links_to_clusters.py '$@'
@@ -0,0 +1,109 @@
1
+ # STEP_NAME: links_to_clusters
2
+ # REQUIREMENTS: pandas pyarrow networkx
3
+
4
+ import os
5
+ from pathlib import Path
6
+
7
+ import networkx as nx
8
+ import pandas as pd
9
+
10
+ links = pd.read_parquet(os.environ["LINKS_FILE_PATH"])
11
+ output_path = Path(os.environ["OUTPUT_PATHS"])
12
+
13
+ no_duplicates_dataset = os.environ["NO_DUPLICATES_DATASET"]
14
+ break_ties_method = os.getenv("BREAK_TIES_METHOD", "drop")
15
+
16
+ left_no_duplicates_dataset = links["Left Record Dataset"] == no_duplicates_dataset
17
+ right_no_duplicates_dataset = links["Right Record Dataset"] == no_duplicates_dataset
18
+
19
+ if (left_no_duplicates_dataset & right_no_duplicates_dataset).any():
20
+ raise ValueError(
21
+ f"Provided links include links within the no_duplicates_dataset ({no_duplicates_dataset})"
22
+ )
23
+
24
+ if not (left_no_duplicates_dataset | right_no_duplicates_dataset).all():
25
+ raise ValueError(
26
+ f"Provided links include links that don't involve the no_duplicates_dataset ({no_duplicates_dataset})"
27
+ )
28
+
29
+ # Get the no-duplicates dataset all on the right
30
+ id_cols = [
31
+ "Left Record Dataset",
32
+ "Left Record ID",
33
+ "Right Record Dataset",
34
+ "Right Record ID",
35
+ ]
36
+ switched_id_cols = [
37
+ "Right Record Dataset",
38
+ "Right Record ID",
39
+ "Left Record Dataset",
40
+ "Left Record ID",
41
+ ]
42
+ links.loc[left_no_duplicates_dataset, id_cols] = links.loc[
43
+ left_no_duplicates_dataset, switched_id_cols
44
+ ].to_numpy()
45
+ links[["Left Record ID", "Right Record ID"]] = links[
46
+ ["Left Record ID", "Right Record ID"]
47
+ ].astype(int)
48
+
49
+ links["Left Record Key"] = (
50
+ links["Left Record Dataset"] + "-__-" + links["Left Record ID"].astype(int).astype(str)
51
+ )
52
+ links["Right Record Key"] = (
53
+ links["Right Record Dataset"] + "-__-" + links["Right Record ID"].astype(int).astype(str)
54
+ )
55
+
56
+ links_to_accept = (
57
+ links[links["Probability"] >= float(os.environ["THRESHOLD_MATCH_PROBABILITY"])]
58
+ # Pre-emptively break probability ties by right record key for the highest_id method
59
+ .sort_values(["Probability", "Right Record Key"], ascending=False)
60
+ # No duplicates in the *right* means only one link per *left* record
61
+ .groupby(["Left Record Key"]).first()
62
+ )
63
+
64
+ if break_ties_method == "drop":
65
+ num_tied = (
66
+ links_to_accept.merge(links, on=["Left Record Key", "Probability"])
67
+ .groupby(["Left Record Key"])
68
+ .size()
69
+ )
70
+ print("Ties:")
71
+ print(num_tied)
72
+ print(num_tied.describe())
73
+ links_to_accept = links_to_accept[num_tied == 1]
74
+ elif break_ties_method == "highest_id":
75
+ # Done above pre-emptively
76
+ pass
77
+ else:
78
+ raise ValueError(f"Unknown break_ties_method {break_ties_method}")
79
+
80
+ # NOTE: We only include nodes involved in an accepted link in our cluster.
81
+ # If a node isn't involved in an accepted link, that could just represent
82
+ # that we haven't evaluated the right pairs involving it, not confidence that
83
+ # it is a singleton.
84
+ G = nx.from_pandas_edgelist(
85
+ links_to_accept.reset_index()[["Left Record Key", "Right Record Key"]].rename(
86
+ columns={"Left Record Key": "source", "Right Record Key": "target"}
87
+ )
88
+ )
89
+
90
+ # Compute connected components
91
+ components = list(nx.connected_components(G))
92
+
93
+ # Assign new cluster IDs
94
+ merged_data = []
95
+ for cluster_id, records in enumerate(components, start=1):
96
+ for record_key in records:
97
+ merged_data.append((record_key, cluster_id))
98
+
99
+ # Build the final DataFrame
100
+ merged_df = pd.DataFrame(merged_data, columns=["Input Record Key", "Cluster ID"])
101
+
102
+ merged_df[["Input Record Dataset", "Input Record ID"]] = (
103
+ merged_df["Input Record Key"].str.split("-__-", n=1, expand=True)
104
+ if not merged_df.empty
105
+ else pd.DataFrame(columns=["Input Record Dataset", "Input Record ID"])
106
+ )
107
+ merged_df["Input Record ID"] = merged_df["Input Record ID"].astype(int)
108
+
109
+ merged_df[["Input Record Dataset", "Input Record ID", "Cluster ID"]].to_parquet(output_path)
@@ -59,7 +59,18 @@ new_clusters_df = load_file(new_clusters_filepath)
59
59
 
60
60
  def merge_clusters(known_clusters_df, new_clusters_df):
61
61
  # Combine both dataframes
62
- combined_df = pd.concat([known_clusters_df, new_clusters_df], ignore_index=True)
62
+ combined_df = pd.concat(
63
+ [
64
+ # Ensure cluster names are unique
65
+ known_clusters_df.assign(
66
+ **{"Cluster ID": lambda df: "known__" + df["Cluster ID"].astype(str)}
67
+ ),
68
+ new_clusters_df.assign(
69
+ **{"Cluster ID": lambda df: "new__" + df["Cluster ID"].astype(str)}
70
+ ),
71
+ ],
72
+ ignore_index=True,
73
+ )
63
74
  combined_df["Input Record Key"] = (
64
75
  combined_df["Input Record Dataset"]
65
76
  + "-__-"
@@ -92,9 +103,11 @@ def merge_clusters(known_clusters_df, new_clusters_df):
92
103
  # Build the final DataFrame
93
104
  merged_df = pd.DataFrame(merged_data, columns=["Input Record Key", "Cluster ID"])
94
105
 
95
- merged_df[["Input Record Dataset", "Input Record ID"]] = merged_df[
96
- "Input Record Key"
97
- ].str.split("-__-", n=1, expand=True)
106
+ merged_df[["Input Record Dataset", "Input Record ID"]] = (
107
+ merged_df["Input Record Key"].str.split("-__-", n=1, expand=True)
108
+ if not merged_df.empty
109
+ else pd.DataFrame(columns=["Input Record Dataset", "Input Record ID"])
110
+ )
98
111
 
99
112
  merged_df["Input Record ID"] = merged_df["Input Record ID"].astype(int)
100
113
 
@@ -72,7 +72,7 @@ if len(clusters_df) > 0:
72
72
 
73
73
  # SAVE OUTPUTS
74
74
 
75
- IDS_TO_REMOVE = pd.DataFrame(columns=["Record ID"])
75
+ IDS_TO_REMOVE = pd.DataFrame(columns=["Input Record ID"])
76
76
 
77
77
  # OUTPUT_PATHS is a single path to a file (results.parquet)
78
78
  results_filepath = os.environ["OUTPUT_PATHS"]
@@ -52,7 +52,7 @@ results_dir.mkdir(exist_ok=True, parents=True)
52
52
  dataset = load_file(dataset_path)
53
53
  ids_to_remove = load_file(ids_filepath)
54
54
 
55
- dataset = dataset[~dataset["Record ID"].isin(ids_to_remove)]
55
+ dataset = dataset[~dataset["Record ID"].isin(ids_to_remove["Input Record ID"])]
56
56
 
57
57
  output_path = results_dir / Path(dataset_path).name
58
58
  logging.info(f"Writing output for dataset from input {dataset_path} to {output_path}")
@@ -90,12 +90,18 @@ blocked_pairs = (
90
90
  .drop(columns=["match_key"])
91
91
  )
92
92
 
93
- blocked_pairs[["Left Record Dataset", "Left Record ID"]] = blocked_pairs.pop(
94
- "join_key_l"
95
- ).str.split("-__-", n=1, expand=True)
96
- blocked_pairs[["Right Record Dataset", "Right Record ID"]] = blocked_pairs.pop(
97
- "join_key_r"
98
- ).str.split("-__-", n=1, expand=True)
93
+ blocked_pairs[["Left Record Dataset", "Left Record ID"]] = (
94
+ blocked_pairs.pop("join_key_l").str.split("-__-", n=1, expand=True)
95
+ if not blocked_pairs.empty
96
+ else pd.DataFrame(columns=["Left Record Dataset", "Left Record ID"])
97
+ )
98
+
99
+ blocked_pairs[["Right Record Dataset", "Right Record ID"]] = (
100
+ blocked_pairs.pop("join_key_r").str.split("-__-", n=1, expand=True)
101
+ if not blocked_pairs.empty
102
+ else pd.DataFrame(columns=["Right Record Dataset", "Right Record ID"])
103
+ )
104
+
99
105
  blocked_pairs[["Left Record ID", "Right Record ID"]] = blocked_pairs[
100
106
  ["Left Record ID", "Right Record ID"]
101
107
  ].astype(int)
@@ -53,6 +53,8 @@ cc = (
53
53
  # Split "Record Key" back into "Input Record Dataset" and "Input Record ID"
54
54
  cc[["Input Record Dataset", "Input Record ID"]] = (
55
55
  cc["Record Key"].astype(str).str.split("-__-", n=1, expand=True)
56
+ if not cc.empty
57
+ else pd.DataFrame(columns=["Input Record Dataset", "Input Record ID"])
56
58
  )
57
59
  cc = cc.drop(columns=["Record Key"])
58
60
  cc["Input Record ID"] = cc["Input Record ID"].astype(int)
@@ -341,8 +341,8 @@ def _validate_pairs(df: pd.DataFrame, filepath: str) -> None:
341
341
  def validate_ids_to_remove(filepath: str) -> None:
342
342
  """Validates a file containing IDs to remove.
343
343
 
344
- - The file must contain a single column: "Record ID".
345
- - "Record ID" must have unique values.
344
+ - The file must contain a single column: "Input Record ID".
345
+ - "Input Record ID" must have unique values.
346
346
 
347
347
  Parameters
348
348
  ----------
@@ -352,13 +352,13 @@ def validate_ids_to_remove(filepath: str) -> None:
352
352
  Raises
353
353
  ------
354
354
  LookupError
355
- If the file is missing the "Record ID" column.
355
+ If the file is missing the "Input Record ID" column.
356
356
  ValueError
357
- If the "Record ID" column is not unique.
357
+ If the "Input Record ID" column is not unique.
358
358
  """
359
- _validate_required_columns(filepath, {"Record ID"})
359
+ _validate_required_columns(filepath, {"Input Record ID"})
360
360
  df = _read_file(filepath)
361
- _validate_unique_column(df, "Record ID", filepath)
361
+ _validate_unique_column(df, "Input Record ID", filepath)
362
362
 
363
363
 
364
364
  def validate_records(filepath: str) -> None:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: easylink
3
- Version: 0.1.25
3
+ Version: 0.2.0
4
4
  Summary: Research repository for the EasyLink ER ecosystem project.
5
5
  Home-page: https://github.com/ihmeuw/easylink
6
6
  Author: The EasyLink developers
@@ -1,34 +1,38 @@
1
1
  easylink/__about__.py,sha256=2-oxCfu9t9yUJouLDwqYRZ0eii8kN25SxRzsawjWjho,440
2
2
  easylink/__init__.py,sha256=gGMcIVfiVnHtlDw5mZwhevcDb2wt-kuP6F64gnkFack,159
3
- easylink/_version.py,sha256=Ej7LsXg-6CASlaEHsZkUoLDpYEfHeFKdIeXMIM0esgA,23
4
- easylink/cli.py,sha256=80_EVklOdX78fPqAinTpsfTfWUqMM4ghFaQcVgZG354,10496
3
+ easylink/_version.py,sha256=Zn1KFblwuFHiDRdRAiRnDBRkbPttWh44jKa5zG2ov0E,22
4
+ easylink/cli.py,sha256=3Xoqclhn7mEHzuqyuVUjt67-V3Fqu0_Jr3B3lCdIuAg,10704
5
5
  easylink/configuration.py,sha256=TPd3WbqUcJMJDPJuHeo5ZebvZPQrRyfm8-laad2sOFk,12681
6
6
  easylink/graph_components.py,sha256=zZDZXg5smReHO3ryQC4pao24wyKXzWDe6jS3C6fM2ak,13892
7
7
  easylink/implementation.py,sha256=lSF37g-aQYgdLI0lgFaDrBkp23te9O9B1V-CmmRtB-0,14514
8
- easylink/implementation_metadata.yaml,sha256=u_E51gVzVzTuM19dMv7-p_0JV-A6j5dfUwJrxtAZDBQ,11805
8
+ easylink/implementation_metadata.yaml,sha256=ahuSVk5Ur1V0F8EsLZO5apkNC2bWv2RsytNaiWGo9Yk,12562
9
9
  easylink/pipeline.py,sha256=NJyMInbOCjJ_5kRzzuK5AcupvC7ecd5qLOC-f1Gy3Ss,17701
10
10
  easylink/pipeline_graph.py,sha256=jtjS7_2IVa189U8cL621600erC_0pa6BKPRRO8VqwiU,23760
11
11
  easylink/pipeline_schema.py,sha256=sj6YQqMFUS35RRFWdlrhPlud68d3Rb5y7iKfAEZy1qs,6877
12
12
  easylink/rule.py,sha256=n8r4NL7MiNzppl39MRbEMdHEpn_e_XS7LfrsJD6KNfA,16629
13
- easylink/runner.py,sha256=irMmrUME1B8BFTtQkCr-u-lHBDKaEll7IX_a4Q4AJNc,10576
13
+ easylink/runner.py,sha256=h39MbWHgTs-VwkPxk76186si76e8UTf1hySqepqUSS8,13155
14
14
  easylink/step.py,sha256=-vdFhPvwAZ3d69LMQGmSIVdcMG8E8ZtSvTE0UWif7zs,91088
15
15
  easylink/devtools/implementation_creator.py,sha256=gZZpfpiOOh912nNjR_d5wR0mBO5c09jWMS0FSYRcE1o,19120
16
16
  easylink/pipeline_schema_constants/__init__.py,sha256=njUL2v67WFC3dW_FjOXWRLSeOhInox_ZINLEji0_7V8,1523
17
17
  easylink/pipeline_schema_constants/development.py,sha256=0x6lWqBmP1K9AebEmeZ4veSnLBcdQcZXzbV6lCU11bc,12622
18
18
  easylink/pipeline_schema_constants/main.py,sha256=kcAhdbK_BhS79LUMhKkn-Uttl2vA-MHVX4M1StTloJQ,22934
19
19
  easylink/pipeline_schema_constants/testing.py,sha256=ti08DeUuF-eWrGKMj4BMyOFFJnEYooDaWX0DGiferbk,24579
20
+ easylink/steps/cascading/accept_all_pairs.def,sha256=kwZMF3H0mqCBcO1Y2parJXFBLp4e9bLQoVIYU7zZ8xY,486
21
+ easylink/steps/cascading/accept_all_pairs.py,sha256=eF_rmqcZtL3vI1u-TJejOcKX2Qou-AbaLI7qAAGjoGI,703
20
22
  easylink/steps/cascading/exclude_clustered.def,sha256=GfoDqO2Vtsh7VI8SwGaJtv_KtKjs-UmBcivqQ7OPkjk,503
21
- easylink/steps/cascading/exclude_clustered.py,sha256=Bpsyf9vAZ431Fh96RVzHkF7fy77NQjo1Cl6bHCIy69c,2580
23
+ easylink/steps/cascading/exclude_clustered.py,sha256=T60deNb91_ZFg5K190G-Q7BC5EYrEdLPhFEK7Togv0Y,3048
22
24
  easylink/steps/cascading/exclude_none.def,sha256=iFUhUMocxtkA0NErkjVrBxY0MUdS3DIPNsbCpTJRP0k,488
23
- easylink/steps/cascading/exclude_none.py,sha256=5DK5bNG4TneMwUKE49Kmz7VDnKBNZWjOERkuSJU3BmA,2475
25
+ easylink/steps/cascading/exclude_none.py,sha256=DesKAO-UcPqKKtUS92OHU25YDXMJLiBEcGLk69UYWDk,2481
26
+ easylink/steps/cascading/one_to_many_links_to_clusters.def,sha256=BVFusUydsV3hY1en16OVr3TPqzwst-cEVBwvb8dtpqA,534
27
+ easylink/steps/cascading/one_to_many_links_to_clusters.py,sha256=7QSJxW3mmR3LIjWBzzgi3vcOsmoYOsiSJn6iYGppHLA,3789
24
28
  easylink/steps/cascading/update_clusters_by_connected_components.def,sha256=sAAAWOod8EuAnotR1cayaGAvs7x6xoMVlwmLso_a9Cc,578
25
- easylink/steps/cascading/update_clusters_by_connected_components.py,sha256=sFZXMGXl17jcGt8Fu5hgQz1KW5bFvPYdCoQGZ9Erc0I,3629
29
+ easylink/steps/cascading/update_clusters_by_connected_components.py,sha256=jhpMgewztCXrRxBw2FnH2HjIybpp7GcHe4kjTMgQOyg,4059
26
30
  easylink/steps/default/default_clusters_to_links.def,sha256=9PjUygLvsoYMUZDznceuuv55t8fPs473P57J_RMl3U0,527
27
31
  easylink/steps/default/default_clusters_to_links.py,sha256=Ckm53d3W-ETNlTvQJPOkpHmSqCmxSWknMPQrEAIoTBY,2816
28
32
  easylink/steps/default/default_determining_exclusions.def,sha256=zZUEHDdrpLxzx3gTm-dki2ge5ivCCg4ziIwTErqCII0,542
29
- easylink/steps/default/default_determining_exclusions.py,sha256=4diLfuqYm_Koj7gwifjwe_7mLZ6xb6RQiEdk-RRtB94,2629
33
+ easylink/steps/default/default_determining_exclusions.py,sha256=RpYHFAral4uYevgiOsYqUHYgsEIejV5NhYdQ3q7VeU0,2635
30
34
  easylink/steps/default/default_removing_records.def,sha256=QqacmOu6myxFSULHRKeKsVD8l73KDm4VEkPkPlovwqs,524
31
- easylink/steps/default/default_removing_records.py,sha256=P4mmX2D4mhSoWd_S5CaNT4hlHOMAeZiMhCScWQiR_fQ,1906
35
+ easylink/steps/default/default_removing_records.py,sha256=I_xGdWftlwP7H8HdxfblSG2YFVqA986KOECVwMCn4fk,1925
32
36
  easylink/steps/default/default_schema_alignment.def,sha256=hFHJkos0Fhe4LvpjLOCd6klIaIqOKqECDDSTVu3G03Y,524
33
37
  easylink/steps/default/default_schema_alignment.py,sha256=oT5jbYQ3C3ocLgqqOnvH0SIJ6NeTtPBWWmCqr_frnAI,1479
34
38
  easylink/steps/default/default_updating_clusters.def,sha256=vDzSkTknDfeiXeHREpw4BkUxFcTWamxr81c3rZ7_enY,527
@@ -70,11 +74,11 @@ easylink/steps/rl-dummy/input_data/known_clusters.parquet,sha256=Ysodu65toHZN4Ag
70
74
  easylink/steps/rl-dummy/pre-processing/dummy_pre-processing.def,sha256=HeUSv2QvMOQzsyVktYR1xYoEqwiNpDo-p7IRcGSMspE,512
71
75
  easylink/steps/rl-dummy/pre-processing/dummy_pre-processing.py,sha256=I6kqG4e_H2yFW5MpsMXdpoY_NjHcBvVVAHWv89LUgXE,1852
72
76
  easylink/steps/splink/splink_blocking_and_filtering.def,sha256=umWzxJhsfdi8g3TD-r2mKpjC-FPAMDk6-IERiWigdQc,557
73
- easylink/steps/splink/splink_blocking_and_filtering.py,sha256=FO8YJ2_KgCLpQoq5xsM339bTSN1DhCXCL8XT1pb5STY,5259
77
+ easylink/steps/splink/splink_blocking_and_filtering.py,sha256=3WMBmNEECB9Kxu4D6PAesZzBrhHTdpFEgvnGPsV4bww,5475
74
78
  easylink/steps/splink/splink_evaluating_pairs.def,sha256=DN3Ohy9qJOAyK58v164neP23HDVYpedMqzCu4eQh4Hg,521
75
79
  easylink/steps/splink/splink_evaluating_pairs.py,sha256=vWUe3vQo9uGs0Cy8pG5PbolzsJX_cEaPS3Q0PMcBjcg,6253
76
80
  easylink/steps/splink/splink_links_to_clusters.def,sha256=RurvOYyGjNs9tx64DTXwI-GSgHD4T7SzDfhAH18pTEM,524
77
- easylink/steps/splink/splink_links_to_clusters.py,sha256=5Sw8yi0dVLuRB-trN2kXmxbHBR0VJBxYee6u4_usg2Y,1920
81
+ easylink/steps/splink/splink_links_to_clusters.py,sha256=Brpy3ZKSBpBUeOitg1ZgDvuMVwILH0QBkLXRJN8LXno,2015
78
82
  easylink/utilities/__init__.py,sha256=0U33kbv4hoMfFQ_lh5hLwifxRPzOgkLkjKLYxmaK10g,196
79
83
  easylink/utilities/aggregator_utils.py,sha256=_DAHRAf9janbDsuj_jnAn5Dzz2s4R5Ni3YeytDpN9UE,954
80
84
  easylink/utilities/data_utils.py,sha256=XPRjq3qW_fN0xQ23Jms_xBzpTHbRwqZWDP1AW0nYkP0,6926
@@ -82,10 +86,10 @@ easylink/utilities/general_utils.py,sha256=MmuoV4T6PgyEDjbepC_1D3TGrq70Hp-hl-GaA
82
86
  easylink/utilities/paths.py,sha256=9inDKMPorAaWh5vhjc3j1Tj_aXVKhLdodiJO9H1nNes,947
83
87
  easylink/utilities/spark.smk,sha256=kGtpem7LfQc71tMh5WAYaqKnHQKFvcdhPQSdumOP70k,5799
84
88
  easylink/utilities/splitter_utils.py,sha256=KXiVUYJ9TGxCQmrwos18pB1sxG_0Ay67qoDJT6vem2o,3828
85
- easylink/utilities/validation_utils.py,sha256=DBJB2TLVklgYw1WaaPse9vqtwPLMGmZNYM2cbCZsoHI,18417
86
- easylink-0.1.25.dist-info/licenses/LICENSE,sha256=z6NBo3hY2Pbh2Wjm9IcHLyQoG-kd816uASh8PbwhTcQ,1530
87
- easylink-0.1.25.dist-info/METADATA,sha256=agOx4R08pqrpwjWmoSAmhU33gmHZ5QhDt9UHVRbnkHI,4219
88
- easylink-0.1.25.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
89
- easylink-0.1.25.dist-info/entry_points.txt,sha256=OGMZDFltg3yMboT7XjJt3joiPhRfV_7jnREVtrAIQNU,51
90
- easylink-0.1.25.dist-info/top_level.txt,sha256=oHcOpcF_jDMWFiJRzfGQvuskENGDjSPC_Agu9Z_Xvik,9
91
- easylink-0.1.25.dist-info/RECORD,,
89
+ easylink/utilities/validation_utils.py,sha256=1naksMPStw_xIOqskX6DE99f16Y7eCcVF9I5ZILjMvI,18453
90
+ easylink-0.2.0.dist-info/licenses/LICENSE,sha256=z6NBo3hY2Pbh2Wjm9IcHLyQoG-kd816uASh8PbwhTcQ,1530
91
+ easylink-0.2.0.dist-info/METADATA,sha256=HxtOiOMe9hTRcK6HL6sLTTQNeP9X7hrhiodTpEMUeOA,4218
92
+ easylink-0.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
93
+ easylink-0.2.0.dist-info/entry_points.txt,sha256=OGMZDFltg3yMboT7XjJt3joiPhRfV_7jnREVtrAIQNU,51
94
+ easylink-0.2.0.dist-info/top_level.txt,sha256=oHcOpcF_jDMWFiJRzfGQvuskENGDjSPC_Agu9Z_Xvik,9
95
+ easylink-0.2.0.dist-info/RECORD,,