easylink 0.1.17__py3-none-any.whl → 0.1.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. easylink/_version.py +1 -1
  2. easylink/cli.py +24 -3
  3. easylink/configuration.py +43 -36
  4. easylink/devtools/implementation_creator.py +71 -22
  5. easylink/implementation.py +88 -11
  6. easylink/implementation_metadata.yaml +177 -29
  7. easylink/pipeline.py +15 -6
  8. easylink/pipeline_schema.py +12 -13
  9. easylink/pipeline_schema_constants/__init__.py +4 -5
  10. easylink/pipeline_schema_constants/main.py +489 -0
  11. easylink/runner.py +11 -7
  12. easylink/step.py +89 -0
  13. easylink/steps/cascading/exclude_clustered.def +22 -0
  14. easylink/steps/cascading/exclude_clustered.py +76 -0
  15. easylink/steps/cascading/exclude_none.def +22 -0
  16. easylink/steps/cascading/exclude_none.py +76 -0
  17. easylink/steps/cascading/update_clusters_by_connected_components.def +22 -0
  18. easylink/steps/cascading/update_clusters_by_connected_components.py +101 -0
  19. easylink/steps/default/default_clusters_to_links.def +22 -0
  20. easylink/steps/default/default_clusters_to_links.py +91 -0
  21. easylink/steps/default/default_determining_exclusions.def +22 -0
  22. easylink/steps/default/default_determining_exclusions.py +81 -0
  23. easylink/steps/default/default_removing_records.def +22 -0
  24. easylink/steps/default/default_removing_records.py +59 -0
  25. easylink/steps/default/default_schema_alignment.def +22 -0
  26. easylink/steps/default/default_schema_alignment.py +53 -0
  27. easylink/steps/default/default_updating_clusters.def +22 -0
  28. easylink/steps/default/default_updating_clusters.py +67 -0
  29. easylink/steps/fastLink/fastLink_evaluating_pairs.R +136 -0
  30. easylink/steps/fastLink/fastLink_evaluating_pairs.def +21 -0
  31. easylink/steps/fastLink/fastLink_links_to_clusters.R +128 -0
  32. easylink/steps/fastLink/fastLink_links_to_clusters.def +21 -0
  33. easylink/steps/rl-dummy/canonicalizing_and_downstream_analysis/dummy_canonicalizing_and_downstream_analysis.def +22 -0
  34. easylink/steps/rl-dummy/canonicalizing_and_downstream_analysis/dummy_canonicalizing_and_downstream_analysis.py +42 -0
  35. easylink/steps/rl-dummy/input_data/create_input_files.ipynb +1433 -0
  36. easylink/steps/rl-dummy/input_data/input_file_1.parquet +0 -0
  37. easylink/steps/rl-dummy/input_data/input_file_2.parquet +0 -0
  38. easylink/steps/rl-dummy/input_data/known_clusters.parquet +0 -0
  39. easylink/steps/rl-dummy/pre-processing/dummy_pre-processing.def +22 -0
  40. easylink/steps/rl-dummy/pre-processing/dummy_pre-processing.py +59 -0
  41. easylink/steps/splink/splink_blocking_and_filtering.def +22 -0
  42. easylink/steps/splink/splink_blocking_and_filtering.py +130 -0
  43. easylink/steps/splink/splink_evaluating_pairs.def +22 -0
  44. easylink/steps/splink/splink_evaluating_pairs.py +164 -0
  45. easylink/steps/splink/splink_links_to_clusters.def +22 -0
  46. easylink/steps/splink/splink_links_to_clusters.py +63 -0
  47. easylink/utilities/data_utils.py +72 -0
  48. easylink/utilities/paths.py +4 -3
  49. easylink/utilities/validation_utils.py +509 -11
  50. {easylink-0.1.17.dist-info → easylink-0.1.19.dist-info}/METADATA +5 -1
  51. easylink-0.1.19.dist-info/RECORD +91 -0
  52. {easylink-0.1.17.dist-info → easylink-0.1.19.dist-info}/WHEEL +1 -1
  53. easylink-0.1.19.dist-info/licenses/LICENSE +28 -0
  54. easylink-0.1.17.dist-info/RECORD +0 -55
  55. {easylink-0.1.17.dist-info → easylink-0.1.19.dist-info}/entry_points.txt +0 -0
  56. {easylink-0.1.17.dist-info → easylink-0.1.19.dist-info}/top_level.txt +0 -0
@@ -11,40 +11,66 @@ function(s) for processed data being passed out of one pipeline step and into th
11
11
  from pathlib import Path
12
12
 
13
13
  import pandas as pd
14
+ from pandas.api.types import is_integer_dtype
14
15
  from pyarrow import parquet as pq
15
16
 
16
17
 
17
- def validate_input_file_dummy(filepath: str) -> None:
18
- """Validates an input file to a dummy :class:`~easylink.step.Step`.
18
+ def _read_file(filepath: str) -> pd.DataFrame:
19
+ """Reads a file.
20
+
21
+ Parameters
22
+ ----------
23
+ filepath : str
24
+ The path to the file to read.
25
+
26
+ Returns
27
+ -------
28
+ The loaded DataFrame.
29
+
30
+ Raises
31
+ ------
32
+ NotImplementedError
33
+ If the file type is not supported.
34
+ """
35
+ extension = Path(filepath).suffix
36
+ if extension == ".parquet":
37
+ return pd.read_parquet(filepath)
38
+ elif extension == ".csv":
39
+ return pd.read_csv(filepath)
40
+ else:
41
+ raise NotImplementedError(
42
+ f"Data file type {extension} is not supported. Convert to Parquet or CSV instead."
43
+ )
19
44
 
20
- This function is intended to be used as the :attr:`~easylink.graph_components.InputSlot.validator`
21
- for _all_ input data at every step in the dummy/:mod:`easylink.pipeline_schema_constants.development`
22
- pipeline schema. It simply checks for supported file types as well as the presence
23
- of required columns.
45
+
46
+ def _validate_required_columns(filepath: str, required_columns: set[str]) -> None:
47
+ """
48
+ Validates that the file at `filepath` contains all columns in `required_columns`.
24
49
 
25
50
  Parameters
26
51
  ----------
27
- filepath
28
- The path to the input data file to be validated.
52
+ filepath : str
53
+ The path to the file to validate.
54
+ required_columns : set[str]
55
+ The set of required column names.
29
56
 
30
57
  Raises
31
58
  ------
32
59
  NotImplementedError
33
60
  If the file type is not supported.
34
61
  LookupError
35
- If the file is missing required columns.
62
+ If any required columns are missing.
36
63
  """
37
64
  extension = Path(filepath).suffix
38
65
  if extension == ".parquet":
39
66
  output_columns = set(pq.ParquetFile(filepath).schema.names)
40
67
  elif extension == ".csv":
41
- output_columns = set(pd.read_csv(filepath).columns)
68
+ output_columns = set(pd.read_csv(filepath, nrows=5).columns)
42
69
  else:
43
70
  raise NotImplementedError(
44
71
  f"Data file type {extension} is not supported. Convert to Parquet or CSV instead"
45
72
  )
46
73
 
47
- required_columns = {"foo", "bar", "counter"}
48
74
  missing_columns = required_columns - output_columns
49
75
  if missing_columns:
50
76
  raise LookupError(
@@ -52,7 +78,479 @@ def validate_input_file_dummy(filepath: str) -> None:
52
78
  )
53
79
 
54
80
 
81
+ def _validate_unique_column(df: pd.DataFrame, column_name: str, filepath: str) -> None:
82
+ """Validates that a column in a DataFrame has unique values.
83
+
84
+ Parameters
85
+ ----------
86
+ df : pandas.DataFrame
87
+ The DataFrame to validate.
88
+ column_name : str
89
+ The name of the column to check.
90
+ filepath : str
91
+ The path to the file being validated.
92
+
93
+ Raises
94
+ ------
95
+ ValueError
96
+ If the column contains duplicate values.
97
+ """
98
+ if not df[column_name].is_unique:
99
+ raise ValueError(
100
+ f"Data file {filepath} contains duplicate values in the '{column_name}' column."
101
+ )
102
+
103
+
104
+ def _validate_unique_column_set(df: pd.DataFrame, columns: set[str], filepath: str) -> None:
105
+ """
106
+ Validates that the combination of columns in `columns` is unique in the DataFrame.
107
+
108
+ Parameters
109
+ ----------
110
+ df : pandas.DataFrame
111
+ The DataFrame to validate.
112
+ columns : set[str]
113
+ The set of column names to check for uniqueness as a group.
114
+ filepath : str
115
+ The path to the file being validated.
116
+
117
+ Raises
118
+ ------
119
+ ValueError
120
+ If duplicate rows exist for the given columns.
121
+ """
122
+ if len(df[list(columns)].drop_duplicates()) < len(df):
123
+ raise ValueError(
124
+ f"Data file {filepath} contains duplicate rows with the same values for {columns}."
125
+ )
126
+
127
+
128
+ def validate_input_file_dummy(filepath: str) -> None:
129
+ """Validates an input file to a dummy :class:`~easylink.step.Step`.
130
+
131
+ The file must contain the columns: "foo", "bar", and "counter".
132
+
133
+ Parameters
134
+ ----------
135
+ filepath : str
136
+ The path to the input file.
137
+
138
+ Raises
139
+ ------
140
+ LookupError
141
+ If the file is missing required columns.
142
+ """
143
+ _validate_required_columns(filepath, required_columns={"foo", "bar", "counter"})
144
+
145
+
146
+ def validate_input_dataset_or_known_clusters(filepath: str) -> None:
147
+ """
148
+ Validates a dataset or clusters file based on its filename.
149
+
150
+ Parameters
151
+ ----------
152
+ filepath : str
153
+ The path to the input file.
154
+
155
+ Raises
156
+ ------
157
+ LookupError, ValueError
158
+ If the file fails validation as a dataset or clusters file.
159
+ """
160
+ filepath = Path(filepath)
161
+ if "clusters" in filepath.stem:
162
+ validate_clusters(filepath)
163
+ else:
164
+ validate_dataset(filepath)
165
+
166
+
167
+ def validate_dataset(filepath: str) -> None:
168
+ """Validates a dataset file.
169
+
170
+ - Must be in a tabular format and contain a "Record ID" column.
171
+ - The "Record ID" column must have unique integer values.
172
+
173
+ Parameters
174
+ ----------
175
+ filepath : str
176
+ The path to the input dataset file.
177
+
178
+ Raises
179
+ ------
180
+ LookupError
181
+ If the file is missing the required "Record ID" column.
182
+ ValueError
183
+ If the "Record ID" column is not unique or not integer dtype.
184
+ """
185
+ _validate_required_columns(filepath, {"Record ID"})
186
+ df = _read_file(filepath)
187
+ _validate_unique_column(df, "Record ID", filepath)
188
+
189
+ if not is_integer_dtype(df["Record ID"]):
190
+ raise ValueError(
191
+ f"Data file {filepath} contains non-integer values in the 'Record ID' column."
192
+ )
193
+
194
+
195
+ def validate_datasets_directory(filepath: str) -> None:
196
+ """Validates a directory of input dataset files.
197
+
198
+ - Each file in the directory must be in a tabular format and contain a "Record ID" column.
199
+ - The "Record ID" column must have unique values.
200
+
201
+ Parameters
202
+ ----------
203
+ filepath : str
204
+ The path to the directory containing input dataset files.
205
+
206
+ Raises
207
+ ------
208
+ NotADirectoryError
209
+ If the provided path is not a directory.
210
+ LookupError
211
+ If any file is missing the required "Record ID" column.
212
+ ValueError
213
+ If the "Record ID" column is not unique in any file or if a non-file is present.
214
+ """
215
+ input_path = Path(filepath)
216
+ if not input_path.is_dir():
217
+ raise NotADirectoryError(f"The path {filepath} is not a directory.")
218
+
219
+ for file in input_path.iterdir():
220
+ if not file.is_file():
221
+ raise ValueError(f"The path {file} is not a file.")
222
+ validate_dataset(file.name)
223
+
224
+
225
+ def validate_clusters(filepath: str) -> None:
226
+ """Validates a file containing cluster information.
227
+
228
+ - The file must contain three columns: "Input Record Dataset", "Input Record ID", and "Cluster ID".
229
+ - "Input Record Dataset" and "Input Record ID", considered as a pair, must have unique values.
230
+
231
+ Parameters
232
+ ----------
233
+ filepath : str
234
+ The path to the file containing cluster data.
235
+
236
+ Raises
237
+ ------
238
+ LookupError
239
+ If the file is missing required columns.
240
+ ValueError
241
+ If the ("Input Record Dataset", "Input Record ID") pair is not unique.
242
+ """
243
+ _validate_required_columns(
244
+ filepath, {"Input Record Dataset", "Input Record ID", "Cluster ID"}
245
+ )
246
+ df = _read_file(filepath)
247
+ _validate_unique_column_set(df, {"Input Record Dataset", "Input Record ID"}, filepath)
248
+
249
+
250
+ def validate_links(filepath: str) -> None:
251
+ """Validates a file containing link information.
252
+
253
+ - The file must contain five columns: "Left Record Dataset", "Left Record ID", "Right Record Dataset", "Right Record ID", and "Probability".
254
+ - "Left Record ID" and "Right Record ID" cannot be equal in a row where "Left Record Dataset" also equals "Right Record Dataset".
255
+ - Rows must be unique, ignoring the Probability column.
256
+ - "Left Record Dataset" must be alphabetically before (or equal to) "Right Record Dataset."
257
+ - "Left Record ID" must be less than "Right Record ID" if "Left Record Dataset" equals "Right Record Dataset".
258
+ - "Probability" values must be between 0 and 1 (inclusive).
259
+
260
+ Parameters
261
+ ----------
262
+ filepath : str
263
+ The path to the file containing link data.
264
+
265
+ Raises
266
+ ------
267
+ LookupError
268
+ If the file is missing required columns.
269
+ ValueError
270
+ If:
271
+ - "Left Record ID" equals "Right Record ID" in any row where datasets match.
272
+ - Duplicate rows exist with the same "Left Record Dataset", "Left Record ID", "Right Record Dataset", and "Right Record ID".
273
+ - "Left Record Dataset" is not alphabetically before or equal to "Right Record Dataset".
274
+ - "Left Record ID" is not less than "Right Record ID" when datasets match.
275
+ - Values in the "Probability" column are not between 0 and 1 (inclusive).
276
+ """
277
+ _validate_required_columns(
278
+ filepath,
279
+ {
280
+ "Left Record Dataset",
281
+ "Left Record ID",
282
+ "Right Record Dataset",
283
+ "Right Record ID",
284
+ "Probability",
285
+ },
286
+ )
287
+ df = _read_file(filepath)
288
+
289
+ _validate_pairs(df, filepath)
290
+
291
+ if not df["Probability"].between(0, 1).all():
292
+ raise ValueError(
293
+ f"Data file {filepath} contains values in the 'Probability' column that are not between 0 and 1 (inclusive)."
294
+ )
295
+
296
+
297
+ def _validate_pairs(df: pd.DataFrame, filepath: str) -> None:
298
+ """
299
+ Validates pairs in a DataFrame for link or pairs files.
300
+
301
+ Parameters
302
+ ----------
303
+ df : pandas.DataFrame
304
+ The DataFrame to validate.
305
+ filepath : str
306
+ The path to the file being validated.
307
+
308
+ Raises
309
+ ------
310
+ ValueError
311
+ If any validation rule for pairs is violated.
312
+ """
313
+ if (
314
+ (df["Left Record Dataset"] == df["Right Record Dataset"])
315
+ & (df["Left Record ID"] == df["Right Record ID"])
316
+ ).any():
317
+ raise ValueError(
318
+ f"Data file {filepath} contains rows where 'Left Record ID' is equal to 'Right Record ID' and 'Left Record Dataset' is equal to 'Right Record Dataset'."
319
+ )
320
+
321
+ _validate_unique_column_set(
322
+ df,
323
+ {"Left Record Dataset", "Left Record ID", "Right Record Dataset", "Right Record ID"},
324
+ filepath,
325
+ )
326
+
327
+ if not all(df["Left Record Dataset"] <= df["Right Record Dataset"]):
328
+ raise ValueError(
329
+ f"Data file {filepath} contains rows where 'Left Record Dataset' is not alphabetically before or equal to 'Right Record Dataset'."
330
+ )
331
+
332
+ if not all(
333
+ (df["Left Record ID"] < df["Right Record ID"])
334
+ | (df["Left Record Dataset"] != df["Right Record Dataset"])
335
+ ):
336
+ raise ValueError(
337
+ f"Data file {filepath} contains rows where 'Left Record ID' is not less than 'Right Record ID', though the records are from the same dataset."
338
+ )
339
+
340
+
341
+ def validate_ids_to_remove(filepath: str) -> None:
342
+ """Validates a file containing IDs to remove.
343
+
344
+ - The file must contain a single column: "Record ID".
345
+ - "Record ID" must have unique values.
346
+
347
+ Parameters
348
+ ----------
349
+ filepath : str
350
+ The path to the file containing IDs to remove.
351
+
352
+ Raises
353
+ ------
354
+ LookupError
355
+ If the file is missing the "Record ID" column.
356
+ ValueError
357
+ If the "Record ID" column is not unique.
358
+ """
359
+ _validate_required_columns(filepath, {"Record ID"})
360
+ df = _read_file(filepath)
361
+ _validate_unique_column(df, "Record ID", filepath)
362
+
363
+
364
+ def validate_records(filepath: str) -> None:
365
+ """Validates a file containing records.
366
+
367
+ - A file in a tabular format.
368
+ - The file may have any number of columns.
369
+ - Two columns must be called "Input Record Dataset" and "Input Record ID" and they must have unique values as a pair.
370
+
371
+ Parameters
372
+ ----------
373
+ filepath : str
374
+ The path to the file containing records.
375
+
376
+ Raises
377
+ ------
378
+ LookupError
379
+ If required columns are missing.
380
+ ValueError
381
+ If the ("Input Record Dataset", "Input Record ID") pair is not unique.
382
+ """
383
+ _validate_required_columns(filepath, {"Input Record Dataset", "Input Record ID"})
384
+ df = _read_file(filepath)
385
+ _validate_unique_column_set(df, {"Input Record Dataset", "Input Record ID"}, filepath)
386
+
387
+
388
+ def validate_blocks(filepath: str) -> None:
389
+ """
390
+ Validates a directory containing blocks.
391
+
392
+ Each block subdirectory must contain exactly two files: a records file and a pairs file, both in tabular format.
393
+
394
+ Validation checks include:
395
+ - The parent directory must exist and be a directory.
396
+ - Each block subdirectory must contain exactly one records file (filename contains "records") and one pairs file (filename contains "pairs").
397
+ - The records file must have columns "Input Record Dataset" and "Input Record ID" with unique pairs.
398
+ - The pairs file must have columns "Left Record Dataset", "Left Record ID", "Right Record Dataset", and "Right Record ID".
399
+ - All values in ("Left Record Dataset", "Left Record ID") and ("Right Record Dataset", "Right Record ID") must exist in the records file.
400
+ - No row in the pairs file may have "Left Record Dataset" == "Right Record Dataset" and "Left Record ID" == "Right Record ID".
401
+ - All rows in the pairs file must be unique with respect to ("Left Record Dataset", "Left Record ID", "Right Record Dataset", "Right Record ID").
402
+ - "Left Record Dataset" must be alphabetically before or equal to "Right Record Dataset".
403
+ - "Left Record ID" must be less than "Right Record ID" if datasets match.
404
+ - No extra files are allowed in block subdirectories.
405
+
406
+ Parameters
407
+ ----------
408
+ filepath : str
409
+ Path to the directory containing block subdirectories.
410
+
411
+ Raises
412
+ ------
413
+ NotADirectoryError
414
+ If the provided path is not a directory.
415
+ FileNotFoundError
416
+ If a required records or pairs file is missing in any block.
417
+ LookupError
418
+ If required columns are missing in records or pairs files.
419
+ ValueError
420
+ If:
421
+ - ("Input Record Dataset", "Input Record ID") is not unique in the records file.
422
+ - ("Left Record Dataset", "Left Record ID") or ("Right Record Dataset", "Right Record ID") in the pairs file do not exist in the records file.
423
+ - "Left Record Dataset" == "Right Record Dataset" and "Left Record ID" == "Right Record ID" in any row of the pairs file.
424
+ - Duplicate rows exist in the pairs file.
425
+ - "Left Record Dataset" is not alphabetically before or equal to "Right Record Dataset" in any row.
426
+ - "Left Record ID" is not less than "Right Record ID" when datasets match.
427
+ - Extra files are present in a block subdirectory.
428
+ """
429
+ input_path = Path(filepath)
430
+
431
+ if not input_path.is_dir():
432
+ raise NotADirectoryError(f"The path {filepath} is not a directory.")
433
+
434
+ for block_dir in filter(lambda d: d.is_dir(), input_path.iterdir()):
435
+ files = {file.stem: file for file in block_dir.iterdir() if file.is_file()}
436
+ records_file = next((f for name, f in files.items() if "records" in name), None)
437
+ pairs_file = next((f for name, f in files.items() if "pairs" in name), None)
438
+
439
+ if len(files) > 2:
440
+ raise ValueError(f"Extra file(s) found in block directory {block_dir}.")
441
+
442
+ if not records_file or not pairs_file:
443
+ raise FileNotFoundError(
444
+ f"Block directory {block_dir} must contain both a records file and a pairs file."
445
+ )
446
+
447
+ # Validate records file
448
+ _validate_required_columns(records_file, {"Input Record Dataset", "Input Record ID"})
449
+ records_df = _read_file(records_file)
450
+ _validate_unique_column_set(
451
+ records_df, {"Input Record Dataset", "Input Record ID"}, records_file
452
+ )
453
+
454
+ # Validate pairs file
455
+ _validate_required_columns(
456
+ pairs_file,
457
+ {
458
+ "Left Record Dataset",
459
+ "Left Record ID",
460
+ "Right Record Dataset",
461
+ "Right Record ID",
462
+ },
463
+ )
464
+ pairs_df = _read_file(pairs_file)
465
+
466
+ # Check that all (dataset, ID) tuples in pairs exist in records
467
+ record_tuples = set(
468
+ records_df[["Input Record Dataset", "Input Record ID"]].itertuples(
469
+ index=False, name=None
470
+ )
471
+ )
472
+ missing_left = (
473
+ set(
474
+ pairs_df[["Left Record Dataset", "Left Record ID"]].itertuples(
475
+ index=False, name=None
476
+ )
477
+ )
478
+ - record_tuples
479
+ )
480
+ missing_right = (
481
+ set(
482
+ pairs_df[["Right Record Dataset", "Right Record ID"]].itertuples(
483
+ index=False, name=None
484
+ )
485
+ )
486
+ - record_tuples
487
+ )
488
+ if missing_left or missing_right:
489
+ raise ValueError(
490
+ f"In block {block_dir}, pairs file {pairs_file} contains records not found in records file {records_file}. "
491
+ f"Missing left records: {missing_left}, missing right records: {missing_right}"
492
+ )
493
+
494
+ _validate_pairs(pairs_df, pairs_file)
495
+
496
+
55
497
  def validate_dir(filepath: str) -> None:
498
+ """
499
+ Validates that the given path is a directory.
500
+
501
+ Parameters
502
+ ----------
503
+ filepath : str
504
+ The path to check.
505
+
506
+ Raises
507
+ ------
508
+ NotADirectoryError
509
+ If the path is not a directory.
510
+ """
56
511
  input_path = Path(filepath)
57
512
  if not input_path.is_dir():
58
513
  raise NotADirectoryError(f"The path {filepath} is not a directory.")
514
+
515
+
516
+ def validate_dataset_dir(filepath: str) -> None:
517
+ """
518
+ Validates a directory containing a single dataset file.
519
+
520
+ Parameters
521
+ ----------
522
+ filepath : str
523
+ The path to the directory.
524
+
525
+ Raises
526
+ ------
527
+ NotADirectoryError
528
+ If the path is not a directory.
529
+ ValueError
530
+ If the directory contains more than one file.
531
+ FileNotFoundError
532
+ If the directory does not contain any files.
533
+ """
534
+ input_path = Path(filepath)
535
+ if not input_path.is_dir():
536
+ raise NotADirectoryError(f"The path {filepath} is not a directory.")
537
+
538
+ file_paths = [f for f in input_path.iterdir() if not str(f.stem).startswith(".")]
539
+ if len(file_paths) > 1:
540
+ raise ValueError(f"The directory {input_path} contains more than one file.")
541
+ if len(file_paths) == 0:
542
+ raise FileNotFoundError(f"The directory {input_path} does not contain any files.")
543
+
544
+ file_path = file_paths[0]
545
+ validate_dataset(file_path)
546
+
547
+
548
+ def dont_validate(filepath: str) -> None:
549
+ """Placeholder function that performs no validation.
550
+
551
+ Parameters
552
+ ----------
553
+ filepath : str
554
+ The path to the file (not used).
555
+ """
556
+ pass
@@ -1,12 +1,13 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: easylink
3
- Version: 0.1.17
3
+ Version: 0.1.19
4
4
  Summary: Research repository for the EasyLink ER ecosystem project.
5
5
  Home-page: https://github.com/ihmeuw/easylink
6
6
  Author: The EasyLink developers
7
7
  Author-email: vivarium.dev@gmail.com
8
8
  License: BSD-3-Clause
9
9
  Description-Content-Type: text/x-rst
10
+ License-File: LICENSE
10
11
  Requires-Dist: click
11
12
  Requires-Dist: docker
12
13
  Requires-Dist: graphviz
@@ -16,7 +17,9 @@ Requires-Dist: networkx
16
17
  Requires-Dist: pandas
17
18
  Requires-Dist: pyyaml
18
19
  Requires-Dist: pyarrow
20
+ Requires-Dist: requests
19
21
  Requires-Dist: snakemake>=8.0.0
22
+ Requires-Dist: tqdm
20
23
  Requires-Dist: snakemake-interface-executor-plugins<9.0.0
21
24
  Requires-Dist: snakemake-executor-plugin-slurm
22
25
  Requires-Dist: pandas-stubs
@@ -52,6 +55,7 @@ Dynamic: description
52
55
  Dynamic: description-content-type
53
56
  Dynamic: home-page
54
57
  Dynamic: license
58
+ Dynamic: license-file
55
59
  Dynamic: provides-extra
56
60
  Dynamic: requires-dist
57
61
  Dynamic: summary
@@ -0,0 +1,91 @@
1
+ easylink/__about__.py,sha256=2-oxCfu9t9yUJouLDwqYRZ0eii8kN25SxRzsawjWjho,440
2
+ easylink/__init__.py,sha256=gGMcIVfiVnHtlDw5mZwhevcDb2wt-kuP6F64gnkFack,159
3
+ easylink/_version.py,sha256=cAJAbAh288a9AL-3yxwFzEM1L26izSJ6wma5aiml_9Y,23
4
+ easylink/cli.py,sha256=zQO4lOVoZ3eVgPVWT2sCF4zNoKgiDJP9ReTh2Myr9jc,10307
5
+ easylink/configuration.py,sha256=hgmG5SIbYqnHDHfk44Gr3QX7C3yTaEVW6GuKeMqvu6c,12689
6
+ easylink/graph_components.py,sha256=zZDZXg5smReHO3ryQC4pao24wyKXzWDe6jS3C6fM2ak,13892
7
+ easylink/implementation.py,sha256=H46WjW9O3csaVAU7qLto3aOu1bSfVOBS0ZySBBX05o0,14544
8
+ easylink/implementation_metadata.yaml,sha256=GoU_aWjVryG8-xjUHkC2nCUeznmYD0BwfJYnNrpZ8P4,10670
9
+ easylink/pipeline.py,sha256=LC0mwboLfe84Mbju9manJjN00Kup4jauiugLlgGCz6I,17884
10
+ easylink/pipeline_graph.py,sha256=9ysX4wAkA-WkUoo15jSLAErncybE4tJwznVx7N_kwIA,23922
11
+ easylink/pipeline_schema.py,sha256=FieJBa3rKgaCIB9QDuQEfWJ9joNBUUp6iHT6xmns-Vk,6886
12
+ easylink/rule.py,sha256=NusEUtBxx18L7UCcgDi3KKooFxSUgyS4eisVM5aPqFE,16770
13
+ easylink/runner.py,sha256=Z9GKgiUAWtp0dW1cyAp86MGthIDeABJtHDXUtzv0-kE,6714
14
+ easylink/step.py,sha256=NGy1KNqM4eXP7kP0kdfcfyGc4K_ExSCSidCdW3h0Qg8,89902
15
+ easylink/devtools/implementation_creator.py,sha256=1WQOOrjQYOhjjp8MQM9j1xoeAp-SW51A1f1oW4G792I,18251
16
+ easylink/images/spark_cluster/Dockerfile,sha256=3PHotbR4jdjVYRHOJ0VQW55b5Qd4tQ1pLLQMrTKWVA0,576
17
+ easylink/images/spark_cluster/README.md,sha256=KdgSttZRplNNWqHn4K1GTsTIab3dTOSG4V99QPLxSp8,569
18
+ easylink/pipeline_schema_constants/__init__.py,sha256=xYymSjTeH3prvQL_rgGFVrriohANFtW_cy0vDwlF3ds,1355
19
+ easylink/pipeline_schema_constants/development.py,sha256=XxcYYZDZM4IADp3eFPQCchD6-OtMp99GiyZBfSswzFo,12640
20
+ easylink/pipeline_schema_constants/main.py,sha256=9IxAjgQej7AaV-zYZEFhG8U-v_rYBFaPuNS3Y3m4Sho,22929
21
+ easylink/pipeline_schema_constants/testing.py,sha256=UDmVVjI1SiDktMbJ2CrSb7amHSYNwhgqNkXhl4lYxQw,20459
22
+ easylink/steps/cascading/exclude_clustered.def,sha256=GfoDqO2Vtsh7VI8SwGaJtv_KtKjs-UmBcivqQ7OPkjk,503
23
+ easylink/steps/cascading/exclude_clustered.py,sha256=NSA6GZBzGa7e6CH4tacCGfr0Y9sUM29g9Nf8NquHB44,2612
24
+ easylink/steps/cascading/exclude_none.def,sha256=iFUhUMocxtkA0NErkjVrBxY0MUdS3DIPNsbCpTJRP0k,488
25
+ easylink/steps/cascading/exclude_none.py,sha256=KntBX3q-V47d96ztOlPNRY_kCFJNi1LNYQ7UNs5wB4c,2507
26
+ easylink/steps/cascading/update_clusters_by_connected_components.def,sha256=sAAAWOod8EuAnotR1cayaGAvs7x6xoMVlwmLso_a9Cc,578
27
+ easylink/steps/cascading/update_clusters_by_connected_components.py,sha256=w7tAOs2QtIIcpTDxw2P_dqMIR-BFa-wi-OmZwrKyhmg,3309
28
+ easylink/steps/default/default_clusters_to_links.def,sha256=9PjUygLvsoYMUZDznceuuv55t8fPs473P57J_RMl3U0,527
29
+ easylink/steps/default/default_clusters_to_links.py,sha256=EIYeP0lj0plBl2OpTRuv3iDEQl-zNVJONUg0kgKSEF0,2848
30
+ easylink/steps/default/default_determining_exclusions.def,sha256=zZUEHDdrpLxzx3gTm-dki2ge5ivCCg4ziIwTErqCII0,542
31
+ easylink/steps/default/default_determining_exclusions.py,sha256=tF2lcga-6n99shgYEmhpNuqok33u7dcW9q5wV3xgp5w,2661
32
+ easylink/steps/default/default_removing_records.def,sha256=QqacmOu6myxFSULHRKeKsVD8l73KDm4VEkPkPlovwqs,524
33
+ easylink/steps/default/default_removing_records.py,sha256=LIlFS8EvJ6h5XqEfgWZYyIAjcKj7Oo8_I5a-vXHOozs,1938
34
+ easylink/steps/default/default_schema_alignment.def,sha256=hFHJkos0Fhe4LvpjLOCd6klIaIqOKqECDDSTVu3G03Y,524
35
+ easylink/steps/default/default_schema_alignment.py,sha256=Uxi6uTFveFKSiiRZG9MnTXOklQngSKGMafqnvKDc0rY,1459
36
+ easylink/steps/default/default_updating_clusters.def,sha256=vDzSkTknDfeiXeHREpw4BkUxFcTWamxr81c3rZ7_enY,527
37
+ easylink/steps/default/default_updating_clusters.py,sha256=A-lO3ussM1Ntffp-ZyPQGbbxZg4QNiZ8AvSOGVJDXnA,2139
38
+ easylink/steps/dev/README.md,sha256=u9dZUggpY2Lf2qb-xkDLWWgHjcmi4osbQtzSNo4uklE,4549
39
+ easylink/steps/dev/build-containers-local.sh,sha256=Wy3pfcyt7I-BNvHcr7ZXDe0g5Ihd00BIPqt9YuRbLeA,259
40
+ easylink/steps/dev/build-containers-remote.sh,sha256=Hy-kaaXf-ta6n8SzOz_ahByjMY5T7J71MvzXRXDvQw8,271
41
+ easylink/steps/dev/test.py,sha256=4iudKBD6CFz2CxbjSBUkc8LCWlMl-Nmw_rB35ZN6TrQ,6835
42
+ easylink/steps/dev/input_data/create_input_files.ipynb,sha256=rHRUFXwTuNXWm8TyaZWFCOewZMDw1G6yiioF8h90erY,2123
43
+ easylink/steps/dev/input_data/input_file_1.csv,sha256=mtu3_ldNTxS9PFtLsZzRspaCCuG_yLxhUdl2ZQFwqrE,88906
44
+ easylink/steps/dev/input_data/input_file_1.parquet,sha256=Km8jRyfGNdq0MFdz_-bewlAc06cFjWn2dWQ7YKKGa5U,60399
45
+ easylink/steps/dev/input_data/input_file_2.csv,sha256=YqKLZDC4d-aYN8Dh9OB6iQWWUKmvueu5CszckH1AApU,100016
46
+ easylink/steps/dev/input_data/input_file_2.parquet,sha256=Vpo0sUqQ78qlWLRk8p303Nh89BVcK4uvXJljRGHmsWk,60392
47
+ easylink/steps/dev/python_pandas/README.md,sha256=c_FbtkKKOTjt2R_LfHUo5lBga1qHiYkxLdQeewRr45g,977
48
+ easylink/steps/dev/python_pandas/dummy_step.py,sha256=NvhLUZu40B3Xbj_S-chQ6IkYUPr6X2aGBxYUa3DqwmY,4362
49
+ easylink/steps/dev/python_pandas/python_pandas.def,sha256=24cxwGF8Cqkv2a1zVsu94MfC_bAXBqAINLwfW2zyB_0,769
50
+ easylink/steps/dev/python_pyspark/README.md,sha256=di29SAfcdTTpar7gdoJRLqKrL8DEfNeayYUyaywdhUg,1563
51
+ easylink/steps/dev/python_pyspark/dummy_step.py,sha256=wxHHI3Uv8MTipKG2ffHbT_eL4JkoNpx49bJoErXumdc,5003
52
+ easylink/steps/dev/python_pyspark/python_pyspark.def,sha256=j_RmVjspmXGOhJTr10ED13RYfbimgxRU3WVTL7VOIUQ,915
53
+ easylink/steps/dev/r/README.md,sha256=dPjZdDTqcJsZCiwhddzlOj1ob0P7YocZUNFrLIGM1-0,1201
54
+ easylink/steps/dev/r/dummy_step.R,sha256=1TWZY8CEkT6gavrulBxFsKbDSKJJjk0NtJrGH7TIikE,4975
55
+ easylink/steps/dev/r/r-image.def,sha256=LrhXlt0C3k7d_VJWopRPEVARnFWSuq_oILlwo7g03bE,627
56
+ easylink/steps/fastLink/fastLink_evaluating_pairs.R,sha256=fQRrTPrgb1t5hrQi0V5H55J-PHdWjsATrVRYdXNYtdU,4603
57
+ easylink/steps/fastLink/fastLink_evaluating_pairs.def,sha256=5rDi-cmWhyuFEsiGFPpTxtySMqq5TpgJG-y8g_MtEvA,509
58
+ easylink/steps/fastLink/fastLink_links_to_clusters.R,sha256=exVzJl4r7k7cRlMCHSmigOqTlxShqzK-FO3EDhlPksg,4087
59
+ easylink/steps/fastLink/fastLink_links_to_clusters.def,sha256=1xYjOMsHtSS2-AI4EC2r6kL8ZX5F2JhmvESefEKeJVY,512
60
+ easylink/steps/output_dir/dummy_step_1_for_output_dir_example.def,sha256=CkQVG-uDRQ9spAavdkZbhx2GD_fRsKZGELPrr8yltsc,550
61
+ easylink/steps/output_dir/dummy_step_1_for_output_dir_example.py,sha256=dI0OWugE35ABLcSwsI-T3C4dvuPTKXwjE52dtSsCo8Y,428
62
+ easylink/steps/output_dir/dummy_step_2_for_output_dir_example.def,sha256=9gShg1EDJEHZcz7Z5VfZ1A4Gpm9XQes8ezn6rAZDgDM,550
63
+ easylink/steps/output_dir/dummy_step_2_for_output_dir_example.py,sha256=DMJW5TXjhELxhY4U9q2RpLjqxlS1YSosTGL2AfRnaZM,521
64
+ easylink/steps/rl-dummy/canonicalizing_and_downstream_analysis/dummy_canonicalizing_and_downstream_analysis.def,sha256=YOWtJZxoe-kHFeEyrgGcVGfdqcbD_Fg17A9shOaK-yc,584
65
+ easylink/steps/rl-dummy/canonicalizing_and_downstream_analysis/dummy_canonicalizing_and_downstream_analysis.py,sha256=skZUiZWcSXAOqq8TAlN5I0wztXgCWHQYA_xkuiL5s28,1202
66
+ easylink/steps/rl-dummy/input_data/create_input_files.ipynb,sha256=uXvJ8zTTPg0077HgA7MhQ_9jD-aeISFLeMeEBbSnOu8,54498
67
+ easylink/steps/rl-dummy/input_data/input_file_1.parquet,sha256=GQ_7v7ucwdJn-9mTgKVcvqkJ5gTkwb0B7y38mfOYbic,15200
68
+ easylink/steps/rl-dummy/input_data/input_file_2.parquet,sha256=Y4eseBm0HmFroksQr_VApgozRL8h8u7nQO6x_Utyns8,14902
69
+ easylink/steps/rl-dummy/input_data/known_clusters.parquet,sha256=Ysodu65toHZN4AgjVJsm0ueUxPIZAJjbtRm9SVM08JE,2598
70
+ easylink/steps/rl-dummy/pre-processing/dummy_pre-processing.def,sha256=HeUSv2QvMOQzsyVktYR1xYoEqwiNpDo-p7IRcGSMspE,512
71
+ easylink/steps/rl-dummy/pre-processing/dummy_pre-processing.py,sha256=aeDgn9z2um0oTPNSwPcTkBou3-1ajud_MWhkuRoHdOU,1884
72
+ easylink/steps/splink/splink_blocking_and_filtering.def,sha256=foAQAPvhDEkXkevpghS-uftsTbIQnQy9PvTkyldQeAA,539
73
+ easylink/steps/splink/splink_blocking_and_filtering.py,sha256=8-_a9PkOmKSa-8TJ9YMjqI7gLo-YD9JCAO1f8uBhdoE,4469
74
+ easylink/steps/splink/splink_evaluating_pairs.def,sha256=DN3Ohy9qJOAyK58v164neP23HDVYpedMqzCu4eQh4Hg,521
75
+ easylink/steps/splink/splink_evaluating_pairs.py,sha256=JR2qVgb14cNZKozDyOrN11nr1mXOwWv69E6WP0pRlMw,5713
76
+ easylink/steps/splink/splink_links_to_clusters.def,sha256=RurvOYyGjNs9tx64DTXwI-GSgHD4T7SzDfhAH18pTEM,524
77
+ easylink/steps/splink/splink_links_to_clusters.py,sha256=z5ymdYl9ytp1e5MA6vn8wpGRFWVuhh23LqGq8NJJxZQ,1936
78
+ easylink/utilities/__init__.py,sha256=0U33kbv4hoMfFQ_lh5hLwifxRPzOgkLkjKLYxmaK10g,196
79
+ easylink/utilities/aggregator_utils.py,sha256=pqBog6kEX4MXBBMjQtHFlE5gEMqRWb5VFl64u0Lr__g,972
80
+ easylink/utilities/data_utils.py,sha256=XPRjq3qW_fN0xQ23Jms_xBzpTHbRwqZWDP1AW0nYkP0,6926
81
+ easylink/utilities/general_utils.py,sha256=El1W0nn4P27sRBGotNQb-9du-Gbhk9ggSuu4vmGDfwo,4591
82
+ easylink/utilities/paths.py,sha256=9inDKMPorAaWh5vhjc3j1Tj_aXVKhLdodiJO9H1nNes,947
83
+ easylink/utilities/spark.smk,sha256=kGtpem7LfQc71tMh5WAYaqKnHQKFvcdhPQSdumOP70k,5799
84
+ easylink/utilities/splitter_utils.py,sha256=UOz4hjkEPqaAz0RrDkDYYej79lLSaq0VVVSH_tF1z0o,3838
85
+ easylink/utilities/validation_utils.py,sha256=DBJB2TLVklgYw1WaaPse9vqtwPLMGmZNYM2cbCZsoHI,18417
86
+ easylink-0.1.19.dist-info/licenses/LICENSE,sha256=z6NBo3hY2Pbh2Wjm9IcHLyQoG-kd816uASh8PbwhTcQ,1530
87
+ easylink-0.1.19.dist-info/METADATA,sha256=nFZA-jZKgZUG4DdiDqY-pNOTfdt1H3QeiwNzvo27vpg,3565
88
+ easylink-0.1.19.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
89
+ easylink-0.1.19.dist-info/entry_points.txt,sha256=OGMZDFltg3yMboT7XjJt3joiPhRfV_7jnREVtrAIQNU,51
90
+ easylink-0.1.19.dist-info/top_level.txt,sha256=oHcOpcF_jDMWFiJRzfGQvuskENGDjSPC_Agu9Z_Xvik,9
91
+ easylink-0.1.19.dist-info/RECORD,,