easylink 0.1.17__py3-none-any.whl → 0.1.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- easylink/_version.py +1 -1
- easylink/cli.py +24 -3
- easylink/configuration.py +43 -36
- easylink/devtools/implementation_creator.py +71 -22
- easylink/implementation.py +88 -11
- easylink/implementation_metadata.yaml +177 -29
- easylink/pipeline.py +15 -6
- easylink/pipeline_schema.py +12 -13
- easylink/pipeline_schema_constants/__init__.py +4 -5
- easylink/pipeline_schema_constants/main.py +489 -0
- easylink/runner.py +11 -7
- easylink/step.py +89 -0
- easylink/steps/cascading/exclude_clustered.def +22 -0
- easylink/steps/cascading/exclude_clustered.py +76 -0
- easylink/steps/cascading/exclude_none.def +22 -0
- easylink/steps/cascading/exclude_none.py +76 -0
- easylink/steps/cascading/update_clusters_by_connected_components.def +22 -0
- easylink/steps/cascading/update_clusters_by_connected_components.py +101 -0
- easylink/steps/default/default_clusters_to_links.def +22 -0
- easylink/steps/default/default_clusters_to_links.py +91 -0
- easylink/steps/default/default_determining_exclusions.def +22 -0
- easylink/steps/default/default_determining_exclusions.py +81 -0
- easylink/steps/default/default_removing_records.def +22 -0
- easylink/steps/default/default_removing_records.py +59 -0
- easylink/steps/default/default_schema_alignment.def +22 -0
- easylink/steps/default/default_schema_alignment.py +53 -0
- easylink/steps/default/default_updating_clusters.def +22 -0
- easylink/steps/default/default_updating_clusters.py +67 -0
- easylink/steps/fastLink/fastLink_evaluating_pairs.R +136 -0
- easylink/steps/fastLink/fastLink_evaluating_pairs.def +21 -0
- easylink/steps/fastLink/fastLink_links_to_clusters.R +128 -0
- easylink/steps/fastLink/fastLink_links_to_clusters.def +21 -0
- easylink/steps/rl-dummy/canonicalizing_and_downstream_analysis/dummy_canonicalizing_and_downstream_analysis.def +22 -0
- easylink/steps/rl-dummy/canonicalizing_and_downstream_analysis/dummy_canonicalizing_and_downstream_analysis.py +42 -0
- easylink/steps/rl-dummy/input_data/create_input_files.ipynb +1433 -0
- easylink/steps/rl-dummy/input_data/input_file_1.parquet +0 -0
- easylink/steps/rl-dummy/input_data/input_file_2.parquet +0 -0
- easylink/steps/rl-dummy/input_data/known_clusters.parquet +0 -0
- easylink/steps/rl-dummy/pre-processing/dummy_pre-processing.def +22 -0
- easylink/steps/rl-dummy/pre-processing/dummy_pre-processing.py +59 -0
- easylink/steps/splink/splink_blocking_and_filtering.def +22 -0
- easylink/steps/splink/splink_blocking_and_filtering.py +130 -0
- easylink/steps/splink/splink_evaluating_pairs.def +22 -0
- easylink/steps/splink/splink_evaluating_pairs.py +164 -0
- easylink/steps/splink/splink_links_to_clusters.def +22 -0
- easylink/steps/splink/splink_links_to_clusters.py +63 -0
- easylink/utilities/data_utils.py +72 -0
- easylink/utilities/paths.py +4 -3
- easylink/utilities/validation_utils.py +509 -11
- {easylink-0.1.17.dist-info → easylink-0.1.19.dist-info}/METADATA +5 -1
- easylink-0.1.19.dist-info/RECORD +91 -0
- {easylink-0.1.17.dist-info → easylink-0.1.19.dist-info}/WHEEL +1 -1
- easylink-0.1.19.dist-info/licenses/LICENSE +28 -0
- easylink-0.1.17.dist-info/RECORD +0 -55
- {easylink-0.1.17.dist-info → easylink-0.1.19.dist-info}/entry_points.txt +0 -0
- {easylink-0.1.17.dist-info → easylink-0.1.19.dist-info}/top_level.txt +0 -0
@@ -11,40 +11,66 @@ function(s) for processed data being passed out of one pipeline step and into th
|
|
11
11
|
from pathlib import Path
|
12
12
|
|
13
13
|
import pandas as pd
|
14
|
+
from pandas.api.types import is_integer_dtype
|
14
15
|
from pyarrow import parquet as pq
|
15
16
|
|
16
17
|
|
17
|
-
def
|
18
|
-
"""
|
18
|
+
def _read_file(filepath: str) -> pd.DataFrame:
|
19
|
+
"""Reads a file.
|
20
|
+
|
21
|
+
Parameters
|
22
|
+
----------
|
23
|
+
filepath : str
|
24
|
+
The path to the file to read.
|
25
|
+
|
26
|
+
Returns
|
27
|
+
-------
|
28
|
+
The loaded DataFrame.
|
29
|
+
|
30
|
+
Raises
|
31
|
+
------
|
32
|
+
NotImplementedError
|
33
|
+
If the file type is not supported.
|
34
|
+
"""
|
35
|
+
extension = Path(filepath).suffix
|
36
|
+
if extension == ".parquet":
|
37
|
+
return pd.read_parquet(filepath)
|
38
|
+
elif extension == ".csv":
|
39
|
+
return pd.read_csv(filepath)
|
40
|
+
else:
|
41
|
+
raise NotImplementedError(
|
42
|
+
f"Data file type {extension} is not supported. Convert to Parquet or CSV instead."
|
43
|
+
)
|
19
44
|
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
45
|
+
|
46
|
+
def _validate_required_columns(filepath: str, required_columns: set[str]) -> None:
|
47
|
+
"""
|
48
|
+
Validates that the file at `filepath` contains all columns in `required_columns`.
|
24
49
|
|
25
50
|
Parameters
|
26
51
|
----------
|
27
|
-
filepath
|
28
|
-
The path to the
|
52
|
+
filepath : str
|
53
|
+
The path to the file to validate.
|
54
|
+
required_columns : set[str]
|
55
|
+
The set of required column names.
|
29
56
|
|
30
57
|
Raises
|
31
58
|
------
|
32
59
|
NotImplementedError
|
33
60
|
If the file type is not supported.
|
34
61
|
LookupError
|
35
|
-
If
|
62
|
+
If any required columns are missing.
|
36
63
|
"""
|
37
64
|
extension = Path(filepath).suffix
|
38
65
|
if extension == ".parquet":
|
39
66
|
output_columns = set(pq.ParquetFile(filepath).schema.names)
|
40
67
|
elif extension == ".csv":
|
41
|
-
output_columns = set(pd.read_csv(filepath).columns)
|
68
|
+
output_columns = set(pd.read_csv(filepath, nrows=5).columns)
|
42
69
|
else:
|
43
70
|
raise NotImplementedError(
|
44
71
|
f"Data file type {extension} is not supported. Convert to Parquet or CSV instead"
|
45
72
|
)
|
46
73
|
|
47
|
-
required_columns = {"foo", "bar", "counter"}
|
48
74
|
missing_columns = required_columns - output_columns
|
49
75
|
if missing_columns:
|
50
76
|
raise LookupError(
|
@@ -52,7 +78,479 @@ def validate_input_file_dummy(filepath: str) -> None:
|
|
52
78
|
)
|
53
79
|
|
54
80
|
|
81
|
+
def _validate_unique_column(df: pd.DataFrame, column_name: str, filepath: str) -> None:
|
82
|
+
"""Validates that a column in a DataFrame has unique values.
|
83
|
+
|
84
|
+
Parameters
|
85
|
+
----------
|
86
|
+
df : pandas.DataFrame
|
87
|
+
The DataFrame to validate.
|
88
|
+
column_name : str
|
89
|
+
The name of the column to check.
|
90
|
+
filepath : str
|
91
|
+
The path to the file being validated.
|
92
|
+
|
93
|
+
Raises
|
94
|
+
------
|
95
|
+
ValueError
|
96
|
+
If the column contains duplicate values.
|
97
|
+
"""
|
98
|
+
if not df[column_name].is_unique:
|
99
|
+
raise ValueError(
|
100
|
+
f"Data file {filepath} contains duplicate values in the '{column_name}' column."
|
101
|
+
)
|
102
|
+
|
103
|
+
|
104
|
+
def _validate_unique_column_set(df: pd.DataFrame, columns: set[str], filepath: str) -> None:
|
105
|
+
"""
|
106
|
+
Validates that the combination of columns in `columns` is unique in the DataFrame.
|
107
|
+
|
108
|
+
Parameters
|
109
|
+
----------
|
110
|
+
df : pandas.DataFrame
|
111
|
+
The DataFrame to validate.
|
112
|
+
columns : set[str]
|
113
|
+
The set of column names to check for uniqueness as a group.
|
114
|
+
filepath : str
|
115
|
+
The path to the file being validated.
|
116
|
+
|
117
|
+
Raises
|
118
|
+
------
|
119
|
+
ValueError
|
120
|
+
If duplicate rows exist for the given columns.
|
121
|
+
"""
|
122
|
+
if len(df[list(columns)].drop_duplicates()) < len(df):
|
123
|
+
raise ValueError(
|
124
|
+
f"Data file {filepath} contains duplicate rows with the same values for {columns}."
|
125
|
+
)
|
126
|
+
|
127
|
+
|
128
|
+
def validate_input_file_dummy(filepath: str) -> None:
|
129
|
+
"""Validates an input file to a dummy :class:`~easylink.step.Step`.
|
130
|
+
|
131
|
+
The file must contain the columns: "foo", "bar", and "counter".
|
132
|
+
|
133
|
+
Parameters
|
134
|
+
----------
|
135
|
+
filepath : str
|
136
|
+
The path to the input file.
|
137
|
+
|
138
|
+
Raises
|
139
|
+
------
|
140
|
+
LookupError
|
141
|
+
If the file is missing required columns.
|
142
|
+
"""
|
143
|
+
_validate_required_columns(filepath, required_columns={"foo", "bar", "counter"})
|
144
|
+
|
145
|
+
|
146
|
+
def validate_input_dataset_or_known_clusters(filepath: str) -> None:
|
147
|
+
"""
|
148
|
+
Validates a dataset or clusters file based on its filename.
|
149
|
+
|
150
|
+
Parameters
|
151
|
+
----------
|
152
|
+
filepath : str
|
153
|
+
The path to the input file.
|
154
|
+
|
155
|
+
Raises
|
156
|
+
------
|
157
|
+
LookupError, ValueError
|
158
|
+
If the file fails validation as a dataset or clusters file.
|
159
|
+
"""
|
160
|
+
filepath = Path(filepath)
|
161
|
+
if "clusters" in filepath.stem:
|
162
|
+
validate_clusters(filepath)
|
163
|
+
else:
|
164
|
+
validate_dataset(filepath)
|
165
|
+
|
166
|
+
|
167
|
+
def validate_dataset(filepath: str) -> None:
|
168
|
+
"""Validates a dataset file.
|
169
|
+
|
170
|
+
- Must be in a tabular format and contain a "Record ID" column.
|
171
|
+
- The "Record ID" column must have unique integer values.
|
172
|
+
|
173
|
+
Parameters
|
174
|
+
----------
|
175
|
+
filepath : str
|
176
|
+
The path to the input dataset file.
|
177
|
+
|
178
|
+
Raises
|
179
|
+
------
|
180
|
+
LookupError
|
181
|
+
If the file is missing the required "Record ID" column.
|
182
|
+
ValueError
|
183
|
+
If the "Record ID" column is not unique or not integer dtype.
|
184
|
+
"""
|
185
|
+
_validate_required_columns(filepath, {"Record ID"})
|
186
|
+
df = _read_file(filepath)
|
187
|
+
_validate_unique_column(df, "Record ID", filepath)
|
188
|
+
|
189
|
+
if not is_integer_dtype(df["Record ID"]):
|
190
|
+
raise ValueError(
|
191
|
+
f"Data file {filepath} contains non-integer values in the 'Record ID' column."
|
192
|
+
)
|
193
|
+
|
194
|
+
|
195
|
+
def validate_datasets_directory(filepath: str) -> None:
|
196
|
+
"""Validates a directory of input dataset files.
|
197
|
+
|
198
|
+
- Each file in the directory must be in a tabular format and contain a "Record ID" column.
|
199
|
+
- The "Record ID" column must have unique values.
|
200
|
+
|
201
|
+
Parameters
|
202
|
+
----------
|
203
|
+
filepath : str
|
204
|
+
The path to the directory containing input dataset files.
|
205
|
+
|
206
|
+
Raises
|
207
|
+
------
|
208
|
+
NotADirectoryError
|
209
|
+
If the provided path is not a directory.
|
210
|
+
LookupError
|
211
|
+
If any file is missing the required "Record ID" column.
|
212
|
+
ValueError
|
213
|
+
If the "Record ID" column is not unique in any file or if a non-file is present.
|
214
|
+
"""
|
215
|
+
input_path = Path(filepath)
|
216
|
+
if not input_path.is_dir():
|
217
|
+
raise NotADirectoryError(f"The path {filepath} is not a directory.")
|
218
|
+
|
219
|
+
for file in input_path.iterdir():
|
220
|
+
if not file.is_file():
|
221
|
+
raise ValueError(f"The path {file} is not a file.")
|
222
|
+
validate_dataset(file.name)
|
223
|
+
|
224
|
+
|
225
|
+
def validate_clusters(filepath: str) -> None:
|
226
|
+
"""Validates a file containing cluster information.
|
227
|
+
|
228
|
+
- The file must contain three columns: "Input Record Dataset", "Input Record ID", and "Cluster ID".
|
229
|
+
- "Input Record Dataset" and "Input Record ID", considered as a pair, must have unique values.
|
230
|
+
|
231
|
+
Parameters
|
232
|
+
----------
|
233
|
+
filepath : str
|
234
|
+
The path to the file containing cluster data.
|
235
|
+
|
236
|
+
Raises
|
237
|
+
------
|
238
|
+
LookupError
|
239
|
+
If the file is missing required columns.
|
240
|
+
ValueError
|
241
|
+
If the ("Input Record Dataset", "Input Record ID") pair is not unique.
|
242
|
+
"""
|
243
|
+
_validate_required_columns(
|
244
|
+
filepath, {"Input Record Dataset", "Input Record ID", "Cluster ID"}
|
245
|
+
)
|
246
|
+
df = _read_file(filepath)
|
247
|
+
_validate_unique_column_set(df, {"Input Record Dataset", "Input Record ID"}, filepath)
|
248
|
+
|
249
|
+
|
250
|
+
def validate_links(filepath: str) -> None:
|
251
|
+
"""Validates a file containing link information.
|
252
|
+
|
253
|
+
- The file must contain five columns: "Left Record Dataset", "Left Record ID", "Right Record Dataset", "Right Record ID", and "Probability".
|
254
|
+
- "Left Record ID" and "Right Record ID" cannot be equal in a row where "Left Record Dataset" also equals "Right Record Dataset".
|
255
|
+
- Rows must be unique, ignoring the Probability column.
|
256
|
+
- "Left Record Dataset" must be alphabetically before (or equal to) "Right Record Dataset."
|
257
|
+
- "Left Record ID" must be less than "Right Record ID" if "Left Record Dataset" equals "Right Record Dataset".
|
258
|
+
- "Probability" values must be between 0 and 1 (inclusive).
|
259
|
+
|
260
|
+
Parameters
|
261
|
+
----------
|
262
|
+
filepath : str
|
263
|
+
The path to the file containing link data.
|
264
|
+
|
265
|
+
Raises
|
266
|
+
------
|
267
|
+
LookupError
|
268
|
+
If the file is missing required columns.
|
269
|
+
ValueError
|
270
|
+
If:
|
271
|
+
- "Left Record ID" equals "Right Record ID" in any row where datasets match.
|
272
|
+
- Duplicate rows exist with the same "Left Record Dataset", "Left Record ID", "Right Record Dataset", and "Right Record ID".
|
273
|
+
- "Left Record Dataset" is not alphabetically before or equal to "Right Record Dataset".
|
274
|
+
- "Left Record ID" is not less than "Right Record ID" when datasets match.
|
275
|
+
- Values in the "Probability" column are not between 0 and 1 (inclusive).
|
276
|
+
"""
|
277
|
+
_validate_required_columns(
|
278
|
+
filepath,
|
279
|
+
{
|
280
|
+
"Left Record Dataset",
|
281
|
+
"Left Record ID",
|
282
|
+
"Right Record Dataset",
|
283
|
+
"Right Record ID",
|
284
|
+
"Probability",
|
285
|
+
},
|
286
|
+
)
|
287
|
+
df = _read_file(filepath)
|
288
|
+
|
289
|
+
_validate_pairs(df, filepath)
|
290
|
+
|
291
|
+
if not df["Probability"].between(0, 1).all():
|
292
|
+
raise ValueError(
|
293
|
+
f"Data file {filepath} contains values in the 'Probability' column that are not between 0 and 1 (inclusive)."
|
294
|
+
)
|
295
|
+
|
296
|
+
|
297
|
+
def _validate_pairs(df: pd.DataFrame, filepath: str) -> None:
|
298
|
+
"""
|
299
|
+
Validates pairs in a DataFrame for link or pairs files.
|
300
|
+
|
301
|
+
Parameters
|
302
|
+
----------
|
303
|
+
df : pandas.DataFrame
|
304
|
+
The DataFrame to validate.
|
305
|
+
filepath : str
|
306
|
+
The path to the file being validated.
|
307
|
+
|
308
|
+
Raises
|
309
|
+
------
|
310
|
+
ValueError
|
311
|
+
If any validation rule for pairs is violated.
|
312
|
+
"""
|
313
|
+
if (
|
314
|
+
(df["Left Record Dataset"] == df["Right Record Dataset"])
|
315
|
+
& (df["Left Record ID"] == df["Right Record ID"])
|
316
|
+
).any():
|
317
|
+
raise ValueError(
|
318
|
+
f"Data file {filepath} contains rows where 'Left Record ID' is equal to 'Right Record ID' and 'Left Record Dataset' is equal to 'Right Record Dataset'."
|
319
|
+
)
|
320
|
+
|
321
|
+
_validate_unique_column_set(
|
322
|
+
df,
|
323
|
+
{"Left Record Dataset", "Left Record ID", "Right Record Dataset", "Right Record ID"},
|
324
|
+
filepath,
|
325
|
+
)
|
326
|
+
|
327
|
+
if not all(df["Left Record Dataset"] <= df["Right Record Dataset"]):
|
328
|
+
raise ValueError(
|
329
|
+
f"Data file {filepath} contains rows where 'Left Record Dataset' is not alphabetically before or equal to 'Right Record Dataset'."
|
330
|
+
)
|
331
|
+
|
332
|
+
if not all(
|
333
|
+
(df["Left Record ID"] < df["Right Record ID"])
|
334
|
+
| (df["Left Record Dataset"] != df["Right Record Dataset"])
|
335
|
+
):
|
336
|
+
raise ValueError(
|
337
|
+
f"Data file {filepath} contains rows where 'Left Record ID' is not less than 'Right Record ID', though the records are from the same dataset."
|
338
|
+
)
|
339
|
+
|
340
|
+
|
341
|
+
def validate_ids_to_remove(filepath: str) -> None:
|
342
|
+
"""Validates a file containing IDs to remove.
|
343
|
+
|
344
|
+
- The file must contain a single column: "Record ID".
|
345
|
+
- "Record ID" must have unique values.
|
346
|
+
|
347
|
+
Parameters
|
348
|
+
----------
|
349
|
+
filepath : str
|
350
|
+
The path to the file containing IDs to remove.
|
351
|
+
|
352
|
+
Raises
|
353
|
+
------
|
354
|
+
LookupError
|
355
|
+
If the file is missing the "Record ID" column.
|
356
|
+
ValueError
|
357
|
+
If the "Record ID" column is not unique.
|
358
|
+
"""
|
359
|
+
_validate_required_columns(filepath, {"Record ID"})
|
360
|
+
df = _read_file(filepath)
|
361
|
+
_validate_unique_column(df, "Record ID", filepath)
|
362
|
+
|
363
|
+
|
364
|
+
def validate_records(filepath: str) -> None:
|
365
|
+
"""Validates a file containing records.
|
366
|
+
|
367
|
+
- A file in a tabular format.
|
368
|
+
- The file may have any number of columns.
|
369
|
+
- Two columns must be called "Input Record Dataset" and "Input Record ID" and they must have unique values as a pair.
|
370
|
+
|
371
|
+
Parameters
|
372
|
+
----------
|
373
|
+
filepath : str
|
374
|
+
The path to the file containing records.
|
375
|
+
|
376
|
+
Raises
|
377
|
+
------
|
378
|
+
LookupError
|
379
|
+
If required columns are missing.
|
380
|
+
ValueError
|
381
|
+
If the ("Input Record Dataset", "Input Record ID") pair is not unique.
|
382
|
+
"""
|
383
|
+
_validate_required_columns(filepath, {"Input Record Dataset", "Input Record ID"})
|
384
|
+
df = _read_file(filepath)
|
385
|
+
_validate_unique_column_set(df, {"Input Record Dataset", "Input Record ID"}, filepath)
|
386
|
+
|
387
|
+
|
388
|
+
def validate_blocks(filepath: str) -> None:
|
389
|
+
"""
|
390
|
+
Validates a directory containing blocks.
|
391
|
+
|
392
|
+
Each block subdirectory must contain exactly two files: a records file and a pairs file, both in tabular format.
|
393
|
+
|
394
|
+
Validation checks include:
|
395
|
+
- The parent directory must exist and be a directory.
|
396
|
+
- Each block subdirectory must contain exactly one records file (filename contains "records") and one pairs file (filename contains "pairs").
|
397
|
+
- The records file must have columns "Input Record Dataset" and "Input Record ID" with unique pairs.
|
398
|
+
- The pairs file must have columns "Left Record Dataset", "Left Record ID", "Right Record Dataset", and "Right Record ID".
|
399
|
+
- All values in ("Left Record Dataset", "Left Record ID") and ("Right Record Dataset", "Right Record ID") must exist in the records file.
|
400
|
+
- No row in the pairs file may have "Left Record Dataset" == "Right Record Dataset" and "Left Record ID" == "Right Record ID".
|
401
|
+
- All rows in the pairs file must be unique with respect to ("Left Record Dataset", "Left Record ID", "Right Record Dataset", "Right Record ID").
|
402
|
+
- "Left Record Dataset" must be alphabetically before or equal to "Right Record Dataset".
|
403
|
+
- "Left Record ID" must be less than "Right Record ID" if datasets match.
|
404
|
+
- No extra files are allowed in block subdirectories.
|
405
|
+
|
406
|
+
Parameters
|
407
|
+
----------
|
408
|
+
filepath : str
|
409
|
+
Path to the directory containing block subdirectories.
|
410
|
+
|
411
|
+
Raises
|
412
|
+
------
|
413
|
+
NotADirectoryError
|
414
|
+
If the provided path is not a directory.
|
415
|
+
FileNotFoundError
|
416
|
+
If a required records or pairs file is missing in any block.
|
417
|
+
LookupError
|
418
|
+
If required columns are missing in records or pairs files.
|
419
|
+
ValueError
|
420
|
+
If:
|
421
|
+
- ("Input Record Dataset", "Input Record ID") is not unique in the records file.
|
422
|
+
- ("Left Record Dataset", "Left Record ID") or ("Right Record Dataset", "Right Record ID") in the pairs file do not exist in the records file.
|
423
|
+
- "Left Record Dataset" == "Right Record Dataset" and "Left Record ID" == "Right Record ID" in any row of the pairs file.
|
424
|
+
- Duplicate rows exist in the pairs file.
|
425
|
+
- "Left Record Dataset" is not alphabetically before or equal to "Right Record Dataset" in any row.
|
426
|
+
- "Left Record ID" is not less than "Right Record ID" when datasets match.
|
427
|
+
- Extra files are present in a block subdirectory.
|
428
|
+
"""
|
429
|
+
input_path = Path(filepath)
|
430
|
+
|
431
|
+
if not input_path.is_dir():
|
432
|
+
raise NotADirectoryError(f"The path {filepath} is not a directory.")
|
433
|
+
|
434
|
+
for block_dir in filter(lambda d: d.is_dir(), input_path.iterdir()):
|
435
|
+
files = {file.stem: file for file in block_dir.iterdir() if file.is_file()}
|
436
|
+
records_file = next((f for name, f in files.items() if "records" in name), None)
|
437
|
+
pairs_file = next((f for name, f in files.items() if "pairs" in name), None)
|
438
|
+
|
439
|
+
if len(files) > 2:
|
440
|
+
raise ValueError(f"Extra file(s) found in block directory {block_dir}.")
|
441
|
+
|
442
|
+
if not records_file or not pairs_file:
|
443
|
+
raise FileNotFoundError(
|
444
|
+
f"Block directory {block_dir} must contain both a records file and a pairs file."
|
445
|
+
)
|
446
|
+
|
447
|
+
# Validate records file
|
448
|
+
_validate_required_columns(records_file, {"Input Record Dataset", "Input Record ID"})
|
449
|
+
records_df = _read_file(records_file)
|
450
|
+
_validate_unique_column_set(
|
451
|
+
records_df, {"Input Record Dataset", "Input Record ID"}, records_file
|
452
|
+
)
|
453
|
+
|
454
|
+
# Validate pairs file
|
455
|
+
_validate_required_columns(
|
456
|
+
pairs_file,
|
457
|
+
{
|
458
|
+
"Left Record Dataset",
|
459
|
+
"Left Record ID",
|
460
|
+
"Right Record Dataset",
|
461
|
+
"Right Record ID",
|
462
|
+
},
|
463
|
+
)
|
464
|
+
pairs_df = _read_file(pairs_file)
|
465
|
+
|
466
|
+
# Check that all (dataset, ID) tuples in pairs exist in records
|
467
|
+
record_tuples = set(
|
468
|
+
records_df[["Input Record Dataset", "Input Record ID"]].itertuples(
|
469
|
+
index=False, name=None
|
470
|
+
)
|
471
|
+
)
|
472
|
+
missing_left = (
|
473
|
+
set(
|
474
|
+
pairs_df[["Left Record Dataset", "Left Record ID"]].itertuples(
|
475
|
+
index=False, name=None
|
476
|
+
)
|
477
|
+
)
|
478
|
+
- record_tuples
|
479
|
+
)
|
480
|
+
missing_right = (
|
481
|
+
set(
|
482
|
+
pairs_df[["Right Record Dataset", "Right Record ID"]].itertuples(
|
483
|
+
index=False, name=None
|
484
|
+
)
|
485
|
+
)
|
486
|
+
- record_tuples
|
487
|
+
)
|
488
|
+
if missing_left or missing_right:
|
489
|
+
raise ValueError(
|
490
|
+
f"In block {block_dir}, pairs file {pairs_file} contains records not found in records file {records_file}. "
|
491
|
+
f"Missing left records: {missing_left}, missing right records: {missing_right}"
|
492
|
+
)
|
493
|
+
|
494
|
+
_validate_pairs(pairs_df, pairs_file)
|
495
|
+
|
496
|
+
|
55
497
|
def validate_dir(filepath: str) -> None:
|
498
|
+
"""
|
499
|
+
Validates that the given path is a directory.
|
500
|
+
|
501
|
+
Parameters
|
502
|
+
----------
|
503
|
+
filepath : str
|
504
|
+
The path to check.
|
505
|
+
|
506
|
+
Raises
|
507
|
+
------
|
508
|
+
NotADirectoryError
|
509
|
+
If the path is not a directory.
|
510
|
+
"""
|
56
511
|
input_path = Path(filepath)
|
57
512
|
if not input_path.is_dir():
|
58
513
|
raise NotADirectoryError(f"The path {filepath} is not a directory.")
|
514
|
+
|
515
|
+
|
516
|
+
def validate_dataset_dir(filepath: str) -> None:
|
517
|
+
"""
|
518
|
+
Validates a directory containing a single dataset file.
|
519
|
+
|
520
|
+
Parameters
|
521
|
+
----------
|
522
|
+
filepath : str
|
523
|
+
The path to the directory.
|
524
|
+
|
525
|
+
Raises
|
526
|
+
------
|
527
|
+
NotADirectoryError
|
528
|
+
If the path is not a directory.
|
529
|
+
ValueError
|
530
|
+
If the directory contains more than one file.
|
531
|
+
FileNotFoundError
|
532
|
+
If the directory does not contain any files.
|
533
|
+
"""
|
534
|
+
input_path = Path(filepath)
|
535
|
+
if not input_path.is_dir():
|
536
|
+
raise NotADirectoryError(f"The path {filepath} is not a directory.")
|
537
|
+
|
538
|
+
file_paths = [f for f in input_path.iterdir() if not str(f.stem).startswith(".")]
|
539
|
+
if len(file_paths) > 1:
|
540
|
+
raise ValueError(f"The directory {input_path} contains more than one file.")
|
541
|
+
if len(file_paths) == 0:
|
542
|
+
raise FileNotFoundError(f"The directory {input_path} does not contain any files.")
|
543
|
+
|
544
|
+
file_path = file_paths[0]
|
545
|
+
validate_dataset(file_path)
|
546
|
+
|
547
|
+
|
548
|
+
def dont_validate(filepath: str) -> None:
|
549
|
+
"""Placeholder function that performs no validation.
|
550
|
+
|
551
|
+
Parameters
|
552
|
+
----------
|
553
|
+
filepath : str
|
554
|
+
The path to the file (not used).
|
555
|
+
"""
|
556
|
+
pass
|
@@ -1,12 +1,13 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: easylink
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.19
|
4
4
|
Summary: Research repository for the EasyLink ER ecosystem project.
|
5
5
|
Home-page: https://github.com/ihmeuw/easylink
|
6
6
|
Author: The EasyLink developers
|
7
7
|
Author-email: vivarium.dev@gmail.com
|
8
8
|
License: BSD-3-Clause
|
9
9
|
Description-Content-Type: text/x-rst
|
10
|
+
License-File: LICENSE
|
10
11
|
Requires-Dist: click
|
11
12
|
Requires-Dist: docker
|
12
13
|
Requires-Dist: graphviz
|
@@ -16,7 +17,9 @@ Requires-Dist: networkx
|
|
16
17
|
Requires-Dist: pandas
|
17
18
|
Requires-Dist: pyyaml
|
18
19
|
Requires-Dist: pyarrow
|
20
|
+
Requires-Dist: requests
|
19
21
|
Requires-Dist: snakemake>=8.0.0
|
22
|
+
Requires-Dist: tqdm
|
20
23
|
Requires-Dist: snakemake-interface-executor-plugins<9.0.0
|
21
24
|
Requires-Dist: snakemake-executor-plugin-slurm
|
22
25
|
Requires-Dist: pandas-stubs
|
@@ -52,6 +55,7 @@ Dynamic: description
|
|
52
55
|
Dynamic: description-content-type
|
53
56
|
Dynamic: home-page
|
54
57
|
Dynamic: license
|
58
|
+
Dynamic: license-file
|
55
59
|
Dynamic: provides-extra
|
56
60
|
Dynamic: requires-dist
|
57
61
|
Dynamic: summary
|
@@ -0,0 +1,91 @@
|
|
1
|
+
easylink/__about__.py,sha256=2-oxCfu9t9yUJouLDwqYRZ0eii8kN25SxRzsawjWjho,440
|
2
|
+
easylink/__init__.py,sha256=gGMcIVfiVnHtlDw5mZwhevcDb2wt-kuP6F64gnkFack,159
|
3
|
+
easylink/_version.py,sha256=cAJAbAh288a9AL-3yxwFzEM1L26izSJ6wma5aiml_9Y,23
|
4
|
+
easylink/cli.py,sha256=zQO4lOVoZ3eVgPVWT2sCF4zNoKgiDJP9ReTh2Myr9jc,10307
|
5
|
+
easylink/configuration.py,sha256=hgmG5SIbYqnHDHfk44Gr3QX7C3yTaEVW6GuKeMqvu6c,12689
|
6
|
+
easylink/graph_components.py,sha256=zZDZXg5smReHO3ryQC4pao24wyKXzWDe6jS3C6fM2ak,13892
|
7
|
+
easylink/implementation.py,sha256=H46WjW9O3csaVAU7qLto3aOu1bSfVOBS0ZySBBX05o0,14544
|
8
|
+
easylink/implementation_metadata.yaml,sha256=GoU_aWjVryG8-xjUHkC2nCUeznmYD0BwfJYnNrpZ8P4,10670
|
9
|
+
easylink/pipeline.py,sha256=LC0mwboLfe84Mbju9manJjN00Kup4jauiugLlgGCz6I,17884
|
10
|
+
easylink/pipeline_graph.py,sha256=9ysX4wAkA-WkUoo15jSLAErncybE4tJwznVx7N_kwIA,23922
|
11
|
+
easylink/pipeline_schema.py,sha256=FieJBa3rKgaCIB9QDuQEfWJ9joNBUUp6iHT6xmns-Vk,6886
|
12
|
+
easylink/rule.py,sha256=NusEUtBxx18L7UCcgDi3KKooFxSUgyS4eisVM5aPqFE,16770
|
13
|
+
easylink/runner.py,sha256=Z9GKgiUAWtp0dW1cyAp86MGthIDeABJtHDXUtzv0-kE,6714
|
14
|
+
easylink/step.py,sha256=NGy1KNqM4eXP7kP0kdfcfyGc4K_ExSCSidCdW3h0Qg8,89902
|
15
|
+
easylink/devtools/implementation_creator.py,sha256=1WQOOrjQYOhjjp8MQM9j1xoeAp-SW51A1f1oW4G792I,18251
|
16
|
+
easylink/images/spark_cluster/Dockerfile,sha256=3PHotbR4jdjVYRHOJ0VQW55b5Qd4tQ1pLLQMrTKWVA0,576
|
17
|
+
easylink/images/spark_cluster/README.md,sha256=KdgSttZRplNNWqHn4K1GTsTIab3dTOSG4V99QPLxSp8,569
|
18
|
+
easylink/pipeline_schema_constants/__init__.py,sha256=xYymSjTeH3prvQL_rgGFVrriohANFtW_cy0vDwlF3ds,1355
|
19
|
+
easylink/pipeline_schema_constants/development.py,sha256=XxcYYZDZM4IADp3eFPQCchD6-OtMp99GiyZBfSswzFo,12640
|
20
|
+
easylink/pipeline_schema_constants/main.py,sha256=9IxAjgQej7AaV-zYZEFhG8U-v_rYBFaPuNS3Y3m4Sho,22929
|
21
|
+
easylink/pipeline_schema_constants/testing.py,sha256=UDmVVjI1SiDktMbJ2CrSb7amHSYNwhgqNkXhl4lYxQw,20459
|
22
|
+
easylink/steps/cascading/exclude_clustered.def,sha256=GfoDqO2Vtsh7VI8SwGaJtv_KtKjs-UmBcivqQ7OPkjk,503
|
23
|
+
easylink/steps/cascading/exclude_clustered.py,sha256=NSA6GZBzGa7e6CH4tacCGfr0Y9sUM29g9Nf8NquHB44,2612
|
24
|
+
easylink/steps/cascading/exclude_none.def,sha256=iFUhUMocxtkA0NErkjVrBxY0MUdS3DIPNsbCpTJRP0k,488
|
25
|
+
easylink/steps/cascading/exclude_none.py,sha256=KntBX3q-V47d96ztOlPNRY_kCFJNi1LNYQ7UNs5wB4c,2507
|
26
|
+
easylink/steps/cascading/update_clusters_by_connected_components.def,sha256=sAAAWOod8EuAnotR1cayaGAvs7x6xoMVlwmLso_a9Cc,578
|
27
|
+
easylink/steps/cascading/update_clusters_by_connected_components.py,sha256=w7tAOs2QtIIcpTDxw2P_dqMIR-BFa-wi-OmZwrKyhmg,3309
|
28
|
+
easylink/steps/default/default_clusters_to_links.def,sha256=9PjUygLvsoYMUZDznceuuv55t8fPs473P57J_RMl3U0,527
|
29
|
+
easylink/steps/default/default_clusters_to_links.py,sha256=EIYeP0lj0plBl2OpTRuv3iDEQl-zNVJONUg0kgKSEF0,2848
|
30
|
+
easylink/steps/default/default_determining_exclusions.def,sha256=zZUEHDdrpLxzx3gTm-dki2ge5ivCCg4ziIwTErqCII0,542
|
31
|
+
easylink/steps/default/default_determining_exclusions.py,sha256=tF2lcga-6n99shgYEmhpNuqok33u7dcW9q5wV3xgp5w,2661
|
32
|
+
easylink/steps/default/default_removing_records.def,sha256=QqacmOu6myxFSULHRKeKsVD8l73KDm4VEkPkPlovwqs,524
|
33
|
+
easylink/steps/default/default_removing_records.py,sha256=LIlFS8EvJ6h5XqEfgWZYyIAjcKj7Oo8_I5a-vXHOozs,1938
|
34
|
+
easylink/steps/default/default_schema_alignment.def,sha256=hFHJkos0Fhe4LvpjLOCd6klIaIqOKqECDDSTVu3G03Y,524
|
35
|
+
easylink/steps/default/default_schema_alignment.py,sha256=Uxi6uTFveFKSiiRZG9MnTXOklQngSKGMafqnvKDc0rY,1459
|
36
|
+
easylink/steps/default/default_updating_clusters.def,sha256=vDzSkTknDfeiXeHREpw4BkUxFcTWamxr81c3rZ7_enY,527
|
37
|
+
easylink/steps/default/default_updating_clusters.py,sha256=A-lO3ussM1Ntffp-ZyPQGbbxZg4QNiZ8AvSOGVJDXnA,2139
|
38
|
+
easylink/steps/dev/README.md,sha256=u9dZUggpY2Lf2qb-xkDLWWgHjcmi4osbQtzSNo4uklE,4549
|
39
|
+
easylink/steps/dev/build-containers-local.sh,sha256=Wy3pfcyt7I-BNvHcr7ZXDe0g5Ihd00BIPqt9YuRbLeA,259
|
40
|
+
easylink/steps/dev/build-containers-remote.sh,sha256=Hy-kaaXf-ta6n8SzOz_ahByjMY5T7J71MvzXRXDvQw8,271
|
41
|
+
easylink/steps/dev/test.py,sha256=4iudKBD6CFz2CxbjSBUkc8LCWlMl-Nmw_rB35ZN6TrQ,6835
|
42
|
+
easylink/steps/dev/input_data/create_input_files.ipynb,sha256=rHRUFXwTuNXWm8TyaZWFCOewZMDw1G6yiioF8h90erY,2123
|
43
|
+
easylink/steps/dev/input_data/input_file_1.csv,sha256=mtu3_ldNTxS9PFtLsZzRspaCCuG_yLxhUdl2ZQFwqrE,88906
|
44
|
+
easylink/steps/dev/input_data/input_file_1.parquet,sha256=Km8jRyfGNdq0MFdz_-bewlAc06cFjWn2dWQ7YKKGa5U,60399
|
45
|
+
easylink/steps/dev/input_data/input_file_2.csv,sha256=YqKLZDC4d-aYN8Dh9OB6iQWWUKmvueu5CszckH1AApU,100016
|
46
|
+
easylink/steps/dev/input_data/input_file_2.parquet,sha256=Vpo0sUqQ78qlWLRk8p303Nh89BVcK4uvXJljRGHmsWk,60392
|
47
|
+
easylink/steps/dev/python_pandas/README.md,sha256=c_FbtkKKOTjt2R_LfHUo5lBga1qHiYkxLdQeewRr45g,977
|
48
|
+
easylink/steps/dev/python_pandas/dummy_step.py,sha256=NvhLUZu40B3Xbj_S-chQ6IkYUPr6X2aGBxYUa3DqwmY,4362
|
49
|
+
easylink/steps/dev/python_pandas/python_pandas.def,sha256=24cxwGF8Cqkv2a1zVsu94MfC_bAXBqAINLwfW2zyB_0,769
|
50
|
+
easylink/steps/dev/python_pyspark/README.md,sha256=di29SAfcdTTpar7gdoJRLqKrL8DEfNeayYUyaywdhUg,1563
|
51
|
+
easylink/steps/dev/python_pyspark/dummy_step.py,sha256=wxHHI3Uv8MTipKG2ffHbT_eL4JkoNpx49bJoErXumdc,5003
|
52
|
+
easylink/steps/dev/python_pyspark/python_pyspark.def,sha256=j_RmVjspmXGOhJTr10ED13RYfbimgxRU3WVTL7VOIUQ,915
|
53
|
+
easylink/steps/dev/r/README.md,sha256=dPjZdDTqcJsZCiwhddzlOj1ob0P7YocZUNFrLIGM1-0,1201
|
54
|
+
easylink/steps/dev/r/dummy_step.R,sha256=1TWZY8CEkT6gavrulBxFsKbDSKJJjk0NtJrGH7TIikE,4975
|
55
|
+
easylink/steps/dev/r/r-image.def,sha256=LrhXlt0C3k7d_VJWopRPEVARnFWSuq_oILlwo7g03bE,627
|
56
|
+
easylink/steps/fastLink/fastLink_evaluating_pairs.R,sha256=fQRrTPrgb1t5hrQi0V5H55J-PHdWjsATrVRYdXNYtdU,4603
|
57
|
+
easylink/steps/fastLink/fastLink_evaluating_pairs.def,sha256=5rDi-cmWhyuFEsiGFPpTxtySMqq5TpgJG-y8g_MtEvA,509
|
58
|
+
easylink/steps/fastLink/fastLink_links_to_clusters.R,sha256=exVzJl4r7k7cRlMCHSmigOqTlxShqzK-FO3EDhlPksg,4087
|
59
|
+
easylink/steps/fastLink/fastLink_links_to_clusters.def,sha256=1xYjOMsHtSS2-AI4EC2r6kL8ZX5F2JhmvESefEKeJVY,512
|
60
|
+
easylink/steps/output_dir/dummy_step_1_for_output_dir_example.def,sha256=CkQVG-uDRQ9spAavdkZbhx2GD_fRsKZGELPrr8yltsc,550
|
61
|
+
easylink/steps/output_dir/dummy_step_1_for_output_dir_example.py,sha256=dI0OWugE35ABLcSwsI-T3C4dvuPTKXwjE52dtSsCo8Y,428
|
62
|
+
easylink/steps/output_dir/dummy_step_2_for_output_dir_example.def,sha256=9gShg1EDJEHZcz7Z5VfZ1A4Gpm9XQes8ezn6rAZDgDM,550
|
63
|
+
easylink/steps/output_dir/dummy_step_2_for_output_dir_example.py,sha256=DMJW5TXjhELxhY4U9q2RpLjqxlS1YSosTGL2AfRnaZM,521
|
64
|
+
easylink/steps/rl-dummy/canonicalizing_and_downstream_analysis/dummy_canonicalizing_and_downstream_analysis.def,sha256=YOWtJZxoe-kHFeEyrgGcVGfdqcbD_Fg17A9shOaK-yc,584
|
65
|
+
easylink/steps/rl-dummy/canonicalizing_and_downstream_analysis/dummy_canonicalizing_and_downstream_analysis.py,sha256=skZUiZWcSXAOqq8TAlN5I0wztXgCWHQYA_xkuiL5s28,1202
|
66
|
+
easylink/steps/rl-dummy/input_data/create_input_files.ipynb,sha256=uXvJ8zTTPg0077HgA7MhQ_9jD-aeISFLeMeEBbSnOu8,54498
|
67
|
+
easylink/steps/rl-dummy/input_data/input_file_1.parquet,sha256=GQ_7v7ucwdJn-9mTgKVcvqkJ5gTkwb0B7y38mfOYbic,15200
|
68
|
+
easylink/steps/rl-dummy/input_data/input_file_2.parquet,sha256=Y4eseBm0HmFroksQr_VApgozRL8h8u7nQO6x_Utyns8,14902
|
69
|
+
easylink/steps/rl-dummy/input_data/known_clusters.parquet,sha256=Ysodu65toHZN4AgjVJsm0ueUxPIZAJjbtRm9SVM08JE,2598
|
70
|
+
easylink/steps/rl-dummy/pre-processing/dummy_pre-processing.def,sha256=HeUSv2QvMOQzsyVktYR1xYoEqwiNpDo-p7IRcGSMspE,512
|
71
|
+
easylink/steps/rl-dummy/pre-processing/dummy_pre-processing.py,sha256=aeDgn9z2um0oTPNSwPcTkBou3-1ajud_MWhkuRoHdOU,1884
|
72
|
+
easylink/steps/splink/splink_blocking_and_filtering.def,sha256=foAQAPvhDEkXkevpghS-uftsTbIQnQy9PvTkyldQeAA,539
|
73
|
+
easylink/steps/splink/splink_blocking_and_filtering.py,sha256=8-_a9PkOmKSa-8TJ9YMjqI7gLo-YD9JCAO1f8uBhdoE,4469
|
74
|
+
easylink/steps/splink/splink_evaluating_pairs.def,sha256=DN3Ohy9qJOAyK58v164neP23HDVYpedMqzCu4eQh4Hg,521
|
75
|
+
easylink/steps/splink/splink_evaluating_pairs.py,sha256=JR2qVgb14cNZKozDyOrN11nr1mXOwWv69E6WP0pRlMw,5713
|
76
|
+
easylink/steps/splink/splink_links_to_clusters.def,sha256=RurvOYyGjNs9tx64DTXwI-GSgHD4T7SzDfhAH18pTEM,524
|
77
|
+
easylink/steps/splink/splink_links_to_clusters.py,sha256=z5ymdYl9ytp1e5MA6vn8wpGRFWVuhh23LqGq8NJJxZQ,1936
|
78
|
+
easylink/utilities/__init__.py,sha256=0U33kbv4hoMfFQ_lh5hLwifxRPzOgkLkjKLYxmaK10g,196
|
79
|
+
easylink/utilities/aggregator_utils.py,sha256=pqBog6kEX4MXBBMjQtHFlE5gEMqRWb5VFl64u0Lr__g,972
|
80
|
+
easylink/utilities/data_utils.py,sha256=XPRjq3qW_fN0xQ23Jms_xBzpTHbRwqZWDP1AW0nYkP0,6926
|
81
|
+
easylink/utilities/general_utils.py,sha256=El1W0nn4P27sRBGotNQb-9du-Gbhk9ggSuu4vmGDfwo,4591
|
82
|
+
easylink/utilities/paths.py,sha256=9inDKMPorAaWh5vhjc3j1Tj_aXVKhLdodiJO9H1nNes,947
|
83
|
+
easylink/utilities/spark.smk,sha256=kGtpem7LfQc71tMh5WAYaqKnHQKFvcdhPQSdumOP70k,5799
|
84
|
+
easylink/utilities/splitter_utils.py,sha256=UOz4hjkEPqaAz0RrDkDYYej79lLSaq0VVVSH_tF1z0o,3838
|
85
|
+
easylink/utilities/validation_utils.py,sha256=DBJB2TLVklgYw1WaaPse9vqtwPLMGmZNYM2cbCZsoHI,18417
|
86
|
+
easylink-0.1.19.dist-info/licenses/LICENSE,sha256=z6NBo3hY2Pbh2Wjm9IcHLyQoG-kd816uASh8PbwhTcQ,1530
|
87
|
+
easylink-0.1.19.dist-info/METADATA,sha256=nFZA-jZKgZUG4DdiDqY-pNOTfdt1H3QeiwNzvo27vpg,3565
|
88
|
+
easylink-0.1.19.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
89
|
+
easylink-0.1.19.dist-info/entry_points.txt,sha256=OGMZDFltg3yMboT7XjJt3joiPhRfV_7jnREVtrAIQNU,51
|
90
|
+
easylink-0.1.19.dist-info/top_level.txt,sha256=oHcOpcF_jDMWFiJRzfGQvuskENGDjSPC_Agu9Z_Xvik,9
|
91
|
+
easylink-0.1.19.dist-info/RECORD,,
|