metameq 2026.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- metameq/__init__.py +42 -0
- metameq/_version.py +21 -0
- metameq/config/__init__.py +0 -0
- metameq/config/config.yml +3 -0
- metameq/config/standards.yml +1648 -0
- metameq/src/__init__.py +0 -0
- metameq/src/__main__.py +34 -0
- metameq/src/metadata_configurator.py +512 -0
- metameq/src/metadata_extender.py +1168 -0
- metameq/src/metadata_merger.py +362 -0
- metameq/src/metadata_transformers.py +335 -0
- metameq/src/metadata_validator.py +387 -0
- metameq/src/util.py +299 -0
- metameq/tests/__init__.py +0 -0
- metameq/tests/data/invalid.yml +1 -0
- metameq/tests/data/test_config.yml +9 -0
- metameq/tests/test_metadata_configurator.py +2334 -0
- metameq/tests/test_metadata_extender.py +2610 -0
- metameq/tests/test_metadata_merger.py +657 -0
- metameq/tests/test_metadata_transformers.py +277 -0
- metameq/tests/test_metadata_validator.py +1191 -0
- metameq/tests/test_util.py +436 -0
- metameq-2026.1.1.dist-info/METADATA +21 -0
- metameq-2026.1.1.dist-info/RECORD +27 -0
- metameq-2026.1.1.dist-info/WHEEL +5 -0
- metameq-2026.1.1.dist-info/entry_points.txt +2 -0
- metameq-2026.1.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,387 @@
|
|
|
1
|
+
import cerberus
|
|
2
|
+
import copy
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from dateutil import parser
|
|
5
|
+
import logging
|
|
6
|
+
import os
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from metameq.src.util import SAMPLE_NAME_KEY, get_extension
|
|
9
|
+
|
|
10
|
+
_TYPE_KEY = "type"
|
|
11
|
+
_ANYOF_KEY = "anyof"
|
|
12
|
+
|
|
13
|
+
# Define a logger for this module
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class MetameqValidator(cerberus.Validator):
|
|
18
|
+
"""Custom cerberus Validator with metameq-specific validation rules.
|
|
19
|
+
|
|
20
|
+
Extends the cerberus Validator class to add custom check_with rules
|
|
21
|
+
for validating metadata fields according to metameq requirements.
|
|
22
|
+
Custom rules are invoked by including "check_with" in a field's
|
|
23
|
+
cerberus schema definition.
|
|
24
|
+
|
|
25
|
+
See Also
|
|
26
|
+
--------
|
|
27
|
+
https://docs.python-cerberus.org/customize.html
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
def _check_with_date_not_in_future(self, field, value):
|
|
31
|
+
"""Validate that a date field value is not in the future.
|
|
32
|
+
|
|
33
|
+
This method is automatically invoked by cerberus when a field's schema
|
|
34
|
+
includes "check_with": "date_not_in_future". It parses the value as a
|
|
35
|
+
date and validates that it is not after the current date/time.
|
|
36
|
+
|
|
37
|
+
Parameters
|
|
38
|
+
----------
|
|
39
|
+
field : str
|
|
40
|
+
The name of the field being validated.
|
|
41
|
+
value : str
|
|
42
|
+
The date string to validate.
|
|
43
|
+
|
|
44
|
+
Notes
|
|
45
|
+
-----
|
|
46
|
+
Adds a validation error if:
|
|
47
|
+
- The value cannot be parsed as a valid date
|
|
48
|
+
- The parsed date is in the future
|
|
49
|
+
"""
|
|
50
|
+
# convert the field string to a date
|
|
51
|
+
try:
|
|
52
|
+
putative_date = parser.parse(value, fuzzy=True, dayfirst=False)
|
|
53
|
+
except Exception: # noqa: E722
|
|
54
|
+
self._error(field, "Must be a valid date")
|
|
55
|
+
return
|
|
56
|
+
|
|
57
|
+
if putative_date > datetime.now():
|
|
58
|
+
self._error(field, "Date cannot be in the future")
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def validate_metadata_df(metadata_df, sample_type_full_metadata_fields_dict):
|
|
62
|
+
"""Validate a metadata DataFrame against a field definition schema.
|
|
63
|
+
|
|
64
|
+
Converts the metadata fields dictionary into a cerberus schema, casts
|
|
65
|
+
each field in the DataFrame to its expected type, and validates all rows
|
|
66
|
+
against the schema. Fields defined in the schema but missing from the
|
|
67
|
+
DataFrame are logged and skipped.
|
|
68
|
+
|
|
69
|
+
Parameters
|
|
70
|
+
----------
|
|
71
|
+
metadata_df : pandas.DataFrame
|
|
72
|
+
The metadata DataFrame to validate. Must contain a SAMPLE_NAME_KEY
|
|
73
|
+
columnfor identifying samples in validation error messages.
|
|
74
|
+
sample_type_full_metadata_fields_dict : dict
|
|
75
|
+
A dictionary defining metadata fields and their validation rules.
|
|
76
|
+
May contain metameq-specific keys (is_phi, field_desc, units,
|
|
77
|
+
min_exclusive, unique) which will be stripped before cerberus
|
|
78
|
+
validation, as well as standard cerberus keys (type, required,
|
|
79
|
+
allowed, regex, etc.).
|
|
80
|
+
|
|
81
|
+
Returns
|
|
82
|
+
-------
|
|
83
|
+
list
|
|
84
|
+
A list of dictionaries containing validation errors. Each dictionary
|
|
85
|
+
contains SAMPLE_NAME_KEY, "field_name", and "error_message" keys.
|
|
86
|
+
Returns an empty list if all rows pass validation.
|
|
87
|
+
"""
|
|
88
|
+
config = _make_cerberus_schema(sample_type_full_metadata_fields_dict)
|
|
89
|
+
|
|
90
|
+
# NB: typed_metadata_df (the type-cast version of metadata_df) is only
|
|
91
|
+
# used for generating validation messages, after which it is discarded.
|
|
92
|
+
typed_metadata_df = metadata_df.copy()
|
|
93
|
+
for curr_field, curr_definition in \
|
|
94
|
+
sample_type_full_metadata_fields_dict.items():
|
|
95
|
+
|
|
96
|
+
if curr_field not in typed_metadata_df.columns:
|
|
97
|
+
logging.info(
|
|
98
|
+
f"Standard field {curr_field} not in metadata file")
|
|
99
|
+
continue
|
|
100
|
+
|
|
101
|
+
curr_allowed_types = _get_allowed_pandas_types(
|
|
102
|
+
curr_field, curr_definition)
|
|
103
|
+
typed_metadata_df[curr_field] = typed_metadata_df[curr_field].apply(
|
|
104
|
+
lambda x: _cast_field_to_type(x, curr_allowed_types))
|
|
105
|
+
# next field in config
|
|
106
|
+
|
|
107
|
+
validation_msgs = _generate_validation_msg(typed_metadata_df, config)
|
|
108
|
+
return validation_msgs
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def output_validation_msgs(validation_msgs_df, out_dir, out_base, sep="\t",
|
|
112
|
+
suppress_empty_fails=False):
|
|
113
|
+
"""Write validation messages to a timestamped file.
|
|
114
|
+
|
|
115
|
+
Outputs the validation messages DataFrame to a file with a timestamp prefix.
|
|
116
|
+
If the DataFrame is empty and suppress_empty_fails is False, creates an empty
|
|
117
|
+
file. If suppress_empty_fails is True and the DataFrame is empty, no file is
|
|
118
|
+
created.
|
|
119
|
+
|
|
120
|
+
Parameters
|
|
121
|
+
----------
|
|
122
|
+
validation_msgs_df : pandas.DataFrame
|
|
123
|
+
DataFrame containing validation error messages.
|
|
124
|
+
out_dir : str
|
|
125
|
+
Directory where the output file will be written.
|
|
126
|
+
out_base : str
|
|
127
|
+
Base name for the output file. The full filename will be
|
|
128
|
+
"{timestamp}_{out_base}_validation_errors.{extension}".
|
|
129
|
+
sep : str, default="\t"
|
|
130
|
+
Separator to use in the output file. Determines file extension
|
|
131
|
+
(tab -> .txt, comma -> .csv).
|
|
132
|
+
suppress_empty_fails : bool, default=False
|
|
133
|
+
If True, no file is created when validation_msgs_df is empty.
|
|
134
|
+
If False, an empty file is created when there are no validation errors.
|
|
135
|
+
"""
|
|
136
|
+
timestamp_str = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
|
|
137
|
+
extension = get_extension(sep)
|
|
138
|
+
out_fp = os.path.join(
|
|
139
|
+
out_dir, f"{timestamp_str}_{out_base}_validation_errors.{extension}")
|
|
140
|
+
|
|
141
|
+
if validation_msgs_df.empty:
|
|
142
|
+
if not suppress_empty_fails:
|
|
143
|
+
Path(out_fp).touch()
|
|
144
|
+
# else, just do nothing
|
|
145
|
+
else:
|
|
146
|
+
validation_msgs_df.to_csv(out_fp, sep=sep, index=False)
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def _make_cerberus_schema(sample_type_metadata_dict):
|
|
150
|
+
"""Convert a metadata fields dictionary into a cerberus-compatible validation schema.
|
|
151
|
+
|
|
152
|
+
Creates a deep copy of the input dictionary and removes keys that are not
|
|
153
|
+
recognized by the cerberus validation library (is_phi, field_desc, units,
|
|
154
|
+
min_exclusive, unique). The resulting dictionary can be used directly with
|
|
155
|
+
cerberus for validation.
|
|
156
|
+
|
|
157
|
+
Parameters
|
|
158
|
+
----------
|
|
159
|
+
sample_type_metadata_dict : dict
|
|
160
|
+
A dictionary containing metadata field definitions, potentially including
|
|
161
|
+
keys that are not recognized by cerberus.
|
|
162
|
+
|
|
163
|
+
Returns
|
|
164
|
+
-------
|
|
165
|
+
dict
|
|
166
|
+
A cerberus-compatible schema with unrecognized keys removed.
|
|
167
|
+
"""
|
|
168
|
+
unrecognized_keys = ['is_phi', 'field_desc', 'units',
|
|
169
|
+
'min_exclusive', 'unique']
|
|
170
|
+
# traverse the host_fields_config dict and remove any keys that are not
|
|
171
|
+
# recognized by cerberus
|
|
172
|
+
cerberus_config = copy.deepcopy(sample_type_metadata_dict)
|
|
173
|
+
cerberus_config = _remove_leaf_keys_from_dict(
|
|
174
|
+
cerberus_config, unrecognized_keys)
|
|
175
|
+
|
|
176
|
+
return cerberus_config
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def _remove_leaf_keys_from_dict(input_dict, keys_to_remove):
|
|
180
|
+
"""Remove specified leaf keys from a dictionary, recursively processing nested structures.
|
|
181
|
+
|
|
182
|
+
Traverses the input dictionary and removes any keys with non-dict, non-list (leaf) values
|
|
183
|
+
that are in the keys_to_remove list. Keys with dict or list values are always
|
|
184
|
+
preserved (even if they match one of the keys_to_remove), with their contents recursively
|
|
185
|
+
processed. For lists, delegates to _remove_leaf_keys_from_dict_in_list.
|
|
186
|
+
Non-dict, non-list values are deep-copied if their key is not being removed.
|
|
187
|
+
|
|
188
|
+
Parameters
|
|
189
|
+
----------
|
|
190
|
+
input_dict : dict
|
|
191
|
+
The dictionary to process.
|
|
192
|
+
keys_to_remove : list
|
|
193
|
+
List of key names to remove from the dictionary and any nested dicts.
|
|
194
|
+
Only keys with non-dict, non-list values will be removed.
|
|
195
|
+
|
|
196
|
+
Returns
|
|
197
|
+
-------
|
|
198
|
+
dict
|
|
199
|
+
A new dictionary with the specified leaf keys removed at all nesting levels.
|
|
200
|
+
"""
|
|
201
|
+
output_dict = {}
|
|
202
|
+
for curr_key, curr_val in input_dict.items():
|
|
203
|
+
if isinstance(curr_val, dict):
|
|
204
|
+
output_dict[curr_key] = \
|
|
205
|
+
_remove_leaf_keys_from_dict(curr_val, keys_to_remove)
|
|
206
|
+
elif isinstance(curr_val, list):
|
|
207
|
+
output_dict[curr_key] = \
|
|
208
|
+
_remove_leaf_keys_from_dict_in_list(curr_val, keys_to_remove)
|
|
209
|
+
else:
|
|
210
|
+
if curr_key not in keys_to_remove:
|
|
211
|
+
output_dict[curr_key] = copy.deepcopy(curr_val)
|
|
212
|
+
return output_dict
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
def _remove_leaf_keys_from_dict_in_list(input_list, keys_to_remove):
|
|
216
|
+
"""Remove specified leaf keys from all dictionaries within a list.
|
|
217
|
+
|
|
218
|
+
Recursively processes the input list and removes any keys with non-dict, non-list (leaf)
|
|
219
|
+
values that are in the keys_to_remove list from any dictionaries found using
|
|
220
|
+
_remove_leaf_keys_from_dict. Handles nested lists and dictionaries at any depth.
|
|
221
|
+
Non-dict, non-list items are preserved unchanged.
|
|
222
|
+
|
|
223
|
+
Parameters
|
|
224
|
+
----------
|
|
225
|
+
input_list : list
|
|
226
|
+
The list to process. May contain dicts, nested lists, or other values.
|
|
227
|
+
keys_to_remove : list
|
|
228
|
+
List of key names to remove from any dictionaries found.
|
|
229
|
+
Only keys with non-dict, non-list values will be removed.
|
|
230
|
+
|
|
231
|
+
Returns
|
|
232
|
+
-------
|
|
233
|
+
list
|
|
234
|
+
A new list with the specified leaf keys removed from all contained dicts.
|
|
235
|
+
"""
|
|
236
|
+
output_list = []
|
|
237
|
+
for curr_val in input_list:
|
|
238
|
+
if isinstance(curr_val, dict):
|
|
239
|
+
output_list.append(
|
|
240
|
+
_remove_leaf_keys_from_dict(curr_val, keys_to_remove))
|
|
241
|
+
elif isinstance(curr_val, list):
|
|
242
|
+
output_list.append(
|
|
243
|
+
_remove_leaf_keys_from_dict_in_list(curr_val, keys_to_remove))
|
|
244
|
+
else:
|
|
245
|
+
output_list.append(curr_val)
|
|
246
|
+
return output_list
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def _cast_field_to_type(raw_field_val, allowed_pandas_types):
|
|
250
|
+
"""Cast a field value to one of the allowed Python types.
|
|
251
|
+
|
|
252
|
+
Attempts to cast the raw field value to each type in allowed_pandas_types
|
|
253
|
+
in order, returning the first successful cast. This allows flexible type
|
|
254
|
+
coercion where a value might be validly interpreted as multiple types.
|
|
255
|
+
|
|
256
|
+
Parameters
|
|
257
|
+
----------
|
|
258
|
+
raw_field_val : any
|
|
259
|
+
The raw value to cast.
|
|
260
|
+
allowed_pandas_types : list
|
|
261
|
+
A list of Python type callables (e.g., str, int, float) to attempt
|
|
262
|
+
casting to, in order of preference.
|
|
263
|
+
|
|
264
|
+
Returns
|
|
265
|
+
-------
|
|
266
|
+
any
|
|
267
|
+
The field value cast to the first successfully matched type.
|
|
268
|
+
|
|
269
|
+
Raises
|
|
270
|
+
------
|
|
271
|
+
ValueError
|
|
272
|
+
If the value cannot be cast to any of the allowed types.
|
|
273
|
+
"""
|
|
274
|
+
typed_field_val = None
|
|
275
|
+
for curr_type in allowed_pandas_types:
|
|
276
|
+
# noinspection PyBroadException
|
|
277
|
+
try:
|
|
278
|
+
typed_field_val = curr_type(raw_field_val)
|
|
279
|
+
break
|
|
280
|
+
except Exception: # noqa: E722
|
|
281
|
+
pass
|
|
282
|
+
# next allowed type
|
|
283
|
+
|
|
284
|
+
if typed_field_val is None:
|
|
285
|
+
raise ValueError(
|
|
286
|
+
f"Unable to cast '{raw_field_val}' to any of the allowed "
|
|
287
|
+
f"types: {allowed_pandas_types}")
|
|
288
|
+
|
|
289
|
+
return typed_field_val
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
def _get_allowed_pandas_types(field_name, field_definition):
|
|
293
|
+
"""Extract allowed Python types from a cerberus field definition.
|
|
294
|
+
|
|
295
|
+
Reads the type specification from a cerberus field definition and converts
|
|
296
|
+
the cerberus type names to their corresponding Python types. Handles both
|
|
297
|
+
single-type definitions (using "type" key) and multiple-type definitions
|
|
298
|
+
(using "anyof" key with a list of type options).
|
|
299
|
+
|
|
300
|
+
Parameters
|
|
301
|
+
----------
|
|
302
|
+
field_name : str
|
|
303
|
+
The name of the field being processed. Used only for error messages.
|
|
304
|
+
field_definition : dict
|
|
305
|
+
A cerberus field definition dictionary containing either a "type" key
|
|
306
|
+
with a single type name, or an "anyof" key with a list of type options.
|
|
307
|
+
|
|
308
|
+
Returns
|
|
309
|
+
-------
|
|
310
|
+
list
|
|
311
|
+
A list of Python type callables (str, int, float, bool, or datetime.date)
|
|
312
|
+
corresponding to the allowed cerberus types for this field.
|
|
313
|
+
|
|
314
|
+
Raises
|
|
315
|
+
------
|
|
316
|
+
ValueError
|
|
317
|
+
If the field definition contains neither a "type" nor an "anyof" key.
|
|
318
|
+
"""
|
|
319
|
+
cerberus_to_python_types = {
|
|
320
|
+
"string": str,
|
|
321
|
+
"integer": int,
|
|
322
|
+
"float": float,
|
|
323
|
+
"number": float,
|
|
324
|
+
"bool": bool,
|
|
325
|
+
"datetime": datetime.date}
|
|
326
|
+
|
|
327
|
+
allowed_cerberus_types = []
|
|
328
|
+
if _TYPE_KEY in field_definition:
|
|
329
|
+
allowed_cerberus_types.append(field_definition.get(_TYPE_KEY))
|
|
330
|
+
elif _ANYOF_KEY in field_definition:
|
|
331
|
+
for curr_allowed_type_entry in field_definition[_ANYOF_KEY]:
|
|
332
|
+
allowed_cerberus_types.append(
|
|
333
|
+
curr_allowed_type_entry[_TYPE_KEY])
|
|
334
|
+
# next anyof entry
|
|
335
|
+
else:
|
|
336
|
+
raise ValueError(
|
|
337
|
+
f"Unable to find type definition for field '{field_name}'")
|
|
338
|
+
# if type or anyof key in definition
|
|
339
|
+
|
|
340
|
+
allowed_pandas_types = \
|
|
341
|
+
[cerberus_to_python_types[x] for x in allowed_cerberus_types]
|
|
342
|
+
return allowed_pandas_types
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
def _generate_validation_msg(typed_metadata_df, config):
|
|
346
|
+
"""Generate validation error messages for a metadata DataFrame.
|
|
347
|
+
|
|
348
|
+
Validates each row of the metadata DataFrame against the provided cerberus
|
|
349
|
+
schema configuration and collects any validation errors into a list of
|
|
350
|
+
dictionaries.
|
|
351
|
+
|
|
352
|
+
Parameters
|
|
353
|
+
----------
|
|
354
|
+
typed_metadata_df : pandas.DataFrame
|
|
355
|
+
A metadata DataFrame with values already cast to their expected types.
|
|
356
|
+
Must contain a SAMPLE_NAME_KEY column for identifying samples.
|
|
357
|
+
config : dict
|
|
358
|
+
A cerberus-compatible validation schema dictionary defining the
|
|
359
|
+
validation rules for each metadata field.
|
|
360
|
+
|
|
361
|
+
Returns
|
|
362
|
+
-------
|
|
363
|
+
list
|
|
364
|
+
A list of dictionaries, where each dictionary contains:
|
|
365
|
+
- SAMPLE_NAME_KEY: The sample name for the row with the error
|
|
366
|
+
- "field_name": The name of the field that failed validation
|
|
367
|
+
- "error_message": The validation error message(s) from cerberus as a list of strings
|
|
368
|
+
Returns an empty list if all rows pass validation.
|
|
369
|
+
"""
|
|
370
|
+
v = MetameqValidator()
|
|
371
|
+
v.allow_unknown = True
|
|
372
|
+
|
|
373
|
+
validation_msgs = []
|
|
374
|
+
raw_metadata_dict = typed_metadata_df.to_dict(orient="records")
|
|
375
|
+
for _, curr_row in enumerate(raw_metadata_dict):
|
|
376
|
+
if not v.validate(curr_row, config):
|
|
377
|
+
curr_sample_name = curr_row[SAMPLE_NAME_KEY]
|
|
378
|
+
for curr_field_name, curr_err_msg in v.errors.items():
|
|
379
|
+
validation_msgs.append({
|
|
380
|
+
SAMPLE_NAME_KEY: curr_sample_name,
|
|
381
|
+
"field_name": curr_field_name,
|
|
382
|
+
"error_message": curr_err_msg})
|
|
383
|
+
# next error for curr row
|
|
384
|
+
# endif row is not valid
|
|
385
|
+
# next row
|
|
386
|
+
|
|
387
|
+
return validation_msgs
|
metameq/src/util.py
ADDED
|
@@ -0,0 +1,299 @@
|
|
|
1
|
+
import copy
|
|
2
|
+
from importlib.resources import files
|
|
3
|
+
import pandas
|
|
4
|
+
from typing import List, Optional, Union, Callable
|
|
5
|
+
import yaml
|
|
6
|
+
|
|
7
|
+
CONFIG_MODULE_PATH = "metameq.config"
|
|
8
|
+
|
|
9
|
+
# config keys
|
|
10
|
+
METADATA_FIELDS_KEY = "metadata_fields"
|
|
11
|
+
STUDY_SPECIFIC_METADATA_KEY = "study_specific_metadata"
|
|
12
|
+
HOST_TYPE_SPECIFIC_METADATA_KEY = "host_type_specific_metadata"
|
|
13
|
+
SAMPLE_TYPE_KEY = "sample_type"
|
|
14
|
+
QIITA_SAMPLE_TYPE = "qiita_sample_type"
|
|
15
|
+
SAMPLE_TYPE_SPECIFIC_METADATA_KEY = "sample_type_specific_metadata"
|
|
16
|
+
METADATA_TRANSFORMERS_KEY = "metadata_transformers"
|
|
17
|
+
PRE_TRANSFORMERS_KEY = "pre_transformers"
|
|
18
|
+
POST_TRANSFORMERS_KEY = "post_transformers"
|
|
19
|
+
ALIAS_KEY = "alias"
|
|
20
|
+
BASE_TYPE_KEY = "base_type"
|
|
21
|
+
DEFAULT_KEY = "default"
|
|
22
|
+
REQUIRED_KEY = "required"
|
|
23
|
+
ALLOWED_KEY = "allowed"
|
|
24
|
+
ANYOF_KEY = "anyof"
|
|
25
|
+
TYPE_KEY = "type"
|
|
26
|
+
SOURCES_KEY = "sources"
|
|
27
|
+
FUNCTION_KEY = "function"
|
|
28
|
+
LEAVE_REQUIREDS_BLANK_KEY = "leave_requireds_blank"
|
|
29
|
+
OVERWRITE_NON_NANS_KEY = "overwrite_non_nans"
|
|
30
|
+
|
|
31
|
+
# internal code keys
|
|
32
|
+
HOSTTYPE_SHORTHAND_KEY = "hosttype_shorthand"
|
|
33
|
+
SAMPLETYPE_SHORTHAND_KEY = "sampletype_shorthand"
|
|
34
|
+
QC_NOTE_KEY = "qc_note"
|
|
35
|
+
|
|
36
|
+
# metadata keys
|
|
37
|
+
SAMPLE_NAME_KEY = "sample_name"
|
|
38
|
+
COLLECTION_TIMESTAMP_KEY = "collection_timestamp"
|
|
39
|
+
HOST_SUBJECT_ID_KEY = "host_subject_id"
|
|
40
|
+
|
|
41
|
+
# constant field values
|
|
42
|
+
NOT_PROVIDED_VAL = "not provided"
|
|
43
|
+
LEAVE_BLANK_VAL = "leaveblank"
|
|
44
|
+
DO_NOT_USE_VAL = "donotuse"
|
|
45
|
+
|
|
46
|
+
# required raw metadata fields
|
|
47
|
+
REQUIRED_RAW_METADATA_FIELDS = [SAMPLE_NAME_KEY,
|
|
48
|
+
HOSTTYPE_SHORTHAND_KEY,
|
|
49
|
+
SAMPLETYPE_SHORTHAND_KEY]
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def extract_config_dict(
|
|
53
|
+
config_fp: Union[str, None]) -> dict:
|
|
54
|
+
"""Extract configuration dictionary from a YAML file.
|
|
55
|
+
|
|
56
|
+
If no config file path is provided, looks for config.yml in the grandparent
|
|
57
|
+
directory of the starting file path or current file.
|
|
58
|
+
|
|
59
|
+
Parameters
|
|
60
|
+
----------
|
|
61
|
+
config_fp : Union[str, None]
|
|
62
|
+
Path to the configuration YAML file. If None, will look for config.yml
|
|
63
|
+
in the "config" module of the package.
|
|
64
|
+
|
|
65
|
+
Returns
|
|
66
|
+
-------
|
|
67
|
+
dict
|
|
68
|
+
Configuration dictionary loaded from the YAML file.
|
|
69
|
+
|
|
70
|
+
Raises
|
|
71
|
+
------
|
|
72
|
+
FileNotFoundError
|
|
73
|
+
If the config file cannot be found.
|
|
74
|
+
yaml.YAMLError
|
|
75
|
+
If the YAML file is invalid.
|
|
76
|
+
"""
|
|
77
|
+
if config_fp is None:
|
|
78
|
+
config_dir = files(CONFIG_MODULE_PATH)
|
|
79
|
+
config_fp = config_dir.joinpath("config.yml")
|
|
80
|
+
|
|
81
|
+
# read in config file
|
|
82
|
+
config_dict = extract_yaml_dict(config_fp)
|
|
83
|
+
return config_dict
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def extract_yaml_dict(yaml_fp: str) -> dict:
|
|
87
|
+
"""Extract dictionary from a YAML file.
|
|
88
|
+
|
|
89
|
+
Parameters
|
|
90
|
+
----------
|
|
91
|
+
yaml_fp : str
|
|
92
|
+
Path to the YAML file.
|
|
93
|
+
|
|
94
|
+
Returns
|
|
95
|
+
-------
|
|
96
|
+
dict
|
|
97
|
+
Dictionary loaded from the YAML file.
|
|
98
|
+
|
|
99
|
+
Raises
|
|
100
|
+
------
|
|
101
|
+
FileNotFoundError
|
|
102
|
+
If the YAML file cannot be found.
|
|
103
|
+
yaml.YAMLError
|
|
104
|
+
If the YAML file is invalid.
|
|
105
|
+
"""
|
|
106
|
+
with open(yaml_fp, "r") as f:
|
|
107
|
+
yaml_dict = yaml.safe_load(f)
|
|
108
|
+
return yaml_dict
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def extract_stds_config(stds_fp: Union[str, None]) -> dict:
|
|
112
|
+
"""Extract standards dictionary from a YAML file.
|
|
113
|
+
|
|
114
|
+
If no standards file path is provided, looks for standards.yml in the
|
|
115
|
+
"config" module of the package.
|
|
116
|
+
|
|
117
|
+
Parameters
|
|
118
|
+
----------
|
|
119
|
+
stds_fp : Union[str, None]
|
|
120
|
+
Path to the standards YAML file. If None, will look for
|
|
121
|
+
standards.yml in the "config" module.
|
|
122
|
+
|
|
123
|
+
Returns
|
|
124
|
+
-------
|
|
125
|
+
dict
|
|
126
|
+
Standards dictionary loaded from the YAML file.
|
|
127
|
+
|
|
128
|
+
Raises
|
|
129
|
+
------
|
|
130
|
+
FileNotFoundError
|
|
131
|
+
If the standards file cannot be found.
|
|
132
|
+
yaml.YAMLError
|
|
133
|
+
If the YAML file is invalid.
|
|
134
|
+
"""
|
|
135
|
+
if not stds_fp:
|
|
136
|
+
config_dir = files(CONFIG_MODULE_PATH)
|
|
137
|
+
stds_fp = config_dir.joinpath("standards.yml")
|
|
138
|
+
return extract_config_dict(stds_fp)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def deepcopy_dict(input_dict: dict) -> dict:
|
|
142
|
+
"""Create a deep copy of a dictionary, including nested dictionaries.
|
|
143
|
+
|
|
144
|
+
Parameters
|
|
145
|
+
----------
|
|
146
|
+
input_dict : dict
|
|
147
|
+
Dictionary to be copied.
|
|
148
|
+
|
|
149
|
+
Returns
|
|
150
|
+
-------
|
|
151
|
+
dict
|
|
152
|
+
Deep copy of the input dictionary.
|
|
153
|
+
"""
|
|
154
|
+
output_dict = {}
|
|
155
|
+
for curr_key, curr_val in input_dict.items():
|
|
156
|
+
if isinstance(curr_val, dict):
|
|
157
|
+
output_dict[curr_key] = deepcopy_dict(curr_val)
|
|
158
|
+
else:
|
|
159
|
+
output_dict[curr_key] = copy.deepcopy(curr_val)
|
|
160
|
+
return output_dict
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def load_df_with_best_fit_encoding(
|
|
164
|
+
an_fp: str, a_file_separator: str, dtype: Optional[str] = None) -> \
|
|
165
|
+
pandas.DataFrame:
|
|
166
|
+
"""Load a DataFrame from a file, trying multiple encodings.
|
|
167
|
+
|
|
168
|
+
Attempts to load the file using various common encodings (utf-8, utf-8-sig,
|
|
169
|
+
iso-8859-1, latin1, cp1252) until successful.
|
|
170
|
+
|
|
171
|
+
Parameters
|
|
172
|
+
----------
|
|
173
|
+
an_fp : str
|
|
174
|
+
Path to the file to load.
|
|
175
|
+
a_file_separator : str
|
|
176
|
+
Separator character used in the file (e.g., ',' for CSV).
|
|
177
|
+
dtype : Optional[str]
|
|
178
|
+
Data type to use for the DataFrame. If None, pandas will infer types.
|
|
179
|
+
|
|
180
|
+
Returns
|
|
181
|
+
-------
|
|
182
|
+
pandas.DataFrame
|
|
183
|
+
DataFrame loaded from the file.
|
|
184
|
+
|
|
185
|
+
Raises
|
|
186
|
+
------
|
|
187
|
+
ValueError
|
|
188
|
+
If the file cannot be decoded with any of the available encodings.
|
|
189
|
+
"""
|
|
190
|
+
result = None
|
|
191
|
+
|
|
192
|
+
# from https://stackoverflow.com/a/76366653
|
|
193
|
+
encodings = ["utf-8", "utf-8-sig", "iso-8859-1", "latin1", "cp1252"]
|
|
194
|
+
for encoding in encodings:
|
|
195
|
+
# noinspection PyBroadException
|
|
196
|
+
try:
|
|
197
|
+
result = pandas.read_csv(
|
|
198
|
+
an_fp, sep=a_file_separator, encoding=encoding, dtype=dtype)
|
|
199
|
+
break
|
|
200
|
+
except Exception: # noqa: E722
|
|
201
|
+
pass
|
|
202
|
+
|
|
203
|
+
if result is None:
|
|
204
|
+
raise ValueError(f"Unable to decode {an_fp} "
|
|
205
|
+
f"with any available encoder")
|
|
206
|
+
|
|
207
|
+
return result
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def validate_required_columns_exist(
|
|
211
|
+
input_df: pandas.DataFrame, required_cols_list: List[str],
|
|
212
|
+
error_msg: str) -> None:
|
|
213
|
+
"""Validate that a DataFrame contains all required columns.
|
|
214
|
+
|
|
215
|
+
Parameters
|
|
216
|
+
----------
|
|
217
|
+
input_df : pandas.DataFrame
|
|
218
|
+
DataFrame to validate.
|
|
219
|
+
required_cols_list : List[str]
|
|
220
|
+
List of column names that must be present in the DataFrame.
|
|
221
|
+
error_msg : str
|
|
222
|
+
Error message to be raised if any required columns are missing.
|
|
223
|
+
|
|
224
|
+
Raises
|
|
225
|
+
------
|
|
226
|
+
ValueError
|
|
227
|
+
If any of the required columns are missing from the DataFrame.
|
|
228
|
+
"""
|
|
229
|
+
missing_cols = set(required_cols_list) - set(input_df.columns)
|
|
230
|
+
if len(missing_cols) > 0:
|
|
231
|
+
missing_cols = sorted(missing_cols)
|
|
232
|
+
raise ValueError(
|
|
233
|
+
f"{error_msg}: {missing_cols}")
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def get_extension(sep: str) -> str:
|
|
237
|
+
"""Get the appropriate file extension based on the separator character.
|
|
238
|
+
|
|
239
|
+
Parameters
|
|
240
|
+
----------
|
|
241
|
+
sep : str
|
|
242
|
+
Separator character used in the file.
|
|
243
|
+
|
|
244
|
+
Returns
|
|
245
|
+
-------
|
|
246
|
+
str
|
|
247
|
+
File extension: 'csv' for comma-separated files, 'txt' for others.
|
|
248
|
+
"""
|
|
249
|
+
return "csv" if sep == "," else "txt"
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
def update_metadata_df_field(
|
|
253
|
+
metadata_df: pandas.DataFrame, field_name: str,
|
|
254
|
+
field_val_or_func: Union[
|
|
255
|
+
str, Callable[[pandas.Series, List[str]], str]],
|
|
256
|
+
source_fields: Optional[List[str]] = None,
|
|
257
|
+
overwrite_non_nans: bool = True) -> None:
|
|
258
|
+
"""Update or add a field in an existing metadata DataFrame.
|
|
259
|
+
|
|
260
|
+
Can update an existing field or add a new one, using either a constant value
|
|
261
|
+
or a function to compute values based on other fields.
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
Parameters
|
|
265
|
+
----------
|
|
266
|
+
metadata_df : pandas.DataFrame
|
|
267
|
+
DataFrame to update. Modified in place.
|
|
268
|
+
field_name : str
|
|
269
|
+
Name of the field to update or add.
|
|
270
|
+
field_val_or_func : Union[str, Callable]
|
|
271
|
+
Either a constant value to set, or a function that takes a row and
|
|
272
|
+
source fields as input and returns a value.
|
|
273
|
+
source_fields : Optional[List[str]]
|
|
274
|
+
List of field names to use as input for the function. Required if
|
|
275
|
+
field_val_or_func is a function.
|
|
276
|
+
overwrite_non_nans : bool
|
|
277
|
+
If True, overwrites all values in the field. If False, only updates
|
|
278
|
+
NaN values.
|
|
279
|
+
"""
|
|
280
|
+
# Note: function doesn't return anything. Work is done in-place on the
|
|
281
|
+
# metadata_df passed in.
|
|
282
|
+
|
|
283
|
+
# If the field does not already exist in the metadata OR if we have
|
|
284
|
+
# been told to overwrite existing (i.e., non-NaN) values, we will set its
|
|
285
|
+
# value in all rows; otherwise, will only set it where it is currently NaN
|
|
286
|
+
set_all = overwrite_non_nans or (field_name not in metadata_df.columns)
|
|
287
|
+
row_mask = \
|
|
288
|
+
metadata_df.index if set_all else metadata_df[field_name].isnull()
|
|
289
|
+
|
|
290
|
+
# If source fields were passed in, the field_val_or_func must be a function
|
|
291
|
+
if source_fields:
|
|
292
|
+
metadata_df.loc[row_mask, field_name] = \
|
|
293
|
+
metadata_df.apply(
|
|
294
|
+
lambda row: field_val_or_func(row, source_fields),
|
|
295
|
+
axis=1)
|
|
296
|
+
else:
|
|
297
|
+
# Otherwise, it is a constant value
|
|
298
|
+
metadata_df.loc[row_mask, field_name] = field_val_or_func
|
|
299
|
+
# endif using a function/a constant value
|
|
File without changes
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
invalid: yaml: content: - [
|