metameq 2026.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- metameq/__init__.py +42 -0
- metameq/_version.py +21 -0
- metameq/config/__init__.py +0 -0
- metameq/config/config.yml +3 -0
- metameq/config/standards.yml +1648 -0
- metameq/src/__init__.py +0 -0
- metameq/src/__main__.py +34 -0
- metameq/src/metadata_configurator.py +512 -0
- metameq/src/metadata_extender.py +1168 -0
- metameq/src/metadata_merger.py +362 -0
- metameq/src/metadata_transformers.py +335 -0
- metameq/src/metadata_validator.py +387 -0
- metameq/src/util.py +299 -0
- metameq/tests/__init__.py +0 -0
- metameq/tests/data/invalid.yml +1 -0
- metameq/tests/data/test_config.yml +9 -0
- metameq/tests/test_metadata_configurator.py +2334 -0
- metameq/tests/test_metadata_extender.py +2610 -0
- metameq/tests/test_metadata_merger.py +657 -0
- metameq/tests/test_metadata_transformers.py +277 -0
- metameq/tests/test_metadata_validator.py +1191 -0
- metameq/tests/test_util.py +436 -0
- metameq-2026.1.1.dist-info/METADATA +21 -0
- metameq-2026.1.1.dist-info/RECORD +27 -0
- metameq-2026.1.1.dist-info/WHEEL +5 -0
- metameq-2026.1.1.dist-info/entry_points.txt +2 -0
- metameq-2026.1.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1168 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import numpy as np
|
|
3
|
+
import os
|
|
4
|
+
import pandas
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from datetime import datetime
|
|
7
|
+
from typing import List, Dict, Optional, Tuple, Any
|
|
8
|
+
from metameq.src.util import extract_config_dict, \
|
|
9
|
+
deepcopy_dict, validate_required_columns_exist, get_extension, \
|
|
10
|
+
load_df_with_best_fit_encoding, update_metadata_df_field, \
|
|
11
|
+
HOSTTYPE_SHORTHAND_KEY, SAMPLETYPE_SHORTHAND_KEY, \
|
|
12
|
+
QC_NOTE_KEY, METADATA_FIELDS_KEY, HOST_TYPE_SPECIFIC_METADATA_KEY, \
|
|
13
|
+
SAMPLE_TYPE_SPECIFIC_METADATA_KEY, SAMPLE_TYPE_KEY, QIITA_SAMPLE_TYPE, \
|
|
14
|
+
DEFAULT_KEY, REQUIRED_KEY, ALIAS_KEY, BASE_TYPE_KEY, \
|
|
15
|
+
LEAVE_BLANK_VAL, SAMPLE_NAME_KEY, \
|
|
16
|
+
ALLOWED_KEY, TYPE_KEY, LEAVE_REQUIREDS_BLANK_KEY, OVERWRITE_NON_NANS_KEY, \
|
|
17
|
+
METADATA_TRANSFORMERS_KEY, PRE_TRANSFORMERS_KEY, POST_TRANSFORMERS_KEY, \
|
|
18
|
+
SOURCES_KEY, FUNCTION_KEY, REQUIRED_RAW_METADATA_FIELDS
|
|
19
|
+
from metameq.src.metadata_configurator import update_wip_metadata_dict, \
|
|
20
|
+
build_full_flat_config_dict
|
|
21
|
+
from metameq.src.metadata_validator import validate_metadata_df, \
|
|
22
|
+
output_validation_msgs
|
|
23
|
+
import metameq.src.metadata_transformers as transformers
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
# columns added to the metadata that are not actually part of it
|
|
27
|
+
INTERNAL_COL_KEYS = [HOSTTYPE_SHORTHAND_KEY, SAMPLETYPE_SHORTHAND_KEY,
|
|
28
|
+
QC_NOTE_KEY]
|
|
29
|
+
|
|
30
|
+
REQ_PLACEHOLDER = "_METAMEQ_REQUIRED"
|
|
31
|
+
|
|
32
|
+
# Define a logger for this module
|
|
33
|
+
logger = logging.getLogger(__name__)
|
|
34
|
+
|
|
35
|
+
pandas.set_option("future.no_silent_downcasting", True)
|
|
36
|
+
|
|
37
|
+
# TODO: find a way to inform user that they *are not allowed* to have a 'sample_id' column
|
|
38
|
+
# (Per Antonio 10/28/24, this is a reserved name for Qiita and may not be
|
|
39
|
+
# in the metadata).
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def get_reserved_cols(
|
|
43
|
+
raw_metadata_df: pandas.DataFrame,
|
|
44
|
+
study_specific_config_dict: Dict[str, Any],
|
|
45
|
+
study_specific_transformers_dict: Optional[Dict[str, Any]] = None) -> List[str]:
|
|
46
|
+
"""Get a list of all reserved column names for all host+sample type combinations in the metadata.
|
|
47
|
+
|
|
48
|
+
Note that 'reserved' is not the same as 'required'. Some column names (e.g.,
|
|
49
|
+
irb_institute for human host types) are not *required*, but are *reserved*, so they can
|
|
50
|
+
only be used to name columns that hold standardized info, not for arbitrary metadata.
|
|
51
|
+
|
|
52
|
+
Parameters
|
|
53
|
+
----------
|
|
54
|
+
raw_metadata_df : pandas.DataFrame
|
|
55
|
+
The input metadata DataFrame.
|
|
56
|
+
study_specific_config_dict : Dict[str, Any]
|
|
57
|
+
Study-specific flat-host-type config dictionary.
|
|
58
|
+
study_specific_transformers_dict : Optional[Dict[str, Any]], default=None
|
|
59
|
+
Dictionary of custom transformers for this study (only).
|
|
60
|
+
|
|
61
|
+
Returns
|
|
62
|
+
-------
|
|
63
|
+
List[str]
|
|
64
|
+
Sorted list of all reserved column names.
|
|
65
|
+
Empty if there are no reserved columns.
|
|
66
|
+
|
|
67
|
+
Raises
|
|
68
|
+
------
|
|
69
|
+
ValueError
|
|
70
|
+
If required columns are missing from the metadata.
|
|
71
|
+
"""
|
|
72
|
+
validate_required_columns_exist(
|
|
73
|
+
raw_metadata_df, [HOSTTYPE_SHORTHAND_KEY, SAMPLETYPE_SHORTHAND_KEY],
|
|
74
|
+
"metadata missing required columns")
|
|
75
|
+
|
|
76
|
+
# Essentially, mock a minimal metadata valid metadata dataframe and then
|
|
77
|
+
# use extend_metadata_df to add all the required columns to it (either empty
|
|
78
|
+
# or with default values but we don't care about the actual values), then
|
|
79
|
+
# return the list of column names from that extended df.
|
|
80
|
+
|
|
81
|
+
# get unique HOSTTYPE_SHORTHAND_KEY, SAMPLETYPE_SHORTHAND_KEY combinations
|
|
82
|
+
temp_df = raw_metadata_df[
|
|
83
|
+
[HOSTTYPE_SHORTHAND_KEY, SAMPLETYPE_SHORTHAND_KEY]].copy()
|
|
84
|
+
temp_df.drop_duplicates(inplace=True)
|
|
85
|
+
|
|
86
|
+
# add a bogus SAMPLE_NAME_KEY column to the df that just holds sequential integers
|
|
87
|
+
temp_df[SAMPLE_NAME_KEY] = range(1, len(temp_df) + 1)
|
|
88
|
+
|
|
89
|
+
temp_df = _catch_nan_required_fields(temp_df)
|
|
90
|
+
|
|
91
|
+
# extend the metadata_df to get all the required columns for all host+sample type combinations;
|
|
92
|
+
# we don't really care about the contents of these columns, just their names.
|
|
93
|
+
# (I doubt it is necessary to pass the actual study_specific_transformers_dict; could probably just use None)
|
|
94
|
+
metadata_df, _ = extend_metadata_df(
|
|
95
|
+
temp_df, study_specific_config_dict,
|
|
96
|
+
study_specific_transformers_dict)
|
|
97
|
+
|
|
98
|
+
return sorted(metadata_df.columns.to_list())
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def id_missing_cols(a_df: pandas.DataFrame) -> List[str]:
|
|
102
|
+
"""Identify required columns that are missing from the DataFrame.
|
|
103
|
+
|
|
104
|
+
Parameters
|
|
105
|
+
----------
|
|
106
|
+
a_df : pandas.DataFrame
|
|
107
|
+
The metadata DataFrame to check for missing columns.
|
|
108
|
+
|
|
109
|
+
Returns
|
|
110
|
+
-------
|
|
111
|
+
List[str]
|
|
112
|
+
Sorted list of required column names that are missing from the DataFrame.
|
|
113
|
+
Empty if there are no missing columns.
|
|
114
|
+
"""
|
|
115
|
+
missing_cols = set(REQUIRED_RAW_METADATA_FIELDS) - set(a_df.columns)
|
|
116
|
+
return sorted(list(missing_cols))
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def find_standard_cols(
|
|
120
|
+
a_df: pandas.DataFrame,
|
|
121
|
+
study_specific_config_dict: Dict[str, Any],
|
|
122
|
+
study_specific_transformers_dict: Optional[Dict[str, Any]] = None,
|
|
123
|
+
suppress_missing_name_err: bool = False) -> List[str]:
|
|
124
|
+
"""Find all the standard columns in the metadata DataFrame.
|
|
125
|
+
|
|
126
|
+
Parameters
|
|
127
|
+
----------
|
|
128
|
+
a_df : pandas.DataFrame
|
|
129
|
+
The metadata DataFrame to analyze.
|
|
130
|
+
study_specific_config_dict : Dict[str, Any]
|
|
131
|
+
Study-specific flat-host-type config dictionary.
|
|
132
|
+
study_specific_transformers_dict : Optional[Dict[str, Any]], default=None
|
|
133
|
+
Dictionary of custom transformers for this study (only).
|
|
134
|
+
suppress_missing_name_err : bool, default=False
|
|
135
|
+
Whether to suppress errors about missing sample name.
|
|
136
|
+
|
|
137
|
+
Returns
|
|
138
|
+
-------
|
|
139
|
+
List[str]
|
|
140
|
+
List of standard column names found in the DataFrame.
|
|
141
|
+
Empty if there are no standard columns.
|
|
142
|
+
|
|
143
|
+
Raises
|
|
144
|
+
------
|
|
145
|
+
ValueError
|
|
146
|
+
If required columns are missing from the metadata.
|
|
147
|
+
"""
|
|
148
|
+
err_msg = "metadata missing required columns"
|
|
149
|
+
required_cols = REQUIRED_RAW_METADATA_FIELDS.copy()
|
|
150
|
+
if suppress_missing_name_err:
|
|
151
|
+
# remove the sample name from the required columns list
|
|
152
|
+
required_cols.remove(SAMPLE_NAME_KEY)
|
|
153
|
+
# endif
|
|
154
|
+
validate_required_columns_exist(a_df, required_cols, err_msg)
|
|
155
|
+
|
|
156
|
+
# get the intersection of the reserved standard columns and
|
|
157
|
+
# the columns in the input dataframe
|
|
158
|
+
standard_cols = get_reserved_cols(
|
|
159
|
+
a_df, study_specific_config_dict,
|
|
160
|
+
study_specific_transformers_dict=study_specific_transformers_dict)
|
|
161
|
+
|
|
162
|
+
standard_cols_set = (set(standard_cols) - set(INTERNAL_COL_KEYS))
|
|
163
|
+
|
|
164
|
+
return list(standard_cols_set & set(a_df.columns))
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def find_nonstandard_cols(
|
|
168
|
+
a_df: pandas.DataFrame,
|
|
169
|
+
study_specific_config_dict: Dict[str, Any],
|
|
170
|
+
study_specific_transformers_dict: Optional[Dict[str, Any]] = None) -> List[str]:
|
|
171
|
+
"""Find any non-standard columns in the metadata DataFrame.
|
|
172
|
+
|
|
173
|
+
Parameters
|
|
174
|
+
----------
|
|
175
|
+
a_df : pandas.DataFrame
|
|
176
|
+
The metadata DataFrame to analyze.
|
|
177
|
+
study_specific_config_dict : Dict[str, Any]
|
|
178
|
+
Study-specific flat-host-type config dictionary.
|
|
179
|
+
study_specific_transformers_dict : Optional[Dict[str, Any]], default=None
|
|
180
|
+
Dictionary of custom transformers for this study (only).
|
|
181
|
+
|
|
182
|
+
Returns
|
|
183
|
+
-------
|
|
184
|
+
List[str]
|
|
185
|
+
List of non-standard column names found in the DataFrame.
|
|
186
|
+
Empty if there are no non-standard columns.
|
|
187
|
+
|
|
188
|
+
Raises
|
|
189
|
+
------
|
|
190
|
+
ValueError
|
|
191
|
+
If required columns are missing from the metadata.
|
|
192
|
+
"""
|
|
193
|
+
validate_required_columns_exist(a_df, REQUIRED_RAW_METADATA_FIELDS,
|
|
194
|
+
"metadata missing required columns")
|
|
195
|
+
|
|
196
|
+
# get the columns in
|
|
197
|
+
standard_cols = get_reserved_cols(
|
|
198
|
+
a_df, study_specific_config_dict,
|
|
199
|
+
study_specific_transformers_dict=study_specific_transformers_dict)
|
|
200
|
+
|
|
201
|
+
return list(set(a_df.columns) - set(standard_cols))
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def get_extended_metadata_from_df_and_yaml(
|
|
205
|
+
raw_metadata_df: pandas.DataFrame,
|
|
206
|
+
study_specific_config_fp: Optional[str]) -> Tuple[pandas.DataFrame, pandas.DataFrame]:
|
|
207
|
+
"""Extend metadata using configuration from a study-specific YAML config file.
|
|
208
|
+
|
|
209
|
+
Parameters
|
|
210
|
+
----------
|
|
211
|
+
raw_metadata_df : pandas.DataFrame
|
|
212
|
+
The raw metadata DataFrame to extend.
|
|
213
|
+
study_specific_config_fp : Optional[str]
|
|
214
|
+
Path to the study-specific configuration YAML file.
|
|
215
|
+
|
|
216
|
+
Returns
|
|
217
|
+
-------
|
|
218
|
+
Tuple[pandas.DataFrame, pandas.DataFrame]
|
|
219
|
+
A tuple containing:
|
|
220
|
+
- The extended metadata DataFrame
|
|
221
|
+
- A DataFrame containing validation messages
|
|
222
|
+
"""
|
|
223
|
+
# get the study-specific flat-host-type config dictionary from the input yaml file
|
|
224
|
+
study_specific_config_dict = \
|
|
225
|
+
_get_study_specific_config(study_specific_config_fp)
|
|
226
|
+
|
|
227
|
+
# extend the metadata DataFrame using the study-specific flat-host-type config dictionary
|
|
228
|
+
metadata_df, validation_msgs_df = \
|
|
229
|
+
extend_metadata_df(raw_metadata_df, study_specific_config_dict)
|
|
230
|
+
|
|
231
|
+
return metadata_df, validation_msgs_df
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def get_qc_failures(a_df: pandas.DataFrame) -> pandas.DataFrame:
|
|
235
|
+
"""Get rows from the extended metadata DataFrame that have QC failures.
|
|
236
|
+
|
|
237
|
+
Parameters
|
|
238
|
+
----------
|
|
239
|
+
a_df : pandas.DataFrame
|
|
240
|
+
The extended metadata DataFrame to check for QC failures.
|
|
241
|
+
|
|
242
|
+
Returns
|
|
243
|
+
-------
|
|
244
|
+
pandas.DataFrame
|
|
245
|
+
A new DataFrame containing only the rows that failed QC checks.
|
|
246
|
+
"""
|
|
247
|
+
fails_qc_mask = a_df[QC_NOTE_KEY] != ""
|
|
248
|
+
qc_fails_df = \
|
|
249
|
+
a_df.loc[fails_qc_mask, :].copy()
|
|
250
|
+
return qc_fails_df
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
def write_extended_metadata(
|
|
254
|
+
raw_metadata_fp: str,
|
|
255
|
+
study_specific_config_fp: str,
|
|
256
|
+
out_dir: str,
|
|
257
|
+
out_name_base: str,
|
|
258
|
+
sep: str = "\t",
|
|
259
|
+
remove_internals: bool = True,
|
|
260
|
+
suppress_empty_fails: bool = False) -> pandas.DataFrame:
|
|
261
|
+
"""Write extended metadata to files starting from input file paths to metadata and config.
|
|
262
|
+
|
|
263
|
+
Parameters
|
|
264
|
+
----------
|
|
265
|
+
raw_metadata_fp : str
|
|
266
|
+
Path to the raw metadata file (.csv, .txt, or .xlsx).
|
|
267
|
+
study_specific_config_fp : str
|
|
268
|
+
Path to the study-specific configuration YAML file.
|
|
269
|
+
out_dir : str
|
|
270
|
+
Directory where output files will be written.
|
|
271
|
+
out_name_base : str
|
|
272
|
+
Base name for output files.
|
|
273
|
+
sep : str, default="\t"
|
|
274
|
+
Separator to use in output files.
|
|
275
|
+
remove_internals : bool, default=True
|
|
276
|
+
Whether to remove internal columns.
|
|
277
|
+
suppress_empty_fails : bool, default=False
|
|
278
|
+
Whether to suppress empty failure files.
|
|
279
|
+
|
|
280
|
+
Returns
|
|
281
|
+
-------
|
|
282
|
+
pandas.DataFrame
|
|
283
|
+
The extended metadata DataFrame.
|
|
284
|
+
|
|
285
|
+
Raises
|
|
286
|
+
------
|
|
287
|
+
ValueError
|
|
288
|
+
If the input file extension is not recognized.
|
|
289
|
+
"""
|
|
290
|
+
# extract the extension from the raw_metadata_fp file path
|
|
291
|
+
extension = os.path.splitext(raw_metadata_fp)[1]
|
|
292
|
+
if extension == ".csv":
|
|
293
|
+
raw_metadata_df = load_df_with_best_fit_encoding(raw_metadata_fp, ",")
|
|
294
|
+
elif extension == ".txt":
|
|
295
|
+
raw_metadata_df = load_df_with_best_fit_encoding(raw_metadata_fp, "\t")
|
|
296
|
+
elif extension == ".xlsx":
|
|
297
|
+
# NB: this loads (only) the first sheet of the input excel file.
|
|
298
|
+
# If needed, can expand with pandas.read_excel sheet_name parameter.
|
|
299
|
+
raw_metadata_df = pandas.read_excel(raw_metadata_fp)
|
|
300
|
+
else:
|
|
301
|
+
raise ValueError("Unrecognized input file extension; "
|
|
302
|
+
"must be .csv, .txt, or .xlsx")
|
|
303
|
+
|
|
304
|
+
# get the study-specific flat-host-type config dictionary from the input yaml file
|
|
305
|
+
study_specific_config_dict = \
|
|
306
|
+
_get_study_specific_config(study_specific_config_fp)
|
|
307
|
+
|
|
308
|
+
# write the extended metadata to files
|
|
309
|
+
extended_df = write_extended_metadata_from_df(
|
|
310
|
+
raw_metadata_df, study_specific_config_dict,
|
|
311
|
+
out_dir, out_name_base, sep=sep,
|
|
312
|
+
remove_internals=remove_internals,
|
|
313
|
+
suppress_empty_fails=suppress_empty_fails)
|
|
314
|
+
|
|
315
|
+
# for good measure, return the extended metadata DataFrame
|
|
316
|
+
return extended_df
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
def _get_study_specific_config(study_specific_config_fp: Optional[str]) -> Optional[Dict[str, Any]]:
|
|
320
|
+
"""Load study-specific flat-host-type configuration from a YAML file.
|
|
321
|
+
|
|
322
|
+
Parameters
|
|
323
|
+
----------
|
|
324
|
+
study_specific_config_fp : Optional[str]
|
|
325
|
+
Path to the study-specific configuration YAML file.
|
|
326
|
+
This file should contain study-specific values for top-level settings (e.g., default
|
|
327
|
+
value) and, if necessary, a HOST_TYPE_SPECIFIC_METADATA_KEY holding a *flat*
|
|
328
|
+
dictionary of host types, defining only their study-specific host and sample type
|
|
329
|
+
metadata fields.
|
|
330
|
+
|
|
331
|
+
Returns
|
|
332
|
+
-------
|
|
333
|
+
Optional[Dict[str, Any]]
|
|
334
|
+
The loaded flat-host-type configuration dictionary, or None if no file path provided.
|
|
335
|
+
"""
|
|
336
|
+
if study_specific_config_fp:
|
|
337
|
+
study_specific_config_dict = \
|
|
338
|
+
extract_config_dict(study_specific_config_fp)
|
|
339
|
+
else:
|
|
340
|
+
study_specific_config_dict = None
|
|
341
|
+
|
|
342
|
+
return study_specific_config_dict
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
def write_extended_metadata_from_df(
|
|
346
|
+
raw_metadata_df: pandas.DataFrame,
|
|
347
|
+
study_specific_config_dict: Dict[str, Any],
|
|
348
|
+
out_dir: str,
|
|
349
|
+
out_name_base: str,
|
|
350
|
+
study_specific_transformers_dict: Optional[Dict[str, Any]] = None,
|
|
351
|
+
sep: str = "\t",
|
|
352
|
+
remove_internals: bool = True,
|
|
353
|
+
suppress_empty_fails: bool = False,
|
|
354
|
+
internal_col_names: Optional[List[str]] = None) -> pandas.DataFrame:
|
|
355
|
+
"""Write extended metadata to files starting from a metadata DataFrame and config dictionary.
|
|
356
|
+
|
|
357
|
+
Parameters
|
|
358
|
+
----------
|
|
359
|
+
raw_metadata_df : pandas.DataFrame
|
|
360
|
+
The raw metadata DataFrame to extend.
|
|
361
|
+
study_specific_config_dict : Dict[str, Any]
|
|
362
|
+
Study-specific configuration dictionary.
|
|
363
|
+
out_dir : str
|
|
364
|
+
Directory where output files will be written.
|
|
365
|
+
out_name_base : str
|
|
366
|
+
Base name for output files.
|
|
367
|
+
study_specific_transformers_dict : Optional[Dict[str, Any]], default=None
|
|
368
|
+
Dictionary of custom transformers.
|
|
369
|
+
sep : str, default="\t"
|
|
370
|
+
Separator to use in output files.
|
|
371
|
+
remove_internals : bool, default=True
|
|
372
|
+
Whether to remove internal columns.
|
|
373
|
+
suppress_empty_fails : bool, default=False
|
|
374
|
+
Whether to suppress empty failure files.
|
|
375
|
+
internal_col_names : Optional[List[str]], default=None
|
|
376
|
+
List of internal column names.
|
|
377
|
+
|
|
378
|
+
Returns
|
|
379
|
+
-------
|
|
380
|
+
pandas.DataFrame
|
|
381
|
+
The extended metadata DataFrame.
|
|
382
|
+
"""
|
|
383
|
+
# extend the metadata DataFrame using the study-specific flat-host-type config dictionary
|
|
384
|
+
metadata_df, validation_msgs_df = extend_metadata_df(
|
|
385
|
+
raw_metadata_df, study_specific_config_dict,
|
|
386
|
+
study_specific_transformers_dict)
|
|
387
|
+
|
|
388
|
+
# write the metadata and validation results to files
|
|
389
|
+
write_metadata_results(
|
|
390
|
+
metadata_df, validation_msgs_df, out_dir, out_name_base,
|
|
391
|
+
sep=sep, remove_internals=remove_internals,
|
|
392
|
+
suppress_empty_fails=suppress_empty_fails,
|
|
393
|
+
internal_col_names=internal_col_names)
|
|
394
|
+
|
|
395
|
+
# for good measure, return the extended metadata DataFrame
|
|
396
|
+
return metadata_df
|
|
397
|
+
|
|
398
|
+
|
|
399
|
+
def extend_metadata_df(
|
|
400
|
+
raw_metadata_df: pandas.DataFrame,
|
|
401
|
+
study_specific_config_dict: Optional[Dict[str, Any]],
|
|
402
|
+
study_specific_transformers_dict: Optional[Dict[str, Any]] = None,
|
|
403
|
+
software_config_dict: Optional[Dict[str, Any]] = None,
|
|
404
|
+
stds_fp: Optional[str] = None
|
|
405
|
+
) -> Tuple[pandas.DataFrame, pandas.DataFrame]:
|
|
406
|
+
"""Extend a metadata DataFrame based on metadata standards and study-specific configurations.
|
|
407
|
+
|
|
408
|
+
Parameters
|
|
409
|
+
----------
|
|
410
|
+
raw_metadata_df : pandas.DataFrame
|
|
411
|
+
The raw metadata DataFrame to extend.
|
|
412
|
+
study_specific_config_dict : Optional[Dict[str, Any]]
|
|
413
|
+
Study-specific flat-host-type config dictionary.
|
|
414
|
+
study_specific_transformers_dict : Optional[Dict[str, Any]], default=None
|
|
415
|
+
Dictionary of custom transformers for this study (only).
|
|
416
|
+
software_config_dict : Optional[Dict[str, Any]], default=None
|
|
417
|
+
Software configuration dictionary. If None, the default software
|
|
418
|
+
config pulled from the config.yml file will be used.
|
|
419
|
+
stds_fp : Optional[str], default=None
|
|
420
|
+
Path to standards dictionary file. If None, the default standards
|
|
421
|
+
config pulled from the standards.yml file will be used.
|
|
422
|
+
|
|
423
|
+
Returns
|
|
424
|
+
-------
|
|
425
|
+
Tuple[pandas.DataFrame, pandas.DataFrame]
|
|
426
|
+
A tuple containing:
|
|
427
|
+
- The extended metadata DataFrame
|
|
428
|
+
- A DataFrame containing validation messages
|
|
429
|
+
|
|
430
|
+
Raises
|
|
431
|
+
------
|
|
432
|
+
ValueError
|
|
433
|
+
If required columns are missing from the metadata.
|
|
434
|
+
"""
|
|
435
|
+
validate_required_columns_exist(
|
|
436
|
+
raw_metadata_df, REQUIRED_RAW_METADATA_FIELDS,
|
|
437
|
+
"metadata missing required columns")
|
|
438
|
+
|
|
439
|
+
full_flat_config_dict = build_full_flat_config_dict(
|
|
440
|
+
study_specific_config_dict, software_config_dict, stds_fp)
|
|
441
|
+
|
|
442
|
+
metadata_df, validation_msgs_df = _populate_metadata_df(
|
|
443
|
+
raw_metadata_df, full_flat_config_dict,
|
|
444
|
+
study_specific_transformers_dict)
|
|
445
|
+
|
|
446
|
+
return metadata_df, validation_msgs_df
|
|
447
|
+
|
|
448
|
+
|
|
449
|
+
def write_metadata_results(
|
|
450
|
+
metadata_df: pandas.DataFrame,
|
|
451
|
+
validation_msgs_df: pandas.DataFrame,
|
|
452
|
+
out_dir: str,
|
|
453
|
+
out_name_base: str,
|
|
454
|
+
sep: str = "\t",
|
|
455
|
+
remove_internals: bool = True,
|
|
456
|
+
suppress_empty_fails: bool = False,
|
|
457
|
+
internal_col_names: Optional[List[str]] = None) -> None:
|
|
458
|
+
"""Write metadata and validation results to files.
|
|
459
|
+
|
|
460
|
+
Parameters
|
|
461
|
+
----------
|
|
462
|
+
metadata_df : pandas.DataFrame
|
|
463
|
+
The metadata DataFrame to write.
|
|
464
|
+
validation_msgs_df : pandas.DataFrame
|
|
465
|
+
DataFrame containing validation messages.
|
|
466
|
+
out_dir : str
|
|
467
|
+
Directory where output files will be written.
|
|
468
|
+
out_name_base : str
|
|
469
|
+
Base name for output files.
|
|
470
|
+
sep : str, default="\t"
|
|
471
|
+
Separator to use in output files.
|
|
472
|
+
remove_internals : bool, default=True
|
|
473
|
+
Whether to remove internal columns.
|
|
474
|
+
suppress_empty_fails : bool, default=False
|
|
475
|
+
Whether to suppress empty failure files.
|
|
476
|
+
internal_col_names : Optional[List[str]], default=None
|
|
477
|
+
List of internal column names.
|
|
478
|
+
"""
|
|
479
|
+
if internal_col_names is None:
|
|
480
|
+
internal_col_names = INTERNAL_COL_KEYS
|
|
481
|
+
|
|
482
|
+
_output_metadata_df_to_files(
|
|
483
|
+
metadata_df, out_dir, out_name_base, internal_col_names,
|
|
484
|
+
remove_internals_and_fails=remove_internals, sep=sep,
|
|
485
|
+
suppress_empty_fails=suppress_empty_fails)
|
|
486
|
+
|
|
487
|
+
output_validation_msgs(validation_msgs_df, out_dir, out_name_base, sep=",",
|
|
488
|
+
suppress_empty_fails=suppress_empty_fails)
|
|
489
|
+
|
|
490
|
+
|
|
491
|
+
def _populate_metadata_df(
|
|
492
|
+
raw_metadata_df: pandas.DataFrame,
|
|
493
|
+
full_flat_config_dict: Dict[str, Any],
|
|
494
|
+
transformer_funcs_dict: Optional[Dict[str, Any]]) -> Tuple[pandas.DataFrame, pandas.DataFrame]:
|
|
495
|
+
"""Populate columns and fields in a metadata DataFrame.
|
|
496
|
+
|
|
497
|
+
Parameters
|
|
498
|
+
----------
|
|
499
|
+
raw_metadata_df : pandas.DataFrame
|
|
500
|
+
The raw metadata DataFrame to populate, which must contain at least
|
|
501
|
+
the columns in REQUIRED_RAW_METADATA_FIELDS.
|
|
502
|
+
full_flat_config_dict : Dict[str, Any]
|
|
503
|
+
Fully combined flat-host-type config dictionary.
|
|
504
|
+
transformer_funcs_dict : Optional[Dict[str, Any]]
|
|
505
|
+
Dictionary of transformer functions, keyed by field name,
|
|
506
|
+
with each value being a dict with keys SOURCES_KEY and FUNCTION_KEY,
|
|
507
|
+
which map to lists of source field names for the transformer to use
|
|
508
|
+
and an existing transformer function name, respectively.
|
|
509
|
+
|
|
510
|
+
Returns
|
|
511
|
+
-------
|
|
512
|
+
Tuple[pandas.DataFrame, pandas.DataFrame]
|
|
513
|
+
A tuple containing:
|
|
514
|
+
- The populated metadata DataFrame
|
|
515
|
+
- A DataFrame containing validation messages
|
|
516
|
+
"""
|
|
517
|
+
metadata_df = raw_metadata_df.copy()
|
|
518
|
+
# Don't try to populate the QC_NOTE_KEY field, since it is an internal field
|
|
519
|
+
update_metadata_df_field(metadata_df, QC_NOTE_KEY, LEAVE_BLANK_VAL)
|
|
520
|
+
|
|
521
|
+
# Error for NaNs in sample name, warn for NaNs in host- and sample-type- shorthand fields.
|
|
522
|
+
metadata_df = _catch_nan_required_fields(metadata_df)
|
|
523
|
+
|
|
524
|
+
# Apply pre-transformers to the metadata. Pre-transformers run BEFORE host- and sample-type
|
|
525
|
+
# specific generation (which also includes validation), so they can transform raw input fields
|
|
526
|
+
# into values that the config validation expects (for example, converting a study's custom sex
|
|
527
|
+
# format like "M"/"F" into standardized values like "male"/"female" before validation occurs.
|
|
528
|
+
metadata_df = _transform_metadata(
|
|
529
|
+
metadata_df, full_flat_config_dict,
|
|
530
|
+
PRE_TRANSFORMERS_KEY, transformer_funcs_dict)
|
|
531
|
+
|
|
532
|
+
# Add specific metadata based on each host type present in the metadata.
|
|
533
|
+
# This step also validates the metadata against the config requirements.
|
|
534
|
+
metadata_df, validation_msgs = _generate_metadata_for_host_types(
|
|
535
|
+
metadata_df, full_flat_config_dict)
|
|
536
|
+
|
|
537
|
+
# Apply post-transformers to the metadata. Post-transformers run AFTER host- and sample-type
|
|
538
|
+
# specific generation, so they can use fields that only exist or were only filled in
|
|
539
|
+
# after that step, such as passing through a value filled in by the defaults to another field.
|
|
540
|
+
metadata_df = _transform_metadata(
|
|
541
|
+
metadata_df, full_flat_config_dict,
|
|
542
|
+
POST_TRANSFORMERS_KEY, transformer_funcs_dict)
|
|
543
|
+
|
|
544
|
+
# Reorder the metadata columns for better readability.
|
|
545
|
+
metadata_df = _reorder_df(metadata_df, INTERNAL_COL_KEYS)
|
|
546
|
+
|
|
547
|
+
# Turn the validation messages into a DataFrame of validation messages for easier use downstream.
|
|
548
|
+
validation_msgs_df = pandas.DataFrame(validation_msgs)
|
|
549
|
+
|
|
550
|
+
return metadata_df, validation_msgs_df
|
|
551
|
+
|
|
552
|
+
|
|
553
|
+
def _catch_nan_required_fields(metadata_df: pandas.DataFrame) -> pandas.DataFrame:
|
|
554
|
+
"""Error for NaNs in sample name, warn for NaNs in host- and sample-type- shorthand fields.
|
|
555
|
+
|
|
556
|
+
Parameters
|
|
557
|
+
----------
|
|
558
|
+
metadata_df : pandas.DataFrame
|
|
559
|
+
The metadata DataFrame to process.
|
|
560
|
+
|
|
561
|
+
Returns
|
|
562
|
+
-------
|
|
563
|
+
pandas.DataFrame
|
|
564
|
+
The processed DataFrame. NaNs in host- and sample-type-shorthand fields are set to "empty".
|
|
565
|
+
|
|
566
|
+
Raises
|
|
567
|
+
------
|
|
568
|
+
ValueError
|
|
569
|
+
If any sample names are NaN.
|
|
570
|
+
"""
|
|
571
|
+
# if there are any sample_name fields that are NaN, raise an error
|
|
572
|
+
nan_sample_name_mask = metadata_df[SAMPLE_NAME_KEY].isna()
|
|
573
|
+
if nan_sample_name_mask.any():
|
|
574
|
+
raise ValueError("Metadata contains NaN sample names")
|
|
575
|
+
|
|
576
|
+
# if there are any hosttype_shorthand or sampletype_shorthand fields
|
|
577
|
+
# that are NaN, set them to "empty" and raise a warning
|
|
578
|
+
for curr_key in [HOSTTYPE_SHORTHAND_KEY, SAMPLETYPE_SHORTHAND_KEY]:
|
|
579
|
+
nan_mask = metadata_df[curr_key].isna()
|
|
580
|
+
if nan_mask.any():
|
|
581
|
+
metadata_df.loc[nan_mask, curr_key] = "empty"
|
|
582
|
+
logging.warning(f"Metadata contains NaN {curr_key}s; "
|
|
583
|
+
f"these have been set to 'empty'")
|
|
584
|
+
|
|
585
|
+
return metadata_df
|
|
586
|
+
|
|
587
|
+
|
|
588
|
+
# transformer runner function
|
|
589
|
+
def _transform_metadata(
|
|
590
|
+
metadata_df: pandas.DataFrame,
|
|
591
|
+
full_flat_config_dict: Dict[str, Any],
|
|
592
|
+
stage_key: str,
|
|
593
|
+
transformer_funcs_dict: Optional[Dict[str, Any]]) -> pandas.DataFrame:
|
|
594
|
+
"""Apply transformations defined in full_flat_config_dict to metadata fields using dict of transformer functions.
|
|
595
|
+
|
|
596
|
+
Parameters
|
|
597
|
+
----------
|
|
598
|
+
metadata_df : pandas.DataFrame
|
|
599
|
+
The metadata DataFrame to transform, which must contain at least
|
|
600
|
+
the columns in REQUIRED_RAW_METADATA_FIELDS.
|
|
601
|
+
full_flat_config_dict : Dict[str, Any]
|
|
602
|
+
Fully combined flat-host-type config dictionary.
|
|
603
|
+
stage_key : str
|
|
604
|
+
Key indicating the transformation stage (pre or post).
|
|
605
|
+
transformer_funcs_dict : Optional[Dict[str, Any]]
|
|
606
|
+
Dictionary of transformer functions, keyed by field name,
|
|
607
|
+
with each value being a dict with keys SOURCES_KEY and FUNCTION_KEY,
|
|
608
|
+
which map to lists of source field names for the transformer to use
|
|
609
|
+
and an existing transformer function name, respectively.
|
|
610
|
+
|
|
611
|
+
Returns
|
|
612
|
+
-------
|
|
613
|
+
pandas.DataFrame
|
|
614
|
+
The transformed metadata DataFrame.
|
|
615
|
+
|
|
616
|
+
Raises
|
|
617
|
+
------
|
|
618
|
+
ValueError
|
|
619
|
+
If a specified transformer function cannot be found.
|
|
620
|
+
"""
|
|
621
|
+
if transformer_funcs_dict is None:
|
|
622
|
+
transformer_funcs_dict = {}
|
|
623
|
+
# If the necessary keys aren't already in the config, set them to do-nothing defaults
|
|
624
|
+
overwrite_non_nans = full_flat_config_dict.get(OVERWRITE_NON_NANS_KEY, False)
|
|
625
|
+
metadata_transformers = full_flat_config_dict.get(METADATA_TRANSFORMERS_KEY, None)
|
|
626
|
+
if metadata_transformers:
|
|
627
|
+
stage_transformers = metadata_transformers.get(stage_key, None)
|
|
628
|
+
# If there are transformers for the stage we're at, apply them
|
|
629
|
+
if stage_transformers:
|
|
630
|
+
for curr_target_field, curr_transformer_dict in \
|
|
631
|
+
stage_transformers.items():
|
|
632
|
+
curr_source_fields = curr_transformer_dict[SOURCES_KEY]
|
|
633
|
+
curr_func_name = curr_transformer_dict[FUNCTION_KEY]
|
|
634
|
+
|
|
635
|
+
try:
|
|
636
|
+
curr_func = transformer_funcs_dict[curr_func_name]
|
|
637
|
+
except KeyError:
|
|
638
|
+
try:
|
|
639
|
+
# if the transformer function isn't in the dictionary
|
|
640
|
+
# that was passed in, probably it is a built-in one,
|
|
641
|
+
# so look for it in the metameq transformers module
|
|
642
|
+
# looking into the metameq transformers module
|
|
643
|
+
curr_func = getattr(transformers, curr_func_name)
|
|
644
|
+
except AttributeError:
|
|
645
|
+
raise ValueError(
|
|
646
|
+
f"Unable to find transformer '{curr_func_name}'")
|
|
647
|
+
# end try to find in metameq transformers
|
|
648
|
+
# end try to find in input (study-specific) transformers
|
|
649
|
+
|
|
650
|
+
# apply the function named curr_func_name to the column(s) of the
|
|
651
|
+
# metadata_df named curr_source_fields to fill curr_target_field
|
|
652
|
+
update_metadata_df_field(metadata_df, curr_target_field,
|
|
653
|
+
curr_func, curr_source_fields,
|
|
654
|
+
overwrite_non_nans=overwrite_non_nans)
|
|
655
|
+
# next stage transformer
|
|
656
|
+
# end if there are stage transformers for this stage
|
|
657
|
+
# end if there are any metadata transformers
|
|
658
|
+
|
|
659
|
+
return metadata_df
|
|
660
|
+
|
|
661
|
+
|
|
662
|
+
def _generate_metadata_for_host_types(
|
|
663
|
+
metadata_df: pandas.DataFrame,
|
|
664
|
+
full_flat_config_dict: Dict[str, Any]) -> Tuple[pandas.DataFrame, List[str]]:
|
|
665
|
+
"""Generate metadata for samples of all host types in the DataFrame.
|
|
666
|
+
|
|
667
|
+
Parameters
|
|
668
|
+
----------
|
|
669
|
+
metadata_df : pandas.DataFrame
|
|
670
|
+
The metadata DataFrame to process, which must contain at least
|
|
671
|
+
the columns in REQUIRED_RAW_METADATA_FIELDS.
|
|
672
|
+
full_flat_config_dict : Dict[str, Any]
|
|
673
|
+
Fully combined flat-host-type config dictionary.
|
|
674
|
+
|
|
675
|
+
Returns
|
|
676
|
+
-------
|
|
677
|
+
Tuple[pandas.DataFrame, List[str]]
|
|
678
|
+
A tuple containing:
|
|
679
|
+
- The processed DataFrame with specific metadata added to each sample of each host type
|
|
680
|
+
- A list of validation messages
|
|
681
|
+
"""
|
|
682
|
+
# gather global settings
|
|
683
|
+
settings_dict = {DEFAULT_KEY: full_flat_config_dict.get(DEFAULT_KEY),
|
|
684
|
+
LEAVE_REQUIREDS_BLANK_KEY:
|
|
685
|
+
full_flat_config_dict.get(LEAVE_REQUIREDS_BLANK_KEY),
|
|
686
|
+
OVERWRITE_NON_NANS_KEY:
|
|
687
|
+
full_flat_config_dict.get(OVERWRITE_NON_NANS_KEY)}
|
|
688
|
+
|
|
689
|
+
validation_msgs = []
|
|
690
|
+
host_type_dfs = []
|
|
691
|
+
# For all the host types present in the metadata, generate the specific metadata
|
|
692
|
+
host_type_shorthands = pandas.unique(metadata_df[HOSTTYPE_SHORTHAND_KEY])
|
|
693
|
+
for curr_host_type_shorthand in host_type_shorthands:
|
|
694
|
+
concatted_dfs, curr_validation_msgs = _generate_metadata_for_a_host_type(
|
|
695
|
+
metadata_df, curr_host_type_shorthand, settings_dict, full_flat_config_dict)
|
|
696
|
+
|
|
697
|
+
host_type_dfs.append(concatted_dfs)
|
|
698
|
+
validation_msgs.extend(curr_validation_msgs)
|
|
699
|
+
# next host type
|
|
700
|
+
|
|
701
|
+
# Concatenate the processed host-type-specific metadata DataFrames into a single output DataFrame
|
|
702
|
+
output_df = pandas.concat(host_type_dfs, ignore_index=True)
|
|
703
|
+
|
|
704
|
+
# concatting dfs from different hosts can create large numbers of NAs--
|
|
705
|
+
# for example, if concatting a host-associated df with a control df, where
|
|
706
|
+
# the control df doesn't have values for any of the host-related columns.
|
|
707
|
+
# Fill those NAs with whatever the general default is.
|
|
708
|
+
# NB: passing in the same dict twice here is not a mistake, just a
|
|
709
|
+
# convenience since we don't have a more specific dict at this point.
|
|
710
|
+
output_df = _fill_na_if_default(
|
|
711
|
+
output_df, settings_dict, settings_dict)
|
|
712
|
+
|
|
713
|
+
# TODO: this is setting a value in the output; should it be centralized
|
|
714
|
+
# so it is easy to find?
|
|
715
|
+
# Replace the LEAVE_BLANK_VAL with an empty string in the output DataFrame
|
|
716
|
+
output_df.replace(LEAVE_BLANK_VAL, "", inplace=True)
|
|
717
|
+
return output_df, validation_msgs
|
|
718
|
+
|
|
719
|
+
|
|
720
|
+
def _generate_metadata_for_a_host_type(
|
|
721
|
+
metadata_df: pandas.DataFrame,
|
|
722
|
+
a_host_type: str,
|
|
723
|
+
settings_dict: Dict[str, Any],
|
|
724
|
+
full_flat_config_dict: Dict[str, Any]) -> Tuple[pandas.DataFrame, List[str]]:
|
|
725
|
+
"""Generate metadata df for samples with a specific host type.
|
|
726
|
+
|
|
727
|
+
Parameters
|
|
728
|
+
----------
|
|
729
|
+
metadata_df : pandas.DataFrame
|
|
730
|
+
The metadata DataFrame to process, which must contain at least
|
|
731
|
+
the columns in REQUIRED_RAW_METADATA_FIELDS.
|
|
732
|
+
a_host_type : str
|
|
733
|
+
The specific host type for which to process samples.
|
|
734
|
+
settings_dict : Dict[str, Any]
|
|
735
|
+
Dictionary containing global settings for default/nan/etc.
|
|
736
|
+
full_flat_config_dict : Dict[str, Any]
|
|
737
|
+
Fully combined flat-host-type config dictionary.
|
|
738
|
+
|
|
739
|
+
Returns
|
|
740
|
+
-------
|
|
741
|
+
Tuple[pandas.DataFrame, List[str]]
|
|
742
|
+
A tuple containing:
|
|
743
|
+
- The processed DataFrame with specific metadata added to each sample of the input host type
|
|
744
|
+
- A list of validation messages
|
|
745
|
+
"""
|
|
746
|
+
# get the subset of the metadata DataFrame that contains samples of the input host type
|
|
747
|
+
host_type_mask = \
|
|
748
|
+
metadata_df[HOSTTYPE_SHORTHAND_KEY] == a_host_type
|
|
749
|
+
host_type_df = metadata_df.loc[host_type_mask, :].copy()
|
|
750
|
+
|
|
751
|
+
validation_msgs = []
|
|
752
|
+
known_host_shorthands = full_flat_config_dict[HOST_TYPE_SPECIFIC_METADATA_KEY].keys()
|
|
753
|
+
if a_host_type not in known_host_shorthands:
|
|
754
|
+
# if the input host type is not in the config, add a QC note to the metadata
|
|
755
|
+
# for these samples but do not error out; move on to the next host type
|
|
756
|
+
update_metadata_df_field(
|
|
757
|
+
host_type_df, QC_NOTE_KEY, "invalid host_type")
|
|
758
|
+
# host_type_df[QC_NOTE_KEY] = "invalid host_type"
|
|
759
|
+
concatted_df = host_type_df
|
|
760
|
+
else:
|
|
761
|
+
# gather host-type-specific settings and overwrite the global settings with them, if any
|
|
762
|
+
a_host_type_config_dict = \
|
|
763
|
+
full_flat_config_dict[HOST_TYPE_SPECIFIC_METADATA_KEY][a_host_type]
|
|
764
|
+
global_plus_host_settings_dict = deepcopy_dict(settings_dict)
|
|
765
|
+
# if this host type has a default value for empty fields, use it; otherwise, use the global default
|
|
766
|
+
global_plus_host_settings_dict[DEFAULT_KEY] = a_host_type_config_dict.get(
|
|
767
|
+
DEFAULT_KEY, global_plus_host_settings_dict[DEFAULT_KEY])
|
|
768
|
+
|
|
769
|
+
dfs_to_concat = []
|
|
770
|
+
# loop through each sample type in the metadata for this host type
|
|
771
|
+
found_host_sample_types = \
|
|
772
|
+
pandas.unique(host_type_df[SAMPLETYPE_SHORTHAND_KEY])
|
|
773
|
+
for curr_sample_type in found_host_sample_types:
|
|
774
|
+
# generate the specific metadata for this sample type *in this host type*
|
|
775
|
+
curr_sample_type_df, curr_validation_msgs = \
|
|
776
|
+
_generate_metadata_for_a_sample_type_in_a_host_type(
|
|
777
|
+
host_type_df, curr_sample_type, global_plus_host_settings_dict,
|
|
778
|
+
a_host_type_config_dict)
|
|
779
|
+
|
|
780
|
+
dfs_to_concat.append(curr_sample_type_df)
|
|
781
|
+
validation_msgs.extend(curr_validation_msgs)
|
|
782
|
+
# next sample type in metadata for this host type
|
|
783
|
+
|
|
784
|
+
# Concatenate the processed sample-type-specific metadata DataFrames
|
|
785
|
+
# for the host type into a single output DataFrame
|
|
786
|
+
concatted_df = pandas.concat(dfs_to_concat, ignore_index=True)
|
|
787
|
+
# endif host_type is valid
|
|
788
|
+
|
|
789
|
+
return concatted_df, validation_msgs
|
|
790
|
+
|
|
791
|
+
|
|
792
|
+
def _generate_metadata_for_a_sample_type_in_a_host_type(
|
|
793
|
+
host_type_metadata_df: pandas.DataFrame,
|
|
794
|
+
a_sample_type: str,
|
|
795
|
+
global_plus_host_settings_dict: Dict[str, Any],
|
|
796
|
+
a_host_type_config_dict: Dict[str, Any]) -> Tuple[pandas.DataFrame, List[str]]:
|
|
797
|
+
"""Generate metadata df for samples with a specific sample type within a specific host type.
|
|
798
|
+
|
|
799
|
+
Parameters
|
|
800
|
+
----------
|
|
801
|
+
host_type_metadata_df : pandas.DataFrame
|
|
802
|
+
DataFrame containing metadata samples for a specific host type.
|
|
803
|
+
a_sample_type : str
|
|
804
|
+
The sample type to process.
|
|
805
|
+
global_plus_host_settings_dict : Dict[str, Any]
|
|
806
|
+
Dictionary containing default/nan/etc settings for current context.
|
|
807
|
+
a_host_type_config_dict : Dict[str, Any]
|
|
808
|
+
Dictionary containing config for this host type.
|
|
809
|
+
|
|
810
|
+
Returns
|
|
811
|
+
-------
|
|
812
|
+
Tuple[pandas.DataFrame, List[str]]
|
|
813
|
+
A tuple containing:
|
|
814
|
+
- The updated metadata DataFrame with sample-type-specific elements added
|
|
815
|
+
- A list of validation messages
|
|
816
|
+
"""
|
|
817
|
+
# copy the metadata fields dict from the host type config to be the
|
|
818
|
+
# basis of the work-in-progress metadata dict--these are the default fields
|
|
819
|
+
# that will be overwritten, if necessary, by sample type-specific fields
|
|
820
|
+
wip_metadata_fields_dict = deepcopy_dict(
|
|
821
|
+
a_host_type_config_dict.get(METADATA_FIELDS_KEY, {}))
|
|
822
|
+
|
|
823
|
+
# get the config section for *all* sample types within this host type
|
|
824
|
+
host_sample_types_config_dict = \
|
|
825
|
+
a_host_type_config_dict[SAMPLE_TYPE_SPECIFIC_METADATA_KEY]
|
|
826
|
+
|
|
827
|
+
# get df of records for this sample type in this host type
|
|
828
|
+
sample_type_mask = \
|
|
829
|
+
host_type_metadata_df[SAMPLETYPE_SHORTHAND_KEY] == a_sample_type
|
|
830
|
+
sample_type_df = host_type_metadata_df.loc[sample_type_mask, :].copy()
|
|
831
|
+
|
|
832
|
+
validation_msgs = []
|
|
833
|
+
known_sample_types = host_sample_types_config_dict.keys()
|
|
834
|
+
if a_sample_type not in known_sample_types:
|
|
835
|
+
# if the input sample type is not in the config, add a QC note to the metadata
|
|
836
|
+
# for these samples but do not error out; move on to the next sample type
|
|
837
|
+
update_metadata_df_field(
|
|
838
|
+
sample_type_df, QC_NOTE_KEY, "invalid sample_type")
|
|
839
|
+
else:
|
|
840
|
+
# resolve any aliases and base types for the sample type and combine its
|
|
841
|
+
# specific metadata fields with the host type's metadata fields
|
|
842
|
+
# to get the full set of config info for this host+sample type
|
|
843
|
+
full_sample_type_metadata_fields_dict = \
|
|
844
|
+
_construct_sample_type_metadata_fields_dict(
|
|
845
|
+
a_sample_type, host_sample_types_config_dict, wip_metadata_fields_dict)
|
|
846
|
+
|
|
847
|
+
# update the metadata df with the sample type specific metadata fields
|
|
848
|
+
# TODO: this is taking in wip_metadata_fields_dict instead of full_sample_type_metadata_fields_dict,
|
|
849
|
+
# which only works because the code underlying _construct_sample_type_metadata_fields_dict
|
|
850
|
+
# is *modifying* wip_metadata_fields_dict in place. This should be corrected, but that
|
|
851
|
+
# needs to wait until there are tests to make sure doing so doesn't break anything.
|
|
852
|
+
sample_type_df = _update_metadata_from_dict(
|
|
853
|
+
sample_type_df, wip_metadata_fields_dict, dict_is_metadata_fields=True,
|
|
854
|
+
overwrite_non_nans=global_plus_host_settings_dict[OVERWRITE_NON_NANS_KEY])
|
|
855
|
+
|
|
856
|
+
# for fields that are required but not yet filled, replace the placeholder with
|
|
857
|
+
# either an indicator that it should be blank or else
|
|
858
|
+
# fill with NA (replaced with default just below), based on config setting
|
|
859
|
+
leave_reqs_blank = global_plus_host_settings_dict[LEAVE_REQUIREDS_BLANK_KEY]
|
|
860
|
+
reqs_val = LEAVE_BLANK_VAL if leave_reqs_blank else np.nan
|
|
861
|
+
sample_type_df.replace(
|
|
862
|
+
to_replace=REQ_PLACEHOLDER, value=reqs_val, inplace=True)
|
|
863
|
+
|
|
864
|
+
# fill NAs with appropriate default value if any is set
|
|
865
|
+
sample_type_df = _fill_na_if_default(
|
|
866
|
+
sample_type_df, full_sample_type_metadata_fields_dict, global_plus_host_settings_dict)
|
|
867
|
+
|
|
868
|
+
# validate the metadata df based on the specific requirements
|
|
869
|
+
# for this host+sample type
|
|
870
|
+
validation_msgs = validate_metadata_df(
|
|
871
|
+
sample_type_df, full_sample_type_metadata_fields_dict)
|
|
872
|
+
|
|
873
|
+
return sample_type_df, validation_msgs
|
|
874
|
+
|
|
875
|
+
|
|
876
|
+
def _construct_sample_type_metadata_fields_dict(
|
|
877
|
+
sample_type: str,
|
|
878
|
+
host_sample_types_config_dict: Dict[str, Any],
|
|
879
|
+
a_host_type_metadata_fields_dict: Dict[str, Any]) -> Dict[str, Any]:
|
|
880
|
+
"""Construct metadata fields dictionary for a specific host+sample type, resolving aliases and base types.
|
|
881
|
+
|
|
882
|
+
Parameters
|
|
883
|
+
----------
|
|
884
|
+
sample_type : str
|
|
885
|
+
The sample type to process.
|
|
886
|
+
host_sample_types_config_dict : Dict[str, Any]
|
|
887
|
+
Dictionary containing config for *all* sample types in
|
|
888
|
+
the host type in question.
|
|
889
|
+
a_host_type_metadata_fields_dict : Dict[str, Any]
|
|
890
|
+
Dictionary containing metadata fields for the host type in question.
|
|
891
|
+
|
|
892
|
+
Returns
|
|
893
|
+
-------
|
|
894
|
+
Dict[str, Any]
|
|
895
|
+
The constructed metadata fields dictionary for this host-and-sample-type combination.
|
|
896
|
+
|
|
897
|
+
Raises
|
|
898
|
+
------
|
|
899
|
+
ValueError
|
|
900
|
+
If there are invalid alias chains or base type configurations.
|
|
901
|
+
"""
|
|
902
|
+
sample_type_for_metadata = sample_type
|
|
903
|
+
|
|
904
|
+
# get dict associated with the naive sample type
|
|
905
|
+
sample_type_specific_dict = \
|
|
906
|
+
host_sample_types_config_dict[sample_type]
|
|
907
|
+
|
|
908
|
+
# if naive sample type contains an alias
|
|
909
|
+
sample_type_alias = sample_type_specific_dict.get(ALIAS_KEY)
|
|
910
|
+
if sample_type_alias:
|
|
911
|
+
# change the sample type to the alias sample type
|
|
912
|
+
# and use the alias's sample type dict
|
|
913
|
+
sample_type_for_metadata = sample_type_alias
|
|
914
|
+
sample_type_specific_dict = \
|
|
915
|
+
host_sample_types_config_dict[sample_type_alias]
|
|
916
|
+
if METADATA_FIELDS_KEY not in sample_type_specific_dict:
|
|
917
|
+
raise ValueError(f"May not chain aliases "
|
|
918
|
+
f"('{sample_type}' to '{sample_type_alias}')")
|
|
919
|
+
# endif sample type is an alias
|
|
920
|
+
|
|
921
|
+
# if the sample type has a base type
|
|
922
|
+
sample_type_base = sample_type_specific_dict.get(BASE_TYPE_KEY)
|
|
923
|
+
if sample_type_base:
|
|
924
|
+
# get the base's sample type dict and add this sample type's
|
|
925
|
+
# info on top of it
|
|
926
|
+
base_sample_dict = host_sample_types_config_dict[sample_type_base]
|
|
927
|
+
if list(base_sample_dict.keys()) != [METADATA_FIELDS_KEY]:
|
|
928
|
+
raise ValueError(f"Base sample type '{sample_type_base}' "
|
|
929
|
+
f"must only have metadata fields")
|
|
930
|
+
sample_type_specific_dict_metadata = update_wip_metadata_dict(
|
|
931
|
+
sample_type_specific_dict.get(METADATA_FIELDS_KEY, {}),
|
|
932
|
+
base_sample_dict[METADATA_FIELDS_KEY])
|
|
933
|
+
sample_type_specific_dict[METADATA_FIELDS_KEY] = \
|
|
934
|
+
sample_type_specific_dict_metadata
|
|
935
|
+
# endif sample type has a base type
|
|
936
|
+
|
|
937
|
+
# add the sample-type-specific info generated above on top of the host info
|
|
938
|
+
sample_type_metadata_dict = update_wip_metadata_dict(
|
|
939
|
+
a_host_type_metadata_fields_dict,
|
|
940
|
+
sample_type_specific_dict.get(METADATA_FIELDS_KEY, {}))
|
|
941
|
+
|
|
942
|
+
# set sample_type, and qiita_sample_type if it is not already set
|
|
943
|
+
sample_type_definition = {
|
|
944
|
+
ALLOWED_KEY: [sample_type_for_metadata],
|
|
945
|
+
DEFAULT_KEY: sample_type_for_metadata,
|
|
946
|
+
TYPE_KEY: "string"
|
|
947
|
+
}
|
|
948
|
+
sample_type_metadata_dict = update_wip_metadata_dict(
|
|
949
|
+
sample_type_metadata_dict, {SAMPLE_TYPE_KEY: sample_type_definition})
|
|
950
|
+
if QIITA_SAMPLE_TYPE not in sample_type_metadata_dict:
|
|
951
|
+
sample_type_metadata_dict = update_wip_metadata_dict(
|
|
952
|
+
sample_type_metadata_dict, {QIITA_SAMPLE_TYPE: sample_type_definition})
|
|
953
|
+
# end if qiita_sample_type not already set
|
|
954
|
+
|
|
955
|
+
return sample_type_metadata_dict
|
|
956
|
+
|
|
957
|
+
|
|
958
|
+
def _update_metadata_from_dict(
|
|
959
|
+
metadata_df: pandas.DataFrame,
|
|
960
|
+
config_section_dict: Dict[str, Any],
|
|
961
|
+
dict_is_metadata_fields: bool = False,
|
|
962
|
+
overwrite_non_nans: bool = False) -> pandas.DataFrame:
|
|
963
|
+
"""Create an updated copy of the metadata DataFrame based on an input dictionary.
|
|
964
|
+
|
|
965
|
+
Parameters
|
|
966
|
+
----------
|
|
967
|
+
metadata_df : pandas.DataFrame
|
|
968
|
+
The metadata DataFrame to update.
|
|
969
|
+
config_section_dict : Dict[str, Any]
|
|
970
|
+
The relevant section of a config dictionary to use.
|
|
971
|
+
dict_is_metadata_fields : bool, default=False
|
|
972
|
+
Whether the config dict contains a METADATA_FIELDS_KEY
|
|
973
|
+
(in which case False) or is itself the contents of
|
|
974
|
+
a METADATA_FIELDS_KEY (in which case True).
|
|
975
|
+
overwrite_non_nans : bool, default=False
|
|
976
|
+
Whether to overwrite non-NaN values with default values.
|
|
977
|
+
|
|
978
|
+
Returns
|
|
979
|
+
-------
|
|
980
|
+
pandas.DataFrame
|
|
981
|
+
An updated copy of the metadata DataFrame.
|
|
982
|
+
"""
|
|
983
|
+
if not dict_is_metadata_fields:
|
|
984
|
+
metadata_fields_dict = config_section_dict.get(METADATA_FIELDS_KEY)
|
|
985
|
+
else:
|
|
986
|
+
metadata_fields_dict = config_section_dict
|
|
987
|
+
|
|
988
|
+
output_df = _update_metadata_from_metadata_fields_dict(
|
|
989
|
+
metadata_df, metadata_fields_dict,
|
|
990
|
+
overwrite_non_nans=overwrite_non_nans)
|
|
991
|
+
return output_df
|
|
992
|
+
|
|
993
|
+
|
|
994
|
+
def _update_metadata_from_metadata_fields_dict(
|
|
995
|
+
metadata_df: pandas.DataFrame,
|
|
996
|
+
metadata_fields_dict: Dict[str, Any],
|
|
997
|
+
overwrite_non_nans: bool) -> pandas.DataFrame:
|
|
998
|
+
"""Create an updated copy of the metadata DataFrame based on a metadata fields dictionary.
|
|
999
|
+
|
|
1000
|
+
Parameters
|
|
1001
|
+
----------
|
|
1002
|
+
metadata_df : pandas.DataFrame
|
|
1003
|
+
The metadata DataFrame to update.
|
|
1004
|
+
metadata_fields_dict : Dict[str, Any]
|
|
1005
|
+
Dictionary containing metadata field definitions and required values.
|
|
1006
|
+
overwrite_non_nans : bool
|
|
1007
|
+
Whether to overwrite non-NaN values with default values.
|
|
1008
|
+
|
|
1009
|
+
Returns
|
|
1010
|
+
-------
|
|
1011
|
+
pandas.DataFrame
|
|
1012
|
+
An updated copy of the metadata DataFrame.
|
|
1013
|
+
"""
|
|
1014
|
+
output_df = metadata_df.copy()
|
|
1015
|
+
|
|
1016
|
+
# loop through each metadata field in the metadata fields dict
|
|
1017
|
+
for curr_field_name, curr_field_vals_dict in metadata_fields_dict.items():
|
|
1018
|
+
# if the field has a default value (regardless of whether it is
|
|
1019
|
+
# required), update the metadata df with it (this includes adding the
|
|
1020
|
+
# field if it does not already exist). For existing fields, what exactly
|
|
1021
|
+
# will beupdated depends on the value of overwrite_non_nans:
|
|
1022
|
+
# if overwrite_non_nans is True, then all values will be updated;
|
|
1023
|
+
# if overwrite_non_nans is False, then only NA values will be updated
|
|
1024
|
+
# if the field already exists in the metadata; otherwise, the field
|
|
1025
|
+
# will be added to the metadata with the default value throughout.
|
|
1026
|
+
if DEFAULT_KEY in curr_field_vals_dict:
|
|
1027
|
+
curr_default_val = curr_field_vals_dict[DEFAULT_KEY]
|
|
1028
|
+
update_metadata_df_field(
|
|
1029
|
+
output_df, curr_field_name, curr_default_val,
|
|
1030
|
+
overwrite_non_nans=overwrite_non_nans)
|
|
1031
|
+
# if the field is required BUT has no default value, then if the field does not
|
|
1032
|
+
# already exist in the metadata, add the field to the metadata with a placeholder value.
|
|
1033
|
+
elif REQUIRED_KEY in curr_field_vals_dict:
|
|
1034
|
+
curr_required_val = curr_field_vals_dict[REQUIRED_KEY]
|
|
1035
|
+
if curr_required_val and curr_field_name not in output_df:
|
|
1036
|
+
update_metadata_df_field(
|
|
1037
|
+
output_df, curr_field_name, REQ_PLACEHOLDER,
|
|
1038
|
+
overwrite_non_nans=overwrite_non_nans)
|
|
1039
|
+
# note that if the field is (a) required, (b) does not have a
|
|
1040
|
+
# default value, and (c) IS already in the metadata, it will
|
|
1041
|
+
# be left alone, with no changes made to it!
|
|
1042
|
+
return output_df
|
|
1043
|
+
|
|
1044
|
+
|
|
1045
|
+
# fill NAs with default value if any is set
|
|
1046
|
+
def _fill_na_if_default(
|
|
1047
|
+
metadata_df: pandas.DataFrame,
|
|
1048
|
+
specific_dict: Dict[str, Any],
|
|
1049
|
+
settings_dict: Dict[str, Any]) -> pandas.DataFrame:
|
|
1050
|
+
"""Fill NaN values in metadata df with default values if available.
|
|
1051
|
+
|
|
1052
|
+
Parameters
|
|
1053
|
+
----------
|
|
1054
|
+
metadata_df : pandas.DataFrame
|
|
1055
|
+
The metadata DataFrame to process.
|
|
1056
|
+
specific_dict : Dict[str, Any]
|
|
1057
|
+
Dictionary containing context-specific settings. Will be used first as a source of default values.
|
|
1058
|
+
settings_dict : Dict[str, Any]
|
|
1059
|
+
Dictionary containing global settings. Will be used as a
|
|
1060
|
+
source of default values if specific_dict does not contain a DEFAULT_KEY.
|
|
1061
|
+
|
|
1062
|
+
Returns
|
|
1063
|
+
-------
|
|
1064
|
+
pandas.DataFrame
|
|
1065
|
+
The updated DataFrame with NaN values filled. Unchanged if no default values are set.
|
|
1066
|
+
"""
|
|
1067
|
+
default_val = specific_dict.get(DEFAULT_KEY, settings_dict[DEFAULT_KEY])
|
|
1068
|
+
if default_val:
|
|
1069
|
+
# TODO: this is setting a value in the output; should it be
|
|
1070
|
+
# centralized so it is easy to find?
|
|
1071
|
+
metadata_df = \
|
|
1072
|
+
metadata_df.fillna(default_val)
|
|
1073
|
+
# metadata_df.astype("string").fillna(default_val)
|
|
1074
|
+
|
|
1075
|
+
return metadata_df
|
|
1076
|
+
|
|
1077
|
+
|
|
1078
|
+
def _output_metadata_df_to_files(
|
|
1079
|
+
a_df: pandas.DataFrame,
|
|
1080
|
+
out_dir: str,
|
|
1081
|
+
out_base: str,
|
|
1082
|
+
internal_col_names: List[str],
|
|
1083
|
+
sep: str = "\t",
|
|
1084
|
+
remove_internals_and_fails: bool = False,
|
|
1085
|
+
suppress_empty_fails: bool = False) -> None:
|
|
1086
|
+
"""Output DataFrame to files, optionally removing internal columns and failures.
|
|
1087
|
+
|
|
1088
|
+
Parameters
|
|
1089
|
+
----------
|
|
1090
|
+
a_df : pandas.DataFrame
|
|
1091
|
+
The metadata DataFrame to output.
|
|
1092
|
+
out_dir : str
|
|
1093
|
+
Directory where output files will be written.
|
|
1094
|
+
out_base : str
|
|
1095
|
+
Base name for output files.
|
|
1096
|
+
internal_col_names : List[str]
|
|
1097
|
+
List of internal column names that will be moved
|
|
1098
|
+
to the end of the DataFrame.
|
|
1099
|
+
sep : str, default="tab"
|
|
1100
|
+
Separator to use in output files.
|
|
1101
|
+
remove_internals_and_fails : bool, default=False
|
|
1102
|
+
Whether to remove internal columns and failures.
|
|
1103
|
+
suppress_empty_fails : bool, default=False
|
|
1104
|
+
Whether to suppress empty failure files.
|
|
1105
|
+
"""
|
|
1106
|
+
timestamp_str = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
|
|
1107
|
+
extension = get_extension(sep)
|
|
1108
|
+
|
|
1109
|
+
# if we've been told to remove the qc fails and the internal columns
|
|
1110
|
+
if remove_internals_and_fails:
|
|
1111
|
+
# output a file of any qc failures
|
|
1112
|
+
qc_fails_df = get_qc_failures(a_df)
|
|
1113
|
+
qc_fails_fp = os.path.join(
|
|
1114
|
+
out_dir, f"{timestamp_str}_{out_base}_fails.csv")
|
|
1115
|
+
if qc_fails_df.empty:
|
|
1116
|
+
# unless we've been told to suppress empty files
|
|
1117
|
+
if not suppress_empty_fails:
|
|
1118
|
+
# if there are no failures, create an empty file
|
|
1119
|
+
# (not even header line) if there are no failures--bc it is easy to
|
|
1120
|
+
# eyeball "zero bytes"
|
|
1121
|
+
Path(qc_fails_fp).touch()
|
|
1122
|
+
# else, just do nothing
|
|
1123
|
+
else:
|
|
1124
|
+
qc_fails_df.to_csv(qc_fails_fp, sep=",", index=False)
|
|
1125
|
+
|
|
1126
|
+
# then remove the qc fails and the internal columns from the metadata
|
|
1127
|
+
# TODO: I'd like to avoid repeating this mask here + in get_qc_failures
|
|
1128
|
+
fails_qc_mask = a_df[QC_NOTE_KEY] != ""
|
|
1129
|
+
a_df = a_df.loc[~fails_qc_mask, :].copy()
|
|
1130
|
+
a_df = a_df.drop(columns=internal_col_names)
|
|
1131
|
+
|
|
1132
|
+
# output the metadata
|
|
1133
|
+
out_fp = os.path.join(out_dir, f"{timestamp_str}_{out_base}.{extension}")
|
|
1134
|
+
a_df.to_csv(out_fp, sep=sep, index=False)
|
|
1135
|
+
|
|
1136
|
+
|
|
1137
|
+
def _reorder_df(a_df: pandas.DataFrame, internal_col_names: List[str]) -> pandas.DataFrame:
|
|
1138
|
+
"""Reorder DataFrame columns according to standard rules.
|
|
1139
|
+
|
|
1140
|
+
Parameters
|
|
1141
|
+
----------
|
|
1142
|
+
a_df : pandas.DataFrame
|
|
1143
|
+
The DataFrame to reorder.
|
|
1144
|
+
internal_col_names : List[str]
|
|
1145
|
+
List of internal column names that will be moved to the end of the DataFrame.
|
|
1146
|
+
|
|
1147
|
+
Returns
|
|
1148
|
+
-------
|
|
1149
|
+
pandas.DataFrame
|
|
1150
|
+
A reordered copy of the input DataFrame with:
|
|
1151
|
+
- sample_name as the first column
|
|
1152
|
+
- remaining columns except for internal columns in alphabetical order
|
|
1153
|
+
- internal columns at the end in the order they were provided
|
|
1154
|
+
"""
|
|
1155
|
+
# sort columns alphabetically
|
|
1156
|
+
working_df = a_df.copy().reindex(sorted(a_df.columns), axis=1)
|
|
1157
|
+
|
|
1158
|
+
# move the internal columns to the end of the list of cols to output
|
|
1159
|
+
col_names = list(working_df)
|
|
1160
|
+
for curr_internal_col_name in internal_col_names:
|
|
1161
|
+
# TODO: throw an error if the internal col name is not present
|
|
1162
|
+
col_names.pop(col_names.index(curr_internal_col_name))
|
|
1163
|
+
col_names.append(curr_internal_col_name)
|
|
1164
|
+
|
|
1165
|
+
# move sample name to the first column
|
|
1166
|
+
col_names.insert(0, col_names.pop(col_names.index(SAMPLE_NAME_KEY)))
|
|
1167
|
+
output_df = working_df.loc[:, col_names].copy()
|
|
1168
|
+
return output_df
|