metameq 2026.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1168 @@
1
+ import logging
2
+ import numpy as np
3
+ import os
4
+ import pandas
5
+ from pathlib import Path
6
+ from datetime import datetime
7
+ from typing import List, Dict, Optional, Tuple, Any
8
+ from metameq.src.util import extract_config_dict, \
9
+ deepcopy_dict, validate_required_columns_exist, get_extension, \
10
+ load_df_with_best_fit_encoding, update_metadata_df_field, \
11
+ HOSTTYPE_SHORTHAND_KEY, SAMPLETYPE_SHORTHAND_KEY, \
12
+ QC_NOTE_KEY, METADATA_FIELDS_KEY, HOST_TYPE_SPECIFIC_METADATA_KEY, \
13
+ SAMPLE_TYPE_SPECIFIC_METADATA_KEY, SAMPLE_TYPE_KEY, QIITA_SAMPLE_TYPE, \
14
+ DEFAULT_KEY, REQUIRED_KEY, ALIAS_KEY, BASE_TYPE_KEY, \
15
+ LEAVE_BLANK_VAL, SAMPLE_NAME_KEY, \
16
+ ALLOWED_KEY, TYPE_KEY, LEAVE_REQUIREDS_BLANK_KEY, OVERWRITE_NON_NANS_KEY, \
17
+ METADATA_TRANSFORMERS_KEY, PRE_TRANSFORMERS_KEY, POST_TRANSFORMERS_KEY, \
18
+ SOURCES_KEY, FUNCTION_KEY, REQUIRED_RAW_METADATA_FIELDS
19
+ from metameq.src.metadata_configurator import update_wip_metadata_dict, \
20
+ build_full_flat_config_dict
21
+ from metameq.src.metadata_validator import validate_metadata_df, \
22
+ output_validation_msgs
23
+ import metameq.src.metadata_transformers as transformers
24
+
25
+
26
+ # columns added to the metadata that are not actually part of it
27
+ INTERNAL_COL_KEYS = [HOSTTYPE_SHORTHAND_KEY, SAMPLETYPE_SHORTHAND_KEY,
28
+ QC_NOTE_KEY]
29
+
30
+ REQ_PLACEHOLDER = "_METAMEQ_REQUIRED"
31
+
32
+ # Define a logger for this module
33
+ logger = logging.getLogger(__name__)
34
+
35
+ pandas.set_option("future.no_silent_downcasting", True)
36
+
37
+ # TODO: find a way to inform user that they *are not allowed* to have a 'sample_id' column
38
+ # (Per Antonio 10/28/24, this is a reserved name for Qiita and may not be
39
+ # in the metadata).
40
+
41
+
42
+ def get_reserved_cols(
43
+ raw_metadata_df: pandas.DataFrame,
44
+ study_specific_config_dict: Dict[str, Any],
45
+ study_specific_transformers_dict: Optional[Dict[str, Any]] = None) -> List[str]:
46
+ """Get a list of all reserved column names for all host+sample type combinations in the metadata.
47
+
48
+ Note that 'reserved' is not the same as 'required'. Some column names (e.g.,
49
+ irb_institute for human host types) are not *required*, but are *reserved*, so they can
50
+ only be used to name columns that hold standardized info, not for arbitrary metadata.
51
+
52
+ Parameters
53
+ ----------
54
+ raw_metadata_df : pandas.DataFrame
55
+ The input metadata DataFrame.
56
+ study_specific_config_dict : Dict[str, Any]
57
+ Study-specific flat-host-type config dictionary.
58
+ study_specific_transformers_dict : Optional[Dict[str, Any]], default=None
59
+ Dictionary of custom transformers for this study (only).
60
+
61
+ Returns
62
+ -------
63
+ List[str]
64
+ Sorted list of all reserved column names.
65
+ Empty if there are no reserved columns.
66
+
67
+ Raises
68
+ ------
69
+ ValueError
70
+ If required columns are missing from the metadata.
71
+ """
72
+ validate_required_columns_exist(
73
+ raw_metadata_df, [HOSTTYPE_SHORTHAND_KEY, SAMPLETYPE_SHORTHAND_KEY],
74
+ "metadata missing required columns")
75
+
76
+ # Essentially, mock a minimal metadata valid metadata dataframe and then
77
+ # use extend_metadata_df to add all the required columns to it (either empty
78
+ # or with default values but we don't care about the actual values), then
79
+ # return the list of column names from that extended df.
80
+
81
+ # get unique HOSTTYPE_SHORTHAND_KEY, SAMPLETYPE_SHORTHAND_KEY combinations
82
+ temp_df = raw_metadata_df[
83
+ [HOSTTYPE_SHORTHAND_KEY, SAMPLETYPE_SHORTHAND_KEY]].copy()
84
+ temp_df.drop_duplicates(inplace=True)
85
+
86
+ # add a bogus SAMPLE_NAME_KEY column to the df that just holds sequential integers
87
+ temp_df[SAMPLE_NAME_KEY] = range(1, len(temp_df) + 1)
88
+
89
+ temp_df = _catch_nan_required_fields(temp_df)
90
+
91
+ # extend the metadata_df to get all the required columns for all host+sample type combinations;
92
+ # we don't really care about the contents of these columns, just their names.
93
+ # (I doubt it is necessary to pass the actual study_specific_transformers_dict; could probably just use None)
94
+ metadata_df, _ = extend_metadata_df(
95
+ temp_df, study_specific_config_dict,
96
+ study_specific_transformers_dict)
97
+
98
+ return sorted(metadata_df.columns.to_list())
99
+
100
+
101
+ def id_missing_cols(a_df: pandas.DataFrame) -> List[str]:
102
+ """Identify required columns that are missing from the DataFrame.
103
+
104
+ Parameters
105
+ ----------
106
+ a_df : pandas.DataFrame
107
+ The metadata DataFrame to check for missing columns.
108
+
109
+ Returns
110
+ -------
111
+ List[str]
112
+ Sorted list of required column names that are missing from the DataFrame.
113
+ Empty if there are no missing columns.
114
+ """
115
+ missing_cols = set(REQUIRED_RAW_METADATA_FIELDS) - set(a_df.columns)
116
+ return sorted(list(missing_cols))
117
+
118
+
119
+ def find_standard_cols(
120
+ a_df: pandas.DataFrame,
121
+ study_specific_config_dict: Dict[str, Any],
122
+ study_specific_transformers_dict: Optional[Dict[str, Any]] = None,
123
+ suppress_missing_name_err: bool = False) -> List[str]:
124
+ """Find all the standard columns in the metadata DataFrame.
125
+
126
+ Parameters
127
+ ----------
128
+ a_df : pandas.DataFrame
129
+ The metadata DataFrame to analyze.
130
+ study_specific_config_dict : Dict[str, Any]
131
+ Study-specific flat-host-type config dictionary.
132
+ study_specific_transformers_dict : Optional[Dict[str, Any]], default=None
133
+ Dictionary of custom transformers for this study (only).
134
+ suppress_missing_name_err : bool, default=False
135
+ Whether to suppress errors about missing sample name.
136
+
137
+ Returns
138
+ -------
139
+ List[str]
140
+ List of standard column names found in the DataFrame.
141
+ Empty if there are no standard columns.
142
+
143
+ Raises
144
+ ------
145
+ ValueError
146
+ If required columns are missing from the metadata.
147
+ """
148
+ err_msg = "metadata missing required columns"
149
+ required_cols = REQUIRED_RAW_METADATA_FIELDS.copy()
150
+ if suppress_missing_name_err:
151
+ # remove the sample name from the required columns list
152
+ required_cols.remove(SAMPLE_NAME_KEY)
153
+ # endif
154
+ validate_required_columns_exist(a_df, required_cols, err_msg)
155
+
156
+ # get the intersection of the reserved standard columns and
157
+ # the columns in the input dataframe
158
+ standard_cols = get_reserved_cols(
159
+ a_df, study_specific_config_dict,
160
+ study_specific_transformers_dict=study_specific_transformers_dict)
161
+
162
+ standard_cols_set = (set(standard_cols) - set(INTERNAL_COL_KEYS))
163
+
164
+ return list(standard_cols_set & set(a_df.columns))
165
+
166
+
167
+ def find_nonstandard_cols(
168
+ a_df: pandas.DataFrame,
169
+ study_specific_config_dict: Dict[str, Any],
170
+ study_specific_transformers_dict: Optional[Dict[str, Any]] = None) -> List[str]:
171
+ """Find any non-standard columns in the metadata DataFrame.
172
+
173
+ Parameters
174
+ ----------
175
+ a_df : pandas.DataFrame
176
+ The metadata DataFrame to analyze.
177
+ study_specific_config_dict : Dict[str, Any]
178
+ Study-specific flat-host-type config dictionary.
179
+ study_specific_transformers_dict : Optional[Dict[str, Any]], default=None
180
+ Dictionary of custom transformers for this study (only).
181
+
182
+ Returns
183
+ -------
184
+ List[str]
185
+ List of non-standard column names found in the DataFrame.
186
+ Empty if there are no non-standard columns.
187
+
188
+ Raises
189
+ ------
190
+ ValueError
191
+ If required columns are missing from the metadata.
192
+ """
193
+ validate_required_columns_exist(a_df, REQUIRED_RAW_METADATA_FIELDS,
194
+ "metadata missing required columns")
195
+
196
+ # get the columns in
197
+ standard_cols = get_reserved_cols(
198
+ a_df, study_specific_config_dict,
199
+ study_specific_transformers_dict=study_specific_transformers_dict)
200
+
201
+ return list(set(a_df.columns) - set(standard_cols))
202
+
203
+
204
+ def get_extended_metadata_from_df_and_yaml(
205
+ raw_metadata_df: pandas.DataFrame,
206
+ study_specific_config_fp: Optional[str]) -> Tuple[pandas.DataFrame, pandas.DataFrame]:
207
+ """Extend metadata using configuration from a study-specific YAML config file.
208
+
209
+ Parameters
210
+ ----------
211
+ raw_metadata_df : pandas.DataFrame
212
+ The raw metadata DataFrame to extend.
213
+ study_specific_config_fp : Optional[str]
214
+ Path to the study-specific configuration YAML file.
215
+
216
+ Returns
217
+ -------
218
+ Tuple[pandas.DataFrame, pandas.DataFrame]
219
+ A tuple containing:
220
+ - The extended metadata DataFrame
221
+ - A DataFrame containing validation messages
222
+ """
223
+ # get the study-specific flat-host-type config dictionary from the input yaml file
224
+ study_specific_config_dict = \
225
+ _get_study_specific_config(study_specific_config_fp)
226
+
227
+ # extend the metadata DataFrame using the study-specific flat-host-type config dictionary
228
+ metadata_df, validation_msgs_df = \
229
+ extend_metadata_df(raw_metadata_df, study_specific_config_dict)
230
+
231
+ return metadata_df, validation_msgs_df
232
+
233
+
234
+ def get_qc_failures(a_df: pandas.DataFrame) -> pandas.DataFrame:
235
+ """Get rows from the extended metadata DataFrame that have QC failures.
236
+
237
+ Parameters
238
+ ----------
239
+ a_df : pandas.DataFrame
240
+ The extended metadata DataFrame to check for QC failures.
241
+
242
+ Returns
243
+ -------
244
+ pandas.DataFrame
245
+ A new DataFrame containing only the rows that failed QC checks.
246
+ """
247
+ fails_qc_mask = a_df[QC_NOTE_KEY] != ""
248
+ qc_fails_df = \
249
+ a_df.loc[fails_qc_mask, :].copy()
250
+ return qc_fails_df
251
+
252
+
253
+ def write_extended_metadata(
254
+ raw_metadata_fp: str,
255
+ study_specific_config_fp: str,
256
+ out_dir: str,
257
+ out_name_base: str,
258
+ sep: str = "\t",
259
+ remove_internals: bool = True,
260
+ suppress_empty_fails: bool = False) -> pandas.DataFrame:
261
+ """Write extended metadata to files starting from input file paths to metadata and config.
262
+
263
+ Parameters
264
+ ----------
265
+ raw_metadata_fp : str
266
+ Path to the raw metadata file (.csv, .txt, or .xlsx).
267
+ study_specific_config_fp : str
268
+ Path to the study-specific configuration YAML file.
269
+ out_dir : str
270
+ Directory where output files will be written.
271
+ out_name_base : str
272
+ Base name for output files.
273
+ sep : str, default="\t"
274
+ Separator to use in output files.
275
+ remove_internals : bool, default=True
276
+ Whether to remove internal columns.
277
+ suppress_empty_fails : bool, default=False
278
+ Whether to suppress empty failure files.
279
+
280
+ Returns
281
+ -------
282
+ pandas.DataFrame
283
+ The extended metadata DataFrame.
284
+
285
+ Raises
286
+ ------
287
+ ValueError
288
+ If the input file extension is not recognized.
289
+ """
290
+ # extract the extension from the raw_metadata_fp file path
291
+ extension = os.path.splitext(raw_metadata_fp)[1]
292
+ if extension == ".csv":
293
+ raw_metadata_df = load_df_with_best_fit_encoding(raw_metadata_fp, ",")
294
+ elif extension == ".txt":
295
+ raw_metadata_df = load_df_with_best_fit_encoding(raw_metadata_fp, "\t")
296
+ elif extension == ".xlsx":
297
+ # NB: this loads (only) the first sheet of the input excel file.
298
+ # If needed, can expand with pandas.read_excel sheet_name parameter.
299
+ raw_metadata_df = pandas.read_excel(raw_metadata_fp)
300
+ else:
301
+ raise ValueError("Unrecognized input file extension; "
302
+ "must be .csv, .txt, or .xlsx")
303
+
304
+ # get the study-specific flat-host-type config dictionary from the input yaml file
305
+ study_specific_config_dict = \
306
+ _get_study_specific_config(study_specific_config_fp)
307
+
308
+ # write the extended metadata to files
309
+ extended_df = write_extended_metadata_from_df(
310
+ raw_metadata_df, study_specific_config_dict,
311
+ out_dir, out_name_base, sep=sep,
312
+ remove_internals=remove_internals,
313
+ suppress_empty_fails=suppress_empty_fails)
314
+
315
+ # for good measure, return the extended metadata DataFrame
316
+ return extended_df
317
+
318
+
319
+ def _get_study_specific_config(study_specific_config_fp: Optional[str]) -> Optional[Dict[str, Any]]:
320
+ """Load study-specific flat-host-type configuration from a YAML file.
321
+
322
+ Parameters
323
+ ----------
324
+ study_specific_config_fp : Optional[str]
325
+ Path to the study-specific configuration YAML file.
326
+ This file should contain study-specific values for top-level settings (e.g., default
327
+ value) and, if necessary, a HOST_TYPE_SPECIFIC_METADATA_KEY holding a *flat*
328
+ dictionary of host types, defining only their study-specific host and sample type
329
+ metadata fields.
330
+
331
+ Returns
332
+ -------
333
+ Optional[Dict[str, Any]]
334
+ The loaded flat-host-type configuration dictionary, or None if no file path provided.
335
+ """
336
+ if study_specific_config_fp:
337
+ study_specific_config_dict = \
338
+ extract_config_dict(study_specific_config_fp)
339
+ else:
340
+ study_specific_config_dict = None
341
+
342
+ return study_specific_config_dict
343
+
344
+
345
+ def write_extended_metadata_from_df(
346
+ raw_metadata_df: pandas.DataFrame,
347
+ study_specific_config_dict: Dict[str, Any],
348
+ out_dir: str,
349
+ out_name_base: str,
350
+ study_specific_transformers_dict: Optional[Dict[str, Any]] = None,
351
+ sep: str = "\t",
352
+ remove_internals: bool = True,
353
+ suppress_empty_fails: bool = False,
354
+ internal_col_names: Optional[List[str]] = None) -> pandas.DataFrame:
355
+ """Write extended metadata to files starting from a metadata DataFrame and config dictionary.
356
+
357
+ Parameters
358
+ ----------
359
+ raw_metadata_df : pandas.DataFrame
360
+ The raw metadata DataFrame to extend.
361
+ study_specific_config_dict : Dict[str, Any]
362
+ Study-specific configuration dictionary.
363
+ out_dir : str
364
+ Directory where output files will be written.
365
+ out_name_base : str
366
+ Base name for output files.
367
+ study_specific_transformers_dict : Optional[Dict[str, Any]], default=None
368
+ Dictionary of custom transformers.
369
+ sep : str, default="\t"
370
+ Separator to use in output files.
371
+ remove_internals : bool, default=True
372
+ Whether to remove internal columns.
373
+ suppress_empty_fails : bool, default=False
374
+ Whether to suppress empty failure files.
375
+ internal_col_names : Optional[List[str]], default=None
376
+ List of internal column names.
377
+
378
+ Returns
379
+ -------
380
+ pandas.DataFrame
381
+ The extended metadata DataFrame.
382
+ """
383
+ # extend the metadata DataFrame using the study-specific flat-host-type config dictionary
384
+ metadata_df, validation_msgs_df = extend_metadata_df(
385
+ raw_metadata_df, study_specific_config_dict,
386
+ study_specific_transformers_dict)
387
+
388
+ # write the metadata and validation results to files
389
+ write_metadata_results(
390
+ metadata_df, validation_msgs_df, out_dir, out_name_base,
391
+ sep=sep, remove_internals=remove_internals,
392
+ suppress_empty_fails=suppress_empty_fails,
393
+ internal_col_names=internal_col_names)
394
+
395
+ # for good measure, return the extended metadata DataFrame
396
+ return metadata_df
397
+
398
+
399
+ def extend_metadata_df(
400
+ raw_metadata_df: pandas.DataFrame,
401
+ study_specific_config_dict: Optional[Dict[str, Any]],
402
+ study_specific_transformers_dict: Optional[Dict[str, Any]] = None,
403
+ software_config_dict: Optional[Dict[str, Any]] = None,
404
+ stds_fp: Optional[str] = None
405
+ ) -> Tuple[pandas.DataFrame, pandas.DataFrame]:
406
+ """Extend a metadata DataFrame based on metadata standards and study-specific configurations.
407
+
408
+ Parameters
409
+ ----------
410
+ raw_metadata_df : pandas.DataFrame
411
+ The raw metadata DataFrame to extend.
412
+ study_specific_config_dict : Optional[Dict[str, Any]]
413
+ Study-specific flat-host-type config dictionary.
414
+ study_specific_transformers_dict : Optional[Dict[str, Any]], default=None
415
+ Dictionary of custom transformers for this study (only).
416
+ software_config_dict : Optional[Dict[str, Any]], default=None
417
+ Software configuration dictionary. If None, the default software
418
+ config pulled from the config.yml file will be used.
419
+ stds_fp : Optional[str], default=None
420
+ Path to standards dictionary file. If None, the default standards
421
+ config pulled from the standards.yml file will be used.
422
+
423
+ Returns
424
+ -------
425
+ Tuple[pandas.DataFrame, pandas.DataFrame]
426
+ A tuple containing:
427
+ - The extended metadata DataFrame
428
+ - A DataFrame containing validation messages
429
+
430
+ Raises
431
+ ------
432
+ ValueError
433
+ If required columns are missing from the metadata.
434
+ """
435
+ validate_required_columns_exist(
436
+ raw_metadata_df, REQUIRED_RAW_METADATA_FIELDS,
437
+ "metadata missing required columns")
438
+
439
+ full_flat_config_dict = build_full_flat_config_dict(
440
+ study_specific_config_dict, software_config_dict, stds_fp)
441
+
442
+ metadata_df, validation_msgs_df = _populate_metadata_df(
443
+ raw_metadata_df, full_flat_config_dict,
444
+ study_specific_transformers_dict)
445
+
446
+ return metadata_df, validation_msgs_df
447
+
448
+
449
+ def write_metadata_results(
450
+ metadata_df: pandas.DataFrame,
451
+ validation_msgs_df: pandas.DataFrame,
452
+ out_dir: str,
453
+ out_name_base: str,
454
+ sep: str = "\t",
455
+ remove_internals: bool = True,
456
+ suppress_empty_fails: bool = False,
457
+ internal_col_names: Optional[List[str]] = None) -> None:
458
+ """Write metadata and validation results to files.
459
+
460
+ Parameters
461
+ ----------
462
+ metadata_df : pandas.DataFrame
463
+ The metadata DataFrame to write.
464
+ validation_msgs_df : pandas.DataFrame
465
+ DataFrame containing validation messages.
466
+ out_dir : str
467
+ Directory where output files will be written.
468
+ out_name_base : str
469
+ Base name for output files.
470
+ sep : str, default="\t"
471
+ Separator to use in output files.
472
+ remove_internals : bool, default=True
473
+ Whether to remove internal columns.
474
+ suppress_empty_fails : bool, default=False
475
+ Whether to suppress empty failure files.
476
+ internal_col_names : Optional[List[str]], default=None
477
+ List of internal column names.
478
+ """
479
+ if internal_col_names is None:
480
+ internal_col_names = INTERNAL_COL_KEYS
481
+
482
+ _output_metadata_df_to_files(
483
+ metadata_df, out_dir, out_name_base, internal_col_names,
484
+ remove_internals_and_fails=remove_internals, sep=sep,
485
+ suppress_empty_fails=suppress_empty_fails)
486
+
487
+ output_validation_msgs(validation_msgs_df, out_dir, out_name_base, sep=",",
488
+ suppress_empty_fails=suppress_empty_fails)
489
+
490
+
491
+ def _populate_metadata_df(
492
+ raw_metadata_df: pandas.DataFrame,
493
+ full_flat_config_dict: Dict[str, Any],
494
+ transformer_funcs_dict: Optional[Dict[str, Any]]) -> Tuple[pandas.DataFrame, pandas.DataFrame]:
495
+ """Populate columns and fields in a metadata DataFrame.
496
+
497
+ Parameters
498
+ ----------
499
+ raw_metadata_df : pandas.DataFrame
500
+ The raw metadata DataFrame to populate, which must contain at least
501
+ the columns in REQUIRED_RAW_METADATA_FIELDS.
502
+ full_flat_config_dict : Dict[str, Any]
503
+ Fully combined flat-host-type config dictionary.
504
+ transformer_funcs_dict : Optional[Dict[str, Any]]
505
+ Dictionary of transformer functions, keyed by field name,
506
+ with each value being a dict with keys SOURCES_KEY and FUNCTION_KEY,
507
+ which map to lists of source field names for the transformer to use
508
+ and an existing transformer function name, respectively.
509
+
510
+ Returns
511
+ -------
512
+ Tuple[pandas.DataFrame, pandas.DataFrame]
513
+ A tuple containing:
514
+ - The populated metadata DataFrame
515
+ - A DataFrame containing validation messages
516
+ """
517
+ metadata_df = raw_metadata_df.copy()
518
+ # Don't try to populate the QC_NOTE_KEY field, since it is an internal field
519
+ update_metadata_df_field(metadata_df, QC_NOTE_KEY, LEAVE_BLANK_VAL)
520
+
521
+ # Error for NaNs in sample name, warn for NaNs in host- and sample-type- shorthand fields.
522
+ metadata_df = _catch_nan_required_fields(metadata_df)
523
+
524
+ # Apply pre-transformers to the metadata. Pre-transformers run BEFORE host- and sample-type
525
+ # specific generation (which also includes validation), so they can transform raw input fields
526
+ # into values that the config validation expects (for example, converting a study's custom sex
527
+ # format like "M"/"F" into standardized values like "male"/"female" before validation occurs.
528
+ metadata_df = _transform_metadata(
529
+ metadata_df, full_flat_config_dict,
530
+ PRE_TRANSFORMERS_KEY, transformer_funcs_dict)
531
+
532
+ # Add specific metadata based on each host type present in the metadata.
533
+ # This step also validates the metadata against the config requirements.
534
+ metadata_df, validation_msgs = _generate_metadata_for_host_types(
535
+ metadata_df, full_flat_config_dict)
536
+
537
+ # Apply post-transformers to the metadata. Post-transformers run AFTER host- and sample-type
538
+ # specific generation, so they can use fields that only exist or were only filled in
539
+ # after that step, such as passing through a value filled in by the defaults to another field.
540
+ metadata_df = _transform_metadata(
541
+ metadata_df, full_flat_config_dict,
542
+ POST_TRANSFORMERS_KEY, transformer_funcs_dict)
543
+
544
+ # Reorder the metadata columns for better readability.
545
+ metadata_df = _reorder_df(metadata_df, INTERNAL_COL_KEYS)
546
+
547
+ # Turn the validation messages into a DataFrame of validation messages for easier use downstream.
548
+ validation_msgs_df = pandas.DataFrame(validation_msgs)
549
+
550
+ return metadata_df, validation_msgs_df
551
+
552
+
553
+ def _catch_nan_required_fields(metadata_df: pandas.DataFrame) -> pandas.DataFrame:
554
+ """Error for NaNs in sample name, warn for NaNs in host- and sample-type- shorthand fields.
555
+
556
+ Parameters
557
+ ----------
558
+ metadata_df : pandas.DataFrame
559
+ The metadata DataFrame to process.
560
+
561
+ Returns
562
+ -------
563
+ pandas.DataFrame
564
+ The processed DataFrame. NaNs in host- and sample-type-shorthand fields are set to "empty".
565
+
566
+ Raises
567
+ ------
568
+ ValueError
569
+ If any sample names are NaN.
570
+ """
571
+ # if there are any sample_name fields that are NaN, raise an error
572
+ nan_sample_name_mask = metadata_df[SAMPLE_NAME_KEY].isna()
573
+ if nan_sample_name_mask.any():
574
+ raise ValueError("Metadata contains NaN sample names")
575
+
576
+ # if there are any hosttype_shorthand or sampletype_shorthand fields
577
+ # that are NaN, set them to "empty" and raise a warning
578
+ for curr_key in [HOSTTYPE_SHORTHAND_KEY, SAMPLETYPE_SHORTHAND_KEY]:
579
+ nan_mask = metadata_df[curr_key].isna()
580
+ if nan_mask.any():
581
+ metadata_df.loc[nan_mask, curr_key] = "empty"
582
+ logging.warning(f"Metadata contains NaN {curr_key}s; "
583
+ f"these have been set to 'empty'")
584
+
585
+ return metadata_df
586
+
587
+
588
+ # transformer runner function
589
+ def _transform_metadata(
590
+ metadata_df: pandas.DataFrame,
591
+ full_flat_config_dict: Dict[str, Any],
592
+ stage_key: str,
593
+ transformer_funcs_dict: Optional[Dict[str, Any]]) -> pandas.DataFrame:
594
+ """Apply transformations defined in full_flat_config_dict to metadata fields using dict of transformer functions.
595
+
596
+ Parameters
597
+ ----------
598
+ metadata_df : pandas.DataFrame
599
+ The metadata DataFrame to transform, which must contain at least
600
+ the columns in REQUIRED_RAW_METADATA_FIELDS.
601
+ full_flat_config_dict : Dict[str, Any]
602
+ Fully combined flat-host-type config dictionary.
603
+ stage_key : str
604
+ Key indicating the transformation stage (pre or post).
605
+ transformer_funcs_dict : Optional[Dict[str, Any]]
606
+ Dictionary of transformer functions, keyed by field name,
607
+ with each value being a dict with keys SOURCES_KEY and FUNCTION_KEY,
608
+ which map to lists of source field names for the transformer to use
609
+ and an existing transformer function name, respectively.
610
+
611
+ Returns
612
+ -------
613
+ pandas.DataFrame
614
+ The transformed metadata DataFrame.
615
+
616
+ Raises
617
+ ------
618
+ ValueError
619
+ If a specified transformer function cannot be found.
620
+ """
621
+ if transformer_funcs_dict is None:
622
+ transformer_funcs_dict = {}
623
+ # If the necessary keys aren't already in the config, set them to do-nothing defaults
624
+ overwrite_non_nans = full_flat_config_dict.get(OVERWRITE_NON_NANS_KEY, False)
625
+ metadata_transformers = full_flat_config_dict.get(METADATA_TRANSFORMERS_KEY, None)
626
+ if metadata_transformers:
627
+ stage_transformers = metadata_transformers.get(stage_key, None)
628
+ # If there are transformers for the stage we're at, apply them
629
+ if stage_transformers:
630
+ for curr_target_field, curr_transformer_dict in \
631
+ stage_transformers.items():
632
+ curr_source_fields = curr_transformer_dict[SOURCES_KEY]
633
+ curr_func_name = curr_transformer_dict[FUNCTION_KEY]
634
+
635
+ try:
636
+ curr_func = transformer_funcs_dict[curr_func_name]
637
+ except KeyError:
638
+ try:
639
+ # if the transformer function isn't in the dictionary
640
+ # that was passed in, probably it is a built-in one,
641
+ # so look for it in the metameq transformers module
642
+ # looking into the metameq transformers module
643
+ curr_func = getattr(transformers, curr_func_name)
644
+ except AttributeError:
645
+ raise ValueError(
646
+ f"Unable to find transformer '{curr_func_name}'")
647
+ # end try to find in metameq transformers
648
+ # end try to find in input (study-specific) transformers
649
+
650
+ # apply the function named curr_func_name to the column(s) of the
651
+ # metadata_df named curr_source_fields to fill curr_target_field
652
+ update_metadata_df_field(metadata_df, curr_target_field,
653
+ curr_func, curr_source_fields,
654
+ overwrite_non_nans=overwrite_non_nans)
655
+ # next stage transformer
656
+ # end if there are stage transformers for this stage
657
+ # end if there are any metadata transformers
658
+
659
+ return metadata_df
660
+
661
+
662
+ def _generate_metadata_for_host_types(
663
+ metadata_df: pandas.DataFrame,
664
+ full_flat_config_dict: Dict[str, Any]) -> Tuple[pandas.DataFrame, List[str]]:
665
+ """Generate metadata for samples of all host types in the DataFrame.
666
+
667
+ Parameters
668
+ ----------
669
+ metadata_df : pandas.DataFrame
670
+ The metadata DataFrame to process, which must contain at least
671
+ the columns in REQUIRED_RAW_METADATA_FIELDS.
672
+ full_flat_config_dict : Dict[str, Any]
673
+ Fully combined flat-host-type config dictionary.
674
+
675
+ Returns
676
+ -------
677
+ Tuple[pandas.DataFrame, List[str]]
678
+ A tuple containing:
679
+ - The processed DataFrame with specific metadata added to each sample of each host type
680
+ - A list of validation messages
681
+ """
682
+ # gather global settings
683
+ settings_dict = {DEFAULT_KEY: full_flat_config_dict.get(DEFAULT_KEY),
684
+ LEAVE_REQUIREDS_BLANK_KEY:
685
+ full_flat_config_dict.get(LEAVE_REQUIREDS_BLANK_KEY),
686
+ OVERWRITE_NON_NANS_KEY:
687
+ full_flat_config_dict.get(OVERWRITE_NON_NANS_KEY)}
688
+
689
+ validation_msgs = []
690
+ host_type_dfs = []
691
+ # For all the host types present in the metadata, generate the specific metadata
692
+ host_type_shorthands = pandas.unique(metadata_df[HOSTTYPE_SHORTHAND_KEY])
693
+ for curr_host_type_shorthand in host_type_shorthands:
694
+ concatted_dfs, curr_validation_msgs = _generate_metadata_for_a_host_type(
695
+ metadata_df, curr_host_type_shorthand, settings_dict, full_flat_config_dict)
696
+
697
+ host_type_dfs.append(concatted_dfs)
698
+ validation_msgs.extend(curr_validation_msgs)
699
+ # next host type
700
+
701
+ # Concatenate the processed host-type-specific metadata DataFrames into a single output DataFrame
702
+ output_df = pandas.concat(host_type_dfs, ignore_index=True)
703
+
704
+ # concatting dfs from different hosts can create large numbers of NAs--
705
+ # for example, if concatting a host-associated df with a control df, where
706
+ # the control df doesn't have values for any of the host-related columns.
707
+ # Fill those NAs with whatever the general default is.
708
+ # NB: passing in the same dict twice here is not a mistake, just a
709
+ # convenience since we don't have a more specific dict at this point.
710
+ output_df = _fill_na_if_default(
711
+ output_df, settings_dict, settings_dict)
712
+
713
+ # TODO: this is setting a value in the output; should it be centralized
714
+ # so it is easy to find?
715
+ # Replace the LEAVE_BLANK_VAL with an empty string in the output DataFrame
716
+ output_df.replace(LEAVE_BLANK_VAL, "", inplace=True)
717
+ return output_df, validation_msgs
718
+
719
+
720
+ def _generate_metadata_for_a_host_type(
721
+ metadata_df: pandas.DataFrame,
722
+ a_host_type: str,
723
+ settings_dict: Dict[str, Any],
724
+ full_flat_config_dict: Dict[str, Any]) -> Tuple[pandas.DataFrame, List[str]]:
725
+ """Generate metadata df for samples with a specific host type.
726
+
727
+ Parameters
728
+ ----------
729
+ metadata_df : pandas.DataFrame
730
+ The metadata DataFrame to process, which must contain at least
731
+ the columns in REQUIRED_RAW_METADATA_FIELDS.
732
+ a_host_type : str
733
+ The specific host type for which to process samples.
734
+ settings_dict : Dict[str, Any]
735
+ Dictionary containing global settings for default/nan/etc.
736
+ full_flat_config_dict : Dict[str, Any]
737
+ Fully combined flat-host-type config dictionary.
738
+
739
+ Returns
740
+ -------
741
+ Tuple[pandas.DataFrame, List[str]]
742
+ A tuple containing:
743
+ - The processed DataFrame with specific metadata added to each sample of the input host type
744
+ - A list of validation messages
745
+ """
746
+ # get the subset of the metadata DataFrame that contains samples of the input host type
747
+ host_type_mask = \
748
+ metadata_df[HOSTTYPE_SHORTHAND_KEY] == a_host_type
749
+ host_type_df = metadata_df.loc[host_type_mask, :].copy()
750
+
751
+ validation_msgs = []
752
+ known_host_shorthands = full_flat_config_dict[HOST_TYPE_SPECIFIC_METADATA_KEY].keys()
753
+ if a_host_type not in known_host_shorthands:
754
+ # if the input host type is not in the config, add a QC note to the metadata
755
+ # for these samples but do not error out; move on to the next host type
756
+ update_metadata_df_field(
757
+ host_type_df, QC_NOTE_KEY, "invalid host_type")
758
+ # host_type_df[QC_NOTE_KEY] = "invalid host_type"
759
+ concatted_df = host_type_df
760
+ else:
761
+ # gather host-type-specific settings and overwrite the global settings with them, if any
762
+ a_host_type_config_dict = \
763
+ full_flat_config_dict[HOST_TYPE_SPECIFIC_METADATA_KEY][a_host_type]
764
+ global_plus_host_settings_dict = deepcopy_dict(settings_dict)
765
+ # if this host type has a default value for empty fields, use it; otherwise, use the global default
766
+ global_plus_host_settings_dict[DEFAULT_KEY] = a_host_type_config_dict.get(
767
+ DEFAULT_KEY, global_plus_host_settings_dict[DEFAULT_KEY])
768
+
769
+ dfs_to_concat = []
770
+ # loop through each sample type in the metadata for this host type
771
+ found_host_sample_types = \
772
+ pandas.unique(host_type_df[SAMPLETYPE_SHORTHAND_KEY])
773
+ for curr_sample_type in found_host_sample_types:
774
+ # generate the specific metadata for this sample type *in this host type*
775
+ curr_sample_type_df, curr_validation_msgs = \
776
+ _generate_metadata_for_a_sample_type_in_a_host_type(
777
+ host_type_df, curr_sample_type, global_plus_host_settings_dict,
778
+ a_host_type_config_dict)
779
+
780
+ dfs_to_concat.append(curr_sample_type_df)
781
+ validation_msgs.extend(curr_validation_msgs)
782
+ # next sample type in metadata for this host type
783
+
784
+ # Concatenate the processed sample-type-specific metadata DataFrames
785
+ # for the host type into a single output DataFrame
786
+ concatted_df = pandas.concat(dfs_to_concat, ignore_index=True)
787
+ # endif host_type is valid
788
+
789
+ return concatted_df, validation_msgs
790
+
791
+
792
+ def _generate_metadata_for_a_sample_type_in_a_host_type(
793
+ host_type_metadata_df: pandas.DataFrame,
794
+ a_sample_type: str,
795
+ global_plus_host_settings_dict: Dict[str, Any],
796
+ a_host_type_config_dict: Dict[str, Any]) -> Tuple[pandas.DataFrame, List[str]]:
797
+ """Generate metadata df for samples with a specific sample type within a specific host type.
798
+
799
+ Parameters
800
+ ----------
801
+ host_type_metadata_df : pandas.DataFrame
802
+ DataFrame containing metadata samples for a specific host type.
803
+ a_sample_type : str
804
+ The sample type to process.
805
+ global_plus_host_settings_dict : Dict[str, Any]
806
+ Dictionary containing default/nan/etc settings for current context.
807
+ a_host_type_config_dict : Dict[str, Any]
808
+ Dictionary containing config for this host type.
809
+
810
+ Returns
811
+ -------
812
+ Tuple[pandas.DataFrame, List[str]]
813
+ A tuple containing:
814
+ - The updated metadata DataFrame with sample-type-specific elements added
815
+ - A list of validation messages
816
+ """
817
+ # copy the metadata fields dict from the host type config to be the
818
+ # basis of the work-in-progress metadata dict--these are the default fields
819
+ # that will be overwritten, if necessary, by sample type-specific fields
820
+ wip_metadata_fields_dict = deepcopy_dict(
821
+ a_host_type_config_dict.get(METADATA_FIELDS_KEY, {}))
822
+
823
+ # get the config section for *all* sample types within this host type
824
+ host_sample_types_config_dict = \
825
+ a_host_type_config_dict[SAMPLE_TYPE_SPECIFIC_METADATA_KEY]
826
+
827
+ # get df of records for this sample type in this host type
828
+ sample_type_mask = \
829
+ host_type_metadata_df[SAMPLETYPE_SHORTHAND_KEY] == a_sample_type
830
+ sample_type_df = host_type_metadata_df.loc[sample_type_mask, :].copy()
831
+
832
+ validation_msgs = []
833
+ known_sample_types = host_sample_types_config_dict.keys()
834
+ if a_sample_type not in known_sample_types:
835
+ # if the input sample type is not in the config, add a QC note to the metadata
836
+ # for these samples but do not error out; move on to the next sample type
837
+ update_metadata_df_field(
838
+ sample_type_df, QC_NOTE_KEY, "invalid sample_type")
839
+ else:
840
+ # resolve any aliases and base types for the sample type and combine its
841
+ # specific metadata fields with the host type's metadata fields
842
+ # to get the full set of config info for this host+sample type
843
+ full_sample_type_metadata_fields_dict = \
844
+ _construct_sample_type_metadata_fields_dict(
845
+ a_sample_type, host_sample_types_config_dict, wip_metadata_fields_dict)
846
+
847
+ # update the metadata df with the sample type specific metadata fields
848
+ # TODO: this is taking in wip_metadata_fields_dict instead of full_sample_type_metadata_fields_dict,
849
+ # which only works because the code underlying _construct_sample_type_metadata_fields_dict
850
+ # is *modifying* wip_metadata_fields_dict in place. This should be corrected, but that
851
+ # needs to wait until there are tests to make sure doing so doesn't break anything.
852
+ sample_type_df = _update_metadata_from_dict(
853
+ sample_type_df, wip_metadata_fields_dict, dict_is_metadata_fields=True,
854
+ overwrite_non_nans=global_plus_host_settings_dict[OVERWRITE_NON_NANS_KEY])
855
+
856
+ # for fields that are required but not yet filled, replace the placeholder with
857
+ # either an indicator that it should be blank or else
858
+ # fill with NA (replaced with default just below), based on config setting
859
+ leave_reqs_blank = global_plus_host_settings_dict[LEAVE_REQUIREDS_BLANK_KEY]
860
+ reqs_val = LEAVE_BLANK_VAL if leave_reqs_blank else np.nan
861
+ sample_type_df.replace(
862
+ to_replace=REQ_PLACEHOLDER, value=reqs_val, inplace=True)
863
+
864
+ # fill NAs with appropriate default value if any is set
865
+ sample_type_df = _fill_na_if_default(
866
+ sample_type_df, full_sample_type_metadata_fields_dict, global_plus_host_settings_dict)
867
+
868
+ # validate the metadata df based on the specific requirements
869
+ # for this host+sample type
870
+ validation_msgs = validate_metadata_df(
871
+ sample_type_df, full_sample_type_metadata_fields_dict)
872
+
873
+ return sample_type_df, validation_msgs
874
+
875
+
876
+ def _construct_sample_type_metadata_fields_dict(
877
+ sample_type: str,
878
+ host_sample_types_config_dict: Dict[str, Any],
879
+ a_host_type_metadata_fields_dict: Dict[str, Any]) -> Dict[str, Any]:
880
+ """Construct metadata fields dictionary for a specific host+sample type, resolving aliases and base types.
881
+
882
+ Parameters
883
+ ----------
884
+ sample_type : str
885
+ The sample type to process.
886
+ host_sample_types_config_dict : Dict[str, Any]
887
+ Dictionary containing config for *all* sample types in
888
+ the host type in question.
889
+ a_host_type_metadata_fields_dict : Dict[str, Any]
890
+ Dictionary containing metadata fields for the host type in question.
891
+
892
+ Returns
893
+ -------
894
+ Dict[str, Any]
895
+ The constructed metadata fields dictionary for this host-and-sample-type combination.
896
+
897
+ Raises
898
+ ------
899
+ ValueError
900
+ If there are invalid alias chains or base type configurations.
901
+ """
902
+ sample_type_for_metadata = sample_type
903
+
904
+ # get dict associated with the naive sample type
905
+ sample_type_specific_dict = \
906
+ host_sample_types_config_dict[sample_type]
907
+
908
+ # if naive sample type contains an alias
909
+ sample_type_alias = sample_type_specific_dict.get(ALIAS_KEY)
910
+ if sample_type_alias:
911
+ # change the sample type to the alias sample type
912
+ # and use the alias's sample type dict
913
+ sample_type_for_metadata = sample_type_alias
914
+ sample_type_specific_dict = \
915
+ host_sample_types_config_dict[sample_type_alias]
916
+ if METADATA_FIELDS_KEY not in sample_type_specific_dict:
917
+ raise ValueError(f"May not chain aliases "
918
+ f"('{sample_type}' to '{sample_type_alias}')")
919
+ # endif sample type is an alias
920
+
921
+ # if the sample type has a base type
922
+ sample_type_base = sample_type_specific_dict.get(BASE_TYPE_KEY)
923
+ if sample_type_base:
924
+ # get the base's sample type dict and add this sample type's
925
+ # info on top of it
926
+ base_sample_dict = host_sample_types_config_dict[sample_type_base]
927
+ if list(base_sample_dict.keys()) != [METADATA_FIELDS_KEY]:
928
+ raise ValueError(f"Base sample type '{sample_type_base}' "
929
+ f"must only have metadata fields")
930
+ sample_type_specific_dict_metadata = update_wip_metadata_dict(
931
+ sample_type_specific_dict.get(METADATA_FIELDS_KEY, {}),
932
+ base_sample_dict[METADATA_FIELDS_KEY])
933
+ sample_type_specific_dict[METADATA_FIELDS_KEY] = \
934
+ sample_type_specific_dict_metadata
935
+ # endif sample type has a base type
936
+
937
+ # add the sample-type-specific info generated above on top of the host info
938
+ sample_type_metadata_dict = update_wip_metadata_dict(
939
+ a_host_type_metadata_fields_dict,
940
+ sample_type_specific_dict.get(METADATA_FIELDS_KEY, {}))
941
+
942
+ # set sample_type, and qiita_sample_type if it is not already set
943
+ sample_type_definition = {
944
+ ALLOWED_KEY: [sample_type_for_metadata],
945
+ DEFAULT_KEY: sample_type_for_metadata,
946
+ TYPE_KEY: "string"
947
+ }
948
+ sample_type_metadata_dict = update_wip_metadata_dict(
949
+ sample_type_metadata_dict, {SAMPLE_TYPE_KEY: sample_type_definition})
950
+ if QIITA_SAMPLE_TYPE not in sample_type_metadata_dict:
951
+ sample_type_metadata_dict = update_wip_metadata_dict(
952
+ sample_type_metadata_dict, {QIITA_SAMPLE_TYPE: sample_type_definition})
953
+ # end if qiita_sample_type not already set
954
+
955
+ return sample_type_metadata_dict
956
+
957
+
958
+ def _update_metadata_from_dict(
959
+ metadata_df: pandas.DataFrame,
960
+ config_section_dict: Dict[str, Any],
961
+ dict_is_metadata_fields: bool = False,
962
+ overwrite_non_nans: bool = False) -> pandas.DataFrame:
963
+ """Create an updated copy of the metadata DataFrame based on an input dictionary.
964
+
965
+ Parameters
966
+ ----------
967
+ metadata_df : pandas.DataFrame
968
+ The metadata DataFrame to update.
969
+ config_section_dict : Dict[str, Any]
970
+ The relevant section of a config dictionary to use.
971
+ dict_is_metadata_fields : bool, default=False
972
+ Whether the config dict contains a METADATA_FIELDS_KEY
973
+ (in which case False) or is itself the contents of
974
+ a METADATA_FIELDS_KEY (in which case True).
975
+ overwrite_non_nans : bool, default=False
976
+ Whether to overwrite non-NaN values with default values.
977
+
978
+ Returns
979
+ -------
980
+ pandas.DataFrame
981
+ An updated copy of the metadata DataFrame.
982
+ """
983
+ if not dict_is_metadata_fields:
984
+ metadata_fields_dict = config_section_dict.get(METADATA_FIELDS_KEY)
985
+ else:
986
+ metadata_fields_dict = config_section_dict
987
+
988
+ output_df = _update_metadata_from_metadata_fields_dict(
989
+ metadata_df, metadata_fields_dict,
990
+ overwrite_non_nans=overwrite_non_nans)
991
+ return output_df
992
+
993
+
994
+ def _update_metadata_from_metadata_fields_dict(
995
+ metadata_df: pandas.DataFrame,
996
+ metadata_fields_dict: Dict[str, Any],
997
+ overwrite_non_nans: bool) -> pandas.DataFrame:
998
+ """Create an updated copy of the metadata DataFrame based on a metadata fields dictionary.
999
+
1000
+ Parameters
1001
+ ----------
1002
+ metadata_df : pandas.DataFrame
1003
+ The metadata DataFrame to update.
1004
+ metadata_fields_dict : Dict[str, Any]
1005
+ Dictionary containing metadata field definitions and required values.
1006
+ overwrite_non_nans : bool
1007
+ Whether to overwrite non-NaN values with default values.
1008
+
1009
+ Returns
1010
+ -------
1011
+ pandas.DataFrame
1012
+ An updated copy of the metadata DataFrame.
1013
+ """
1014
+ output_df = metadata_df.copy()
1015
+
1016
+ # loop through each metadata field in the metadata fields dict
1017
+ for curr_field_name, curr_field_vals_dict in metadata_fields_dict.items():
1018
+ # if the field has a default value (regardless of whether it is
1019
+ # required), update the metadata df with it (this includes adding the
1020
+ # field if it does not already exist). For existing fields, what exactly
1021
+ # will beupdated depends on the value of overwrite_non_nans:
1022
+ # if overwrite_non_nans is True, then all values will be updated;
1023
+ # if overwrite_non_nans is False, then only NA values will be updated
1024
+ # if the field already exists in the metadata; otherwise, the field
1025
+ # will be added to the metadata with the default value throughout.
1026
+ if DEFAULT_KEY in curr_field_vals_dict:
1027
+ curr_default_val = curr_field_vals_dict[DEFAULT_KEY]
1028
+ update_metadata_df_field(
1029
+ output_df, curr_field_name, curr_default_val,
1030
+ overwrite_non_nans=overwrite_non_nans)
1031
+ # if the field is required BUT has no default value, then if the field does not
1032
+ # already exist in the metadata, add the field to the metadata with a placeholder value.
1033
+ elif REQUIRED_KEY in curr_field_vals_dict:
1034
+ curr_required_val = curr_field_vals_dict[REQUIRED_KEY]
1035
+ if curr_required_val and curr_field_name not in output_df:
1036
+ update_metadata_df_field(
1037
+ output_df, curr_field_name, REQ_PLACEHOLDER,
1038
+ overwrite_non_nans=overwrite_non_nans)
1039
+ # note that if the field is (a) required, (b) does not have a
1040
+ # default value, and (c) IS already in the metadata, it will
1041
+ # be left alone, with no changes made to it!
1042
+ return output_df
1043
+
1044
+
1045
+ # fill NAs with default value if any is set
1046
+ def _fill_na_if_default(
1047
+ metadata_df: pandas.DataFrame,
1048
+ specific_dict: Dict[str, Any],
1049
+ settings_dict: Dict[str, Any]) -> pandas.DataFrame:
1050
+ """Fill NaN values in metadata df with default values if available.
1051
+
1052
+ Parameters
1053
+ ----------
1054
+ metadata_df : pandas.DataFrame
1055
+ The metadata DataFrame to process.
1056
+ specific_dict : Dict[str, Any]
1057
+ Dictionary containing context-specific settings. Will be used first as a source of default values.
1058
+ settings_dict : Dict[str, Any]
1059
+ Dictionary containing global settings. Will be used as a
1060
+ source of default values if specific_dict does not contain a DEFAULT_KEY.
1061
+
1062
+ Returns
1063
+ -------
1064
+ pandas.DataFrame
1065
+ The updated DataFrame with NaN values filled. Unchanged if no default values are set.
1066
+ """
1067
+ default_val = specific_dict.get(DEFAULT_KEY, settings_dict[DEFAULT_KEY])
1068
+ if default_val:
1069
+ # TODO: this is setting a value in the output; should it be
1070
+ # centralized so it is easy to find?
1071
+ metadata_df = \
1072
+ metadata_df.fillna(default_val)
1073
+ # metadata_df.astype("string").fillna(default_val)
1074
+
1075
+ return metadata_df
1076
+
1077
+
1078
+ def _output_metadata_df_to_files(
1079
+ a_df: pandas.DataFrame,
1080
+ out_dir: str,
1081
+ out_base: str,
1082
+ internal_col_names: List[str],
1083
+ sep: str = "\t",
1084
+ remove_internals_and_fails: bool = False,
1085
+ suppress_empty_fails: bool = False) -> None:
1086
+ """Output DataFrame to files, optionally removing internal columns and failures.
1087
+
1088
+ Parameters
1089
+ ----------
1090
+ a_df : pandas.DataFrame
1091
+ The metadata DataFrame to output.
1092
+ out_dir : str
1093
+ Directory where output files will be written.
1094
+ out_base : str
1095
+ Base name for output files.
1096
+ internal_col_names : List[str]
1097
+ List of internal column names that will be moved
1098
+ to the end of the DataFrame.
1099
+ sep : str, default="tab"
1100
+ Separator to use in output files.
1101
+ remove_internals_and_fails : bool, default=False
1102
+ Whether to remove internal columns and failures.
1103
+ suppress_empty_fails : bool, default=False
1104
+ Whether to suppress empty failure files.
1105
+ """
1106
+ timestamp_str = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
1107
+ extension = get_extension(sep)
1108
+
1109
+ # if we've been told to remove the qc fails and the internal columns
1110
+ if remove_internals_and_fails:
1111
+ # output a file of any qc failures
1112
+ qc_fails_df = get_qc_failures(a_df)
1113
+ qc_fails_fp = os.path.join(
1114
+ out_dir, f"{timestamp_str}_{out_base}_fails.csv")
1115
+ if qc_fails_df.empty:
1116
+ # unless we've been told to suppress empty files
1117
+ if not suppress_empty_fails:
1118
+ # if there are no failures, create an empty file
1119
+ # (not even header line) if there are no failures--bc it is easy to
1120
+ # eyeball "zero bytes"
1121
+ Path(qc_fails_fp).touch()
1122
+ # else, just do nothing
1123
+ else:
1124
+ qc_fails_df.to_csv(qc_fails_fp, sep=",", index=False)
1125
+
1126
+ # then remove the qc fails and the internal columns from the metadata
1127
+ # TODO: I'd like to avoid repeating this mask here + in get_qc_failures
1128
+ fails_qc_mask = a_df[QC_NOTE_KEY] != ""
1129
+ a_df = a_df.loc[~fails_qc_mask, :].copy()
1130
+ a_df = a_df.drop(columns=internal_col_names)
1131
+
1132
+ # output the metadata
1133
+ out_fp = os.path.join(out_dir, f"{timestamp_str}_{out_base}.{extension}")
1134
+ a_df.to_csv(out_fp, sep=sep, index=False)
1135
+
1136
+
1137
+ def _reorder_df(a_df: pandas.DataFrame, internal_col_names: List[str]) -> pandas.DataFrame:
1138
+ """Reorder DataFrame columns according to standard rules.
1139
+
1140
+ Parameters
1141
+ ----------
1142
+ a_df : pandas.DataFrame
1143
+ The DataFrame to reorder.
1144
+ internal_col_names : List[str]
1145
+ List of internal column names that will be moved to the end of the DataFrame.
1146
+
1147
+ Returns
1148
+ -------
1149
+ pandas.DataFrame
1150
+ A reordered copy of the input DataFrame with:
1151
+ - sample_name as the first column
1152
+ - remaining columns except for internal columns in alphabetical order
1153
+ - internal columns at the end in the order they were provided
1154
+ """
1155
+ # sort columns alphabetically
1156
+ working_df = a_df.copy().reindex(sorted(a_df.columns), axis=1)
1157
+
1158
+ # move the internal columns to the end of the list of cols to output
1159
+ col_names = list(working_df)
1160
+ for curr_internal_col_name in internal_col_names:
1161
+ # TODO: throw an error if the internal col name is not present
1162
+ col_names.pop(col_names.index(curr_internal_col_name))
1163
+ col_names.append(curr_internal_col_name)
1164
+
1165
+ # move sample name to the first column
1166
+ col_names.insert(0, col_names.pop(col_names.index(SAMPLE_NAME_KEY)))
1167
+ output_df = working_df.loc[:, col_names].copy()
1168
+ return output_df