metameq 2026.1.2__tar.gz → 2026.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {metameq-2026.1.2/metameq.egg-info → metameq-2026.2.2}/PKG-INFO +1 -1
- {metameq-2026.1.2 → metameq-2026.2.2}/metameq/_version.py +3 -3
- metameq-2026.2.2/metameq/config/config.yml +7 -0
- {metameq-2026.1.2 → metameq-2026.2.2}/metameq/src/metadata_configurator.py +53 -6
- {metameq-2026.1.2 → metameq-2026.2.2}/metameq/src/metadata_extender.py +58 -36
- {metameq-2026.1.2 → metameq-2026.2.2}/metameq/src/util.py +9 -0
- {metameq-2026.1.2 → metameq-2026.2.2}/metameq/tests/test_metadata_configurator.py +188 -2
- {metameq-2026.1.2 → metameq-2026.2.2}/metameq/tests/test_metadata_extender.py +246 -93
- {metameq-2026.1.2 → metameq-2026.2.2/metameq.egg-info}/PKG-INFO +1 -1
- metameq-2026.1.2/metameq/config/config.yml +0 -3
- {metameq-2026.1.2 → metameq-2026.2.2}/.gitattributes +0 -0
- {metameq-2026.1.2 → metameq-2026.2.2}/.github/workflows/main.yaml +0 -0
- {metameq-2026.1.2 → metameq-2026.2.2}/.gitignore +0 -0
- {metameq-2026.1.2 → metameq-2026.2.2}/README.md +0 -0
- {metameq-2026.1.2 → metameq-2026.2.2}/assets/metameq.png +0 -0
- {metameq-2026.1.2 → metameq-2026.2.2}/assets/metameq_dark.svg +0 -0
- {metameq-2026.1.2 → metameq-2026.2.2}/assets/metameq_light.svg +0 -0
- {metameq-2026.1.2 → metameq-2026.2.2}/assets/metameq_medium.png +0 -0
- {metameq-2026.1.2 → metameq-2026.2.2}/environment.yml +0 -0
- {metameq-2026.1.2 → metameq-2026.2.2}/metameq/__init__.py +0 -0
- {metameq-2026.1.2 → metameq-2026.2.2}/metameq/config/__init__.py +0 -0
- {metameq-2026.1.2 → metameq-2026.2.2}/metameq/config/standards.yml +0 -0
- {metameq-2026.1.2 → metameq-2026.2.2}/metameq/src/__init__.py +0 -0
- {metameq-2026.1.2 → metameq-2026.2.2}/metameq/src/__main__.py +0 -0
- {metameq-2026.1.2 → metameq-2026.2.2}/metameq/src/metadata_merger.py +0 -0
- {metameq-2026.1.2 → metameq-2026.2.2}/metameq/src/metadata_transformers.py +0 -0
- {metameq-2026.1.2 → metameq-2026.2.2}/metameq/src/metadata_validator.py +0 -0
- {metameq-2026.1.2 → metameq-2026.2.2}/metameq/tests/__init__.py +0 -0
- {metameq-2026.1.2 → metameq-2026.2.2}/metameq/tests/data/invalid.yml +0 -0
- {metameq-2026.1.2 → metameq-2026.2.2}/metameq/tests/data/test_config.yml +0 -0
- {metameq-2026.1.2 → metameq-2026.2.2}/metameq/tests/test_metadata_merger.py +0 -0
- {metameq-2026.1.2 → metameq-2026.2.2}/metameq/tests/test_metadata_transformers.py +0 -0
- {metameq-2026.1.2 → metameq-2026.2.2}/metameq/tests/test_metadata_validator.py +0 -0
- {metameq-2026.1.2 → metameq-2026.2.2}/metameq/tests/test_util.py +0 -0
- {metameq-2026.1.2 → metameq-2026.2.2}/metameq.egg-info/SOURCES.txt +0 -0
- {metameq-2026.1.2 → metameq-2026.2.2}/metameq.egg-info/dependency_links.txt +0 -0
- {metameq-2026.1.2 → metameq-2026.2.2}/metameq.egg-info/entry_points.txt +0 -0
- {metameq-2026.1.2 → metameq-2026.2.2}/metameq.egg-info/requires.txt +0 -0
- {metameq-2026.1.2 → metameq-2026.2.2}/metameq.egg-info/top_level.txt +0 -0
- {metameq-2026.1.2 → metameq-2026.2.2}/setup.cfg +0 -0
- {metameq-2026.1.2 → metameq-2026.2.2}/setup.py +0 -0
- {metameq-2026.1.2 → metameq-2026.2.2}/versioneer.py +0 -0
|
@@ -8,11 +8,11 @@ import json
|
|
|
8
8
|
|
|
9
9
|
version_json = '''
|
|
10
10
|
{
|
|
11
|
-
"date": "2026-
|
|
11
|
+
"date": "2026-02-02T16:43:52-0800",
|
|
12
12
|
"dirty": false,
|
|
13
13
|
"error": null,
|
|
14
|
-
"full-revisionid": "
|
|
15
|
-
"version": "2026.
|
|
14
|
+
"full-revisionid": "4fe1396e1007820dc7a4bdb58708fff0df6b9a57",
|
|
15
|
+
"version": "2026.02.2"
|
|
16
16
|
}
|
|
17
17
|
''' # END VERSION_JSON
|
|
18
18
|
|
|
@@ -5,7 +5,7 @@ from metameq.src.util import extract_config_dict, extract_stds_config, \
|
|
|
5
5
|
HOST_TYPE_SPECIFIC_METADATA_KEY, \
|
|
6
6
|
SAMPLE_TYPE_SPECIFIC_METADATA_KEY, ALIAS_KEY, BASE_TYPE_KEY, \
|
|
7
7
|
DEFAULT_KEY, ALLOWED_KEY, ANYOF_KEY, TYPE_KEY, \
|
|
8
|
-
SAMPLE_TYPE_KEY, QIITA_SAMPLE_TYPE
|
|
8
|
+
SAMPLE_TYPE_KEY, QIITA_SAMPLE_TYPE, GLOBAL_SETTINGS_KEYS
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
def combine_stds_and_study_config(
|
|
@@ -257,11 +257,12 @@ def _combine_base_and_added_host_type(
|
|
|
257
257
|
host_type_wip_nested_dict = \
|
|
258
258
|
deepcopy_dict(host_type_base_dict)
|
|
259
259
|
|
|
260
|
-
# look for
|
|
261
|
-
#
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
260
|
+
# look for global settings in the add dict for this host; if
|
|
261
|
+
# any exists, add it to the wip dict (ok to overwrite existing)
|
|
262
|
+
for curr_global_setting_key in GLOBAL_SETTINGS_KEYS:
|
|
263
|
+
if curr_global_setting_key in host_type_add_dict:
|
|
264
|
+
host_type_wip_nested_dict[curr_global_setting_key] = \
|
|
265
|
+
host_type_add_dict.get(curr_global_setting_key)
|
|
265
266
|
|
|
266
267
|
# combine add metadata fields with the wip metadata fields
|
|
267
268
|
# for the current host type and assign to wip if not empty
|
|
@@ -636,6 +637,10 @@ def build_full_flat_config_dict(
|
|
|
636
637
|
# since the software config doesn't include any host type specific info
|
|
637
638
|
full_nested_hosts_dict = extract_stds_config(stds_fp)
|
|
638
639
|
|
|
640
|
+
full_nested_hosts_dict = _push_global_settings_into_top_host(
|
|
641
|
+
full_nested_hosts_dict,
|
|
642
|
+
software_plus_study_flat_config_dict)
|
|
643
|
+
|
|
639
644
|
full_flat_hosts_dict = flatten_nested_stds_dict(
|
|
640
645
|
full_nested_hosts_dict, None)
|
|
641
646
|
software_plus_study_flat_config_dict[HOST_TYPE_SPECIFIC_METADATA_KEY] = \
|
|
@@ -655,3 +660,45 @@ def build_full_flat_config_dict(
|
|
|
655
660
|
full_flat_config_dict = software_plus_study_flat_config_dict
|
|
656
661
|
|
|
657
662
|
return full_flat_config_dict
|
|
663
|
+
|
|
664
|
+
|
|
665
|
+
def _push_global_settings_into_top_host(
|
|
666
|
+
a_full_nested_hosts_dict: Dict[str, Any],
|
|
667
|
+
a_software_plus_study_flat_config_dict: Dict[str, Any]) -> Dict[str, Any]:
|
|
668
|
+
"""Push global settings from flat config into top-level host in nested hosts dict.
|
|
669
|
+
|
|
670
|
+
Parameters
|
|
671
|
+
----------
|
|
672
|
+
a_full_nested_hosts_dict : Dict[str, Any]
|
|
673
|
+
Nested hosts dictionary to update.
|
|
674
|
+
a_software_plus_study_flat_config_dict : Dict[str, Any]
|
|
675
|
+
Flat configuration dictionary containing global settings.
|
|
676
|
+
|
|
677
|
+
Returns
|
|
678
|
+
-------
|
|
679
|
+
Dict[str, Any]
|
|
680
|
+
Updated nested hosts dictionary with global settings added to top-level host.
|
|
681
|
+
|
|
682
|
+
Raises
|
|
683
|
+
------
|
|
684
|
+
ValueError
|
|
685
|
+
If there is not exactly one top-level host in the nested hosts dictionary.
|
|
686
|
+
"""
|
|
687
|
+
result = deepcopy_dict(a_full_nested_hosts_dict)
|
|
688
|
+
|
|
689
|
+
# get the top level host(s) in full_nested_hosts_dict
|
|
690
|
+
# (should be only one because it is nested)
|
|
691
|
+
top_level_host_keys = list(a_full_nested_hosts_dict[HOST_TYPE_SPECIFIC_METADATA_KEY].keys())
|
|
692
|
+
if len(top_level_host_keys) != 1:
|
|
693
|
+
raise ValueError(f"Expected exactly one top-level key in "
|
|
694
|
+
f"full_nested_hosts_dict but found: {top_level_host_keys}")
|
|
695
|
+
top_level_host_key = top_level_host_keys[0]
|
|
696
|
+
|
|
697
|
+
# check for each top-level setting from the software+study dictionary
|
|
698
|
+
# and add it under the top level host key in a_full_nested_hosts_dict
|
|
699
|
+
for curr_setting_key in GLOBAL_SETTINGS_KEYS:
|
|
700
|
+
if curr_setting_key in a_software_plus_study_flat_config_dict:
|
|
701
|
+
result[HOST_TYPE_SPECIFIC_METADATA_KEY][top_level_host_key][curr_setting_key] = \
|
|
702
|
+
a_software_plus_study_flat_config_dict[curr_setting_key]
|
|
703
|
+
|
|
704
|
+
return result
|
|
@@ -6,7 +6,7 @@ from pathlib import Path
|
|
|
6
6
|
from datetime import datetime
|
|
7
7
|
from typing import List, Dict, Optional, Tuple, Any
|
|
8
8
|
from metameq.src.util import extract_config_dict, \
|
|
9
|
-
|
|
9
|
+
validate_required_columns_exist, get_extension, \
|
|
10
10
|
load_df_with_best_fit_encoding, update_metadata_df_field, \
|
|
11
11
|
HOSTTYPE_SHORTHAND_KEY, SAMPLETYPE_SHORTHAND_KEY, \
|
|
12
12
|
QC_NOTE_KEY, METADATA_FIELDS_KEY, HOST_TYPE_SPECIFIC_METADATA_KEY, \
|
|
@@ -15,7 +15,8 @@ from metameq.src.util import extract_config_dict, \
|
|
|
15
15
|
LEAVE_BLANK_VAL, SAMPLE_NAME_KEY, \
|
|
16
16
|
ALLOWED_KEY, TYPE_KEY, LEAVE_REQUIREDS_BLANK_KEY, OVERWRITE_NON_NANS_KEY, \
|
|
17
17
|
METADATA_TRANSFORMERS_KEY, PRE_TRANSFORMERS_KEY, POST_TRANSFORMERS_KEY, \
|
|
18
|
-
SOURCES_KEY, FUNCTION_KEY, REQUIRED_RAW_METADATA_FIELDS
|
|
18
|
+
SOURCES_KEY, FUNCTION_KEY, REQUIRED_RAW_METADATA_FIELDS, \
|
|
19
|
+
HOSTTYPE_COL_OPTIONS_KEY, SAMPLETYPE_COL_OPTIONS_KEY
|
|
19
20
|
from metameq.src.metadata_configurator import update_wip_metadata_dict, \
|
|
20
21
|
build_full_flat_config_dict
|
|
21
22
|
from metameq.src.metadata_validator import validate_metadata_df, \
|
|
@@ -447,13 +448,22 @@ def extend_metadata_df(
|
|
|
447
448
|
ValueError
|
|
448
449
|
If required columns are missing from the metadata.
|
|
449
450
|
"""
|
|
451
|
+
full_flat_config_dict = build_full_flat_config_dict(
|
|
452
|
+
study_specific_config_dict, software_config_dict, stds_fp)
|
|
453
|
+
|
|
454
|
+
needed_cols = [(HOSTTYPE_SHORTHAND_KEY, HOSTTYPE_COL_OPTIONS_KEY),
|
|
455
|
+
(SAMPLETYPE_SHORTHAND_KEY, SAMPLETYPE_COL_OPTIONS_KEY)]
|
|
456
|
+
for curr_key, curr_options_key in needed_cols:
|
|
457
|
+
if curr_key not in raw_metadata_df.columns:
|
|
458
|
+
specified_name = _get_specified_column_name(
|
|
459
|
+
curr_options_key, raw_metadata_df, full_flat_config_dict)
|
|
460
|
+
if specified_name:
|
|
461
|
+
raw_metadata_df[curr_key] = raw_metadata_df[specified_name]
|
|
462
|
+
|
|
450
463
|
validate_required_columns_exist(
|
|
451
464
|
raw_metadata_df, REQUIRED_RAW_METADATA_FIELDS,
|
|
452
465
|
"metadata missing required columns")
|
|
453
466
|
|
|
454
|
-
full_flat_config_dict = build_full_flat_config_dict(
|
|
455
|
-
study_specific_config_dict, software_config_dict, stds_fp)
|
|
456
|
-
|
|
457
467
|
metadata_df, validation_msgs_df = _populate_metadata_df(
|
|
458
468
|
raw_metadata_df, full_flat_config_dict,
|
|
459
469
|
study_specific_transformers_dict)
|
|
@@ -461,6 +471,41 @@ def extend_metadata_df(
|
|
|
461
471
|
return metadata_df, validation_msgs_df
|
|
462
472
|
|
|
463
473
|
|
|
474
|
+
def _get_specified_column_name(
|
|
475
|
+
col_options_key: str,
|
|
476
|
+
raw_metadata_df: pandas.DataFrame,
|
|
477
|
+
config_dict: Dict[str, Any] = None) -> Optional[str]:
|
|
478
|
+
"""Get the specified type of column name from the metadata DataFrame based on possible options.
|
|
479
|
+
|
|
480
|
+
Parameters
|
|
481
|
+
----------
|
|
482
|
+
col_options_key : str
|
|
483
|
+
Key in the config dict that holds the list of possible column names to check.
|
|
484
|
+
raw_metadata_df : pandas.DataFrame
|
|
485
|
+
The metadata DataFrame to check.
|
|
486
|
+
config_dict : Dict[str, Any], default=None
|
|
487
|
+
Configuration dictionary. If provided, may contain a list of possible
|
|
488
|
+
column names under the key specified by col_options_key.
|
|
489
|
+
If None, defaults to values from the main config.yml file.
|
|
490
|
+
Returns
|
|
491
|
+
-------
|
|
492
|
+
Optional[str]
|
|
493
|
+
The specified column name found in the DataFrame, or None if not found.
|
|
494
|
+
"""
|
|
495
|
+
found_name = None
|
|
496
|
+
|
|
497
|
+
if not config_dict:
|
|
498
|
+
config_dict = extract_config_dict(None)
|
|
499
|
+
col_options = config_dict.get(col_options_key)
|
|
500
|
+
if col_options:
|
|
501
|
+
for col_name in col_options:
|
|
502
|
+
if col_name in raw_metadata_df.columns:
|
|
503
|
+
found_name = col_name
|
|
504
|
+
break
|
|
505
|
+
|
|
506
|
+
return found_name
|
|
507
|
+
|
|
508
|
+
|
|
464
509
|
def write_metadata_results(
|
|
465
510
|
metadata_df: pandas.DataFrame,
|
|
466
511
|
validation_msgs_df: pandas.DataFrame,
|
|
@@ -694,12 +739,6 @@ def _generate_metadata_for_host_types(
|
|
|
694
739
|
- The processed DataFrame with specific metadata added to each sample of each host type
|
|
695
740
|
- A list of validation messages
|
|
696
741
|
"""
|
|
697
|
-
# gather global settings
|
|
698
|
-
settings_dict = {DEFAULT_KEY: full_flat_config_dict.get(DEFAULT_KEY),
|
|
699
|
-
LEAVE_REQUIREDS_BLANK_KEY:
|
|
700
|
-
full_flat_config_dict.get(LEAVE_REQUIREDS_BLANK_KEY),
|
|
701
|
-
OVERWRITE_NON_NANS_KEY:
|
|
702
|
-
full_flat_config_dict.get(OVERWRITE_NON_NANS_KEY)}
|
|
703
742
|
|
|
704
743
|
validation_msgs = []
|
|
705
744
|
host_type_dfs = []
|
|
@@ -707,7 +746,7 @@ def _generate_metadata_for_host_types(
|
|
|
707
746
|
host_type_shorthands = pandas.unique(metadata_df[HOSTTYPE_SHORTHAND_KEY])
|
|
708
747
|
for curr_host_type_shorthand in host_type_shorthands:
|
|
709
748
|
concatted_dfs, curr_validation_msgs = _generate_metadata_for_a_host_type(
|
|
710
|
-
metadata_df, curr_host_type_shorthand,
|
|
749
|
+
metadata_df, curr_host_type_shorthand, full_flat_config_dict)
|
|
711
750
|
|
|
712
751
|
host_type_dfs.append(concatted_dfs)
|
|
713
752
|
validation_msgs.extend(curr_validation_msgs)
|
|
@@ -723,7 +762,7 @@ def _generate_metadata_for_host_types(
|
|
|
723
762
|
# NB: passing in the same dict twice here is not a mistake, just a
|
|
724
763
|
# convenience since we don't have a more specific dict at this point.
|
|
725
764
|
output_df = _fill_na_if_default(
|
|
726
|
-
output_df,
|
|
765
|
+
output_df, full_flat_config_dict)
|
|
727
766
|
|
|
728
767
|
# TODO: this is setting a value in the output; should it be centralized
|
|
729
768
|
# so it is easy to find?
|
|
@@ -735,7 +774,6 @@ def _generate_metadata_for_host_types(
|
|
|
735
774
|
def _generate_metadata_for_a_host_type(
|
|
736
775
|
metadata_df: pandas.DataFrame,
|
|
737
776
|
a_host_type: str,
|
|
738
|
-
settings_dict: Dict[str, Any],
|
|
739
777
|
full_flat_config_dict: Dict[str, Any]) -> Tuple[pandas.DataFrame, List[str]]:
|
|
740
778
|
"""Generate metadata df for samples with a specific host type.
|
|
741
779
|
|
|
@@ -746,8 +784,6 @@ def _generate_metadata_for_a_host_type(
|
|
|
746
784
|
the columns in REQUIRED_RAW_METADATA_FIELDS.
|
|
747
785
|
a_host_type : str
|
|
748
786
|
The specific host type for which to process samples.
|
|
749
|
-
settings_dict : Dict[str, Any]
|
|
750
|
-
Dictionary containing global settings for default/nan/etc.
|
|
751
787
|
full_flat_config_dict : Dict[str, Any]
|
|
752
788
|
Fully combined flat-host-type config dictionary.
|
|
753
789
|
|
|
@@ -770,16 +806,11 @@ def _generate_metadata_for_a_host_type(
|
|
|
770
806
|
# for these samples but do not error out; move on to the next host type
|
|
771
807
|
update_metadata_df_field(
|
|
772
808
|
host_type_df, QC_NOTE_KEY, "invalid host_type")
|
|
773
|
-
# host_type_df[QC_NOTE_KEY] = "invalid host_type"
|
|
774
809
|
concatted_df = host_type_df
|
|
775
810
|
else:
|
|
776
811
|
# gather host-type-specific settings and overwrite the global settings with them, if any
|
|
777
812
|
a_host_type_config_dict = \
|
|
778
813
|
full_flat_config_dict[HOST_TYPE_SPECIFIC_METADATA_KEY][a_host_type]
|
|
779
|
-
global_plus_host_settings_dict = deepcopy_dict(settings_dict)
|
|
780
|
-
# if this host type has a default value for empty fields, use it; otherwise, use the global default
|
|
781
|
-
global_plus_host_settings_dict[DEFAULT_KEY] = a_host_type_config_dict.get(
|
|
782
|
-
DEFAULT_KEY, global_plus_host_settings_dict[DEFAULT_KEY])
|
|
783
814
|
|
|
784
815
|
dfs_to_concat = []
|
|
785
816
|
# loop through each sample type in the metadata for this host type
|
|
@@ -789,8 +820,7 @@ def _generate_metadata_for_a_host_type(
|
|
|
789
820
|
# generate the specific metadata for this sample type *in this host type*
|
|
790
821
|
curr_sample_type_df, curr_validation_msgs = \
|
|
791
822
|
_generate_metadata_for_a_sample_type_in_a_host_type(
|
|
792
|
-
host_type_df, curr_sample_type,
|
|
793
|
-
a_host_type_config_dict)
|
|
823
|
+
host_type_df, curr_sample_type, a_host_type_config_dict)
|
|
794
824
|
|
|
795
825
|
dfs_to_concat.append(curr_sample_type_df)
|
|
796
826
|
validation_msgs.extend(curr_validation_msgs)
|
|
@@ -807,7 +837,6 @@ def _generate_metadata_for_a_host_type(
|
|
|
807
837
|
def _generate_metadata_for_a_sample_type_in_a_host_type(
|
|
808
838
|
host_type_metadata_df: pandas.DataFrame,
|
|
809
839
|
a_sample_type: str,
|
|
810
|
-
global_plus_host_settings_dict: Dict[str, Any],
|
|
811
840
|
a_host_type_config_dict: Dict[str, Any]) -> Tuple[pandas.DataFrame, List[str]]:
|
|
812
841
|
"""Generate metadata df for samples with a specific sample type within a specific host type.
|
|
813
842
|
|
|
@@ -817,8 +846,6 @@ def _generate_metadata_for_a_sample_type_in_a_host_type(
|
|
|
817
846
|
DataFrame containing metadata samples for a specific host type.
|
|
818
847
|
a_sample_type : str
|
|
819
848
|
The sample type to process.
|
|
820
|
-
global_plus_host_settings_dict : Dict[str, Any]
|
|
821
|
-
Dictionary containing default/nan/etc settings for current context.
|
|
822
849
|
a_host_type_config_dict : Dict[str, Any]
|
|
823
850
|
Dictionary containing config for this host type.
|
|
824
851
|
|
|
@@ -857,19 +884,19 @@ def _generate_metadata_for_a_sample_type_in_a_host_type(
|
|
|
857
884
|
sample_type_df = _update_metadata_from_dict(
|
|
858
885
|
sample_type_df, full_sample_type_metadata_fields_dict,
|
|
859
886
|
dict_is_metadata_fields=True,
|
|
860
|
-
overwrite_non_nans=
|
|
887
|
+
overwrite_non_nans=a_host_type_config_dict[OVERWRITE_NON_NANS_KEY])
|
|
861
888
|
|
|
862
889
|
# for fields that are required but not yet filled, replace the placeholder with
|
|
863
890
|
# either an indicator that it should be blank or else
|
|
864
891
|
# fill with NA (replaced with default just below), based on config setting
|
|
865
|
-
leave_reqs_blank =
|
|
892
|
+
leave_reqs_blank = a_host_type_config_dict[LEAVE_REQUIREDS_BLANK_KEY]
|
|
866
893
|
reqs_val = LEAVE_BLANK_VAL if leave_reqs_blank else np.nan
|
|
867
894
|
sample_type_df.replace(
|
|
868
895
|
to_replace=REQ_PLACEHOLDER, value=reqs_val, inplace=True)
|
|
869
896
|
|
|
870
897
|
# fill NAs with appropriate default value if any is set
|
|
871
898
|
sample_type_df = _fill_na_if_default(
|
|
872
|
-
sample_type_df,
|
|
899
|
+
sample_type_df, a_host_type_config_dict)
|
|
873
900
|
|
|
874
901
|
# validate the metadata df based on the specific requirements
|
|
875
902
|
# for this host+sample type
|
|
@@ -1051,7 +1078,6 @@ def _update_metadata_from_metadata_fields_dict(
|
|
|
1051
1078
|
# fill NAs with default value if any is set
|
|
1052
1079
|
def _fill_na_if_default(
|
|
1053
1080
|
metadata_df: pandas.DataFrame,
|
|
1054
|
-
specific_dict: Dict[str, Any],
|
|
1055
1081
|
settings_dict: Dict[str, Any]) -> pandas.DataFrame:
|
|
1056
1082
|
"""Fill NaN values in metadata df with default values if available.
|
|
1057
1083
|
|
|
@@ -1059,24 +1085,20 @@ def _fill_na_if_default(
|
|
|
1059
1085
|
----------
|
|
1060
1086
|
metadata_df : pandas.DataFrame
|
|
1061
1087
|
The metadata DataFrame to process.
|
|
1062
|
-
specific_dict : Dict[str, Any]
|
|
1063
|
-
Dictionary containing context-specific settings. Will be used first as a source of default values.
|
|
1064
1088
|
settings_dict : Dict[str, Any]
|
|
1065
|
-
Dictionary containing
|
|
1066
|
-
source of default values if specific_dict does not contain a DEFAULT_KEY.
|
|
1089
|
+
Dictionary containing settings.
|
|
1067
1090
|
|
|
1068
1091
|
Returns
|
|
1069
1092
|
-------
|
|
1070
1093
|
pandas.DataFrame
|
|
1071
1094
|
The updated DataFrame with NaN values filled. Unchanged if no default values are set.
|
|
1072
1095
|
"""
|
|
1073
|
-
default_val =
|
|
1096
|
+
default_val = settings_dict.get(DEFAULT_KEY)
|
|
1074
1097
|
if default_val:
|
|
1075
1098
|
# TODO: this is setting a value in the output; should it be
|
|
1076
1099
|
# centralized so it is easy to find?
|
|
1077
1100
|
metadata_df = \
|
|
1078
1101
|
metadata_df.fillna(default_val)
|
|
1079
|
-
# metadata_df.astype("string").fillna(default_val)
|
|
1080
1102
|
|
|
1081
1103
|
return metadata_df
|
|
1082
1104
|
|
|
@@ -27,6 +27,8 @@ SOURCES_KEY = "sources"
|
|
|
27
27
|
FUNCTION_KEY = "function"
|
|
28
28
|
LEAVE_REQUIREDS_BLANK_KEY = "leave_requireds_blank"
|
|
29
29
|
OVERWRITE_NON_NANS_KEY = "overwrite_non_nans"
|
|
30
|
+
HOSTTYPE_COL_OPTIONS_KEY = "hosttype_column_options"
|
|
31
|
+
SAMPLETYPE_COL_OPTIONS_KEY = "sampletype_column_options"
|
|
30
32
|
|
|
31
33
|
# internal code keys
|
|
32
34
|
HOSTTYPE_SHORTHAND_KEY = "hosttype_shorthand"
|
|
@@ -49,6 +51,13 @@ REQUIRED_RAW_METADATA_FIELDS = [SAMPLE_NAME_KEY,
|
|
|
49
51
|
SAMPLETYPE_SHORTHAND_KEY]
|
|
50
52
|
|
|
51
53
|
|
|
54
|
+
GLOBAL_SETTINGS_KEYS = [
|
|
55
|
+
DEFAULT_KEY,
|
|
56
|
+
LEAVE_REQUIREDS_BLANK_KEY,
|
|
57
|
+
OVERWRITE_NON_NANS_KEY
|
|
58
|
+
]
|
|
59
|
+
|
|
60
|
+
|
|
52
61
|
def extract_config_dict(
|
|
53
62
|
config_fp: Union[str, None]) -> dict:
|
|
54
63
|
"""Extract configuration dictionary from a YAML file.
|