metameq 2026.2.1__tar.gz → 2026.2.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. {metameq-2026.2.1/metameq.egg-info → metameq-2026.2.3}/PKG-INFO +2 -1
  2. {metameq-2026.2.1 → metameq-2026.2.3}/README.md +1 -1
  3. {metameq-2026.2.1 → metameq-2026.2.3}/environment.yml +1 -0
  4. {metameq-2026.2.1 → metameq-2026.2.3}/metameq/__init__.py +3 -2
  5. {metameq-2026.2.1 → metameq-2026.2.3}/metameq/_version.py +3 -3
  6. {metameq-2026.2.1 → metameq-2026.2.3}/metameq/src/metadata_configurator.py +53 -6
  7. {metameq-2026.2.1 → metameq-2026.2.3}/metameq/src/metadata_extender.py +16 -38
  8. {metameq-2026.2.1 → metameq-2026.2.3}/metameq/src/util.py +7 -0
  9. {metameq-2026.2.1 → metameq-2026.2.3}/metameq/tests/test_metadata_configurator.py +184 -1
  10. {metameq-2026.2.1 → metameq-2026.2.3}/metameq/tests/test_metadata_extender.py +306 -117
  11. {metameq-2026.2.1 → metameq-2026.2.3}/metameq/tests/test_metadata_validator.py +2 -2
  12. {metameq-2026.2.1 → metameq-2026.2.3/metameq.egg-info}/PKG-INFO +2 -1
  13. {metameq-2026.2.1 → metameq-2026.2.3}/metameq.egg-info/requires.txt +1 -0
  14. {metameq-2026.2.1 → metameq-2026.2.3}/setup.py +1 -0
  15. {metameq-2026.2.1 → metameq-2026.2.3}/.gitattributes +0 -0
  16. {metameq-2026.2.1 → metameq-2026.2.3}/.github/workflows/main.yaml +0 -0
  17. {metameq-2026.2.1 → metameq-2026.2.3}/.gitignore +0 -0
  18. {metameq-2026.2.1 → metameq-2026.2.3}/assets/metameq.png +0 -0
  19. {metameq-2026.2.1 → metameq-2026.2.3}/assets/metameq_dark.svg +0 -0
  20. {metameq-2026.2.1 → metameq-2026.2.3}/assets/metameq_light.svg +0 -0
  21. {metameq-2026.2.1 → metameq-2026.2.3}/assets/metameq_medium.png +0 -0
  22. {metameq-2026.2.1 → metameq-2026.2.3}/metameq/config/__init__.py +0 -0
  23. {metameq-2026.2.1 → metameq-2026.2.3}/metameq/config/config.yml +0 -0
  24. {metameq-2026.2.1 → metameq-2026.2.3}/metameq/config/standards.yml +0 -0
  25. {metameq-2026.2.1 → metameq-2026.2.3}/metameq/src/__init__.py +0 -0
  26. {metameq-2026.2.1 → metameq-2026.2.3}/metameq/src/__main__.py +0 -0
  27. {metameq-2026.2.1 → metameq-2026.2.3}/metameq/src/metadata_merger.py +0 -0
  28. {metameq-2026.2.1 → metameq-2026.2.3}/metameq/src/metadata_transformers.py +0 -0
  29. {metameq-2026.2.1 → metameq-2026.2.3}/metameq/src/metadata_validator.py +0 -0
  30. {metameq-2026.2.1 → metameq-2026.2.3}/metameq/tests/__init__.py +0 -0
  31. {metameq-2026.2.1 → metameq-2026.2.3}/metameq/tests/data/invalid.yml +0 -0
  32. {metameq-2026.2.1 → metameq-2026.2.3}/metameq/tests/data/test_config.yml +0 -0
  33. {metameq-2026.2.1 → metameq-2026.2.3}/metameq/tests/test_metadata_merger.py +0 -0
  34. {metameq-2026.2.1 → metameq-2026.2.3}/metameq/tests/test_metadata_transformers.py +0 -0
  35. {metameq-2026.2.1 → metameq-2026.2.3}/metameq/tests/test_util.py +0 -0
  36. {metameq-2026.2.1 → metameq-2026.2.3}/metameq.egg-info/SOURCES.txt +0 -0
  37. {metameq-2026.2.1 → metameq-2026.2.3}/metameq.egg-info/dependency_links.txt +0 -0
  38. {metameq-2026.2.1 → metameq-2026.2.3}/metameq.egg-info/entry_points.txt +0 -0
  39. {metameq-2026.2.1 → metameq-2026.2.3}/metameq.egg-info/top_level.txt +0 -0
  40. {metameq-2026.2.1 → metameq-2026.2.3}/setup.cfg +0 -0
  41. {metameq-2026.2.1 → metameq-2026.2.3}/versioneer.py +0 -0
@@ -1,12 +1,13 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: metameq
3
- Version: 2026.2.1
3
+ Version: 2026.2.3
4
4
  Summary: Qiita-compliant metadata generation and validation tool
5
5
  Home-page: https://github.com/AmandaBirmingham/metameq
6
6
  Author: Amanda Birmingham
7
7
  Author-email: abirmingham@ucsd.edu
8
8
  License: BSD-3-Clause
9
9
  Requires-Dist: click>=8.0.0
10
+ Requires-Dist: openpyxl>=3.0.0
10
11
  Requires-Dist: pandas>=1.3.0
11
12
  Requires-Dist: PyYAML>=5.4.0
12
13
  Requires-Dist: Cerberus>=1.3.4
@@ -106,7 +106,7 @@ from metameq import (
106
106
  )
107
107
 
108
108
  # Load your raw metadata into a DataFrame
109
- raw_metadata_df = pd.read_csv("my_samples.csv")
109
+ raw_metadata_df = pd.read_csv("my_samples.csv", dtype=str)
110
110
 
111
111
  # Ensure required columns exist
112
112
  raw_metadata_df[HOSTTYPE_SHORTHAND_KEY] = "human"
@@ -6,6 +6,7 @@ dependencies:
6
6
  - python
7
7
  - click
8
8
  - pandas
9
+ - openpyxl
9
10
  - pip
10
11
  - pyyaml
11
12
  - flake8
@@ -9,7 +9,7 @@ from metameq.src.metadata_extender import \
9
9
  write_extended_metadata, write_extended_metadata_from_df, \
10
10
  get_reserved_cols, get_extended_metadata_from_df_and_yaml, \
11
11
  write_metadata_results, id_missing_cols, find_standard_cols, \
12
- find_nonstandard_cols, get_qc_failures
12
+ find_nonstandard_cols, get_qc_failures, extend_metadata_df
13
13
  from metameq.src.metadata_merger import merge_sample_and_subject_metadata, \
14
14
  merge_many_to_one_metadata, merge_one_to_one_metadata, \
15
15
  find_common_col_names, find_common_df_cols
@@ -36,7 +36,8 @@ __all__ = ["HOSTTYPE_SHORTHAND_KEY", "SAMPLETYPE_SHORTHAND_KEY",
36
36
  "find_nonstandard_cols", "get_qc_failures",
37
37
  "format_a_datetime", "standardize_input_sex",
38
38
  "set_life_stage_from_age_yrs", "transform_input_sex_to_std_sex",
39
- "transform_age_to_life_stage", "transform_date_to_formatted_date"]
39
+ "transform_age_to_life_stage", "transform_date_to_formatted_date",
40
+ "extend_metadata_df"]
40
41
 
41
42
  from . import _version
42
43
  __version__ = _version.get_versions()['version']
@@ -8,11 +8,11 @@ import json
8
8
 
9
9
  version_json = '''
10
10
  {
11
- "date": "2026-02-01T21:32:06-0800",
11
+ "date": "2026-02-03T15:03:32-0800",
12
12
  "dirty": false,
13
13
  "error": null,
14
- "full-revisionid": "87171fd73f555e2c03a15fa36ed9b5a912b824e9",
15
- "version": "2026.02.1"
14
+ "full-revisionid": "89687d23015566a7583179a69f92c2e1d1adcf61",
15
+ "version": "2026.02.3"
16
16
  }
17
17
  ''' # END VERSION_JSON
18
18
 
@@ -5,7 +5,7 @@ from metameq.src.util import extract_config_dict, extract_stds_config, \
5
5
  HOST_TYPE_SPECIFIC_METADATA_KEY, \
6
6
  SAMPLE_TYPE_SPECIFIC_METADATA_KEY, ALIAS_KEY, BASE_TYPE_KEY, \
7
7
  DEFAULT_KEY, ALLOWED_KEY, ANYOF_KEY, TYPE_KEY, \
8
- SAMPLE_TYPE_KEY, QIITA_SAMPLE_TYPE
8
+ SAMPLE_TYPE_KEY, QIITA_SAMPLE_TYPE, GLOBAL_SETTINGS_KEYS
9
9
 
10
10
 
11
11
  def combine_stds_and_study_config(
@@ -257,11 +257,12 @@ def _combine_base_and_added_host_type(
257
257
  host_type_wip_nested_dict = \
258
258
  deepcopy_dict(host_type_base_dict)
259
259
 
260
- # look for a default key in the add dict for this host; if
261
- # it exists, add it to the wip dict (ok to overwrite existing)
262
- if DEFAULT_KEY in host_type_add_dict:
263
- host_type_wip_nested_dict[DEFAULT_KEY] = \
264
- host_type_add_dict.get(DEFAULT_KEY)
260
+ # look for global settings in the add dict for this host; if
261
+ # any exists, add it to the wip dict (ok to overwrite existing)
262
+ for curr_global_setting_key in GLOBAL_SETTINGS_KEYS:
263
+ if curr_global_setting_key in host_type_add_dict:
264
+ host_type_wip_nested_dict[curr_global_setting_key] = \
265
+ host_type_add_dict.get(curr_global_setting_key)
265
266
 
266
267
  # combine add metadata fields with the wip metadata fields
267
268
  # for the current host type and assign to wip if not empty
@@ -636,6 +637,10 @@ def build_full_flat_config_dict(
636
637
  # since the software config doesn't include any host type specific info
637
638
  full_nested_hosts_dict = extract_stds_config(stds_fp)
638
639
 
640
+ full_nested_hosts_dict = _push_global_settings_into_top_host(
641
+ full_nested_hosts_dict,
642
+ software_plus_study_flat_config_dict)
643
+
639
644
  full_flat_hosts_dict = flatten_nested_stds_dict(
640
645
  full_nested_hosts_dict, None)
641
646
  software_plus_study_flat_config_dict[HOST_TYPE_SPECIFIC_METADATA_KEY] = \
@@ -655,3 +660,45 @@ def build_full_flat_config_dict(
655
660
  full_flat_config_dict = software_plus_study_flat_config_dict
656
661
 
657
662
  return full_flat_config_dict
663
+
664
+
665
+ def _push_global_settings_into_top_host(
666
+ a_full_nested_hosts_dict: Dict[str, Any],
667
+ a_software_plus_study_flat_config_dict: Dict[str, Any]) -> Dict[str, Any]:
668
+ """Push global settings from flat config into top-level host in nested hosts dict.
669
+
670
+ Parameters
671
+ ----------
672
+ a_full_nested_hosts_dict : Dict[str, Any]
673
+ Nested hosts dictionary to update.
674
+ a_software_plus_study_flat_config_dict : Dict[str, Any]
675
+ Flat configuration dictionary containing global settings.
676
+
677
+ Returns
678
+ -------
679
+ Dict[str, Any]
680
+ Updated nested hosts dictionary with global settings added to top-level host.
681
+
682
+ Raises
683
+ ------
684
+ ValueError
685
+ If there is not exactly one top-level host in the nested hosts dictionary.
686
+ """
687
+ result = deepcopy_dict(a_full_nested_hosts_dict)
688
+
689
+ # get the top level host(s) in full_nested_hosts_dict
690
+ # (should be only one because it is nested)
691
+ top_level_host_keys = list(a_full_nested_hosts_dict[HOST_TYPE_SPECIFIC_METADATA_KEY].keys())
692
+ if len(top_level_host_keys) != 1:
693
+ raise ValueError(f"Expected exactly one top-level key in "
694
+ f"full_nested_hosts_dict but found: {top_level_host_keys}")
695
+ top_level_host_key = top_level_host_keys[0]
696
+
697
+ # check for each top-level setting from the software+study dictionary
698
+ # and add it under the top level host key in a_full_nested_hosts_dict
699
+ for curr_setting_key in GLOBAL_SETTINGS_KEYS:
700
+ if curr_setting_key in a_software_plus_study_flat_config_dict:
701
+ result[HOST_TYPE_SPECIFIC_METADATA_KEY][top_level_host_key][curr_setting_key] = \
702
+ a_software_plus_study_flat_config_dict[curr_setting_key]
703
+
704
+ return result
@@ -6,7 +6,7 @@ from pathlib import Path
6
6
  from datetime import datetime
7
7
  from typing import List, Dict, Optional, Tuple, Any
8
8
  from metameq.src.util import extract_config_dict, \
9
- deepcopy_dict, validate_required_columns_exist, get_extension, \
9
+ validate_required_columns_exist, get_extension, \
10
10
  load_df_with_best_fit_encoding, update_metadata_df_field, \
11
11
  HOSTTYPE_SHORTHAND_KEY, SAMPLETYPE_SHORTHAND_KEY, \
12
12
  QC_NOTE_KEY, METADATA_FIELDS_KEY, HOST_TYPE_SPECIFIC_METADATA_KEY, \
@@ -301,13 +301,13 @@ def write_extended_metadata(
301
301
  # extract the extension from the raw_metadata_fp file path
302
302
  extension = os.path.splitext(raw_metadata_fp)[1]
303
303
  if extension == ".csv":
304
- raw_metadata_df = load_df_with_best_fit_encoding(raw_metadata_fp, ",")
304
+ raw_metadata_df = load_df_with_best_fit_encoding(raw_metadata_fp, ",", str)
305
305
  elif extension == ".txt":
306
- raw_metadata_df = load_df_with_best_fit_encoding(raw_metadata_fp, "\t")
306
+ raw_metadata_df = load_df_with_best_fit_encoding(raw_metadata_fp, "\t", str)
307
307
  elif extension == ".xlsx":
308
308
  # NB: this loads (only) the first sheet of the input excel file.
309
309
  # If needed, can expand with pandas.read_excel sheet_name parameter.
310
- raw_metadata_df = pandas.read_excel(raw_metadata_fp)
310
+ raw_metadata_df = pandas.read_excel(raw_metadata_fp, dtype=str)
311
311
  else:
312
312
  raise ValueError("Unrecognized input file extension; "
313
313
  "must be .csv, .txt, or .xlsx")
@@ -451,7 +451,7 @@ def extend_metadata_df(
451
451
  full_flat_config_dict = build_full_flat_config_dict(
452
452
  study_specific_config_dict, software_config_dict, stds_fp)
453
453
 
454
- needed_cols = [(HOSTTYPE_SHORTHAND_KEY, HOSTTYPE_COL_OPTIONS_KEY),
454
+ needed_cols = [(HOSTTYPE_SHORTHAND_KEY, HOSTTYPE_COL_OPTIONS_KEY),
455
455
  (SAMPLETYPE_SHORTHAND_KEY, SAMPLETYPE_COL_OPTIONS_KEY)]
456
456
  for curr_key, curr_options_key in needed_cols:
457
457
  if curr_key not in raw_metadata_df.columns:
@@ -485,7 +485,7 @@ def _get_specified_column_name(
485
485
  The metadata DataFrame to check.
486
486
  config_dict : Dict[str, Any], default=None
487
487
  Configuration dictionary. If provided, may contain a list of possible
488
- column names under the key specified by col_options_key.
488
+ column names under the key specified by col_options_key.
489
489
  If None, defaults to values from the main config.yml file.
490
490
  Returns
491
491
  -------
@@ -503,7 +503,8 @@ def _get_specified_column_name(
503
503
  found_name = col_name
504
504
  break
505
505
 
506
- return found_name
506
+ return found_name
507
+
507
508
 
508
509
  def write_metadata_results(
509
510
  metadata_df: pandas.DataFrame,
@@ -738,12 +739,6 @@ def _generate_metadata_for_host_types(
738
739
  - The processed DataFrame with specific metadata added to each sample of each host type
739
740
  - A list of validation messages
740
741
  """
741
- # gather global settings
742
- settings_dict = {DEFAULT_KEY: full_flat_config_dict.get(DEFAULT_KEY),
743
- LEAVE_REQUIREDS_BLANK_KEY:
744
- full_flat_config_dict.get(LEAVE_REQUIREDS_BLANK_KEY),
745
- OVERWRITE_NON_NANS_KEY:
746
- full_flat_config_dict.get(OVERWRITE_NON_NANS_KEY)}
747
742
 
748
743
  validation_msgs = []
749
744
  host_type_dfs = []
@@ -751,7 +746,7 @@ def _generate_metadata_for_host_types(
751
746
  host_type_shorthands = pandas.unique(metadata_df[HOSTTYPE_SHORTHAND_KEY])
752
747
  for curr_host_type_shorthand in host_type_shorthands:
753
748
  concatted_dfs, curr_validation_msgs = _generate_metadata_for_a_host_type(
754
- metadata_df, curr_host_type_shorthand, settings_dict, full_flat_config_dict)
749
+ metadata_df, curr_host_type_shorthand, full_flat_config_dict)
755
750
 
756
751
  host_type_dfs.append(concatted_dfs)
757
752
  validation_msgs.extend(curr_validation_msgs)
@@ -767,7 +762,7 @@ def _generate_metadata_for_host_types(
767
762
  # NB: passing in the same dict twice here is not a mistake, just a
768
763
  # convenience since we don't have a more specific dict at this point.
769
764
  output_df = _fill_na_if_default(
770
- output_df, settings_dict, settings_dict)
765
+ output_df, full_flat_config_dict)
771
766
 
772
767
  # TODO: this is setting a value in the output; should it be centralized
773
768
  # so it is easy to find?
@@ -779,7 +774,6 @@ def _generate_metadata_for_host_types(
779
774
  def _generate_metadata_for_a_host_type(
780
775
  metadata_df: pandas.DataFrame,
781
776
  a_host_type: str,
782
- settings_dict: Dict[str, Any],
783
777
  full_flat_config_dict: Dict[str, Any]) -> Tuple[pandas.DataFrame, List[str]]:
784
778
  """Generate metadata df for samples with a specific host type.
785
779
 
@@ -790,8 +784,6 @@ def _generate_metadata_for_a_host_type(
790
784
  the columns in REQUIRED_RAW_METADATA_FIELDS.
791
785
  a_host_type : str
792
786
  The specific host type for which to process samples.
793
- settings_dict : Dict[str, Any]
794
- Dictionary containing global settings for default/nan/etc.
795
787
  full_flat_config_dict : Dict[str, Any]
796
788
  Fully combined flat-host-type config dictionary.
797
789
 
@@ -814,16 +806,11 @@ def _generate_metadata_for_a_host_type(
814
806
  # for these samples but do not error out; move on to the next host type
815
807
  update_metadata_df_field(
816
808
  host_type_df, QC_NOTE_KEY, "invalid host_type")
817
- # host_type_df[QC_NOTE_KEY] = "invalid host_type"
818
809
  concatted_df = host_type_df
819
810
  else:
820
811
  # gather host-type-specific settings and overwrite the global settings with them, if any
821
812
  a_host_type_config_dict = \
822
813
  full_flat_config_dict[HOST_TYPE_SPECIFIC_METADATA_KEY][a_host_type]
823
- global_plus_host_settings_dict = deepcopy_dict(settings_dict)
824
- # if this host type has a default value for empty fields, use it; otherwise, use the global default
825
- global_plus_host_settings_dict[DEFAULT_KEY] = a_host_type_config_dict.get(
826
- DEFAULT_KEY, global_plus_host_settings_dict[DEFAULT_KEY])
827
814
 
828
815
  dfs_to_concat = []
829
816
  # loop through each sample type in the metadata for this host type
@@ -833,8 +820,7 @@ def _generate_metadata_for_a_host_type(
833
820
  # generate the specific metadata for this sample type *in this host type*
834
821
  curr_sample_type_df, curr_validation_msgs = \
835
822
  _generate_metadata_for_a_sample_type_in_a_host_type(
836
- host_type_df, curr_sample_type, global_plus_host_settings_dict,
837
- a_host_type_config_dict)
823
+ host_type_df, curr_sample_type, a_host_type_config_dict)
838
824
 
839
825
  dfs_to_concat.append(curr_sample_type_df)
840
826
  validation_msgs.extend(curr_validation_msgs)
@@ -851,7 +837,6 @@ def _generate_metadata_for_a_host_type(
851
837
  def _generate_metadata_for_a_sample_type_in_a_host_type(
852
838
  host_type_metadata_df: pandas.DataFrame,
853
839
  a_sample_type: str,
854
- global_plus_host_settings_dict: Dict[str, Any],
855
840
  a_host_type_config_dict: Dict[str, Any]) -> Tuple[pandas.DataFrame, List[str]]:
856
841
  """Generate metadata df for samples with a specific sample type within a specific host type.
857
842
 
@@ -861,8 +846,6 @@ def _generate_metadata_for_a_sample_type_in_a_host_type(
861
846
  DataFrame containing metadata samples for a specific host type.
862
847
  a_sample_type : str
863
848
  The sample type to process.
864
- global_plus_host_settings_dict : Dict[str, Any]
865
- Dictionary containing default/nan/etc settings for current context.
866
849
  a_host_type_config_dict : Dict[str, Any]
867
850
  Dictionary containing config for this host type.
868
851
 
@@ -901,19 +884,19 @@ def _generate_metadata_for_a_sample_type_in_a_host_type(
901
884
  sample_type_df = _update_metadata_from_dict(
902
885
  sample_type_df, full_sample_type_metadata_fields_dict,
903
886
  dict_is_metadata_fields=True,
904
- overwrite_non_nans=global_plus_host_settings_dict[OVERWRITE_NON_NANS_KEY])
887
+ overwrite_non_nans=a_host_type_config_dict[OVERWRITE_NON_NANS_KEY])
905
888
 
906
889
  # for fields that are required but not yet filled, replace the placeholder with
907
890
  # either an indicator that it should be blank or else
908
891
  # fill with NA (replaced with default just below), based on config setting
909
- leave_reqs_blank = global_plus_host_settings_dict[LEAVE_REQUIREDS_BLANK_KEY]
892
+ leave_reqs_blank = a_host_type_config_dict[LEAVE_REQUIREDS_BLANK_KEY]
910
893
  reqs_val = LEAVE_BLANK_VAL if leave_reqs_blank else np.nan
911
894
  sample_type_df.replace(
912
895
  to_replace=REQ_PLACEHOLDER, value=reqs_val, inplace=True)
913
896
 
914
897
  # fill NAs with appropriate default value if any is set
915
898
  sample_type_df = _fill_na_if_default(
916
- sample_type_df, full_sample_type_metadata_fields_dict, global_plus_host_settings_dict)
899
+ sample_type_df, a_host_type_config_dict)
917
900
 
918
901
  # validate the metadata df based on the specific requirements
919
902
  # for this host+sample type
@@ -1095,7 +1078,6 @@ def _update_metadata_from_metadata_fields_dict(
1095
1078
  # fill NAs with default value if any is set
1096
1079
  def _fill_na_if_default(
1097
1080
  metadata_df: pandas.DataFrame,
1098
- specific_dict: Dict[str, Any],
1099
1081
  settings_dict: Dict[str, Any]) -> pandas.DataFrame:
1100
1082
  """Fill NaN values in metadata df with default values if available.
1101
1083
 
@@ -1103,24 +1085,20 @@ def _fill_na_if_default(
1103
1085
  ----------
1104
1086
  metadata_df : pandas.DataFrame
1105
1087
  The metadata DataFrame to process.
1106
- specific_dict : Dict[str, Any]
1107
- Dictionary containing context-specific settings. Will be used first as a source of default values.
1108
1088
  settings_dict : Dict[str, Any]
1109
- Dictionary containing global settings. Will be used as a
1110
- source of default values if specific_dict does not contain a DEFAULT_KEY.
1089
+ Dictionary containing settings.
1111
1090
 
1112
1091
  Returns
1113
1092
  -------
1114
1093
  pandas.DataFrame
1115
1094
  The updated DataFrame with NaN values filled. Unchanged if no default values are set.
1116
1095
  """
1117
- default_val = specific_dict.get(DEFAULT_KEY, settings_dict[DEFAULT_KEY])
1096
+ default_val = settings_dict.get(DEFAULT_KEY)
1118
1097
  if default_val:
1119
1098
  # TODO: this is setting a value in the output; should it be
1120
1099
  # centralized so it is easy to find?
1121
1100
  metadata_df = \
1122
1101
  metadata_df.fillna(default_val)
1123
- # metadata_df.astype("string").fillna(default_val)
1124
1102
 
1125
1103
  return metadata_df
1126
1104
 
@@ -51,6 +51,13 @@ REQUIRED_RAW_METADATA_FIELDS = [SAMPLE_NAME_KEY,
51
51
  SAMPLETYPE_SHORTHAND_KEY]
52
52
 
53
53
 
54
+ GLOBAL_SETTINGS_KEYS = [
55
+ DEFAULT_KEY,
56
+ LEAVE_REQUIREDS_BLANK_KEY,
57
+ OVERWRITE_NON_NANS_KEY
58
+ ]
59
+
60
+
54
61
  def extract_config_dict(
55
62
  config_fp: Union[str, None]) -> dict:
56
63
  """Extract configuration dictionary from a YAML file.
@@ -17,7 +17,8 @@ from metameq.src.metadata_configurator import \
17
17
  _id_sample_type_definition, \
18
18
  update_wip_metadata_dict, \
19
19
  build_full_flat_config_dict, \
20
- _resolve_sample_type_aliases_and_bases
20
+ _resolve_sample_type_aliases_and_bases, \
21
+ _push_global_settings_into_top_host
21
22
 
22
23
 
23
24
  class TestMetadataConfigurator(TestCase):
@@ -3847,6 +3848,9 @@ class TestMetadataConfigurator(TestCase):
3847
3848
  HOST_TYPE_SPECIFIC_METADATA_KEY: {
3848
3849
  # base: top level in test_standards.yml, no default
3849
3850
  "base": {
3851
+ DEFAULT_KEY: "software_default",
3852
+ LEAVE_REQUIREDS_BLANK_KEY: True,
3853
+ OVERWRITE_NON_NANS_KEY: False,
3850
3854
  METADATA_FIELDS_KEY: {
3851
3855
  # sample_name defined at base level
3852
3856
  "sample_name": {
@@ -3865,6 +3869,8 @@ class TestMetadataConfigurator(TestCase):
3865
3869
  "host_associated": {
3866
3870
  # default defined at host_associated level
3867
3871
  DEFAULT_KEY: "not provided",
3872
+ LEAVE_REQUIREDS_BLANK_KEY: True,
3873
+ OVERWRITE_NON_NANS_KEY: False,
3868
3874
  METADATA_FIELDS_KEY: {
3869
3875
  # description defined at host_associated level
3870
3876
  "description": {
@@ -3919,6 +3925,8 @@ class TestMetadataConfigurator(TestCase):
3919
3925
  "human": {
3920
3926
  # default inherited from host_associated
3921
3927
  DEFAULT_KEY: "not provided",
3928
+ LEAVE_REQUIREDS_BLANK_KEY: True,
3929
+ OVERWRITE_NON_NANS_KEY: False,
3922
3930
  METADATA_FIELDS_KEY: {
3923
3931
  # custom_field added from study_specific_metadata
3924
3932
  "custom_field": {
@@ -4037,6 +4045,8 @@ class TestMetadataConfigurator(TestCase):
4037
4045
  "mouse": {
4038
4046
  # default inherited from host_associated
4039
4047
  DEFAULT_KEY: "not provided",
4048
+ LEAVE_REQUIREDS_BLANK_KEY: True,
4049
+ OVERWRITE_NON_NANS_KEY: False,
4040
4050
  METADATA_FIELDS_KEY: {
4041
4051
  # description inherited from host_associated (not overridden)
4042
4052
  "description": {
@@ -4103,6 +4113,7 @@ class TestMetadataConfigurator(TestCase):
4103
4113
  }
4104
4114
  }
4105
4115
  }
4116
+
4106
4117
  self.assertEqual(expected, result)
4107
4118
 
4108
4119
  def test_build_full_flat_config_dict_without_study_config(self):
@@ -4130,6 +4141,9 @@ class TestMetadataConfigurator(TestCase):
4130
4141
  HOST_TYPE_SPECIFIC_METADATA_KEY: {
4131
4142
  # base: top level, no default, just sample_name/sample_type
4132
4143
  "base": {
4144
+ DEFAULT_KEY: "software_default",
4145
+ LEAVE_REQUIREDS_BLANK_KEY: True,
4146
+ OVERWRITE_NON_NANS_KEY: False,
4133
4147
  METADATA_FIELDS_KEY: {
4134
4148
  "sample_name": {
4135
4149
  REQUIRED_KEY: True,
@@ -4145,6 +4159,8 @@ class TestMetadataConfigurator(TestCase):
4145
4159
  # host_associated: inherits from base, adds default and description
4146
4160
  "host_associated": {
4147
4161
  DEFAULT_KEY: "not provided",
4162
+ LEAVE_REQUIREDS_BLANK_KEY: True,
4163
+ OVERWRITE_NON_NANS_KEY: False,
4148
4164
  METADATA_FIELDS_KEY: {
4149
4165
  "description": {
4150
4166
  DEFAULT_KEY: "host associated sample",
@@ -4194,6 +4210,8 @@ class TestMetadataConfigurator(TestCase):
4194
4210
  # human: inherits from host_associated, overrides description
4195
4211
  "human": {
4196
4212
  DEFAULT_KEY: "not provided",
4213
+ LEAVE_REQUIREDS_BLANK_KEY: True,
4214
+ OVERWRITE_NON_NANS_KEY: False,
4197
4215
  METADATA_FIELDS_KEY: {
4198
4216
  "description": {
4199
4217
  DEFAULT_KEY: "human sample",
@@ -4291,6 +4309,8 @@ class TestMetadataConfigurator(TestCase):
4291
4309
  # mouse: inherits from host_associated, keeps parent description
4292
4310
  "mouse": {
4293
4311
  DEFAULT_KEY: "not provided",
4312
+ LEAVE_REQUIREDS_BLANK_KEY: True,
4313
+ OVERWRITE_NON_NANS_KEY: False,
4294
4314
  METADATA_FIELDS_KEY: {
4295
4315
  "description": {
4296
4316
  DEFAULT_KEY: "host associated sample",
@@ -4395,6 +4415,12 @@ class TestMetadataConfigurator(TestCase):
4395
4415
  # Flattened host types
4396
4416
  HOST_TYPE_SPECIFIC_METADATA_KEY: {
4397
4417
  "base": {
4418
+ # default from study_config overrides software_config
4419
+ DEFAULT_KEY: "study_default",
4420
+ # leave_requireds_blank from study_config overrides software_config
4421
+ LEAVE_REQUIREDS_BLANK_KEY: True,
4422
+ # overwrite_non_nans from software_config (not overridden by study)
4423
+ OVERWRITE_NON_NANS_KEY: True,
4398
4424
  METADATA_FIELDS_KEY: {
4399
4425
  "sample_name": {
4400
4426
  REQUIRED_KEY: True,
@@ -4409,6 +4435,10 @@ class TestMetadataConfigurator(TestCase):
4409
4435
  },
4410
4436
  "host_associated": {
4411
4437
  DEFAULT_KEY: "not provided",
4438
+ # leave_requireds_blank from study_config overrides software_config
4439
+ LEAVE_REQUIREDS_BLANK_KEY: True,
4440
+ # overwrite_non_nans from software_config (not overridden by study)
4441
+ OVERWRITE_NON_NANS_KEY: True,
4412
4442
  METADATA_FIELDS_KEY: {
4413
4443
  "description": {
4414
4444
  DEFAULT_KEY: "host associated sample",
@@ -4457,6 +4487,10 @@ class TestMetadataConfigurator(TestCase):
4457
4487
  },
4458
4488
  "human": {
4459
4489
  DEFAULT_KEY: "not provided",
4490
+ # leave_requireds_blank from study_config overrides software_config
4491
+ LEAVE_REQUIREDS_BLANK_KEY: True,
4492
+ # overwrite_non_nans from software_config (not overridden by study)
4493
+ OVERWRITE_NON_NANS_KEY: True,
4460
4494
  METADATA_FIELDS_KEY: {
4461
4495
  "description": {
4462
4496
  DEFAULT_KEY: "human sample",
@@ -4553,6 +4587,10 @@ class TestMetadataConfigurator(TestCase):
4553
4587
  },
4554
4588
  "mouse": {
4555
4589
  DEFAULT_KEY: "not provided",
4590
+ # leave_requireds_blank from study_config overrides software_config
4591
+ LEAVE_REQUIREDS_BLANK_KEY: True,
4592
+ # overwrite_non_nans from software_config (not overridden by study)
4593
+ OVERWRITE_NON_NANS_KEY: True,
4556
4594
  METADATA_FIELDS_KEY: {
4557
4595
  "description": {
4558
4596
  DEFAULT_KEY: "host associated sample",
@@ -4649,6 +4687,9 @@ class TestMetadataConfigurator(TestCase):
4649
4687
  # Flattened host types
4650
4688
  HOST_TYPE_SPECIFIC_METADATA_KEY: {
4651
4689
  "base": {
4690
+ DEFAULT_KEY: "not applicable",
4691
+ LEAVE_REQUIREDS_BLANK_KEY: False,
4692
+ OVERWRITE_NON_NANS_KEY: False,
4652
4693
  METADATA_FIELDS_KEY: {
4653
4694
  "sample_name": {
4654
4695
  REQUIRED_KEY: True,
@@ -4663,6 +4704,8 @@ class TestMetadataConfigurator(TestCase):
4663
4704
  },
4664
4705
  "host_associated": {
4665
4706
  DEFAULT_KEY: "not provided",
4707
+ LEAVE_REQUIREDS_BLANK_KEY: False,
4708
+ OVERWRITE_NON_NANS_KEY: False,
4666
4709
  METADATA_FIELDS_KEY: {
4667
4710
  "description": {
4668
4711
  DEFAULT_KEY: "host associated sample",
@@ -4711,6 +4754,8 @@ class TestMetadataConfigurator(TestCase):
4711
4754
  },
4712
4755
  "human": {
4713
4756
  DEFAULT_KEY: "not provided",
4757
+ LEAVE_REQUIREDS_BLANK_KEY: False,
4758
+ OVERWRITE_NON_NANS_KEY: False,
4714
4759
  METADATA_FIELDS_KEY: {
4715
4760
  "description": {
4716
4761
  DEFAULT_KEY: "human sample",
@@ -4807,6 +4852,8 @@ class TestMetadataConfigurator(TestCase):
4807
4852
  },
4808
4853
  "mouse": {
4809
4854
  DEFAULT_KEY: "not provided",
4855
+ LEAVE_REQUIREDS_BLANK_KEY: False,
4856
+ OVERWRITE_NON_NANS_KEY: False,
4810
4857
  METADATA_FIELDS_KEY: {
4811
4858
  "description": {
4812
4859
  DEFAULT_KEY: "host associated sample",
@@ -4867,4 +4914,140 @@ class TestMetadataConfigurator(TestCase):
4867
4914
  }
4868
4915
  }
4869
4916
  }
4917
+
4918
+ self.assertEqual(expected, result)
4919
+
4920
+ # Tests for _push_global_settings_into_top_host
4921
+
4922
+ def test__push_global_settings_into_top_host_single_setting(self):
4923
+ """Test pushing a single global setting into the top-level host."""
4924
+ nested_hosts_dict = {
4925
+ HOST_TYPE_SPECIFIC_METADATA_KEY: {
4926
+ "base": {
4927
+ METADATA_FIELDS_KEY: {
4928
+ "field1": {TYPE_KEY: "string"}
4929
+ }
4930
+ }
4931
+ }
4932
+ }
4933
+ flat_config_dict = {
4934
+ DEFAULT_KEY: "custom_default"
4935
+ }
4936
+
4937
+ expected = {
4938
+ HOST_TYPE_SPECIFIC_METADATA_KEY: {
4939
+ "base": {
4940
+ DEFAULT_KEY: "custom_default",
4941
+ METADATA_FIELDS_KEY: {
4942
+ "field1": {TYPE_KEY: "string"}
4943
+ }
4944
+ }
4945
+ }
4946
+ }
4947
+
4948
+ result = _push_global_settings_into_top_host(
4949
+ nested_hosts_dict, flat_config_dict)
4950
+
4951
+ self.assertEqual(expected, result)
4952
+ # Original should be unchanged
4953
+ self.assertNotIn(
4954
+ DEFAULT_KEY,
4955
+ nested_hosts_dict[HOST_TYPE_SPECIFIC_METADATA_KEY]["base"])
4956
+
4957
+ def test__push_global_settings_into_top_host_multiple_settings(self):
4958
+ """Test pushing multiple global settings into the top-level host."""
4959
+ nested_hosts_dict = {
4960
+ HOST_TYPE_SPECIFIC_METADATA_KEY: {
4961
+ "base": {
4962
+ METADATA_FIELDS_KEY: {
4963
+ "field1": {TYPE_KEY: "string"}
4964
+ }
4965
+ }
4966
+ }
4967
+ }
4968
+ flat_config_dict = {
4969
+ DEFAULT_KEY: "custom_default",
4970
+ LEAVE_REQUIREDS_BLANK_KEY: True,
4971
+ OVERWRITE_NON_NANS_KEY: True
4972
+ }
4973
+
4974
+ expected = {
4975
+ HOST_TYPE_SPECIFIC_METADATA_KEY: {
4976
+ "base": {
4977
+ DEFAULT_KEY: "custom_default",
4978
+ LEAVE_REQUIREDS_BLANK_KEY: True,
4979
+ OVERWRITE_NON_NANS_KEY: True,
4980
+ METADATA_FIELDS_KEY: {
4981
+ "field1": {TYPE_KEY: "string"}
4982
+ }
4983
+ }
4984
+ }
4985
+ }
4986
+
4987
+ result = _push_global_settings_into_top_host(
4988
+ nested_hosts_dict, flat_config_dict)
4989
+
4870
4990
  self.assertEqual(expected, result)
4991
+
4992
+ def test__push_global_settings_into_top_host_no_settings(self):
4993
+ """Test that function returns copy when no global settings present."""
4994
+ nested_hosts_dict = {
4995
+ HOST_TYPE_SPECIFIC_METADATA_KEY: {
4996
+ "base": {
4997
+ METADATA_FIELDS_KEY: {
4998
+ "field1": {TYPE_KEY: "string"}
4999
+ }
5000
+ }
5001
+ }
5002
+ }
5003
+ flat_config_dict = {
5004
+ "some_other_key": "value"
5005
+ }
5006
+
5007
+ expected = {
5008
+ HOST_TYPE_SPECIFIC_METADATA_KEY: {
5009
+ "base": {
5010
+ METADATA_FIELDS_KEY: {
5011
+ "field1": {TYPE_KEY: "string"}
5012
+ }
5013
+ }
5014
+ }
5015
+ }
5016
+
5017
+ result = _push_global_settings_into_top_host(
5018
+ nested_hosts_dict, flat_config_dict)
5019
+
5020
+ self.assertEqual(expected, result)
5021
+
5022
+ def test__push_global_settings_into_top_host_raises_on_zero_hosts(self):
5023
+ """Test that ValueError is raised when no top-level hosts exist."""
5024
+ nested_hosts_dict = {
5025
+ HOST_TYPE_SPECIFIC_METADATA_KEY: {}
5026
+ }
5027
+ flat_config_dict = {
5028
+ DEFAULT_KEY: "custom_default"
5029
+ }
5030
+
5031
+ with self.assertRaisesRegex(
5032
+ ValueError,
5033
+ r"Expected exactly one top-level key.*found: \[\]"):
5034
+ _push_global_settings_into_top_host(
5035
+ nested_hosts_dict, flat_config_dict)
5036
+
5037
+ def test__push_global_settings_into_top_host_raises_on_multiple_hosts(self):
5038
+ """Test that ValueError is raised when multiple top-level hosts exist."""
5039
+ nested_hosts_dict = {
5040
+ HOST_TYPE_SPECIFIC_METADATA_KEY: {
5041
+ "host1": {METADATA_FIELDS_KEY: {}},
5042
+ "host2": {METADATA_FIELDS_KEY: {}}
5043
+ }
5044
+ }
5045
+ flat_config_dict = {
5046
+ DEFAULT_KEY: "custom_default"
5047
+ }
5048
+
5049
+ with self.assertRaisesRegex(
5050
+ ValueError,
5051
+ r"Expected exactly one top-level key"):
5052
+ _push_global_settings_into_top_host(
5053
+ nested_hosts_dict, flat_config_dict)