PyPI - metameq - Versions diffs - 2026.1.1__py3-none-any.whl → 2026.1.2__py3-none-any.whl - Mend

metameq 2026.1.1py3-none-any.whl → 2026.1.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

metameq/_version.py +3 -3
metameq/src/metadata_configurator.py +146 -1
metameq/src/metadata_extender.py +44 -38
metameq/tests/test_metadata_configurator.py +2741 -208
metameq/tests/test_metadata_extender.py +2034 -497
metameq/tests/test_metadata_merger.py +1 -1
metameq/tests/test_util.py +1 -1
{metameq-2026.1.1.dist-info → metameq-2026.1.2.dist-info}/METADATA +1 -1
{metameq-2026.1.1.dist-info → metameq-2026.1.2.dist-info}/RECORD +12 -12
{metameq-2026.1.1.dist-info → metameq-2026.1.2.dist-info}/WHEEL +0 -0
{metameq-2026.1.1.dist-info → metameq-2026.1.2.dist-info}/entry_points.txt +0 -0
{metameq-2026.1.1.dist-info → metameq-2026.1.2.dist-info}/top_level.txt +0 -0

metameq/_version.py CHANGED Viewed

@@ -8,11 +8,11 @@ import json
 version_json = '''
 {
- "date": "2026-01-28T14:30:42-0800",
+ "date": "2026-01-31T12:28:01-0800",
  "dirty": false,
  "error": null,
- "full-revisionid": "b60757af0c4b7b16d71119971565d9991779f6d2",
- "version": "2026.01.1"
+ "full-revisionid": "889941fbd7d28889867e3f4b6edba2d50dbc5956",
+ "version": "2026.01.2"
 }
 '''  # END VERSION_JSON

metameq/src/metadata_configurator.py CHANGED Viewed

@@ -4,7 +4,8 @@ from metameq.src.util import extract_config_dict, extract_stds_config, \
     METADATA_FIELDS_KEY, STUDY_SPECIFIC_METADATA_KEY, \
     HOST_TYPE_SPECIFIC_METADATA_KEY, \
     SAMPLE_TYPE_SPECIFIC_METADATA_KEY, ALIAS_KEY, BASE_TYPE_KEY, \
-    DEFAULT_KEY, ALLOWED_KEY, ANYOF_KEY, TYPE_KEY
+    DEFAULT_KEY, ALLOWED_KEY, ANYOF_KEY, TYPE_KEY, \
+    SAMPLE_TYPE_KEY, QIITA_SAMPLE_TYPE
 def combine_stds_and_study_config(
@@ -89,6 +90,15 @@ def flatten_nested_stds_dict(
         if curr_host_type_sub_host_dict:
             wip_host_types_dict.update(curr_host_type_sub_host_dict)
+        # resolve aliases and base types for this host's sample types
+        # This happens AFTER recursion so children inherit unresolved aliases,
+        # ensuring correct bottom-up resolution order
+        if SAMPLE_TYPE_SPECIFIC_METADATA_KEY in curr_host_type_wip_flat_dict:
+            curr_host_type_wip_flat_dict[SAMPLE_TYPE_SPECIFIC_METADATA_KEY] = \
+                _resolve_sample_type_aliases_and_bases(
+                    curr_host_type_wip_flat_dict[SAMPLE_TYPE_SPECIFIC_METADATA_KEY],
+                    curr_host_type_wip_flat_dict.get(METADATA_FIELDS_KEY, {}))
         # assign the flattened wip dict for the current host type to the result
         # (which now contains flat records for the hosts lower down than
         # this, if there are any)
@@ -270,8 +280,11 @@ def _combine_base_and_added_host_type(
         _combine_base_and_added_sample_type_specific_metadata(
             host_type_wip_nested_dict,
             host_type_add_dict)
     # if we got back a non-empty dictionary of sample types,
     # add it to the wip for this host type dict
+    # Note: resolution of aliases/base types happens in flatten_nested_stds_dict
+    # AFTER recursion, to ensure correct bottom-up resolution order
     if curr_host_wip_sample_types_dict:
         host_type_wip_nested_dict[
             SAMPLE_TYPE_SPECIFIC_METADATA_KEY] = \
@@ -450,6 +463,130 @@ def _id_sample_type_definition(sample_type_name: str, sample_type_dict: Dict[str
                          "the same sample type dict")
+def _construct_sample_type_metadata_fields_dict(
+        sample_type: str,
+        host_sample_types_config_dict: Dict[str, Any],
+        a_host_type_metadata_fields_dict: Dict[str, Any]) -> Dict[str, Any]:
+    """Construct metadata fields dictionary for a specific host+sample type, resolving aliases and base types.
+    Parameters
+    ----------
+    sample_type : str
+        The sample type to process.
+    host_sample_types_config_dict : Dict[str, Any]
+        Dictionary containing config for *all* sample types in
+        the host type in question.
+    a_host_type_metadata_fields_dict : Dict[str, Any]
+        Dictionary containing metadata fields for the host type in question.
+    Returns
+    -------
+    Dict[str, Any]
+        The constructed metadata fields dictionary for this host-and-sample-type combination.
+    Raises
+    ------
+    ValueError
+        If there are invalid alias chains or base type configurations.
+    """
+    sample_type_for_metadata = sample_type
+    # get dict associated with the naive sample type
+    sample_type_specific_dict = \
+        host_sample_types_config_dict[sample_type]
+    # if naive sample type contains an alias
+    sample_type_alias = sample_type_specific_dict.get(ALIAS_KEY)
+    if sample_type_alias:
+        # change the sample type to the alias sample type
+        # and use the alias's sample type dict
+        sample_type_for_metadata = sample_type_alias
+        sample_type_specific_dict = \
+            host_sample_types_config_dict[sample_type_alias]
+        if METADATA_FIELDS_KEY not in sample_type_specific_dict:
+            raise ValueError(f"May not chain aliases "
+                             f"('{sample_type}' to '{sample_type_alias}')")
+    # endif sample type is an alias
+    # if the sample type has a base type
+    sample_type_base = sample_type_specific_dict.get(BASE_TYPE_KEY)
+    if sample_type_base:
+        # get the base's sample type dict and add this sample type's
+        # info on top of it
+        base_sample_dict = host_sample_types_config_dict[sample_type_base]
+        if list(base_sample_dict.keys()) != [METADATA_FIELDS_KEY]:
+            raise ValueError(f"Base sample type '{sample_type_base}' "
+                             f"must only have metadata fields")
+        sample_type_specific_dict_metadata = update_wip_metadata_dict(
+            deepcopy_dict(base_sample_dict[METADATA_FIELDS_KEY]),
+            sample_type_specific_dict.get(METADATA_FIELDS_KEY, {}))
+        sample_type_specific_dict = deepcopy_dict(sample_type_specific_dict)
+        sample_type_specific_dict[METADATA_FIELDS_KEY] = \
+            sample_type_specific_dict_metadata
+    # endif sample type has a base type
+    # add the sample-type-specific info generated above on top of the host info
+    sample_type_metadata_dict = update_wip_metadata_dict(
+        deepcopy_dict(a_host_type_metadata_fields_dict),
+        sample_type_specific_dict.get(METADATA_FIELDS_KEY, {}))
+    # set sample_type, and qiita_sample_type if it is not already set
+    sample_type_definition = {
+        ALLOWED_KEY: [sample_type_for_metadata],
+        DEFAULT_KEY: sample_type_for_metadata,
+        TYPE_KEY: "string"
+    }
+    sample_type_metadata_dict = update_wip_metadata_dict(
+        sample_type_metadata_dict, {SAMPLE_TYPE_KEY: sample_type_definition})
+    if QIITA_SAMPLE_TYPE not in sample_type_metadata_dict:
+        sample_type_metadata_dict = update_wip_metadata_dict(
+            sample_type_metadata_dict, {QIITA_SAMPLE_TYPE: sample_type_definition})
+    # end if qiita_sample_type not already set
+    return sample_type_metadata_dict
+def _resolve_sample_type_aliases_and_bases(
+        sample_types_dict: Dict[str, Any],
+        host_metadata_fields_dict: Dict[str, Any]) -> Dict[str, Any]:
+    """Resolve aliases and base types in sample type definitions.
+    For each sample type in the input dictionary:
+    1. If it's an alias, follow the alias and resolve the target's metadata
+    2. If it has a base_type, inherit metadata fields from the base
+    3. Merge sample-type metadata fields with host-level metadata fields
+    4. Add sample_type and qiita_sample_type fields
+    Parameters
+    ----------
+    sample_types_dict : Dict[str, Any]
+        Dictionary of sample type configurations (from sample_type_specific_metadata).
+    host_metadata_fields_dict : Dict[str, Any]
+        Host-level metadata fields to merge into each sample type.
+    Returns
+    -------
+    Dict[str, Any]
+        Dictionary with all sample types resolved.
+    Raises
+    ------
+    ValueError
+        If chained aliases are detected or base type has invalid structure.
+    """
+    result = {}
+    for sample_type_name in sample_types_dict.keys():
+        resolved_metadata = _construct_sample_type_metadata_fields_dict(
+            sample_type_name, sample_types_dict, host_metadata_fields_dict)
+        result[sample_type_name] = {
+            METADATA_FIELDS_KEY: resolved_metadata
+        }
+    return result
 def build_full_flat_config_dict(
         study_specific_config_dict: Optional[Dict[str, Any]] = None,
         software_config_dict: Optional[Dict[str, Any]] = None,
@@ -503,6 +640,14 @@ def build_full_flat_config_dict(
         full_nested_hosts_dict, None)
     software_plus_study_flat_config_dict[HOST_TYPE_SPECIFIC_METADATA_KEY] = \
         full_flat_hosts_dict
+    # drop the STUDY_SPECIFIC_METADATA_KEY from the final output dict (because
+    # its contents have already been incorporated into the
+    # HOST_TYPE_SPECIFIC_METADATA_KEY section); note we keep all the other
+    # top-level keys from the study-specific config dict
+    if STUDY_SPECIFIC_METADATA_KEY in software_plus_study_flat_config_dict:
+        del software_plus_study_flat_config_dict[STUDY_SPECIFIC_METADATA_KEY]
     # this is just a renaming to indicate that, having overwritten any original
     # HOST_TYPE_SPECIFIC_METADATA_KEY in the software_plus_study_flat_config_dict
     # with the complete and flattened combination of software+study+standards, it is now

metameq/src/metadata_extender.py CHANGED Viewed

@@ -42,7 +42,7 @@ pandas.set_option("future.no_silent_downcasting", True)
 def get_reserved_cols(
         raw_metadata_df: pandas.DataFrame,
         study_specific_config_dict: Dict[str, Any],
-        study_specific_transformers_dict: Optional[Dict[str, Any]] = None) -> List[str]:
+        stds_fp: Optional[str] = None) -> List[str]:
     """Get a list of all reserved column names for all host+sample type combinations in the metadata.
     Note that 'reserved' is not the same as 'required'.  Some column names (e.g.,
@@ -55,8 +55,9 @@ def get_reserved_cols(
         The input metadata DataFrame.
     study_specific_config_dict : Dict[str, Any]
         Study-specific flat-host-type config dictionary.
-    study_specific_transformers_dict : Optional[Dict[str, Any]], default=None
-        Dictionary of custom transformers for this study (only).
+    stds_fp : Optional[str], default=None
+        Path to standards dictionary file. If None, the default standards
+        config pulled from the standards.yml file will be used.
     Returns
     -------
@@ -90,10 +91,10 @@ def get_reserved_cols(
     # extend the metadata_df to get all the required columns for all host+sample type combinations;
     # we don't really care about the contents of these columns, just their names.
-    # (I doubt it is necessary to pass the actual study_specific_transformers_dict; could probably just use None)
+    # (Likewise, it is not necessary to pass the actual study_specific_transformers_dict so
+    # just use None)
     metadata_df, _ = extend_metadata_df(
-        temp_df, study_specific_config_dict,
-        study_specific_transformers_dict)
+        temp_df, study_specific_config_dict, None, None, stds_fp)
     return sorted(metadata_df.columns.to_list())
@@ -119,7 +120,7 @@ def id_missing_cols(a_df: pandas.DataFrame) -> List[str]:
 def find_standard_cols(
         a_df: pandas.DataFrame,
         study_specific_config_dict: Dict[str, Any],
-        study_specific_transformers_dict: Optional[Dict[str, Any]] = None,
+        stds_fp: Optional[str] = None,
         suppress_missing_name_err: bool = False) -> List[str]:
     """Find all the standard columns in the metadata DataFrame.
@@ -129,8 +130,9 @@ def find_standard_cols(
         The metadata DataFrame to analyze.
     study_specific_config_dict : Dict[str, Any]
         Study-specific flat-host-type config dictionary.
-    study_specific_transformers_dict : Optional[Dict[str, Any]], default=None
-        Dictionary of custom transformers for this study (only).
+    stds_fp : Optional[str], default=None
+        Path to standards dictionary file. If None, the default standards
+        config pulled from the standards.yml file will be used.
     suppress_missing_name_err : bool, default=False
         Whether to suppress errors about missing sample name.
@@ -156,8 +158,7 @@ def find_standard_cols(
     # get the intersection of the reserved standard columns and
     # the columns in the input dataframe
     standard_cols = get_reserved_cols(
-        a_df, study_specific_config_dict,
-        study_specific_transformers_dict=study_specific_transformers_dict)
+        a_df, study_specific_config_dict, stds_fp)
     standard_cols_set = (set(standard_cols) - set(INTERNAL_COL_KEYS))
@@ -167,7 +168,7 @@ def find_standard_cols(
 def find_nonstandard_cols(
         a_df: pandas.DataFrame,
         study_specific_config_dict: Dict[str, Any],
-        study_specific_transformers_dict: Optional[Dict[str, Any]] = None) -> List[str]:
+        stds_fp: Optional[str] = None) -> List[str]:
     """Find any non-standard columns in the metadata DataFrame.
     Parameters
@@ -176,8 +177,9 @@ def find_nonstandard_cols(
         The metadata DataFrame to analyze.
     study_specific_config_dict : Dict[str, Any]
         Study-specific flat-host-type config dictionary.
-    study_specific_transformers_dict : Optional[Dict[str, Any]], default=None
-        Dictionary of custom transformers for this study (only).
+    stds_fp : Optional[str], default=None
+        Path to standards dictionary file. If None, the default standards
+        config pulled from the standards.yml file will be used.
     Returns
     -------
@@ -195,15 +197,15 @@ def find_nonstandard_cols(
     # get the columns in
     standard_cols = get_reserved_cols(
-        a_df, study_specific_config_dict,
-        study_specific_transformers_dict=study_specific_transformers_dict)
+        a_df, study_specific_config_dict, stds_fp)
     return list(set(a_df.columns) - set(standard_cols))
 def get_extended_metadata_from_df_and_yaml(
         raw_metadata_df: pandas.DataFrame,
-        study_specific_config_fp: Optional[str]) -> Tuple[pandas.DataFrame, pandas.DataFrame]:
+        study_specific_config_fp: Optional[str],
+        stds_fp: Optional[str] = None) -> Tuple[pandas.DataFrame, pandas.DataFrame]:
     """Extend metadata using configuration from a study-specific YAML config file.
     Parameters
@@ -212,6 +214,9 @@ def get_extended_metadata_from_df_and_yaml(
         The raw metadata DataFrame to extend.
     study_specific_config_fp : Optional[str]
         Path to the study-specific configuration YAML file.
+    stds_fp : Optional[str], default=None
+        Path to standards dictionary file. If None, the default standards
+        config pulled from the standards.yml file will be used.
     Returns
     -------
@@ -226,7 +231,8 @@ def get_extended_metadata_from_df_and_yaml(
     # extend the metadata DataFrame using the study-specific flat-host-type config dictionary
     metadata_df, validation_msgs_df = \
-        extend_metadata_df(raw_metadata_df, study_specific_config_dict)
+        extend_metadata_df(raw_metadata_df, study_specific_config_dict,
+                           None, None, stds_fp)
     return metadata_df, validation_msgs_df
@@ -257,7 +263,8 @@ def write_extended_metadata(
         out_name_base: str,
         sep: str = "\t",
         remove_internals: bool = True,
-        suppress_empty_fails: bool = False) -> pandas.DataFrame:
+        suppress_empty_fails: bool = False,
+        stds_fp: Optional[str] = None) -> pandas.DataFrame:
     """Write extended metadata to files starting from input file paths to metadata and config.
     Parameters
@@ -276,6 +283,9 @@ def write_extended_metadata(
         Whether to remove internal columns.
     suppress_empty_fails : bool, default=False
         Whether to suppress empty failure files.
+    stds_fp : Optional[str], default=None
+        Path to standards dictionary file. If None, the default standards
+        config pulled from the standards.yml file will be used.
     Returns
     -------
@@ -310,7 +320,8 @@ def write_extended_metadata(
         raw_metadata_df, study_specific_config_dict,
         out_dir, out_name_base, sep=sep,
         remove_internals=remove_internals,
-        suppress_empty_fails=suppress_empty_fails)
+        suppress_empty_fails=suppress_empty_fails,
+        stds_fp=stds_fp)
     # for good measure, return the extended metadata DataFrame
     return extended_df
@@ -351,7 +362,8 @@ def write_extended_metadata_from_df(
         sep: str = "\t",
         remove_internals: bool = True,
         suppress_empty_fails: bool = False,
-        internal_col_names: Optional[List[str]] = None) -> pandas.DataFrame:
+        internal_col_names: Optional[List[str]] = None,
+        stds_fp: Optional[str] = None) -> pandas.DataFrame:
     """Write extended metadata to files starting from a metadata DataFrame and config dictionary.
     Parameters
@@ -374,6 +386,9 @@ def write_extended_metadata_from_df(
         Whether to suppress empty failure files.
     internal_col_names : Optional[List[str]], default=None
         List of internal column names.
+    stds_fp : Optional[str], default=None
+        Path to standards dictionary file. If None, the default standards
+        config pulled from the standards.yml file will be used.
     Returns
     -------
@@ -383,7 +398,7 @@ def write_extended_metadata_from_df(
     # extend the metadata DataFrame using the study-specific flat-host-type config dictionary
     metadata_df, validation_msgs_df = extend_metadata_df(
         raw_metadata_df, study_specific_config_dict,
-        study_specific_transformers_dict)
+        study_specific_transformers_dict, None, stds_fp)
     # write the metadata and validation results to files
     write_metadata_results(
@@ -814,12 +829,6 @@ def _generate_metadata_for_a_sample_type_in_a_host_type(
             - The updated metadata DataFrame with sample-type-specific elements added
             - A list of validation messages
     """
-    # copy the metadata fields dict from the host type config to be the
-    # basis of the work-in-progress metadata dict--these are the default fields
-    # that will be overwritten, if necessary, by sample type-specific fields
-    wip_metadata_fields_dict = deepcopy_dict(
-        a_host_type_config_dict.get(METADATA_FIELDS_KEY, {}))
     # get the config section for *all* sample types within this host type
     host_sample_types_config_dict = \
         a_host_type_config_dict[SAMPLE_TYPE_SPECIFIC_METADATA_KEY]
@@ -837,20 +846,17 @@ def _generate_metadata_for_a_sample_type_in_a_host_type(
         update_metadata_df_field(
             sample_type_df, QC_NOTE_KEY, "invalid sample_type")
     else:
-        # resolve any aliases and base types for the sample type and combine its
-        # specific metadata fields with the host type's metadata fields
-        # to get the full set of config info for this host+sample type
+        # Get the already-resolved metadata fields dict for this sample type.
+        # The config is pre-resolved: aliases/base types are merged and
+        # host metadata is combined.
+        sample_type_config = host_sample_types_config_dict[a_sample_type]
         full_sample_type_metadata_fields_dict = \
-            _construct_sample_type_metadata_fields_dict(
-                a_sample_type, host_sample_types_config_dict, wip_metadata_fields_dict)
+            sample_type_config.get(METADATA_FIELDS_KEY, {})
         # update the metadata df with the sample type specific metadata fields
-        # TODO: this is taking in wip_metadata_fields_dict instead of full_sample_type_metadata_fields_dict,
-        # which only works because the code underlying _construct_sample_type_metadata_fields_dict
-        # is *modifying* wip_metadata_fields_dict in place. This should be corrected, but that
-        # needs to wait until there are tests to make sure doing so doesn't break anything.
         sample_type_df = _update_metadata_from_dict(
-            sample_type_df, wip_metadata_fields_dict, dict_is_metadata_fields=True,
+            sample_type_df, full_sample_type_metadata_fields_dict,
+            dict_is_metadata_fields=True,
             overwrite_non_nans=global_plus_host_settings_dict[OVERWRITE_NON_NANS_KEY])
         # for fields that are required but not yet filled, replace the placeholder with

metameq 2026.1.1__py3-none-any.whl → 2026.1.2__py3-none-any.whl

metameq 2026.1.1py3-none-any.whl → 2026.1.2py3-none-any.whl