PyPI - metameq - Versions diffs - 2026.1.1__py3-none-any.whl → 2026.2.1__py3-none-any.whl - Mend

metameq 2026.1.1py3-none-any.whl → 2026.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

metameq/_version.py +3 -3
metameq/config/config.yml +5 -1
metameq/src/metadata_configurator.py +146 -1
metameq/src/metadata_extender.py +92 -42
metameq/src/util.py +2 -0
metameq/tests/test_metadata_configurator.py +2744 -208
metameq/tests/test_metadata_extender.py +1801 -126
metameq/tests/test_metadata_merger.py +1 -1
metameq/tests/test_util.py +1 -1
{metameq-2026.1.1.dist-info → metameq-2026.2.1.dist-info}/METADATA +1 -1
{metameq-2026.1.1.dist-info → metameq-2026.2.1.dist-info}/RECORD +14 -14
{metameq-2026.1.1.dist-info → metameq-2026.2.1.dist-info}/WHEEL +0 -0
{metameq-2026.1.1.dist-info → metameq-2026.2.1.dist-info}/entry_points.txt +0 -0
{metameq-2026.1.1.dist-info → metameq-2026.2.1.dist-info}/top_level.txt +0 -0

metameq/tests/test_metadata_extender.py CHANGED Viewed

@@ -14,17 +14,20 @@ from metameq.src.util import \
     OVERWRITE_NON_NANS_KEY, LEAVE_REQUIREDS_BLANK_KEY, LEAVE_BLANK_VAL, \
     HOST_TYPE_SPECIFIC_METADATA_KEY, METADATA_TRANSFORMERS_KEY, \
     SOURCES_KEY, FUNCTION_KEY, PRE_TRANSFORMERS_KEY, POST_TRANSFORMERS_KEY, \
-    STUDY_SPECIFIC_METADATA_KEY
+    STUDY_SPECIFIC_METADATA_KEY, HOSTTYPE_COL_OPTIONS_KEY, \
+    SAMPLETYPE_COL_OPTIONS_KEY
 from metameq.src.metadata_extender import \
-    id_missing_cols, get_qc_failures, _reorder_df, \
-    _catch_nan_required_fields, _fill_na_if_default, \
-    _update_metadata_from_metadata_fields_dict, _update_metadata_from_dict, \
-    _construct_sample_type_metadata_fields_dict, \
+    id_missing_cols, get_qc_failures, get_reserved_cols, find_standard_cols, \
+    find_nonstandard_cols, write_metadata_results, \
+    get_extended_metadata_from_df_and_yaml, write_extended_metadata_from_df, \
+    write_extended_metadata, _reorder_df, _catch_nan_required_fields, \
+    _fill_na_if_default, _update_metadata_from_metadata_fields_dict, \
+    _update_metadata_from_dict, _construct_sample_type_metadata_fields_dict, \
     _generate_metadata_for_a_sample_type_in_a_host_type, \
     _generate_metadata_for_a_host_type, _generate_metadata_for_host_types, \
     _transform_metadata, _populate_metadata_df, extend_metadata_df, \
     _get_study_specific_config, _output_metadata_df_to_files, \
-    INTERNAL_COL_KEYS, REQ_PLACEHOLDER
+    _get_specified_column_name, INTERNAL_COL_KEYS, REQ_PLACEHOLDER
 class TestMetadataExtender(TestCase):
@@ -67,6 +70,536 @@ class TestMetadataExtender(TestCase):
         expected = sorted(REQUIRED_RAW_METADATA_FIELDS)
         self.assertEqual(expected, result)
+    # Tests for get_reserved_cols
+    def test_get_reserved_cols_single_host_sample_type(self):
+        """Test returns sorted list of reserved column names for a single host/sample type."""
+        input_df = pandas.DataFrame({
+            SAMPLE_NAME_KEY: ["sample1"],
+            HOSTTYPE_SHORTHAND_KEY: ["human"],
+            SAMPLETYPE_SHORTHAND_KEY: ["stool"]
+        })
+        study_config = {
+            DEFAULT_KEY: "not provided",
+            LEAVE_REQUIREDS_BLANK_KEY: True,
+            OVERWRITE_NON_NANS_KEY: False,
+            STUDY_SPECIFIC_METADATA_KEY: {
+                HOST_TYPE_SPECIFIC_METADATA_KEY: {
+                    "human": {
+                        METADATA_FIELDS_KEY: {
+                            "host_common_name": {
+                                DEFAULT_KEY: "human",
+                                TYPE_KEY: "string"
+                            }
+                        },
+                        SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
+                            "stool": {
+                                METADATA_FIELDS_KEY: {
+                                    "body_site": {
+                                        DEFAULT_KEY: "gut",
+                                        TYPE_KEY: "string"
+                                    },
+                                    "stool_consistency": {
+                                        DEFAULT_KEY: "normal",
+                                        TYPE_KEY: "string"
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        result = get_reserved_cols(input_df, study_config, self.TEST_STDS_FP)
+        # Expected columns are union of study_config fields and test_standards.yml fields
+        # From standards: sample_name, sample_type (base), description (human overrides host_associated),
+        # body_site (host_associated stool), body_product (human stool), host_common_name (human)
+        expected = [
+            "body_product",  # from human stool in test_standards.yml
+            "body_site",
+            "description",  # from human in test_standards.yml (overrides host_associated)
+            "host_common_name",
+            HOSTTYPE_SHORTHAND_KEY,
+            QC_NOTE_KEY,
+            QIITA_SAMPLE_TYPE,
+            SAMPLE_NAME_KEY,
+            SAMPLE_TYPE_KEY,
+            SAMPLETYPE_SHORTHAND_KEY,
+            "stool_consistency"
+        ]
+        self.assertEqual(expected, result)
+    def test_get_reserved_cols_missing_hosttype_shorthand_raises(self):
+        """Test raises ValueError when hosttype_shorthand column is missing."""
+        input_df = pandas.DataFrame({
+            SAMPLE_NAME_KEY: ["sample1"],
+            SAMPLETYPE_SHORTHAND_KEY: ["stool"]
+        })
+        study_config = {}
+        with self.assertRaisesRegex(ValueError, HOSTTYPE_SHORTHAND_KEY):
+            get_reserved_cols(input_df, study_config)
+    def test_get_reserved_cols_missing_sampletype_shorthand_raises(self):
+        """Test raises ValueError when sampletype_shorthand column is missing."""
+        input_df = pandas.DataFrame({
+            SAMPLE_NAME_KEY: ["sample1"],
+            HOSTTYPE_SHORTHAND_KEY: ["human"]
+        })
+        study_config = {}
+        with self.assertRaisesRegex(ValueError, SAMPLETYPE_SHORTHAND_KEY):
+            get_reserved_cols(input_df, study_config)
+    def test_get_reserved_cols_multiple_host_sample_types(self):
+        """Test returns deduped union of reserved columns for multiple host/sample type combinations."""
+        input_df = pandas.DataFrame({
+            SAMPLE_NAME_KEY: ["sample1", "sample2", "sample3"],
+            HOSTTYPE_SHORTHAND_KEY: ["human", "human", "mouse"],
+            SAMPLETYPE_SHORTHAND_KEY: ["stool", "blood", "stool"]
+        })
+        # Both human and mouse define host_common_name and body_site - should appear only once each
+        study_config = {
+            DEFAULT_KEY: "not provided",
+            LEAVE_REQUIREDS_BLANK_KEY: True,
+            OVERWRITE_NON_NANS_KEY: False,
+            STUDY_SPECIFIC_METADATA_KEY: {
+                HOST_TYPE_SPECIFIC_METADATA_KEY: {
+                    "human": {
+                        METADATA_FIELDS_KEY: {
+                            "host_common_name": {
+                                DEFAULT_KEY: "human",
+                                TYPE_KEY: "string"
+                            },
+                            "human_field": {
+                                DEFAULT_KEY: "human_value",
+                                TYPE_KEY: "string"
+                            }
+                        },
+                        SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
+                            "stool": {
+                                METADATA_FIELDS_KEY: {
+                                    "body_site": {
+                                        DEFAULT_KEY: "gut",
+                                        TYPE_KEY: "string"
+                                    },
+                                    "stool_consistency": {
+                                        DEFAULT_KEY: "normal",
+                                        TYPE_KEY: "string"
+                                    }
+                                }
+                            },
+                            "blood": {
+                                METADATA_FIELDS_KEY: {
+                                    "body_site": {
+                                        DEFAULT_KEY: "blood",
+                                        TYPE_KEY: "string"
+                                    },
+                                    "blood_type": {
+                                        DEFAULT_KEY: "unknown",
+                                        TYPE_KEY: "string"
+                                    }
+                                }
+                            }
+                        }
+                    },
+                    "mouse": {
+                        METADATA_FIELDS_KEY: {
+                            "host_common_name": {
+                                DEFAULT_KEY: "mouse",
+                                TYPE_KEY: "string"
+                            },
+                            "mouse_field": {
+                                DEFAULT_KEY: "mouse_value",
+                                TYPE_KEY: "string"
+                            }
+                        },
+                        SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
+                            "stool": {
+                                METADATA_FIELDS_KEY: {
+                                    "body_site": {
+                                        DEFAULT_KEY: "gut",
+                                        TYPE_KEY: "string"
+                                    },
+                                    "mouse_stool_field": {
+                                        DEFAULT_KEY: "mouse_stool_value",
+                                        TYPE_KEY: "string"
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        result = get_reserved_cols(input_df, study_config, self.TEST_STDS_FP)
+        # Expected columns are union of study_config fields and test_standards.yml fields
+        # From standards for human/stool: sample_name, sample_type (base), description (human),
+        #   body_site (host_associated stool), body_product (human stool), host_common_name (human)
+        # From standards for human/blood: body_site (human blood), body_product (human blood),
+        #   description (human), host_common_name (human)
+        # From standards for mouse/stool: sample_name, sample_type (base), description (host_associated),
+        #   body_site (host_associated stool), host_common_name (mouse)
+        # TODO: cage_id from mouse stool in test_standards.yml SHOULD be included here
+        # but is currently excluded because it has required: false and no default.
+        # The function under test needs to be changed to include fields even when
+        # they have required: false and no default.
+        expected = [
+            "blood_type",
+            "body_product",  # from human stool and human blood in test_standards.yml
+            "body_site",
+            "description",  # from human (overrides host_associated) and host_associated (mouse inherits)
+            "host_common_name",
+            HOSTTYPE_SHORTHAND_KEY,
+            "human_field",
+            "mouse_field",
+            "mouse_stool_field",
+            QC_NOTE_KEY,
+            QIITA_SAMPLE_TYPE,
+            SAMPLE_NAME_KEY,
+            SAMPLE_TYPE_KEY,
+            SAMPLETYPE_SHORTHAND_KEY,
+            "stool_consistency"
+        ]
+        self.assertEqual(expected, result)
+    # Tests for find_standard_cols
+    def test_find_standard_cols_returns_standard_cols_in_df(self):
+        """Test returns standard columns that exist in the input DataFrame, excluding internals."""
+        input_df = pandas.DataFrame({
+            SAMPLE_NAME_KEY: ["sample1"],
+            HOSTTYPE_SHORTHAND_KEY: ["human"],
+            SAMPLETYPE_SHORTHAND_KEY: ["stool"],
+            "body_site": ["gut"],
+            "host_common_name": ["human"],
+            "my_custom_column": ["custom_value"]
+        })
+        study_config = {
+            DEFAULT_KEY: "not provided",
+            LEAVE_REQUIREDS_BLANK_KEY: True,
+            OVERWRITE_NON_NANS_KEY: False,
+            STUDY_SPECIFIC_METADATA_KEY: {
+                HOST_TYPE_SPECIFIC_METADATA_KEY: {
+                    "human": {
+                        METADATA_FIELDS_KEY: {
+                            "host_common_name": {
+                                DEFAULT_KEY: "human",
+                                TYPE_KEY: "string"
+                            }
+                        },
+                        SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
+                            "stool": {
+                                METADATA_FIELDS_KEY: {
+                                    "body_site": {
+                                        DEFAULT_KEY: "gut",
+                                        TYPE_KEY: "string"
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        result = find_standard_cols(input_df, study_config, self.TEST_STDS_FP)
+        # Returns intersection of reserved cols (minus internals) with df columns.
+        # body_site, host_common_name, sample_name are standard and in df
+        # hosttype_shorthand, sampletype_shorthand are internal (excluded)
+        # my_custom_column is nonstandard (excluded)
+        expected = ["body_site", "host_common_name", SAMPLE_NAME_KEY]
+        self.assertEqual(sorted(expected), sorted(result))
+    def test_find_standard_cols_missing_hosttype_shorthand_raises(self):
+        """Test raises ValueError when hosttype_shorthand column is missing."""
+        input_df = pandas.DataFrame({
+            SAMPLE_NAME_KEY: ["sample1"],
+            SAMPLETYPE_SHORTHAND_KEY: ["stool"]
+        })
+        study_config = {}
+        with self.assertRaisesRegex(ValueError, HOSTTYPE_SHORTHAND_KEY):
+            find_standard_cols(input_df, study_config, self.TEST_STDS_FP)
+    def test_find_standard_cols_missing_sampletype_shorthand_raises(self):
+        """Test raises ValueError when sampletype_shorthand column is missing."""
+        input_df = pandas.DataFrame({
+            SAMPLE_NAME_KEY: ["sample1"],
+            HOSTTYPE_SHORTHAND_KEY: ["human"]
+        })
+        study_config = {}
+        with self.assertRaisesRegex(ValueError, SAMPLETYPE_SHORTHAND_KEY):
+            find_standard_cols(input_df, study_config, self.TEST_STDS_FP)
+    def test_find_standard_cols_missing_sample_name_raises(self):
+        """Test raises ValueError when sample_name column is missing."""
+        input_df = pandas.DataFrame({
+            HOSTTYPE_SHORTHAND_KEY: ["human"],
+            SAMPLETYPE_SHORTHAND_KEY: ["stool"]
+        })
+        study_config = {}
+        with self.assertRaisesRegex(ValueError, SAMPLE_NAME_KEY):
+            find_standard_cols(input_df, study_config, self.TEST_STDS_FP)
+    def test_find_standard_cols_suppress_missing_name_err(self):
+        """Test that suppress_missing_name_err=True allows missing sample_name."""
+        input_df = pandas.DataFrame({
+            HOSTTYPE_SHORTHAND_KEY: ["human"],
+            SAMPLETYPE_SHORTHAND_KEY: ["stool"],
+            "body_site": ["gut"]
+        })
+        study_config = {
+            DEFAULT_KEY: "not provided",
+            LEAVE_REQUIREDS_BLANK_KEY: True,
+            OVERWRITE_NON_NANS_KEY: False,
+            STUDY_SPECIFIC_METADATA_KEY: {
+                HOST_TYPE_SPECIFIC_METADATA_KEY: {
+                    "human": {
+                        METADATA_FIELDS_KEY: {},
+                        SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
+                            "stool": {
+                                METADATA_FIELDS_KEY: {
+                                    "body_site": {
+                                        DEFAULT_KEY: "gut",
+                                        TYPE_KEY: "string"
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        result = find_standard_cols(
+            input_df, study_config, self.TEST_STDS_FP,
+            suppress_missing_name_err=True)
+        # Only body_site is a standard col in df (sample_name is missing but allowed)
+        expected = ["body_site"]
+        self.assertEqual(expected, sorted(result))
+    # Tests for find_nonstandard_cols
+    def test_find_nonstandard_cols_returns_nonstandard_cols(self):
+        """Test returns columns in df that are not in the reserved columns list."""
+        input_df = pandas.DataFrame({
+            SAMPLE_NAME_KEY: ["sample1"],
+            HOSTTYPE_SHORTHAND_KEY: ["human"],
+            SAMPLETYPE_SHORTHAND_KEY: ["stool"],
+            "body_site": ["gut"],
+            "host_common_name": ["human"],
+            "my_custom_column": ["custom_value"],
+            "another_nonstandard": ["value"]
+        })
+        study_config = {
+            DEFAULT_KEY: "not provided",
+            LEAVE_REQUIREDS_BLANK_KEY: True,
+            OVERWRITE_NON_NANS_KEY: False,
+            STUDY_SPECIFIC_METADATA_KEY: {
+                HOST_TYPE_SPECIFIC_METADATA_KEY: {
+                    "human": {
+                        METADATA_FIELDS_KEY: {
+                            "host_common_name": {
+                                DEFAULT_KEY: "human",
+                                TYPE_KEY: "string"
+                            }
+                        },
+                        SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
+                            "stool": {
+                                METADATA_FIELDS_KEY: {
+                                    "body_site": {
+                                        DEFAULT_KEY: "gut",
+                                        TYPE_KEY: "string"
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        result = find_nonstandard_cols(input_df, study_config, self.TEST_STDS_FP)
+        # Only my_custom_column and another_nonstandard are not in the reserved list
+        # sample_name, body_site, host_common_name, hosttype_shorthand,
+        # sampletype_shorthand are all reserved
+        expected = ["another_nonstandard", "my_custom_column"]
+        self.assertEqual(sorted(expected), sorted(result))
+    def test_find_nonstandard_cols_missing_required_col_raises(self):
+        """Test raises ValueError when a required column is missing."""
+        input_df = pandas.DataFrame({
+            SAMPLE_NAME_KEY: ["sample1"],
+            SAMPLETYPE_SHORTHAND_KEY: ["stool"]
+            # missing HOSTTYPE_SHORTHAND_KEY
+        })
+        study_config = {}
+        with self.assertRaisesRegex(ValueError, HOSTTYPE_SHORTHAND_KEY):
+            find_nonstandard_cols(input_df, study_config, self.TEST_STDS_FP)
+    # Tests for write_metadata_results
+    def test_write_metadata_results_creates_all_files(self):
+        """Test creates metadata file and validation errors file, includes failed rows."""
+        metadata_df = pandas.DataFrame({
+            SAMPLE_NAME_KEY: ["sample1", "sample2", "sample3"],
+            "field_a": ["a1", "a2", "a3"],
+            HOSTTYPE_SHORTHAND_KEY: ["human", "human", "human"],
+            SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool", "stool"],
+            QC_NOTE_KEY: ["", "invalid host_type", ""]
+        })
+        validation_msgs_df = pandas.DataFrame({
+            "field": ["field_a"],
+            "error": ["some validation error"]
+        })
+        with tempfile.TemporaryDirectory() as tmpdir:
+            write_metadata_results(
+                metadata_df, validation_msgs_df, tmpdir, "test_output",
+                sep="\t", remove_internals=False)
+            # Find the main metadata file
+            metadata_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
+            self.assertEqual(1, len(metadata_files))
+            # Verify metadata file contents - includes failed row when remove_internals=False
+            result_df = pandas.read_csv(
+                metadata_files[0], sep="\t", keep_default_na=False)
+            assert_frame_equal(metadata_df, result_df)
+            # Find the validation errors file (uses comma separator)
+            validation_files = glob.glob(
+                os.path.join(tmpdir, "*_test_output_validation_errors.csv"))
+            self.assertEqual(1, len(validation_files))
+            # Verify validation errors file contents
+            result_validation_df = pandas.read_csv(validation_files[0], sep=",")
+            assert_frame_equal(validation_msgs_df, result_validation_df)
+            # No fails file should be created when remove_internals=False
+            fails_files = glob.glob(
+                os.path.join(tmpdir, "*_test_output_fails.csv"))
+            self.assertEqual(0, len(fails_files))
+    def test_write_metadata_results_remove_internals_creates_fails_file(self):
+        """Test with remove_internals=True creates fails file and removes internal cols."""
+        metadata_df = pandas.DataFrame({
+            SAMPLE_NAME_KEY: ["sample1", "sample2", "sample3"],
+            "field_a": ["a1", "a2", "a3"],
+            HOSTTYPE_SHORTHAND_KEY: ["human", "human", "human"],
+            SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool", "stool"],
+            QC_NOTE_KEY: ["", "invalid host_type", ""]
+        })
+        validation_msgs_df = pandas.DataFrame()
+        with tempfile.TemporaryDirectory() as tmpdir:
+            write_metadata_results(
+                metadata_df, validation_msgs_df, tmpdir, "test_output",
+                sep="\t", remove_internals=True)
+            # Find the main metadata file
+            metadata_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
+            self.assertEqual(1, len(metadata_files))
+            # Verify metadata has internal cols removed and no failures
+            result_df = pandas.read_csv(metadata_files[0], sep="\t")
+            expected_df = pandas.DataFrame({
+                SAMPLE_NAME_KEY: ["sample1", "sample3"],
+                "field_a": ["a1", "a3"]
+            })
+            assert_frame_equal(expected_df, result_df)
+            # Find the fails file
+            fails_files = glob.glob(
+                os.path.join(tmpdir, "*_test_output_fails.csv"))
+            self.assertEqual(1, len(fails_files))
+            # Verify fails file contains the failed row
+            fails_df = pandas.read_csv(fails_files[0], sep=",")
+            expected_fails_df = pandas.DataFrame({
+                SAMPLE_NAME_KEY: ["sample2"],
+                "field_a": ["a2"],
+                HOSTTYPE_SHORTHAND_KEY: ["human"],
+                SAMPLETYPE_SHORTHAND_KEY: ["stool"],
+                QC_NOTE_KEY: ["invalid host_type"]
+            })
+            assert_frame_equal(expected_fails_df, fails_df)
+            # Validation errors file should be empty (touched)
+            validation_files = glob.glob(
+                os.path.join(tmpdir, "*_test_output_validation_errors.csv"))
+            self.assertEqual(1, len(validation_files))
+            self.assertEqual(0, os.path.getsize(validation_files[0]))
+    def test_write_metadata_results_suppress_empty_fails(self):
+        """Test with suppress_empty_fails=True does not create empty files."""
+        metadata_df = pandas.DataFrame({
+            SAMPLE_NAME_KEY: ["sample1", "sample2"],
+            "field_a": ["a1", "a2"],
+            HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
+            SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
+            QC_NOTE_KEY: ["", ""]
+        })
+        validation_msgs_df = pandas.DataFrame()
+        with tempfile.TemporaryDirectory() as tmpdir:
+            write_metadata_results(
+                metadata_df, validation_msgs_df, tmpdir, "test_output",
+                sep="\t", remove_internals=True, suppress_empty_fails=True)
+            # Main metadata file should exist
+            metadata_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
+            self.assertEqual(1, len(metadata_files))
+            # Fails file should NOT exist (no failures, suppressed)
+            fails_files = glob.glob(
+                os.path.join(tmpdir, "*_test_output_fails.csv"))
+            self.assertEqual(0, len(fails_files))
+            # Validation errors file should NOT exist (empty, suppressed)
+            validation_files = glob.glob(
+                os.path.join(tmpdir, "*_test_output_validation_errors.csv"))
+            self.assertEqual(0, len(validation_files))
+    def test_write_metadata_results_custom_internal_col_names(self):
+        """Test with custom internal_col_names parameter."""
+        metadata_df = pandas.DataFrame({
+            SAMPLE_NAME_KEY: ["sample1", "sample2"],
+            "field_a": ["a1", "a2"],
+            "custom_internal": ["x", "y"],
+            QC_NOTE_KEY: ["", ""]
+        })
+        validation_msgs_df = pandas.DataFrame()
+        with tempfile.TemporaryDirectory() as tmpdir:
+            write_metadata_results(
+                metadata_df, validation_msgs_df, tmpdir, "test_output",
+                sep="\t", remove_internals=True, suppress_empty_fails=True,
+                internal_col_names=["custom_internal", QC_NOTE_KEY])
+            # Find the main metadata file
+            metadata_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
+            self.assertEqual(1, len(metadata_files))
+            # Verify custom internal cols are removed
+            result_df = pandas.read_csv(metadata_files[0], sep="\t")
+            expected_df = pandas.DataFrame({
+                SAMPLE_NAME_KEY: ["sample1", "sample2"],
+                "field_a": ["a1", "a2"]
+            })
+            assert_frame_equal(expected_df, result_df)
     # Tests for get_qc_failures
     def test_get_qc_failures_no_failures(self):
@@ -745,6 +1278,8 @@ class TestMetadataExtender(TestCase):
             LEAVE_REQUIREDS_BLANK_KEY: False,
             DEFAULT_KEY: "not provided"
         }
+        # Config is pre-resolved: sample type's metadata_fields already includes
+        # host fields merged in, plus sample_type and qiita_sample_type
         host_type_config_dict = {
             METADATA_FIELDS_KEY: {
                 "host_field": {
@@ -755,9 +1290,23 @@ class TestMetadataExtender(TestCase):
             SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
                 "stool": {
                     METADATA_FIELDS_KEY: {
+                        "host_field": {
+                            DEFAULT_KEY: "host_default",
+                            TYPE_KEY: "string"
+                        },
                         "stool_field": {
                             DEFAULT_KEY: "stool_default",
                             TYPE_KEY: "string"
+                        },
+                        SAMPLE_TYPE_KEY: {
+                            ALLOWED_KEY: ["stool"],
+                            DEFAULT_KEY: "stool",
+                            TYPE_KEY: "string"
+                        },
+                        QIITA_SAMPLE_TYPE: {
+                            ALLOWED_KEY: ["stool"],
+                            DEFAULT_KEY: "stool",
+                            TYPE_KEY: "string"
                         }
                     }
                 }
@@ -996,17 +1545,44 @@ class TestMetadataExtender(TestCase):
             LEAVE_REQUIREDS_BLANK_KEY: False,
             DEFAULT_KEY: "not provided"
         }
+        # Config is pre-resolved: alias "feces" has its own metadata_fields
+        # that is a copy of "stool"'s resolved fields with sample_type="stool"
         host_type_config_dict = {
             METADATA_FIELDS_KEY: {},
             SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
                 "feces": {
-                    ALIAS_KEY: "stool"
+                    METADATA_FIELDS_KEY: {
+                        "stool_field": {
+                            DEFAULT_KEY: "stool_value",
+                            TYPE_KEY: "string"
+                        },
+                        SAMPLE_TYPE_KEY: {
+                            ALLOWED_KEY: ["stool"],
+                            DEFAULT_KEY: "stool",
+                            TYPE_KEY: "string"
+                        },
+                        QIITA_SAMPLE_TYPE: {
+                            ALLOWED_KEY: ["stool"],
+                            DEFAULT_KEY: "stool",
+                            TYPE_KEY: "string"
+                        }
+                    }
                 },
                 "stool": {
                     METADATA_FIELDS_KEY: {
                         "stool_field": {
                             DEFAULT_KEY: "stool_value",
                             TYPE_KEY: "string"
+                        },
+                        SAMPLE_TYPE_KEY: {
+                            ALLOWED_KEY: ["stool"],
+                            DEFAULT_KEY: "stool",
+                            TYPE_KEY: "string"
+                        },
+                        QIITA_SAMPLE_TYPE: {
+                            ALLOWED_KEY: ["stool"],
+                            DEFAULT_KEY: "stool",
+                            TYPE_KEY: "string"
                         }
                     }
                 }
@@ -1035,6 +1611,8 @@ class TestMetadataExtender(TestCase):
             LEAVE_REQUIREDS_BLANK_KEY: False,
             DEFAULT_KEY: "global_default"
         }
+        # Config is pre-resolved: sample type's metadata_fields includes
+        # host fields merged in, plus sample_type and qiita_sample_type
         full_flat_config_dict = {
             HOST_TYPE_SPECIFIC_METADATA_KEY: {
                 "human": {
@@ -1048,9 +1626,23 @@ class TestMetadataExtender(TestCase):
                     SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
                         "stool": {
                             METADATA_FIELDS_KEY: {
+                                "host_field": {
+                                    DEFAULT_KEY: "host_value",
+                                    TYPE_KEY: "string"
+                                },
                                 "stool_field": {
                                     DEFAULT_KEY: "stool_value",
                                     TYPE_KEY: "string"
+                                },
+                                SAMPLE_TYPE_KEY: {
+                                    ALLOWED_KEY: ["stool"],
+                                    DEFAULT_KEY: "stool",
+                                    TYPE_KEY: "string"
+                                },
+                                QIITA_SAMPLE_TYPE: {
+                                    ALLOWED_KEY: ["stool"],
+                                    DEFAULT_KEY: "stool",
+                                    TYPE_KEY: "string"
                                 }
                             }
                         }
@@ -1160,6 +1752,8 @@ class TestMetadataExtender(TestCase):
             LEAVE_REQUIREDS_BLANK_KEY: False,
             DEFAULT_KEY: "global_default"
         }
+        # Config is pre-resolved: sample type's metadata_fields includes
+        # host fields merged in, plus sample_type and qiita_sample_type
         full_flat_config_dict = {
             HOST_TYPE_SPECIFIC_METADATA_KEY: {
                 "human": {
@@ -1171,7 +1765,22 @@ class TestMetadataExtender(TestCase):
                     },
                     SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
                         "stool": {
-                            METADATA_FIELDS_KEY: {}
+                            METADATA_FIELDS_KEY: {
+                                "human_field": {
+                                    DEFAULT_KEY: "human_value",
+                                    TYPE_KEY: "string"
+                                },
+                                SAMPLE_TYPE_KEY: {
+                                    ALLOWED_KEY: ["stool"],
+                                    DEFAULT_KEY: "stool",
+                                    TYPE_KEY: "string"
+                                },
+                                QIITA_SAMPLE_TYPE: {
+                                    ALLOWED_KEY: ["stool"],
+                                    DEFAULT_KEY: "stool",
+                                    TYPE_KEY: "string"
+                                }
+                            }
                         }
                     }
                 },
@@ -1209,6 +1818,8 @@ class TestMetadataExtender(TestCase):
             LEAVE_REQUIREDS_BLANK_KEY: False,
             DEFAULT_KEY: "global_default"
         }
+        # Config is pre-resolved: sample type's metadata_fields includes
+        # host fields merged in, plus sample_type and qiita_sample_type
         full_flat_config_dict = {
             HOST_TYPE_SPECIFIC_METADATA_KEY: {
                 "human": {
@@ -1220,6 +1831,16 @@ class TestMetadataExtender(TestCase):
                                 "required_field": {
                                     REQUIRED_KEY: True,
                                     TYPE_KEY: "string"
+                                },
+                                SAMPLE_TYPE_KEY: {
+                                    ALLOWED_KEY: ["stool"],
+                                    DEFAULT_KEY: "stool",
+                                    TYPE_KEY: "string"
+                                },
+                                QIITA_SAMPLE_TYPE: {
+                                    ALLOWED_KEY: ["stool"],
+                                    DEFAULT_KEY: "stool",
+                                    TYPE_KEY: "string"
                                 }
                             }
                         }
@@ -1255,6 +1876,8 @@ class TestMetadataExtender(TestCase):
             LEAVE_REQUIREDS_BLANK_KEY: False,
             DEFAULT_KEY: "global_default"
         }
+        # Config is pre-resolved: sample type's metadata_fields includes
+        # host fields merged in, plus sample_type and qiita_sample_type
         full_flat_config_dict = {
             HOST_TYPE_SPECIFIC_METADATA_KEY: {
                 "human": {
@@ -1266,6 +1889,16 @@ class TestMetadataExtender(TestCase):
                                 "required_field": {
                                     REQUIRED_KEY: True,
                                     TYPE_KEY: "string"
+                                },
+                                SAMPLE_TYPE_KEY: {
+                                    ALLOWED_KEY: ["stool"],
+                                    DEFAULT_KEY: "stool",
+                                    TYPE_KEY: "string"
+                                },
+                                QIITA_SAMPLE_TYPE: {
+                                    ALLOWED_KEY: ["stool"],
+                                    DEFAULT_KEY: "stool",
+                                    TYPE_KEY: "string"
                                 }
                             }
                         }
@@ -1298,6 +1931,8 @@ class TestMetadataExtender(TestCase):
             SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
             QC_NOTE_KEY: ["", ""]
         })
+        # Config is pre-resolved: sample type's metadata_fields includes
+        # host fields merged in, plus sample_type and qiita_sample_type
         full_flat_config_dict = {
             DEFAULT_KEY: "global_default",
             LEAVE_REQUIREDS_BLANK_KEY: False,
@@ -1313,9 +1948,23 @@ class TestMetadataExtender(TestCase):
                     SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
                         "stool": {
                             METADATA_FIELDS_KEY: {
+                                "host_field": {
+                                    DEFAULT_KEY: "host_value",
+                                    TYPE_KEY: "string"
+                                },
                                 "stool_field": {
                                     DEFAULT_KEY: "stool_value",
                                     TYPE_KEY: "string"
+                                },
+                                SAMPLE_TYPE_KEY: {
+                                    ALLOWED_KEY: ["stool"],
+                                    DEFAULT_KEY: "stool",
+                                    TYPE_KEY: "string"
+                                },
+                                QIITA_SAMPLE_TYPE: {
+                                    ALLOWED_KEY: ["stool"],
+                                    DEFAULT_KEY: "stool",
+                                    TYPE_KEY: "string"
                                 }
                             }
                         }
@@ -1348,6 +1997,8 @@ class TestMetadataExtender(TestCase):
             SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool", "blood"],
             QC_NOTE_KEY: ["", "", ""]
         })
+        # Config is pre-resolved: sample type's metadata_fields includes
+        # host fields merged in, plus sample_type and qiita_sample_type
         full_flat_config_dict = {
             DEFAULT_KEY: "global_default",
             LEAVE_REQUIREDS_BLANK_KEY: False,
@@ -1362,10 +2013,40 @@ class TestMetadataExtender(TestCase):
                     },
                     SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
                         "stool": {
-                            METADATA_FIELDS_KEY: {}
+                            METADATA_FIELDS_KEY: {
+                                "human_field": {
+                                    DEFAULT_KEY: "human_value",
+                                    TYPE_KEY: "string"
+                                },
+                                SAMPLE_TYPE_KEY: {
+                                    ALLOWED_KEY: ["stool"],
+                                    DEFAULT_KEY: "stool",
+                                    TYPE_KEY: "string"
+                                },
+                                QIITA_SAMPLE_TYPE: {
+                                    ALLOWED_KEY: ["stool"],
+                                    DEFAULT_KEY: "stool",
+                                    TYPE_KEY: "string"
+                                }
+                            }
                         },
                         "blood": {
-                            METADATA_FIELDS_KEY: {}
+                            METADATA_FIELDS_KEY: {
+                                "human_field": {
+                                    DEFAULT_KEY: "human_value",
+                                    TYPE_KEY: "string"
+                                },
+                                SAMPLE_TYPE_KEY: {
+                                    ALLOWED_KEY: ["blood"],
+                                    DEFAULT_KEY: "blood",
+                                    TYPE_KEY: "string"
+                                },
+                                QIITA_SAMPLE_TYPE: {
+                                    ALLOWED_KEY: ["blood"],
+                                    DEFAULT_KEY: "blood",
+                                    TYPE_KEY: "string"
+                                }
+                            }
                         }
                     }
                 },
@@ -1378,7 +2059,22 @@ class TestMetadataExtender(TestCase):
                     },
                     SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
                         "stool": {
-                            METADATA_FIELDS_KEY: {}
+                            METADATA_FIELDS_KEY: {
+                                "mouse_field": {
+                                    DEFAULT_KEY: "mouse_value",
+                                    TYPE_KEY: "string"
+                                },
+                                SAMPLE_TYPE_KEY: {
+                                    ALLOWED_KEY: ["stool"],
+                                    DEFAULT_KEY: "stool",
+                                    TYPE_KEY: "string"
+                                },
+                                QIITA_SAMPLE_TYPE: {
+                                    ALLOWED_KEY: ["stool"],
+                                    DEFAULT_KEY: "stool",
+                                    TYPE_KEY: "string"
+                                }
+                            }
                         }
                     }
                 }
@@ -1478,6 +2174,8 @@ class TestMetadataExtender(TestCase):
             SAMPLETYPE_SHORTHAND_KEY: ["stool"],
             QC_NOTE_KEY: [""]
         })
+        # Config is pre-resolved: sample type's metadata_fields includes
+        # host fields merged in, plus sample_type and qiita_sample_type
         full_flat_config_dict = {
             DEFAULT_KEY: "global_default",
             LEAVE_REQUIREDS_BLANK_KEY: True,  # This causes required fields to get LEAVE_BLANK_VAL
@@ -1491,6 +2189,16 @@ class TestMetadataExtender(TestCase):
                                 "required_field": {
                                     REQUIRED_KEY: True,
                                     TYPE_KEY: "string"
+                                },
+                                SAMPLE_TYPE_KEY: {
+                                    ALLOWED_KEY: ["stool"],
+                                    DEFAULT_KEY: "stool",
+                                    TYPE_KEY: "string"
+                                },
+                                QIITA_SAMPLE_TYPE: {
+                                    ALLOWED_KEY: ["stool"],
+                                    DEFAULT_KEY: "stool",
+                                    TYPE_KEY: "string"
                                 }
                             }
                         }
@@ -1790,6 +2498,8 @@ class TestMetadataExtender(TestCase):
             HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
             SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"]
         })
+        # Config is pre-resolved: sample type's metadata_fields includes
+        # host fields merged in, plus sample_type and qiita_sample_type
         full_flat_config_dict = {
             DEFAULT_KEY: "not provided",
             LEAVE_REQUIREDS_BLANK_KEY: False,
@@ -1805,9 +2515,23 @@ class TestMetadataExtender(TestCase):
                     SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
                         "stool": {
                             METADATA_FIELDS_KEY: {
+                                "host_field": {
+                                    DEFAULT_KEY: "host_value",
+                                    TYPE_KEY: "string"
+                                },
                                 "stool_field": {
                                     DEFAULT_KEY: "stool_value",
                                     TYPE_KEY: "string"
+                                },
+                                SAMPLE_TYPE_KEY: {
+                                    ALLOWED_KEY: ["stool"],
+                                    DEFAULT_KEY: "stool",
+                                    TYPE_KEY: "string"
+                                },
+                                QIITA_SAMPLE_TYPE: {
+                                    ALLOWED_KEY: ["stool"],
+                                    DEFAULT_KEY: "stool",
+                                    TYPE_KEY: "string"
                                 }
                             }
                         }
@@ -1840,6 +2564,8 @@ class TestMetadataExtender(TestCase):
             SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
             "input_sex": ["F", "Male"]
         })
+        # Config is pre-resolved: sample type's metadata_fields includes
+        # host fields merged in, plus sample_type and qiita_sample_type
         full_flat_config_dict = {
             DEFAULT_KEY: "not provided",
             LEAVE_REQUIREDS_BLANK_KEY: False,
@@ -1857,7 +2583,18 @@ class TestMetadataExtender(TestCase):
                     METADATA_FIELDS_KEY: {},
                     SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
                         "stool": {
-                            METADATA_FIELDS_KEY: {}
+                            METADATA_FIELDS_KEY: {
+                                SAMPLE_TYPE_KEY: {
+                                    ALLOWED_KEY: ["stool"],
+                                    DEFAULT_KEY: "stool",
+                                    TYPE_KEY: "string"
+                                },
+                                QIITA_SAMPLE_TYPE: {
+                                    ALLOWED_KEY: ["stool"],
+                                    DEFAULT_KEY: "stool",
+                                    TYPE_KEY: "string"
+                                }
+                            }
                         }
                     }
                 }
@@ -1886,6 +2623,8 @@ class TestMetadataExtender(TestCase):
             HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
             SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"]
         })
+        # Config is pre-resolved: sample type's metadata_fields includes
+        # host fields merged in, plus sample_type and qiita_sample_type
         full_flat_config_dict = {
             DEFAULT_KEY: "not provided",
             LEAVE_REQUIREDS_BLANK_KEY: False,
@@ -1903,7 +2642,18 @@ class TestMetadataExtender(TestCase):
                     METADATA_FIELDS_KEY: {},
                     SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
                         "stool": {
-                            METADATA_FIELDS_KEY: {}
+                            METADATA_FIELDS_KEY: {
+                                SAMPLE_TYPE_KEY: {
+                                    ALLOWED_KEY: ["stool"],
+                                    DEFAULT_KEY: "stool",
+                                    TYPE_KEY: "string"
+                                },
+                                QIITA_SAMPLE_TYPE: {
+                                    ALLOWED_KEY: ["stool"],
+                                    DEFAULT_KEY: "stool",
+                                    TYPE_KEY: "string"
+                                }
+                            }
                         }
                     }
                 }
@@ -1963,6 +2713,8 @@ class TestMetadataExtender(TestCase):
             HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
             SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"]
         })
+        # Config is pre-resolved: sample type's metadata_fields includes
+        # host fields merged in, plus sample_type and qiita_sample_type
         full_flat_config_dict = {
             DEFAULT_KEY: "not provided",
             LEAVE_REQUIREDS_BLANK_KEY: False,
@@ -1972,7 +2724,18 @@ class TestMetadataExtender(TestCase):
                     METADATA_FIELDS_KEY: {},
                     SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
                         "stool": {
-                            METADATA_FIELDS_KEY: {}
+                            METADATA_FIELDS_KEY: {
+                                SAMPLE_TYPE_KEY: {
+                                    ALLOWED_KEY: ["stool"],
+                                    DEFAULT_KEY: "stool",
+                                    TYPE_KEY: "string"
+                                },
+                                QIITA_SAMPLE_TYPE: {
+                                    ALLOWED_KEY: ["stool"],
+                                    DEFAULT_KEY: "stool",
+                                    TYPE_KEY: "string"
+                                }
+                            }
                         }
                     }
                 }
@@ -2002,6 +2765,8 @@ class TestMetadataExtender(TestCase):
             SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
             "source_field": ["hello", "world"]
         })
+        # Config is pre-resolved: sample type's metadata_fields includes
+        # host fields merged in, plus sample_type and qiita_sample_type
         full_flat_config_dict = {
             DEFAULT_KEY: "not provided",
             LEAVE_REQUIREDS_BLANK_KEY: False,
@@ -2019,7 +2784,18 @@ class TestMetadataExtender(TestCase):
                     METADATA_FIELDS_KEY: {},
                     SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
                         "stool": {
-                            METADATA_FIELDS_KEY: {}
+                            METADATA_FIELDS_KEY: {
+                                SAMPLE_TYPE_KEY: {
+                                    ALLOWED_KEY: ["stool"],
+                                    DEFAULT_KEY: "stool",
+                                    TYPE_KEY: "string"
+                                },
+                                QIITA_SAMPLE_TYPE: {
+                                    ALLOWED_KEY: ["stool"],
+                                    DEFAULT_KEY: "stool",
+                                    TYPE_KEY: "string"
+                                }
+                            }
                         }
                     }
                 }
@@ -2100,8 +2876,15 @@ class TestMetadataExtender(TestCase):
         expected_df = pandas.DataFrame({
             SAMPLE_NAME_KEY: ["sample1", "sample2"],
+            # body_product from human stool in test_standards.yml
+            "body_product": ["UBERON:feces", "UBERON:feces"],
+            # body_site inherited from host_associated stool
             "body_site": ["gut", "gut"],
+            # custom_field from study_specific_metadata
             "custom_field": ["custom_value", "custom_value"],
+            # description overridden at human level
+            "description": ["human sample", "human sample"],
+            # host_common_name from human level
             "host_common_name": ["human", "human"],
             QIITA_SAMPLE_TYPE: ["stool", "stool"],
             SAMPLE_TYPE_KEY: ["stool", "stool"],
@@ -2151,7 +2934,11 @@ class TestMetadataExtender(TestCase):
         expected_df = pandas.DataFrame({
             SAMPLE_NAME_KEY: ["sample1", "sample2"],
+            # body_product from human stool in test_standards.yml
+            "body_product": ["UBERON:feces", "UBERON:feces"],
             "body_site": ["gut", "gut"],
+            # description overridden at human level
+            "description": ["human sample", "human sample"],
             "host_common_name": ["human", "human"],
             "input_sex": ["F", "Male"],
             QIITA_SAMPLE_TYPE: ["stool", "stool"],
@@ -2207,7 +2994,9 @@ class TestMetadataExtender(TestCase):
         expected_df = pandas.DataFrame({
             SAMPLE_NAME_KEY: ["sample1", "sample2"],
+            "body_product": ["UBERON:feces", "UBERON:feces"],
             "body_site": ["gut", "gut"],
+            "description": ["human sample", "human sample"],
             "host_common_name": ["human", "human"],
             QIITA_SAMPLE_TYPE: ["stool", "stool"],
             SAMPLE_TYPE_KEY: ["stool", "stool"],
@@ -2243,7 +3032,9 @@ class TestMetadataExtender(TestCase):
         expected_df = pandas.DataFrame({
             SAMPLE_NAME_KEY: ["sample1"],
+            "body_product": ["UBERON:feces"],
             "body_site": ["gut"],
+            "description": ["human sample"],
             "host_common_name": ["human"],
             QIITA_SAMPLE_TYPE: ["stool"],
             SAMPLE_TYPE_KEY: ["stool"],
@@ -2332,7 +3123,12 @@ class TestMetadataExtender(TestCase):
         # Human samples are processed together, then mouse samples
         expected_df = pandas.DataFrame({
             SAMPLE_NAME_KEY: ["sample1", "sample3", "sample2"],
+            # body_product: human stool/blood have it, mouse stool uses default
+            "body_product": ["UBERON:feces", "UBERON:blood", "not provided"],
             "body_site": ["gut", "blood", "gut"],
+            # description: human overrides to "human sample",
+            # mouse inherits "host associated sample"
+            "description": ["human sample", "human sample", "host associated sample"],
             "host_common_name": ["human", "human", "mouse"],
             QIITA_SAMPLE_TYPE: ["stool", "blood", "stool"],
             SAMPLE_TYPE_KEY: ["stool", "blood", "stool"],
@@ -2342,30 +3138,74 @@ class TestMetadataExtender(TestCase):
         })
         assert_frame_equal(expected_df, result_df)
-    def test_extend_metadata_df_with_software_config(self):
-        """Test metadata extension with custom software config overrides defaults."""
+    def test_extend_metadata_df_with_software_config(self):
+        """Test metadata extension with custom software config overrides defaults."""
+        input_df = pandas.DataFrame({
+            SAMPLE_NAME_KEY: ["sample1", "sample2"],
+            HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
+            SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"]
+        })
+        # Software config with custom default value
+        software_config = {
+            DEFAULT_KEY: "custom_software_default",
+            LEAVE_REQUIREDS_BLANK_KEY: True,
+            OVERWRITE_NON_NANS_KEY: False
+        }
+        # Study config that doesn't override DEFAULT_KEY
+        study_config = {
+            STUDY_SPECIFIC_METADATA_KEY: {
+                HOST_TYPE_SPECIFIC_METADATA_KEY: {
+                    "human": {
+                        METADATA_FIELDS_KEY: {
+                            "study_field": {
+                                DEFAULT_KEY: "study_value",
+                                TYPE_KEY: "string"
+                            }
+                        },
+                        SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
+                            "stool": {
+                                METADATA_FIELDS_KEY: {}
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        result_df, validation_msgs_df = extend_metadata_df(
+            input_df, study_config, None, software_config, self.TEST_STDS_FP)
+        expected_df = pandas.DataFrame({
+            SAMPLE_NAME_KEY: ["sample1", "sample2"],
+            "body_product": ["UBERON:feces", "UBERON:feces"],
+            "body_site": ["gut", "gut"],
+            "description": ["human sample", "human sample"],
+            "host_common_name": ["human", "human"],
+            QIITA_SAMPLE_TYPE: ["stool", "stool"],
+            SAMPLE_TYPE_KEY: ["stool", "stool"],
+            "study_field": ["study_value", "study_value"],
+            HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
+            SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
+            QC_NOTE_KEY: ["", ""]
+        })
+        assert_frame_equal(expected_df, result_df)
+    def test_extend_metadata_df_with_alternate_column_names(self):
+        """Test metadata extension with alternate hosttype and sampletype column names."""
+        # Use alternate column names instead of hosttype_shorthand and sampletype_shorthand
         input_df = pandas.DataFrame({
             SAMPLE_NAME_KEY: ["sample1", "sample2"],
-            HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
-            SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"]
+            "host_type": ["human", "human"],
+            "sample": ["stool", "stool"]
         })
-        # Software config with custom default value
-        software_config = {
-            DEFAULT_KEY: "custom_software_default",
-            LEAVE_REQUIREDS_BLANK_KEY: True,
-            OVERWRITE_NON_NANS_KEY: False
-        }
-        # Study config that doesn't override DEFAULT_KEY
         study_config = {
+            DEFAULT_KEY: "not provided",
+            LEAVE_REQUIREDS_BLANK_KEY: True,
+            OVERWRITE_NON_NANS_KEY: False,
             STUDY_SPECIFIC_METADATA_KEY: {
                 HOST_TYPE_SPECIFIC_METADATA_KEY: {
                     "human": {
-                        METADATA_FIELDS_KEY: {
-                            "study_field": {
-                                DEFAULT_KEY: "study_value",
-                                TYPE_KEY: "string"
-                            }
-                        },
+                        METADATA_FIELDS_KEY: {},
                         SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
                             "stool": {
                                 METADATA_FIELDS_KEY: {}
@@ -2375,22 +3215,37 @@ class TestMetadataExtender(TestCase):
                 }
             }
         }
+        # Software config specifies alternate column names
+        software_config = {
+            DEFAULT_KEY: "not provided",
+            LEAVE_REQUIREDS_BLANK_KEY: True,
+            OVERWRITE_NON_NANS_KEY: False,
+            HOSTTYPE_COL_OPTIONS_KEY: ["host_type"],
+            SAMPLETYPE_COL_OPTIONS_KEY: ["sample"]
+        }
         result_df, validation_msgs_df = extend_metadata_df(
             input_df, study_config, None, software_config, self.TEST_STDS_FP)
         expected_df = pandas.DataFrame({
             SAMPLE_NAME_KEY: ["sample1", "sample2"],
+            "body_product": ["UBERON:feces", "UBERON:feces"],
             "body_site": ["gut", "gut"],
+            "description": ["human sample", "human sample"],
             "host_common_name": ["human", "human"],
+            # Alternate column names from input are preserved
+            "host_type": ["human", "human"],
             QIITA_SAMPLE_TYPE: ["stool", "stool"],
+            # Alternate column names from input are preserved
+            "sample": ["stool", "stool"],
             SAMPLE_TYPE_KEY: ["stool", "stool"],
-            "study_field": ["study_value", "study_value"],
+            # Standard internal columns added at end (in order of INTERNAL_COL_KEYS)
             HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
             SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
             QC_NOTE_KEY: ["", ""]
         })
         assert_frame_equal(expected_df, result_df)
+        self.assertTrue(validation_msgs_df.empty)
     # Tests for _get_study_specific_config
@@ -2455,156 +3310,976 @@ class TestMetadataExtender(TestCase):
         })
         with tempfile.TemporaryDirectory() as tmpdir:
-            _output_metadata_df_to_files(
-                input_df, tmpdir, "test_output", INTERNAL_COL_KEYS,
-                sep="\t", remove_internals_and_fails=False)
+            _output_metadata_df_to_files(
+                input_df, tmpdir, "test_output", INTERNAL_COL_KEYS,
+                sep="\t", remove_internals_and_fails=False)
+            # Find the output file (has timestamp prefix)
+            output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
+            self.assertEqual(1, len(output_files))
+            # Read and verify contents (keep_default_na=False preserves empty strings)
+            result_df = pandas.read_csv(output_files[0], sep="\t", keep_default_na=False)
+            expected_df = input_df
+            assert_frame_equal(expected_df, result_df)
+    def test__output_metadata_df_to_files_remove_internals_and_fails(self):
+        """Test output with internal columns and failures removed."""
+        input_df = pandas.DataFrame({
+            SAMPLE_NAME_KEY: ["sample1", "sample2", "sample3"],
+            "field_a": ["a1", "a2", "a3"],
+            HOSTTYPE_SHORTHAND_KEY: ["human", "human", "human"],
+            SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool", "stool"],
+            QC_NOTE_KEY: ["", "invalid host_type", ""]
+        })
+        with tempfile.TemporaryDirectory() as tmpdir:
+            _output_metadata_df_to_files(
+                input_df, tmpdir, "test_output", INTERNAL_COL_KEYS,
+                sep="\t", remove_internals_and_fails=True)
+            # Find the main output file
+            output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
+            self.assertEqual(1, len(output_files))
+            # Verify main output has internal cols removed and no failures
+            result_df = pandas.read_csv(output_files[0], sep="\t")
+            expected_df = pandas.DataFrame({
+                SAMPLE_NAME_KEY: ["sample1", "sample3"],
+                "field_a": ["a1", "a3"]
+            })
+            assert_frame_equal(expected_df, result_df)
+            # Find the fails file
+            fails_files = glob.glob(os.path.join(tmpdir, "*_test_output_fails.csv"))
+            self.assertEqual(1, len(fails_files))
+            # Verify fails file contains the failed row
+            fails_df = pandas.read_csv(fails_files[0], sep=",")
+            expected_fails_df = pandas.DataFrame({
+                SAMPLE_NAME_KEY: ["sample2"],
+                "field_a": ["a2"],
+                HOSTTYPE_SHORTHAND_KEY: ["human"],
+                SAMPLETYPE_SHORTHAND_KEY: ["stool"],
+                QC_NOTE_KEY: ["invalid host_type"]
+            })
+            assert_frame_equal(expected_fails_df, fails_df)
+    def test__output_metadata_df_to_files_no_failures_creates_empty_file(self):
+        """Test that empty fails file is created when there are no failures."""
+        input_df = pandas.DataFrame({
+            SAMPLE_NAME_KEY: ["sample1", "sample2"],
+            "field_a": ["a1", "a2"],
+            HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
+            SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
+            QC_NOTE_KEY: ["", ""]
+        })
+        with tempfile.TemporaryDirectory() as tmpdir:
+            _output_metadata_df_to_files(
+                input_df, tmpdir, "test_output", INTERNAL_COL_KEYS,
+                sep="\t", remove_internals_and_fails=True,
+                suppress_empty_fails=False)
+            # Find the fails file
+            fails_files = glob.glob(os.path.join(tmpdir, "*_test_output_fails.csv"))
+            self.assertEqual(1, len(fails_files))
+            # Verify fails file is empty (zero bytes)
+            self.assertEqual(0, os.path.getsize(fails_files[0]))
+    def test__output_metadata_df_to_files_suppress_empty_fails(self):
+        """Test that empty fails file is not created when suppress_empty_fails=True."""
+        input_df = pandas.DataFrame({
+            SAMPLE_NAME_KEY: ["sample1", "sample2"],
+            "field_a": ["a1", "a2"],
+            HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
+            SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
+            QC_NOTE_KEY: ["", ""]
+        })
+        with tempfile.TemporaryDirectory() as tmpdir:
+            _output_metadata_df_to_files(
+                input_df, tmpdir, "test_output", INTERNAL_COL_KEYS,
+                sep="\t", remove_internals_and_fails=True,
+                suppress_empty_fails=True)
+            # Find the fails file - should not exist
+            fails_files = glob.glob(os.path.join(tmpdir, "*_test_output_fails.csv"))
+            self.assertEqual(0, len(fails_files))
+            # Main output file should still exist
+            output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
+            self.assertEqual(1, len(output_files))
+    def test__output_metadata_df_to_files_csv_separator(self):
+        """Test output with comma separator creates .csv file."""
+        input_df = pandas.DataFrame({
+            SAMPLE_NAME_KEY: ["sample1", "sample2"],
+            "field_a": ["a1", "a2"],
+            HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
+            SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
+            QC_NOTE_KEY: ["", ""]
+        })
+        with tempfile.TemporaryDirectory() as tmpdir:
+            _output_metadata_df_to_files(
+                input_df, tmpdir, "test_output", INTERNAL_COL_KEYS,
+                sep=",", remove_internals_and_fails=False)
+            # Find the output file with .csv extension
+            output_files = glob.glob(os.path.join(tmpdir, "*_test_output.csv"))
+            self.assertEqual(1, len(output_files))
+            # Read and verify contents (keep_default_na=False preserves empty strings)
+            result_df = pandas.read_csv(output_files[0], sep=",", keep_default_na=False)
+            expected_df = input_df
+            assert_frame_equal(expected_df, result_df)
+    def test__output_metadata_df_to_files_all_failures(self):
+        """Test output when all rows are failures."""
+        input_df = pandas.DataFrame({
+            SAMPLE_NAME_KEY: ["sample1", "sample2"],
+            "field_a": ["a1", "a2"],
+            HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
+            SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
+            QC_NOTE_KEY: ["invalid host_type", "invalid sample_type"]
+        })
+        with tempfile.TemporaryDirectory() as tmpdir:
+            _output_metadata_df_to_files(
+                input_df, tmpdir, "test_output", INTERNAL_COL_KEYS,
+                sep="\t", remove_internals_and_fails=True)
+            # Main output file should have only headers (empty data)
+            output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
+            self.assertEqual(1, len(output_files))
+            result_df = pandas.read_csv(output_files[0], sep="\t")
+            self.assertTrue(result_df.empty)
+            self.assertEqual([SAMPLE_NAME_KEY, "field_a"], list(result_df.columns))
+            # Fails file should have both rows
+            fails_files = glob.glob(os.path.join(tmpdir, "*_test_output_fails.csv"))
+            self.assertEqual(1, len(fails_files))
+            fails_df = pandas.read_csv(fails_files[0], sep=",")
+            self.assertEqual(2, len(fails_df))
+    # Tests for get_extended_metadata_from_df_and_yaml
+    TEST_STUDY_CONFIG_FP = path.join(TEST_DIR, "data/test_study_config.yml")
+    def test_get_extended_metadata_from_df_and_yaml_with_config(self):
+        """Test extending metadata with a study-specific YAML config file."""
+        input_df = pandas.DataFrame({
+            SAMPLE_NAME_KEY: ["sample1", "sample2"],
+            HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
+            SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"]
+        })
+        result_df, validation_msgs_df = get_extended_metadata_from_df_and_yaml(
+            input_df, self.TEST_STUDY_CONFIG_FP, self.TEST_STDS_FP)
+        expected_df = pandas.DataFrame({
+            SAMPLE_NAME_KEY: ["sample1", "sample2"],
+            "body_product": ["UBERON:feces", "UBERON:feces"],
+            "body_site": ["gut", "gut"],
+            "description": ["human sample", "human sample"],
+            "host_common_name": ["human", "human"],
+            QIITA_SAMPLE_TYPE: ["stool", "stool"],
+            SAMPLE_TYPE_KEY: ["stool", "stool"],
+            "study_custom_field": ["custom_value", "custom_value"],
+            "study_stool_field": ["stool_custom", "stool_custom"],
+            HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
+            SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
+            QC_NOTE_KEY: ["", ""]
+        })
+        assert_frame_equal(expected_df, result_df)
+        self.assertTrue(validation_msgs_df.empty)
+    def test_get_extended_metadata_from_df_and_yaml_none_config(self):
+        """Test extending metadata with None for study_specific_config_fp."""
+        input_df = pandas.DataFrame({
+            SAMPLE_NAME_KEY: ["sample1", "sample2"],
+            HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
+            SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"]
+        })
+        result_df, validation_msgs_df = get_extended_metadata_from_df_and_yaml(
+            input_df, None, self.TEST_STDS_FP)
+        expected_df = pandas.DataFrame({
+            SAMPLE_NAME_KEY: ["sample1", "sample2"],
+            "body_product": ["UBERON:feces", "UBERON:feces"],
+            "body_site": ["gut", "gut"],
+            "description": ["human sample", "human sample"],
+            "host_common_name": ["human", "human"],
+            QIITA_SAMPLE_TYPE: ["stool", "stool"],
+            SAMPLE_TYPE_KEY: ["stool", "stool"],
+            HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
+            SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
+            QC_NOTE_KEY: ["", ""]
+        })
+        assert_frame_equal(expected_df, result_df)
+        self.assertTrue(validation_msgs_df.empty)
+    def test_get_extended_metadata_from_df_and_yaml_invalid_host_type(self):
+        """Test that invalid host types are flagged with QC note."""
+        input_df = pandas.DataFrame({
+            SAMPLE_NAME_KEY: ["sample1", "sample2"],
+            HOSTTYPE_SHORTHAND_KEY: ["unknown_host", "human"],
+            SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"]
+        })
+        result_df, validation_msgs_df = get_extended_metadata_from_df_and_yaml(
+            input_df, self.TEST_STUDY_CONFIG_FP, self.TEST_STDS_FP)
+        expected_df = pandas.DataFrame({
+            SAMPLE_NAME_KEY: ["sample1", "sample2"],
+            "body_product": ["not provided", "UBERON:feces"],
+            "body_site": ["not provided", "gut"],
+            "description": ["not provided", "human sample"],
+            "host_common_name": ["not provided", "human"],
+            QIITA_SAMPLE_TYPE: ["not provided", "stool"],
+            SAMPLE_TYPE_KEY: ["not provided", "stool"],
+            "study_custom_field": ["not provided", "custom_value"],
+            "study_stool_field": ["not provided", "stool_custom"],
+            HOSTTYPE_SHORTHAND_KEY: ["unknown_host", "human"],
+            SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
+            QC_NOTE_KEY: ["invalid host_type", ""]
+        })
+        assert_frame_equal(expected_df, result_df)
+        self.assertTrue(validation_msgs_df.empty)
+    # Tests for write_extended_metadata_from_df
+    def test_write_extended_metadata_from_df_basic(self):
+        """Test basic writing of extended metadata to files."""
+        input_df = pandas.DataFrame({
+            SAMPLE_NAME_KEY: ["sample1", "sample2"],
+            HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
+            SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"]
+        })
+        study_config = {
+            DEFAULT_KEY: "not provided",
+            LEAVE_REQUIREDS_BLANK_KEY: True,
+            OVERWRITE_NON_NANS_KEY: False,
+            STUDY_SPECIFIC_METADATA_KEY: {
+                HOST_TYPE_SPECIFIC_METADATA_KEY: {
+                    "human": {
+                        METADATA_FIELDS_KEY: {
+                            "custom_field": {
+                                DEFAULT_KEY: "custom_value",
+                                TYPE_KEY: "string"
+                            }
+                        },
+                        SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
+                            "stool": {
+                                METADATA_FIELDS_KEY: {}
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        with tempfile.TemporaryDirectory() as tmpdir:
+            result_df = write_extended_metadata_from_df(
+                input_df, study_config, tmpdir, "test_output",
+                stds_fp=self.TEST_STDS_FP)
-            # Find the output file (has timestamp prefix)
+            # Verify returned DataFrame
+            expected_df = pandas.DataFrame({
+                SAMPLE_NAME_KEY: ["sample1", "sample2"],
+                "body_product": ["UBERON:feces", "UBERON:feces"],
+                "body_site": ["gut", "gut"],
+                "custom_field": ["custom_value", "custom_value"],
+                "description": ["human sample", "human sample"],
+                "host_common_name": ["human", "human"],
+                QIITA_SAMPLE_TYPE: ["stool", "stool"],
+                SAMPLE_TYPE_KEY: ["stool", "stool"],
+                HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
+                SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
+                QC_NOTE_KEY: ["", ""]
+            })
+            assert_frame_equal(expected_df, result_df)
+            # Verify main output file was created (internal cols removed by default)
             output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
             self.assertEqual(1, len(output_files))
+            output_df = pandas.read_csv(output_files[0], sep="\t")
+            expected_output_df = pandas.DataFrame({
+                SAMPLE_NAME_KEY: ["sample1", "sample2"],
+                "body_product": ["UBERON:feces", "UBERON:feces"],
+                "body_site": ["gut", "gut"],
+                "custom_field": ["custom_value", "custom_value"],
+                "description": ["human sample", "human sample"],
+                "host_common_name": ["human", "human"],
+                QIITA_SAMPLE_TYPE: ["stool", "stool"],
+                SAMPLE_TYPE_KEY: ["stool", "stool"]
+            })
+            assert_frame_equal(expected_output_df, output_df)
-            # Read and verify contents (keep_default_na=False preserves empty strings)
-            result_df = pandas.read_csv(output_files[0], sep="\t", keep_default_na=False)
-            expected_df = input_df
-            assert_frame_equal(expected_df, result_df)
+            # Verify empty fails file was created
+            fails_files = glob.glob(os.path.join(tmpdir, "*_test_output_fails.csv"))
+            self.assertEqual(1, len(fails_files))
+            self.assertEqual(0, os.path.getsize(fails_files[0]))
-    def test__output_metadata_df_to_files_remove_internals_and_fails(self):
-        """Test output with internal columns and failures removed."""
+            # Verify validation errors file was created (empty)
+            validation_files = glob.glob(
+                os.path.join(tmpdir, "*_test_output_validation_errors.csv"))
+            self.assertEqual(1, len(validation_files))
+            self.assertEqual(0, os.path.getsize(validation_files[0]))
+    def test_write_extended_metadata_from_df_with_qc_failures(self):
+        """Test writing extended metadata when some rows have QC failures."""
         input_df = pandas.DataFrame({
             SAMPLE_NAME_KEY: ["sample1", "sample2", "sample3"],
-            "field_a": ["a1", "a2", "a3"],
-            HOSTTYPE_SHORTHAND_KEY: ["human", "human", "human"],
-            SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool", "stool"],
-            QC_NOTE_KEY: ["", "invalid host_type", ""]
+            HOSTTYPE_SHORTHAND_KEY: ["human", "unknown_host", "human"],
+            SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool", "stool"]
         })
+        study_config = {
+            DEFAULT_KEY: "not provided",
+            LEAVE_REQUIREDS_BLANK_KEY: True,
+            OVERWRITE_NON_NANS_KEY: False,
+            STUDY_SPECIFIC_METADATA_KEY: {
+                HOST_TYPE_SPECIFIC_METADATA_KEY: {
+                    "human": {
+                        METADATA_FIELDS_KEY: {},
+                        SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
+                            "stool": {
+                                METADATA_FIELDS_KEY: {}
+                            }
+                        }
+                    }
+                }
+            }
+        }
         with tempfile.TemporaryDirectory() as tmpdir:
-            _output_metadata_df_to_files(
-                input_df, tmpdir, "test_output", INTERNAL_COL_KEYS,
-                sep="\t", remove_internals_and_fails=True)
+            result_df = write_extended_metadata_from_df(
+                input_df, study_config, tmpdir, "test_output",
+                stds_fp=self.TEST_STDS_FP)
+            # Verify returned DataFrame includes all rows (including failures)
+            # Note: rows are reordered by host type processing (valid hosts first)
+            expected_result_df = pandas.DataFrame({
+                SAMPLE_NAME_KEY: ["sample1", "sample3", "sample2"],
+                "body_product": ["UBERON:feces", "UBERON:feces", "not provided"],
+                "body_site": ["gut", "gut", "not provided"],
+                "description": ["human sample", "human sample", "not provided"],
+                "host_common_name": ["human", "human", "not provided"],
+                QIITA_SAMPLE_TYPE: ["stool", "stool", "not provided"],
+                SAMPLE_TYPE_KEY: ["stool", "stool", "not provided"],
+                HOSTTYPE_SHORTHAND_KEY: ["human", "human", "unknown_host"],
+                SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool", "stool"],
+                QC_NOTE_KEY: ["", "", "invalid host_type"]
+            })
+            assert_frame_equal(expected_result_df, result_df)
-            # Find the main output file
+            # Verify main output file excludes failure rows
             output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
             self.assertEqual(1, len(output_files))
-            # Verify main output has internal cols removed and no failures
-            result_df = pandas.read_csv(output_files[0], sep="\t")
-            expected_df = pandas.DataFrame({
+            output_df = pandas.read_csv(output_files[0], sep="\t")
+            expected_output_df = pandas.DataFrame({
                 SAMPLE_NAME_KEY: ["sample1", "sample3"],
-                "field_a": ["a1", "a3"]
+                "body_product": ["UBERON:feces", "UBERON:feces"],
+                "body_site": ["gut", "gut"],
+                "description": ["human sample", "human sample"],
+                "host_common_name": ["human", "human"],
+                QIITA_SAMPLE_TYPE: ["stool", "stool"],
+                SAMPLE_TYPE_KEY: ["stool", "stool"]
             })
-            assert_frame_equal(expected_df, result_df)
+            assert_frame_equal(expected_output_df, output_df)
-            # Find the fails file
+            # Verify fails file contains the failed row
             fails_files = glob.glob(os.path.join(tmpdir, "*_test_output_fails.csv"))
             self.assertEqual(1, len(fails_files))
-            # Verify fails file contains the failed row
             fails_df = pandas.read_csv(fails_files[0], sep=",")
             expected_fails_df = pandas.DataFrame({
                 SAMPLE_NAME_KEY: ["sample2"],
-                "field_a": ["a2"],
-                HOSTTYPE_SHORTHAND_KEY: ["human"],
+                "body_product": ["not provided"],
+                "body_site": ["not provided"],
+                "description": ["not provided"],
+                "host_common_name": ["not provided"],
+                QIITA_SAMPLE_TYPE: ["not provided"],
+                SAMPLE_TYPE_KEY: ["not provided"],
+                HOSTTYPE_SHORTHAND_KEY: ["unknown_host"],
                 SAMPLETYPE_SHORTHAND_KEY: ["stool"],
                 QC_NOTE_KEY: ["invalid host_type"]
             })
             assert_frame_equal(expected_fails_df, fails_df)
-    def test__output_metadata_df_to_files_no_failures_creates_empty_file(self):
-        """Test that empty fails file is created when there are no failures."""
+    def test_write_extended_metadata_from_df_with_validation_errors(self):
+        """Test writing extended metadata when validation errors occur."""
         input_df = pandas.DataFrame({
             SAMPLE_NAME_KEY: ["sample1", "sample2"],
-            "field_a": ["a1", "a2"],
             HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
             SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
-            QC_NOTE_KEY: ["", ""]
+            "restricted_field": ["invalid_value", "allowed_value"]
         })
+        study_config = {
+            DEFAULT_KEY: "not provided",
+            LEAVE_REQUIREDS_BLANK_KEY: True,
+            OVERWRITE_NON_NANS_KEY: False,
+            STUDY_SPECIFIC_METADATA_KEY: {
+                HOST_TYPE_SPECIFIC_METADATA_KEY: {
+                    "human": {
+                        METADATA_FIELDS_KEY: {
+                            "restricted_field": {
+                                TYPE_KEY: "string",
+                                ALLOWED_KEY: ["allowed_value"]
+                            }
+                        },
+                        SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
+                            "stool": {
+                                METADATA_FIELDS_KEY: {}
+                            }
+                        }
+                    }
+                }
+            }
+        }
         with tempfile.TemporaryDirectory() as tmpdir:
-            _output_metadata_df_to_files(
-                input_df, tmpdir, "test_output", INTERNAL_COL_KEYS,
-                sep="\t", remove_internals_and_fails=True,
-                suppress_empty_fails=False)
-            # Find the fails file
-            fails_files = glob.glob(os.path.join(tmpdir, "*_test_output_fails.csv"))
-            self.assertEqual(1, len(fails_files))
-            # Verify fails file is empty (zero bytes)
-            self.assertEqual(0, os.path.getsize(fails_files[0]))
+            result_df = write_extended_metadata_from_df(
+                input_df, study_config, tmpdir, "test_output",
+                stds_fp=self.TEST_STDS_FP)
+            # Verify returned DataFrame
+            expected_result_df = pandas.DataFrame({
+                SAMPLE_NAME_KEY: ["sample1", "sample2"],
+                "body_product": ["UBERON:feces", "UBERON:feces"],
+                "body_site": ["gut", "gut"],
+                "description": ["human sample", "human sample"],
+                "host_common_name": ["human", "human"],
+                QIITA_SAMPLE_TYPE: ["stool", "stool"],
+                "restricted_field": ["invalid_value", "allowed_value"],
+                SAMPLE_TYPE_KEY: ["stool", "stool"],
+                HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
+                SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
+                QC_NOTE_KEY: ["", ""]
+            })
+            assert_frame_equal(expected_result_df, result_df)
+            # Verify validation errors file contains the error
+            validation_files = glob.glob(
+                os.path.join(tmpdir, "*_test_output_validation_errors.csv"))
+            self.assertEqual(1, len(validation_files))
+            validation_df = pandas.read_csv(validation_files[0], sep=",")
+            expected_validation_df = pandas.DataFrame({
+                "sample_name": ["sample1"],
+                "field_name": ["restricted_field"],
+                "error_message": ["['unallowed value invalid_value']"]
+            })
+            assert_frame_equal(expected_validation_df, validation_df)
-    def test__output_metadata_df_to_files_suppress_empty_fails(self):
-        """Test that empty fails file is not created when suppress_empty_fails=True."""
+    def test_write_extended_metadata_from_df_remove_internals_false(self):
+        """Test writing extended metadata with remove_internals=False."""
         input_df = pandas.DataFrame({
-            SAMPLE_NAME_KEY: ["sample1", "sample2"],
-            "field_a": ["a1", "a2"],
-            HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
-            SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
-            QC_NOTE_KEY: ["", ""]
+            SAMPLE_NAME_KEY: ["sample1"],
+            HOSTTYPE_SHORTHAND_KEY: ["human"],
+            SAMPLETYPE_SHORTHAND_KEY: ["stool"]
         })
+        study_config = {
+            DEFAULT_KEY: "not provided",
+            LEAVE_REQUIREDS_BLANK_KEY: True,
+            OVERWRITE_NON_NANS_KEY: False,
+            STUDY_SPECIFIC_METADATA_KEY: {
+                HOST_TYPE_SPECIFIC_METADATA_KEY: {
+                    "human": {
+                        METADATA_FIELDS_KEY: {},
+                        SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
+                            "stool": {
+                                METADATA_FIELDS_KEY: {}
+                            }
+                        }
+                    }
+                }
+            }
+        }
         with tempfile.TemporaryDirectory() as tmpdir:
-            _output_metadata_df_to_files(
-                input_df, tmpdir, "test_output", INTERNAL_COL_KEYS,
-                sep="\t", remove_internals_and_fails=True,
-                suppress_empty_fails=True)
+            write_extended_metadata_from_df(
+                input_df, study_config, tmpdir, "test_output",
+                remove_internals=False, stds_fp=self.TEST_STDS_FP)
-            # Find the fails file - should not exist
+            # Verify main output file includes internal columns
+            output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
+            self.assertEqual(1, len(output_files))
+            output_df = pandas.read_csv(output_files[0], sep="\t", keep_default_na=False)
+            expected_output_df = pandas.DataFrame({
+                SAMPLE_NAME_KEY: ["sample1"],
+                "body_product": ["UBERON:feces"],
+                "body_site": ["gut"],
+                "description": ["human sample"],
+                "host_common_name": ["human"],
+                QIITA_SAMPLE_TYPE: ["stool"],
+                SAMPLE_TYPE_KEY: ["stool"],
+                HOSTTYPE_SHORTHAND_KEY: ["human"],
+                SAMPLETYPE_SHORTHAND_KEY: ["stool"],
+                QC_NOTE_KEY: [""]
+            })
+            assert_frame_equal(expected_output_df, output_df)
+            # Verify no fails file was created (since remove_internals=False)
             fails_files = glob.glob(os.path.join(tmpdir, "*_test_output_fails.csv"))
             self.assertEqual(0, len(fails_files))
-            # Main output file should still exist
+    # Tests for write_extended_metadata
+    TEST_METADATA_CSV_FP = path.join(TEST_DIR, "data/test_metadata.csv")
+    TEST_METADATA_TXT_FP = path.join(TEST_DIR, "data/test_metadata.txt")
+    TEST_METADATA_WITH_ERRORS_FP = path.join(
+        TEST_DIR, "data/test_metadata_with_errors.csv")
+    TEST_STUDY_CONFIG_WITH_VALIDATION_FP = path.join(
+        TEST_DIR, "data/test_study_config_with_validation.yml")
+    def test_write_extended_metadata_csv_input(self):
+        """Test writing extended metadata from a CSV input file."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            result_df = write_extended_metadata(
+                self.TEST_METADATA_CSV_FP, self.TEST_STUDY_CONFIG_FP,
+                tmpdir, "test_output", stds_fp=self.TEST_STDS_FP)
+            # Verify returned DataFrame
+            expected_result_df = pandas.DataFrame({
+                SAMPLE_NAME_KEY: ["sample1", "sample2"],
+                "body_product": ["UBERON:feces", "UBERON:feces"],
+                "body_site": ["gut", "gut"],
+                "description": ["human sample", "human sample"],
+                "host_common_name": ["human", "human"],
+                QIITA_SAMPLE_TYPE: ["stool", "stool"],
+                SAMPLE_TYPE_KEY: ["stool", "stool"],
+                "study_custom_field": ["custom_value", "custom_value"],
+                "study_stool_field": ["stool_custom", "stool_custom"],
+                HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
+                SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
+                QC_NOTE_KEY: ["", ""]
+            })
+            assert_frame_equal(expected_result_df, result_df)
+            # Verify main output file was created (internal cols removed by default)
             output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
             self.assertEqual(1, len(output_files))
+            output_df = pandas.read_csv(output_files[0], sep="\t")
+            expected_output_df = pandas.DataFrame({
+                SAMPLE_NAME_KEY: ["sample1", "sample2"],
+                "body_product": ["UBERON:feces", "UBERON:feces"],
+                "body_site": ["gut", "gut"],
+                "description": ["human sample", "human sample"],
+                "host_common_name": ["human", "human"],
+                QIITA_SAMPLE_TYPE: ["stool", "stool"],
+                SAMPLE_TYPE_KEY: ["stool", "stool"],
+                "study_custom_field": ["custom_value", "custom_value"],
+                "study_stool_field": ["stool_custom", "stool_custom"]
+            })
+            assert_frame_equal(expected_output_df, output_df)
-    def test__output_metadata_df_to_files_csv_separator(self):
-        """Test output with comma separator creates .csv file."""
-        input_df = pandas.DataFrame({
-            SAMPLE_NAME_KEY: ["sample1", "sample2"],
-            "field_a": ["a1", "a2"],
-            HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
-            SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
-            QC_NOTE_KEY: ["", ""]
-        })
+            # Verify empty fails file was created
+            fails_files = glob.glob(os.path.join(tmpdir, "*_test_output_fails.csv"))
+            self.assertEqual(1, len(fails_files))
+            self.assertEqual(0, os.path.getsize(fails_files[0]))
+            # Verify empty validation errors file was created
+            validation_files = glob.glob(
+                os.path.join(tmpdir, "*_test_output_validation_errors.csv"))
+            self.assertEqual(1, len(validation_files))
+            self.assertEqual(0, os.path.getsize(validation_files[0]))
+    def test_write_extended_metadata_txt_input(self):
+        """Test writing extended metadata from a tab-delimited TXT input file."""
         with tempfile.TemporaryDirectory() as tmpdir:
-            _output_metadata_df_to_files(
-                input_df, tmpdir, "test_output", INTERNAL_COL_KEYS,
-                sep=",", remove_internals_and_fails=False)
+            result_df = write_extended_metadata(
+                self.TEST_METADATA_TXT_FP, self.TEST_STUDY_CONFIG_FP,
+                tmpdir, "test_output", stds_fp=self.TEST_STDS_FP)
+            # Verify returned DataFrame
+            expected_result_df = pandas.DataFrame({
+                SAMPLE_NAME_KEY: ["sample1", "sample2"],
+                "body_product": ["UBERON:feces", "UBERON:feces"],
+                "body_site": ["gut", "gut"],
+                "description": ["human sample", "human sample"],
+                "host_common_name": ["human", "human"],
+                QIITA_SAMPLE_TYPE: ["stool", "stool"],
+                SAMPLE_TYPE_KEY: ["stool", "stool"],
+                "study_custom_field": ["custom_value", "custom_value"],
+                "study_stool_field": ["stool_custom", "stool_custom"],
+                HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
+                SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
+                QC_NOTE_KEY: ["", ""]
+            })
+            assert_frame_equal(expected_result_df, result_df)
-            # Find the output file with .csv extension
+            # Verify main output file was created
+            output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
+            self.assertEqual(1, len(output_files))
+            output_df = pandas.read_csv(output_files[0], sep="\t")
+            expected_output_df = pandas.DataFrame({
+                SAMPLE_NAME_KEY: ["sample1", "sample2"],
+                "body_product": ["UBERON:feces", "UBERON:feces"],
+                "body_site": ["gut", "gut"],
+                "description": ["human sample", "human sample"],
+                "host_common_name": ["human", "human"],
+                QIITA_SAMPLE_TYPE: ["stool", "stool"],
+                SAMPLE_TYPE_KEY: ["stool", "stool"],
+                "study_custom_field": ["custom_value", "custom_value"],
+                "study_stool_field": ["stool_custom", "stool_custom"]
+            })
+            assert_frame_equal(expected_output_df, output_df)
+    def test_write_extended_metadata_with_validation_errors(self):
+        """Test writing extended metadata when validation errors occur."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            result_df = write_extended_metadata(
+                self.TEST_METADATA_WITH_ERRORS_FP,
+                self.TEST_STUDY_CONFIG_WITH_VALIDATION_FP,
+                tmpdir, "test_output", stds_fp=self.TEST_STDS_FP)
+            # Verify returned DataFrame
+            expected_result_df = pandas.DataFrame({
+                SAMPLE_NAME_KEY: ["sample1", "sample2"],
+                "body_product": ["UBERON:feces", "UBERON:feces"],
+                "body_site": ["gut", "gut"],
+                "description": ["human sample", "human sample"],
+                "host_common_name": ["human", "human"],
+                QIITA_SAMPLE_TYPE: ["stool", "stool"],
+                "restricted_field": ["invalid_value", "allowed_value"],
+                SAMPLE_TYPE_KEY: ["stool", "stool"],
+                HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
+                SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
+                QC_NOTE_KEY: ["", ""]
+            })
+            assert_frame_equal(expected_result_df, result_df)
+            # Verify main output file was created
+            output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
+            self.assertEqual(1, len(output_files))
+            output_df = pandas.read_csv(output_files[0], sep="\t")
+            expected_output_df = pandas.DataFrame({
+                SAMPLE_NAME_KEY: ["sample1", "sample2"],
+                "body_product": ["UBERON:feces", "UBERON:feces"],
+                "body_site": ["gut", "gut"],
+                "description": ["human sample", "human sample"],
+                "host_common_name": ["human", "human"],
+                QIITA_SAMPLE_TYPE: ["stool", "stool"],
+                "restricted_field": ["invalid_value", "allowed_value"],
+                SAMPLE_TYPE_KEY: ["stool", "stool"]
+            })
+            assert_frame_equal(expected_output_df, output_df)
+            # Verify validation errors file contains the error
+            validation_files = glob.glob(
+                os.path.join(tmpdir, "*_test_output_validation_errors.csv"))
+            self.assertEqual(1, len(validation_files))
+            validation_df = pandas.read_csv(validation_files[0], sep=",")
+            expected_validation_df = pandas.DataFrame({
+                "sample_name": ["sample1"],
+                "field_name": ["restricted_field"],
+                "error_message": ["['unallowed value invalid_value']"]
+            })
+            assert_frame_equal(expected_validation_df, validation_df)
+    def test_write_extended_metadata_unrecognized_extension_raises(self):
+        """Test that unrecognized file extension raises ValueError."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            fake_fp = path.join(tmpdir, "test.json")
+            # Create a dummy file so the path exists
+            with open(fake_fp, "w") as f:
+                f.write("{}")
+            with self.assertRaisesRegex(
+                    ValueError, "Unrecognized input file extension"):
+                write_extended_metadata(
+                    fake_fp, self.TEST_STUDY_CONFIG_FP,
+                    tmpdir, "test_output", stds_fp=self.TEST_STDS_FP)
+    def test_write_extended_metadata_csv_separator_output(self):
+        """Test writing extended metadata with CSV separator for output."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            result_df = write_extended_metadata(
+                self.TEST_METADATA_CSV_FP, self.TEST_STUDY_CONFIG_FP,
+                tmpdir, "test_output", sep=",", stds_fp=self.TEST_STDS_FP)
+            # Verify returned DataFrame
+            expected_result_df = pandas.DataFrame({
+                SAMPLE_NAME_KEY: ["sample1", "sample2"],
+                "body_product": ["UBERON:feces", "UBERON:feces"],
+                "body_site": ["gut", "gut"],
+                "description": ["human sample", "human sample"],
+                "host_common_name": ["human", "human"],
+                QIITA_SAMPLE_TYPE: ["stool", "stool"],
+                SAMPLE_TYPE_KEY: ["stool", "stool"],
+                "study_custom_field": ["custom_value", "custom_value"],
+                "study_stool_field": ["stool_custom", "stool_custom"],
+                HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
+                SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
+                QC_NOTE_KEY: ["", ""]
+            })
+            assert_frame_equal(expected_result_df, result_df)
+            # Verify output file has .csv extension
             output_files = glob.glob(os.path.join(tmpdir, "*_test_output.csv"))
             self.assertEqual(1, len(output_files))
+            output_df = pandas.read_csv(output_files[0], sep=",")
+            expected_output_df = pandas.DataFrame({
+                SAMPLE_NAME_KEY: ["sample1", "sample2"],
+                "body_product": ["UBERON:feces", "UBERON:feces"],
+                "body_site": ["gut", "gut"],
+                "description": ["human sample", "human sample"],
+                "host_common_name": ["human", "human"],
+                QIITA_SAMPLE_TYPE: ["stool", "stool"],
+                SAMPLE_TYPE_KEY: ["stool", "stool"],
+                "study_custom_field": ["custom_value", "custom_value"],
+                "study_stool_field": ["stool_custom", "stool_custom"]
+            })
+            assert_frame_equal(expected_output_df, output_df)
-            # Read and verify contents (keep_default_na=False preserves empty strings)
-            result_df = pandas.read_csv(output_files[0], sep=",", keep_default_na=False)
-            expected_df = input_df
-            assert_frame_equal(expected_df, result_df)
+    def test_write_extended_metadata_remove_internals_false(self):
+        """Test writing extended metadata with remove_internals=False."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            result_df = write_extended_metadata(
+                self.TEST_METADATA_CSV_FP, self.TEST_STUDY_CONFIG_FP,
+                tmpdir, "test_output", remove_internals=False,
+                stds_fp=self.TEST_STDS_FP)
+            # Verify returned DataFrame
+            expected_result_df = pandas.DataFrame({
+                SAMPLE_NAME_KEY: ["sample1", "sample2"],
+                "body_product": ["UBERON:feces", "UBERON:feces"],
+                "body_site": ["gut", "gut"],
+                "description": ["human sample", "human sample"],
+                "host_common_name": ["human", "human"],
+                QIITA_SAMPLE_TYPE: ["stool", "stool"],
+                SAMPLE_TYPE_KEY: ["stool", "stool"],
+                "study_custom_field": ["custom_value", "custom_value"],
+                "study_stool_field": ["stool_custom", "stool_custom"],
+                HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
+                SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
+                QC_NOTE_KEY: ["", ""]
+            })
+            assert_frame_equal(expected_result_df, result_df)
-    def test__output_metadata_df_to_files_all_failures(self):
-        """Test output when all rows are failures."""
-        input_df = pandas.DataFrame({
-            SAMPLE_NAME_KEY: ["sample1", "sample2"],
-            "field_a": ["a1", "a2"],
-            HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
-            SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
-            QC_NOTE_KEY: ["invalid host_type", "invalid sample_type"]
-        })
+            # Verify main output file includes internal columns
+            output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
+            self.assertEqual(1, len(output_files))
+            output_df = pandas.read_csv(output_files[0], sep="\t", keep_default_na=False)
+            expected_output_df = pandas.DataFrame({
+                SAMPLE_NAME_KEY: ["sample1", "sample2"],
+                "body_product": ["UBERON:feces", "UBERON:feces"],
+                "body_site": ["gut", "gut"],
+                "description": ["human sample", "human sample"],
+                "host_common_name": ["human", "human"],
+                QIITA_SAMPLE_TYPE: ["stool", "stool"],
+                SAMPLE_TYPE_KEY: ["stool", "stool"],
+                "study_custom_field": ["custom_value", "custom_value"],
+                "study_stool_field": ["stool_custom", "stool_custom"],
+                HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
+                SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
+                QC_NOTE_KEY: ["", ""]
+            })
+            assert_frame_equal(expected_output_df, output_df)
+            # Verify no fails file was created (since remove_internals=False)
+            fails_files = glob.glob(os.path.join(tmpdir, "*_test_output_fails.csv"))
+            self.assertEqual(0, len(fails_files))
+    def test_write_extended_metadata_suppress_empty_fails(self):
+        """Test writing extended metadata with suppress_empty_fails=True."""
         with tempfile.TemporaryDirectory() as tmpdir:
-            _output_metadata_df_to_files(
-                input_df, tmpdir, "test_output", INTERNAL_COL_KEYS,
-                sep="\t", remove_internals_and_fails=True)
+            result_df = write_extended_metadata(
+                self.TEST_METADATA_CSV_FP, self.TEST_STUDY_CONFIG_FP,
+                tmpdir, "test_output", suppress_empty_fails=True,
+                stds_fp=self.TEST_STDS_FP)
+            # Verify returned DataFrame
+            expected_result_df = pandas.DataFrame({
+                SAMPLE_NAME_KEY: ["sample1", "sample2"],
+                "body_product": ["UBERON:feces", "UBERON:feces"],
+                "body_site": ["gut", "gut"],
+                "description": ["human sample", "human sample"],
+                "host_common_name": ["human", "human"],
+                QIITA_SAMPLE_TYPE: ["stool", "stool"],
+                SAMPLE_TYPE_KEY: ["stool", "stool"],
+                "study_custom_field": ["custom_value", "custom_value"],
+                "study_stool_field": ["stool_custom", "stool_custom"],
+                HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
+                SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
+                QC_NOTE_KEY: ["", ""]
+            })
+            assert_frame_equal(expected_result_df, result_df)
-            # Main output file should have only headers (empty data)
+            # Verify main output file was created
             output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
             self.assertEqual(1, len(output_files))
-            result_df = pandas.read_csv(output_files[0], sep="\t")
-            self.assertTrue(result_df.empty)
-            self.assertEqual([SAMPLE_NAME_KEY, "field_a"], list(result_df.columns))
+            output_df = pandas.read_csv(output_files[0], sep="\t")
+            expected_output_df = pandas.DataFrame({
+                SAMPLE_NAME_KEY: ["sample1", "sample2"],
+                "body_product": ["UBERON:feces", "UBERON:feces"],
+                "body_site": ["gut", "gut"],
+                "description": ["human sample", "human sample"],
+                "host_common_name": ["human", "human"],
+                QIITA_SAMPLE_TYPE: ["stool", "stool"],
+                SAMPLE_TYPE_KEY: ["stool", "stool"],
+                "study_custom_field": ["custom_value", "custom_value"],
+                "study_stool_field": ["stool_custom", "stool_custom"]
+            })
+            assert_frame_equal(expected_output_df, output_df)
-            # Fails file should have both rows
+            # Verify no empty fails file was created (since suppress_empty_fails=True)
+            fails_files = glob.glob(os.path.join(tmpdir, "*_test_output_fails.csv"))
+            self.assertEqual(0, len(fails_files))
+            # Verify no empty validation errors file was created
+            validation_files = glob.glob(
+                os.path.join(tmpdir, "*_test_output_validation_errors.csv"))
+            self.assertEqual(0, len(validation_files))
+    # Integration tests
+    TEST_PROJECT1_METADATA_FP = path.join(TEST_DIR, "data/test_project1_input_metadata.csv")
+    TEST_PROJECT1_CONFIG_FP = path.join(TEST_DIR, "data/test_project1_config.yml")
+    TEST_PROJECT1_EXPECTED_OUTPUT_FP = path.join(
+        TEST_DIR, "data/test_project1_output_metadata.txt")
+    TEST_PROJECT1_EXPECTED_FAILS_FP = path.join(
+        TEST_DIR, "data/test_project1_output_fails.csv")
+    def test_write_extended_metadata_from_df_project1_integration(self):
+        """Integration test using project1 test data files."""
+        def write_mismatched_debug_files(expected_content, actual_content, file_name):
+            """Write debug files to Desktop for unmatched content."""
+            debug_dir = path.join(path.expanduser("~"), "Desktop")
+            with open(path.join(debug_dir, f"UNMATCHED_1_{file_name}"), 'w') as debug_expected_file:
+                debug_expected_file.write(expected_content)
+            with open(path.join(debug_dir, f"UNMATCHED_2_{file_name}"), 'w') as debug_actual_file:
+                debug_actual_file.write(actual_content)
+        # Load input metadata CSV
+        input_df = pandas.read_csv(self.TEST_PROJECT1_METADATA_FP, dtype=str)
+        # for the columns "plating_notes" and "notes", fill NaN with empty string
+        input_df["plating_notes"] = input_df["plating_notes"].fillna("")
+        input_df["notes"] = input_df["notes"].fillna("")
+        # Load study config
+        study_config = _get_study_specific_config(self.TEST_PROJECT1_CONFIG_FP)
+        with tempfile.TemporaryDirectory() as tmpdir:
+            write_extended_metadata_from_df(
+                input_df, study_config, tmpdir, "test_output",
+                remove_internals=True)
+            # Compare main output file directly to expected file
+            output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
+            self.assertEqual(1, len(output_files))
+            with open(output_files[0], 'r') as actual_file:
+                actual_content = actual_file.read()
+            with open(self.TEST_PROJECT1_EXPECTED_OUTPUT_FP, 'r') as expected_file:
+                expected_content = expected_file.read()
+            try:
+                self.assertEqual(expected_content, actual_content)
+            except AssertionError:
+                write_mismatched_debug_files(
+                    expected_content, actual_content,
+                    "project1_output.txt")
+                raise
+            # Compare fails file directly to expected file
             fails_files = glob.glob(os.path.join(tmpdir, "*_test_output_fails.csv"))
             self.assertEqual(1, len(fails_files))
-            fails_df = pandas.read_csv(fails_files[0], sep=",")
-            self.assertEqual(2, len(fails_df))
+            with open(fails_files[0], 'r') as actual_file:
+                actual_fails_content = actual_file.read()
+            with open(self.TEST_PROJECT1_EXPECTED_FAILS_FP, 'r') as expected_file:
+                expected_fails_content = expected_file.read()
+            try:
+                self.assertEqual(expected_fails_content, actual_fails_content)
+            except AssertionError:
+                write_mismatched_debug_files(
+                    expected_fails_content, actual_fails_content,
+                    "project1_fails.csv")
+                raise
+            # Verify validation errors file is empty
+            validation_files = glob.glob(
+                os.path.join(tmpdir, "*_test_output_validation_errors.csv"))
+            self.assertEqual(1, len(validation_files))
+            self.assertEqual(0, os.path.getsize(validation_files[0]))
+    # Tests for _get_specified_column_name
+    def test__get_specified_column_name_finds_column(self):
+        """Test that _get_specified_column_name finds a column that exists."""
+        input_df = pandas.DataFrame({
+            "sample_name": ["s1"],
+            "host_type": ["human"]
+        })
+        config_dict = {
+            HOSTTYPE_COL_OPTIONS_KEY: ["host_type", "host_common_name"]
+        }
+        result = _get_specified_column_name(
+            HOSTTYPE_COL_OPTIONS_KEY, input_df, config_dict)
+        self.assertEqual("host_type", result)
+    def test__get_specified_column_name_returns_first_match(self):
+        """Test that _get_specified_column_name returns the first match when multiple options exist."""
+        input_df = pandas.DataFrame({
+            "sample_name": ["s1"],
+            "host_type": ["human"],
+            "host_common_name": ["human"]
+        })
+        config_dict = {
+            HOSTTYPE_COL_OPTIONS_KEY: ["host_type", "host_common_name"]
+        }
+        result = _get_specified_column_name(
+            HOSTTYPE_COL_OPTIONS_KEY, input_df, config_dict)
+        self.assertEqual("host_type", result)
+    def test__get_specified_column_name_returns_none_when_no_match(self):
+        """Test that _get_specified_column_name returns None when no options match."""
+        input_df = pandas.DataFrame({
+            "sample_name": ["s1"],
+            "other_column": ["value"]
+        })
+        config_dict = {
+            HOSTTYPE_COL_OPTIONS_KEY: ["host_type", "host_common_name"]
+        }
+        result = _get_specified_column_name(
+            HOSTTYPE_COL_OPTIONS_KEY, input_df, config_dict)
+        self.assertIsNone(result)
+    def test__get_specified_column_name_returns_none_when_key_missing(self):
+        """Test that _get_specified_column_name returns None when col_options_key is not in config."""
+        input_df = pandas.DataFrame({
+            "sample_name": ["s1"],
+            "host_type": ["human"]
+        })
+        config_dict = {}
+        result = _get_specified_column_name(
+            HOSTTYPE_COL_OPTIONS_KEY, input_df, config_dict)
+        self.assertIsNone(result)
+    def test__get_specified_column_name_returns_none_when_options_empty(self):
+        """Test that _get_specified_column_name returns None when col_options is empty list."""
+        input_df = pandas.DataFrame({
+            "sample_name": ["s1"],
+            "host_type": ["human"]
+        })
+        config_dict = {
+            HOSTTYPE_COL_OPTIONS_KEY: []
+        }
+        result = _get_specified_column_name(
+            HOSTTYPE_COL_OPTIONS_KEY, input_df, config_dict)
+        self.assertIsNone(result)
+    def test__get_specified_column_name_with_sampletype_key(self):
+        """Test that _get_specified_column_name works with sampletype column options."""
+        input_df = pandas.DataFrame({
+            "sample_name": ["s1"],
+            "sample_type": ["stool"]
+        })
+        config_dict = {
+            SAMPLETYPE_COL_OPTIONS_KEY: ["sample_type", "sampletype"]
+        }
+        result = _get_specified_column_name(
+            SAMPLETYPE_COL_OPTIONS_KEY, input_df, config_dict)
+        self.assertEqual("sample_type", result)
+    # endregion _get_specified_column_name tests

metameq 2026.1.1__py3-none-any.whl → 2026.2.1__py3-none-any.whl

metameq 2026.1.1py3-none-any.whl → 2026.2.1py3-none-any.whl