metameq 2026.2.2__tar.gz → 2026.2.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. {metameq-2026.2.2/metameq.egg-info → metameq-2026.2.3}/PKG-INFO +2 -1
  2. {metameq-2026.2.2 → metameq-2026.2.3}/README.md +1 -1
  3. {metameq-2026.2.2 → metameq-2026.2.3}/environment.yml +1 -0
  4. {metameq-2026.2.2 → metameq-2026.2.3}/metameq/__init__.py +3 -2
  5. {metameq-2026.2.2 → metameq-2026.2.3}/metameq/_version.py +3 -3
  6. {metameq-2026.2.2 → metameq-2026.2.3}/metameq/src/metadata_extender.py +3 -3
  7. {metameq-2026.2.2 → metameq-2026.2.3}/metameq/tests/test_metadata_extender.py +200 -26
  8. {metameq-2026.2.2 → metameq-2026.2.3}/metameq/tests/test_metadata_validator.py +2 -2
  9. {metameq-2026.2.2 → metameq-2026.2.3/metameq.egg-info}/PKG-INFO +2 -1
  10. {metameq-2026.2.2 → metameq-2026.2.3}/metameq.egg-info/requires.txt +1 -0
  11. {metameq-2026.2.2 → metameq-2026.2.3}/setup.py +1 -0
  12. {metameq-2026.2.2 → metameq-2026.2.3}/.gitattributes +0 -0
  13. {metameq-2026.2.2 → metameq-2026.2.3}/.github/workflows/main.yaml +0 -0
  14. {metameq-2026.2.2 → metameq-2026.2.3}/.gitignore +0 -0
  15. {metameq-2026.2.2 → metameq-2026.2.3}/assets/metameq.png +0 -0
  16. {metameq-2026.2.2 → metameq-2026.2.3}/assets/metameq_dark.svg +0 -0
  17. {metameq-2026.2.2 → metameq-2026.2.3}/assets/metameq_light.svg +0 -0
  18. {metameq-2026.2.2 → metameq-2026.2.3}/assets/metameq_medium.png +0 -0
  19. {metameq-2026.2.2 → metameq-2026.2.3}/metameq/config/__init__.py +0 -0
  20. {metameq-2026.2.2 → metameq-2026.2.3}/metameq/config/config.yml +0 -0
  21. {metameq-2026.2.2 → metameq-2026.2.3}/metameq/config/standards.yml +0 -0
  22. {metameq-2026.2.2 → metameq-2026.2.3}/metameq/src/__init__.py +0 -0
  23. {metameq-2026.2.2 → metameq-2026.2.3}/metameq/src/__main__.py +0 -0
  24. {metameq-2026.2.2 → metameq-2026.2.3}/metameq/src/metadata_configurator.py +0 -0
  25. {metameq-2026.2.2 → metameq-2026.2.3}/metameq/src/metadata_merger.py +0 -0
  26. {metameq-2026.2.2 → metameq-2026.2.3}/metameq/src/metadata_transformers.py +0 -0
  27. {metameq-2026.2.2 → metameq-2026.2.3}/metameq/src/metadata_validator.py +0 -0
  28. {metameq-2026.2.2 → metameq-2026.2.3}/metameq/src/util.py +0 -0
  29. {metameq-2026.2.2 → metameq-2026.2.3}/metameq/tests/__init__.py +0 -0
  30. {metameq-2026.2.2 → metameq-2026.2.3}/metameq/tests/data/invalid.yml +0 -0
  31. {metameq-2026.2.2 → metameq-2026.2.3}/metameq/tests/data/test_config.yml +0 -0
  32. {metameq-2026.2.2 → metameq-2026.2.3}/metameq/tests/test_metadata_configurator.py +0 -0
  33. {metameq-2026.2.2 → metameq-2026.2.3}/metameq/tests/test_metadata_merger.py +0 -0
  34. {metameq-2026.2.2 → metameq-2026.2.3}/metameq/tests/test_metadata_transformers.py +0 -0
  35. {metameq-2026.2.2 → metameq-2026.2.3}/metameq/tests/test_util.py +0 -0
  36. {metameq-2026.2.2 → metameq-2026.2.3}/metameq.egg-info/SOURCES.txt +0 -0
  37. {metameq-2026.2.2 → metameq-2026.2.3}/metameq.egg-info/dependency_links.txt +0 -0
  38. {metameq-2026.2.2 → metameq-2026.2.3}/metameq.egg-info/entry_points.txt +0 -0
  39. {metameq-2026.2.2 → metameq-2026.2.3}/metameq.egg-info/top_level.txt +0 -0
  40. {metameq-2026.2.2 → metameq-2026.2.3}/setup.cfg +0 -0
  41. {metameq-2026.2.2 → metameq-2026.2.3}/versioneer.py +0 -0
@@ -1,12 +1,13 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: metameq
3
- Version: 2026.2.2
3
+ Version: 2026.2.3
4
4
  Summary: Qiita-compliant metadata generation and validation tool
5
5
  Home-page: https://github.com/AmandaBirmingham/metameq
6
6
  Author: Amanda Birmingham
7
7
  Author-email: abirmingham@ucsd.edu
8
8
  License: BSD-3-Clause
9
9
  Requires-Dist: click>=8.0.0
10
+ Requires-Dist: openpyxl>=3.0.0
10
11
  Requires-Dist: pandas>=1.3.0
11
12
  Requires-Dist: PyYAML>=5.4.0
12
13
  Requires-Dist: Cerberus>=1.3.4
@@ -106,7 +106,7 @@ from metameq import (
106
106
  )
107
107
 
108
108
  # Load your raw metadata into a DataFrame
109
- raw_metadata_df = pd.read_csv("my_samples.csv")
109
+ raw_metadata_df = pd.read_csv("my_samples.csv", dtype=str)
110
110
 
111
111
  # Ensure required columns exist
112
112
  raw_metadata_df[HOSTTYPE_SHORTHAND_KEY] = "human"
@@ -6,6 +6,7 @@ dependencies:
6
6
  - python
7
7
  - click
8
8
  - pandas
9
+ - openpyxl
9
10
  - pip
10
11
  - pyyaml
11
12
  - flake8
@@ -9,7 +9,7 @@ from metameq.src.metadata_extender import \
9
9
  write_extended_metadata, write_extended_metadata_from_df, \
10
10
  get_reserved_cols, get_extended_metadata_from_df_and_yaml, \
11
11
  write_metadata_results, id_missing_cols, find_standard_cols, \
12
- find_nonstandard_cols, get_qc_failures
12
+ find_nonstandard_cols, get_qc_failures, extend_metadata_df
13
13
  from metameq.src.metadata_merger import merge_sample_and_subject_metadata, \
14
14
  merge_many_to_one_metadata, merge_one_to_one_metadata, \
15
15
  find_common_col_names, find_common_df_cols
@@ -36,7 +36,8 @@ __all__ = ["HOSTTYPE_SHORTHAND_KEY", "SAMPLETYPE_SHORTHAND_KEY",
36
36
  "find_nonstandard_cols", "get_qc_failures",
37
37
  "format_a_datetime", "standardize_input_sex",
38
38
  "set_life_stage_from_age_yrs", "transform_input_sex_to_std_sex",
39
- "transform_age_to_life_stage", "transform_date_to_formatted_date"]
39
+ "transform_age_to_life_stage", "transform_date_to_formatted_date",
40
+ "extend_metadata_df"]
40
41
 
41
42
  from . import _version
42
43
  __version__ = _version.get_versions()['version']
@@ -8,11 +8,11 @@ import json
8
8
 
9
9
  version_json = '''
10
10
  {
11
- "date": "2026-02-02T16:43:52-0800",
11
+ "date": "2026-02-03T15:03:32-0800",
12
12
  "dirty": false,
13
13
  "error": null,
14
- "full-revisionid": "4fe1396e1007820dc7a4bdb58708fff0df6b9a57",
15
- "version": "2026.02.2"
14
+ "full-revisionid": "89687d23015566a7583179a69f92c2e1d1adcf61",
15
+ "version": "2026.02.3"
16
16
  }
17
17
  ''' # END VERSION_JSON
18
18
 
@@ -301,13 +301,13 @@ def write_extended_metadata(
301
301
  # extract the extension from the raw_metadata_fp file path
302
302
  extension = os.path.splitext(raw_metadata_fp)[1]
303
303
  if extension == ".csv":
304
- raw_metadata_df = load_df_with_best_fit_encoding(raw_metadata_fp, ",")
304
+ raw_metadata_df = load_df_with_best_fit_encoding(raw_metadata_fp, ",", str)
305
305
  elif extension == ".txt":
306
- raw_metadata_df = load_df_with_best_fit_encoding(raw_metadata_fp, "\t")
306
+ raw_metadata_df = load_df_with_best_fit_encoding(raw_metadata_fp, "\t", str)
307
307
  elif extension == ".xlsx":
308
308
  # NB: this loads (only) the first sheet of the input excel file.
309
309
  # If needed, can expand with pandas.read_excel sheet_name parameter.
310
- raw_metadata_df = pandas.read_excel(raw_metadata_fp)
310
+ raw_metadata_df = pandas.read_excel(raw_metadata_fp, dtype=str)
311
311
  else:
312
312
  raise ValueError("Unrecognized input file extension; "
313
313
  "must be .csv, .txt, or .xlsx")
@@ -475,7 +475,7 @@ class TestMetadataExtender(TestCase):
475
475
 
476
476
  # Verify metadata file contents - includes failed row when remove_internals=False
477
477
  result_df = pandas.read_csv(
478
- metadata_files[0], sep="\t", keep_default_na=False)
478
+ metadata_files[0], sep="\t", dtype=str, keep_default_na=False)
479
479
  assert_frame_equal(metadata_df, result_df)
480
480
 
481
481
  # Find the validation errors file (uses comma separator)
@@ -484,7 +484,7 @@ class TestMetadataExtender(TestCase):
484
484
  self.assertEqual(1, len(validation_files))
485
485
 
486
486
  # Verify validation errors file contents
487
- result_validation_df = pandas.read_csv(validation_files[0], sep=",")
487
+ result_validation_df = pandas.read_csv(validation_files[0], sep=",", dtype=str, keep_default_na=False)
488
488
  assert_frame_equal(validation_msgs_df, result_validation_df)
489
489
 
490
490
  # No fails file should be created when remove_internals=False
@@ -513,7 +513,7 @@ class TestMetadataExtender(TestCase):
513
513
  self.assertEqual(1, len(metadata_files))
514
514
 
515
515
  # Verify metadata has internal cols removed and no failures
516
- result_df = pandas.read_csv(metadata_files[0], sep="\t")
516
+ result_df = pandas.read_csv(metadata_files[0], sep="\t", dtype=str, keep_default_na=False)
517
517
  expected_df = pandas.DataFrame({
518
518
  SAMPLE_NAME_KEY: ["sample1", "sample3"],
519
519
  "field_a": ["a1", "a3"]
@@ -526,7 +526,7 @@ class TestMetadataExtender(TestCase):
526
526
  self.assertEqual(1, len(fails_files))
527
527
 
528
528
  # Verify fails file contains the failed row
529
- fails_df = pandas.read_csv(fails_files[0], sep=",")
529
+ fails_df = pandas.read_csv(fails_files[0], sep=",", dtype=str, keep_default_na=False)
530
530
  expected_fails_df = pandas.DataFrame({
531
531
  SAMPLE_NAME_KEY: ["sample2"],
532
532
  "field_a": ["a2"],
@@ -593,7 +593,7 @@ class TestMetadataExtender(TestCase):
593
593
  self.assertEqual(1, len(metadata_files))
594
594
 
595
595
  # Verify custom internal cols are removed
596
- result_df = pandas.read_csv(metadata_files[0], sep="\t")
596
+ result_df = pandas.read_csv(metadata_files[0], sep="\t", dtype=str, keep_default_na=False)
597
597
  expected_df = pandas.DataFrame({
598
598
  SAMPLE_NAME_KEY: ["sample1", "sample2"],
599
599
  "field_a": ["a1", "a2"]
@@ -3334,7 +3334,7 @@ class TestMetadataExtender(TestCase):
3334
3334
  self.assertEqual(1, len(output_files))
3335
3335
 
3336
3336
  # Read and verify contents (keep_default_na=False preserves empty strings)
3337
- result_df = pandas.read_csv(output_files[0], sep="\t", keep_default_na=False)
3337
+ result_df = pandas.read_csv(output_files[0], sep="\t", dtype=str, keep_default_na=False)
3338
3338
  expected_df = input_df
3339
3339
  assert_frame_equal(expected_df, result_df)
3340
3340
 
@@ -3358,7 +3358,7 @@ class TestMetadataExtender(TestCase):
3358
3358
  self.assertEqual(1, len(output_files))
3359
3359
 
3360
3360
  # Verify main output has internal cols removed and no failures
3361
- result_df = pandas.read_csv(output_files[0], sep="\t")
3361
+ result_df = pandas.read_csv(output_files[0], sep="\t", dtype=str, keep_default_na=False)
3362
3362
  expected_df = pandas.DataFrame({
3363
3363
  SAMPLE_NAME_KEY: ["sample1", "sample3"],
3364
3364
  "field_a": ["a1", "a3"]
@@ -3370,7 +3370,7 @@ class TestMetadataExtender(TestCase):
3370
3370
  self.assertEqual(1, len(fails_files))
3371
3371
 
3372
3372
  # Verify fails file contains the failed row
3373
- fails_df = pandas.read_csv(fails_files[0], sep=",")
3373
+ fails_df = pandas.read_csv(fails_files[0], sep=",", dtype=str, keep_default_na=False)
3374
3374
  expected_fails_df = pandas.DataFrame({
3375
3375
  SAMPLE_NAME_KEY: ["sample2"],
3376
3376
  "field_a": ["a2"],
@@ -3447,7 +3447,7 @@ class TestMetadataExtender(TestCase):
3447
3447
  self.assertEqual(1, len(output_files))
3448
3448
 
3449
3449
  # Read and verify contents (keep_default_na=False preserves empty strings)
3450
- result_df = pandas.read_csv(output_files[0], sep=",", keep_default_na=False)
3450
+ result_df = pandas.read_csv(output_files[0], sep=",", dtype=str, keep_default_na=False)
3451
3451
  expected_df = input_df
3452
3452
  assert_frame_equal(expected_df, result_df)
3453
3453
 
@@ -3469,14 +3469,14 @@ class TestMetadataExtender(TestCase):
3469
3469
  # Main output file should have only headers (empty data)
3470
3470
  output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
3471
3471
  self.assertEqual(1, len(output_files))
3472
- result_df = pandas.read_csv(output_files[0], sep="\t")
3472
+ result_df = pandas.read_csv(output_files[0], sep="\t", dtype=str, keep_default_na=False)
3473
3473
  self.assertTrue(result_df.empty)
3474
3474
  self.assertEqual([SAMPLE_NAME_KEY, "field_a"], list(result_df.columns))
3475
3475
 
3476
3476
  # Fails file should have both rows
3477
3477
  fails_files = glob.glob(os.path.join(tmpdir, "*_test_output_fails.csv"))
3478
3478
  self.assertEqual(1, len(fails_files))
3479
- fails_df = pandas.read_csv(fails_files[0], sep=",")
3479
+ fails_df = pandas.read_csv(fails_files[0], sep=",", dtype=str, keep_default_na=False)
3480
3480
  self.assertEqual(2, len(fails_df))
3481
3481
 
3482
3482
  # Tests for get_extended_metadata_from_df_and_yaml
@@ -3621,7 +3621,7 @@ class TestMetadataExtender(TestCase):
3621
3621
  # Verify main output file was created (internal cols removed by default)
3622
3622
  output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
3623
3623
  self.assertEqual(1, len(output_files))
3624
- output_df = pandas.read_csv(output_files[0], sep="\t")
3624
+ output_df = pandas.read_csv(output_files[0], sep="\t", dtype=str, keep_default_na=False)
3625
3625
  expected_output_df = pandas.DataFrame({
3626
3626
  SAMPLE_NAME_KEY: ["sample1", "sample2"],
3627
3627
  "body_product": ["UBERON:feces", "UBERON:feces"],
@@ -3694,7 +3694,7 @@ class TestMetadataExtender(TestCase):
3694
3694
  # Verify main output file excludes failure rows
3695
3695
  output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
3696
3696
  self.assertEqual(1, len(output_files))
3697
- output_df = pandas.read_csv(output_files[0], sep="\t")
3697
+ output_df = pandas.read_csv(output_files[0], sep="\t", dtype=str, keep_default_na=False)
3698
3698
  expected_output_df = pandas.DataFrame({
3699
3699
  SAMPLE_NAME_KEY: ["sample1", "sample3"],
3700
3700
  "body_product": ["UBERON:feces", "UBERON:feces"],
@@ -3709,7 +3709,7 @@ class TestMetadataExtender(TestCase):
3709
3709
  # Verify fails file contains the failed row
3710
3710
  fails_files = glob.glob(os.path.join(tmpdir, "*_test_output_fails.csv"))
3711
3711
  self.assertEqual(1, len(fails_files))
3712
- fails_df = pandas.read_csv(fails_files[0], sep=",")
3712
+ fails_df = pandas.read_csv(fails_files[0], sep=",", dtype=str, keep_default_na=False)
3713
3713
  expected_fails_df = pandas.DataFrame({
3714
3714
  SAMPLE_NAME_KEY: ["sample2"],
3715
3715
  "body_product": ["not provided"],
@@ -3780,7 +3780,7 @@ class TestMetadataExtender(TestCase):
3780
3780
  validation_files = glob.glob(
3781
3781
  os.path.join(tmpdir, "*_test_output_validation_errors.csv"))
3782
3782
  self.assertEqual(1, len(validation_files))
3783
- validation_df = pandas.read_csv(validation_files[0], sep=",")
3783
+ validation_df = pandas.read_csv(validation_files[0], sep=",", dtype=str, keep_default_na=False)
3784
3784
  expected_validation_df = pandas.DataFrame({
3785
3785
  "sample_name": ["sample1"],
3786
3786
  "field_name": ["restricted_field"],
@@ -3821,7 +3821,7 @@ class TestMetadataExtender(TestCase):
3821
3821
  # Verify main output file includes internal columns
3822
3822
  output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
3823
3823
  self.assertEqual(1, len(output_files))
3824
- output_df = pandas.read_csv(output_files[0], sep="\t", keep_default_na=False)
3824
+ output_df = pandas.read_csv(output_files[0], sep="\t", dtype=str, keep_default_na=False)
3825
3825
  expected_output_df = pandas.DataFrame({
3826
3826
  SAMPLE_NAME_KEY: ["sample1"],
3827
3827
  "body_product": ["UBERON:feces"],
@@ -3844,6 +3844,7 @@ class TestMetadataExtender(TestCase):
3844
3844
 
3845
3845
  TEST_METADATA_CSV_FP = path.join(TEST_DIR, "data/test_metadata.csv")
3846
3846
  TEST_METADATA_TXT_FP = path.join(TEST_DIR, "data/test_metadata.txt")
3847
+ TEST_METADATA_XLSX_FP = path.join(TEST_DIR, "data/test_metadata.xlsx")
3847
3848
  TEST_METADATA_WITH_ERRORS_FP = path.join(
3848
3849
  TEST_DIR, "data/test_metadata_with_errors.csv")
3849
3850
  TEST_STUDY_CONFIG_WITH_VALIDATION_FP = path.join(
@@ -3862,6 +3863,7 @@ class TestMetadataExtender(TestCase):
3862
3863
  "body_product": ["UBERON:feces", "UBERON:feces"],
3863
3864
  "body_site": ["gut", "gut"],
3864
3865
  "description": ["human sample", "human sample"],
3866
+ "dna_extracted": ["TRUE", "FALSE"],
3865
3867
  "host_common_name": ["human", "human"],
3866
3868
  QIITA_SAMPLE_TYPE: ["stool", "stool"],
3867
3869
  SAMPLE_TYPE_KEY: ["stool", "stool"],
@@ -3876,12 +3878,13 @@ class TestMetadataExtender(TestCase):
3876
3878
  # Verify main output file was created (internal cols removed by default)
3877
3879
  output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
3878
3880
  self.assertEqual(1, len(output_files))
3879
- output_df = pandas.read_csv(output_files[0], sep="\t")
3881
+ output_df = pandas.read_csv(output_files[0], sep="\t", dtype=str, keep_default_na=False)
3880
3882
  expected_output_df = pandas.DataFrame({
3881
3883
  SAMPLE_NAME_KEY: ["sample1", "sample2"],
3882
3884
  "body_product": ["UBERON:feces", "UBERON:feces"],
3883
3885
  "body_site": ["gut", "gut"],
3884
3886
  "description": ["human sample", "human sample"],
3887
+ "dna_extracted": ["TRUE", "FALSE"],
3885
3888
  "host_common_name": ["human", "human"],
3886
3889
  QIITA_SAMPLE_TYPE: ["stool", "stool"],
3887
3890
  SAMPLE_TYPE_KEY: ["stool", "stool"],
@@ -3914,6 +3917,7 @@ class TestMetadataExtender(TestCase):
3914
3917
  "body_product": ["UBERON:feces", "UBERON:feces"],
3915
3918
  "body_site": ["gut", "gut"],
3916
3919
  "description": ["human sample", "human sample"],
3920
+ "dna_extracted": ["TRUE", "FALSE"],
3917
3921
  "host_common_name": ["human", "human"],
3918
3922
  QIITA_SAMPLE_TYPE: ["stool", "stool"],
3919
3923
  SAMPLE_TYPE_KEY: ["stool", "stool"],
@@ -3928,12 +3932,13 @@ class TestMetadataExtender(TestCase):
3928
3932
  # Verify main output file was created
3929
3933
  output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
3930
3934
  self.assertEqual(1, len(output_files))
3931
- output_df = pandas.read_csv(output_files[0], sep="\t")
3935
+ output_df = pandas.read_csv(output_files[0], sep="\t", dtype=str, keep_default_na=False)
3932
3936
  expected_output_df = pandas.DataFrame({
3933
3937
  SAMPLE_NAME_KEY: ["sample1", "sample2"],
3934
3938
  "body_product": ["UBERON:feces", "UBERON:feces"],
3935
3939
  "body_site": ["gut", "gut"],
3936
3940
  "description": ["human sample", "human sample"],
3941
+ "dna_extracted": ["TRUE", "FALSE"],
3937
3942
  "host_common_name": ["human", "human"],
3938
3943
  QIITA_SAMPLE_TYPE: ["stool", "stool"],
3939
3944
  SAMPLE_TYPE_KEY: ["stool", "stool"],
@@ -3942,6 +3947,60 @@ class TestMetadataExtender(TestCase):
3942
3947
  })
3943
3948
  assert_frame_equal(expected_output_df, output_df)
3944
3949
 
3950
+ def test_write_extended_metadata_xlsx_input(self):
3951
+ """Test writing extended metadata from an Excel XLSX input file."""
3952
+ with tempfile.TemporaryDirectory() as tmpdir:
3953
+ result_df = write_extended_metadata(
3954
+ self.TEST_METADATA_XLSX_FP, self.TEST_STUDY_CONFIG_FP,
3955
+ tmpdir, "test_output", stds_fp=self.TEST_STDS_FP)
3956
+
3957
+ # Verify returned DataFrame
3958
+ expected_result_df = pandas.DataFrame({
3959
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
3960
+ "body_product": ["UBERON:feces", "UBERON:feces"],
3961
+ "body_site": ["gut", "gut"],
3962
+ "description": ["human sample", "human sample"],
3963
+ "dna_extracted": ["TRUE", "FALSE"],
3964
+ "host_common_name": ["human", "human"],
3965
+ QIITA_SAMPLE_TYPE: ["stool", "stool"],
3966
+ SAMPLE_TYPE_KEY: ["stool", "stool"],
3967
+ "study_custom_field": ["custom_value", "custom_value"],
3968
+ "study_stool_field": ["stool_custom", "stool_custom"],
3969
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
3970
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
3971
+ QC_NOTE_KEY: ["", ""]
3972
+ })
3973
+ assert_frame_equal(expected_result_df, result_df)
3974
+
3975
+ # Verify main output file was created
3976
+ output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
3977
+ self.assertEqual(1, len(output_files))
3978
+ output_df = pandas.read_csv(output_files[0], sep="\t", dtype=str, keep_default_na=False)
3979
+ expected_output_df = pandas.DataFrame({
3980
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
3981
+ "body_product": ["UBERON:feces", "UBERON:feces"],
3982
+ "body_site": ["gut", "gut"],
3983
+ "description": ["human sample", "human sample"],
3984
+ "dna_extracted": ["TRUE", "FALSE"],
3985
+ "host_common_name": ["human", "human"],
3986
+ QIITA_SAMPLE_TYPE: ["stool", "stool"],
3987
+ SAMPLE_TYPE_KEY: ["stool", "stool"],
3988
+ "study_custom_field": ["custom_value", "custom_value"],
3989
+ "study_stool_field": ["stool_custom", "stool_custom"]
3990
+ })
3991
+ assert_frame_equal(expected_output_df, output_df)
3992
+
3993
+ # Verify empty fails file was created
3994
+ fails_files = glob.glob(os.path.join(tmpdir, "*_test_output_fails.csv"))
3995
+ self.assertEqual(1, len(fails_files))
3996
+ self.assertEqual(0, os.path.getsize(fails_files[0]))
3997
+
3998
+ # Verify empty validation errors file was created
3999
+ validation_files = glob.glob(
4000
+ os.path.join(tmpdir, "*_test_output_validation_errors.csv"))
4001
+ self.assertEqual(1, len(validation_files))
4002
+ self.assertEqual(0, os.path.getsize(validation_files[0]))
4003
+
3945
4004
  def test_write_extended_metadata_with_validation_errors(self):
3946
4005
  """Test writing extended metadata when validation errors occur."""
3947
4006
  with tempfile.TemporaryDirectory() as tmpdir:
@@ -3956,6 +4015,7 @@ class TestMetadataExtender(TestCase):
3956
4015
  "body_product": ["UBERON:feces", "UBERON:feces"],
3957
4016
  "body_site": ["gut", "gut"],
3958
4017
  "description": ["human sample", "human sample"],
4018
+ "dna_extracted": ["TRUE", "FALSE"],
3959
4019
  "host_common_name": ["human", "human"],
3960
4020
  QIITA_SAMPLE_TYPE: ["stool", "stool"],
3961
4021
  "restricted_field": ["invalid_value", "allowed_value"],
@@ -3969,12 +4029,13 @@ class TestMetadataExtender(TestCase):
3969
4029
  # Verify main output file was created
3970
4030
  output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
3971
4031
  self.assertEqual(1, len(output_files))
3972
- output_df = pandas.read_csv(output_files[0], sep="\t")
4032
+ output_df = pandas.read_csv(output_files[0], sep="\t", dtype=str, keep_default_na=False)
3973
4033
  expected_output_df = pandas.DataFrame({
3974
4034
  SAMPLE_NAME_KEY: ["sample1", "sample2"],
3975
4035
  "body_product": ["UBERON:feces", "UBERON:feces"],
3976
4036
  "body_site": ["gut", "gut"],
3977
4037
  "description": ["human sample", "human sample"],
4038
+ "dna_extracted": ["TRUE", "FALSE"],
3978
4039
  "host_common_name": ["human", "human"],
3979
4040
  QIITA_SAMPLE_TYPE: ["stool", "stool"],
3980
4041
  "restricted_field": ["invalid_value", "allowed_value"],
@@ -3986,7 +4047,7 @@ class TestMetadataExtender(TestCase):
3986
4047
  validation_files = glob.glob(
3987
4048
  os.path.join(tmpdir, "*_test_output_validation_errors.csv"))
3988
4049
  self.assertEqual(1, len(validation_files))
3989
- validation_df = pandas.read_csv(validation_files[0], sep=",")
4050
+ validation_df = pandas.read_csv(validation_files[0], sep=",", dtype=str, keep_default_na=False)
3990
4051
  expected_validation_df = pandas.DataFrame({
3991
4052
  "sample_name": ["sample1"],
3992
4053
  "field_name": ["restricted_field"],
@@ -4021,6 +4082,7 @@ class TestMetadataExtender(TestCase):
4021
4082
  "body_product": ["UBERON:feces", "UBERON:feces"],
4022
4083
  "body_site": ["gut", "gut"],
4023
4084
  "description": ["human sample", "human sample"],
4085
+ "dna_extracted": ["TRUE", "FALSE"],
4024
4086
  "host_common_name": ["human", "human"],
4025
4087
  QIITA_SAMPLE_TYPE: ["stool", "stool"],
4026
4088
  SAMPLE_TYPE_KEY: ["stool", "stool"],
@@ -4035,12 +4097,13 @@ class TestMetadataExtender(TestCase):
4035
4097
  # Verify output file has .csv extension
4036
4098
  output_files = glob.glob(os.path.join(tmpdir, "*_test_output.csv"))
4037
4099
  self.assertEqual(1, len(output_files))
4038
- output_df = pandas.read_csv(output_files[0], sep=",")
4100
+ output_df = pandas.read_csv(output_files[0], sep=",", dtype=str, keep_default_na=False)
4039
4101
  expected_output_df = pandas.DataFrame({
4040
4102
  SAMPLE_NAME_KEY: ["sample1", "sample2"],
4041
4103
  "body_product": ["UBERON:feces", "UBERON:feces"],
4042
4104
  "body_site": ["gut", "gut"],
4043
4105
  "description": ["human sample", "human sample"],
4106
+ "dna_extracted": ["TRUE", "FALSE"],
4044
4107
  "host_common_name": ["human", "human"],
4045
4108
  QIITA_SAMPLE_TYPE: ["stool", "stool"],
4046
4109
  SAMPLE_TYPE_KEY: ["stool", "stool"],
@@ -4063,6 +4126,7 @@ class TestMetadataExtender(TestCase):
4063
4126
  "body_product": ["UBERON:feces", "UBERON:feces"],
4064
4127
  "body_site": ["gut", "gut"],
4065
4128
  "description": ["human sample", "human sample"],
4129
+ "dna_extracted": ["TRUE", "FALSE"],
4066
4130
  "host_common_name": ["human", "human"],
4067
4131
  QIITA_SAMPLE_TYPE: ["stool", "stool"],
4068
4132
  SAMPLE_TYPE_KEY: ["stool", "stool"],
@@ -4077,12 +4141,13 @@ class TestMetadataExtender(TestCase):
4077
4141
  # Verify main output file includes internal columns
4078
4142
  output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
4079
4143
  self.assertEqual(1, len(output_files))
4080
- output_df = pandas.read_csv(output_files[0], sep="\t", keep_default_na=False)
4144
+ output_df = pandas.read_csv(output_files[0], sep="\t", dtype=str, keep_default_na=False)
4081
4145
  expected_output_df = pandas.DataFrame({
4082
4146
  SAMPLE_NAME_KEY: ["sample1", "sample2"],
4083
4147
  "body_product": ["UBERON:feces", "UBERON:feces"],
4084
4148
  "body_site": ["gut", "gut"],
4085
4149
  "description": ["human sample", "human sample"],
4150
+ "dna_extracted": ["TRUE", "FALSE"],
4086
4151
  "host_common_name": ["human", "human"],
4087
4152
  QIITA_SAMPLE_TYPE: ["stool", "stool"],
4088
4153
  SAMPLE_TYPE_KEY: ["stool", "stool"],
@@ -4112,6 +4177,7 @@ class TestMetadataExtender(TestCase):
4112
4177
  "body_product": ["UBERON:feces", "UBERON:feces"],
4113
4178
  "body_site": ["gut", "gut"],
4114
4179
  "description": ["human sample", "human sample"],
4180
+ "dna_extracted": ["TRUE", "FALSE"],
4115
4181
  "host_common_name": ["human", "human"],
4116
4182
  QIITA_SAMPLE_TYPE: ["stool", "stool"],
4117
4183
  SAMPLE_TYPE_KEY: ["stool", "stool"],
@@ -4126,12 +4192,13 @@ class TestMetadataExtender(TestCase):
4126
4192
  # Verify main output file was created
4127
4193
  output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
4128
4194
  self.assertEqual(1, len(output_files))
4129
- output_df = pandas.read_csv(output_files[0], sep="\t")
4195
+ output_df = pandas.read_csv(output_files[0], sep="\t", dtype=str, keep_default_na=False)
4130
4196
  expected_output_df = pandas.DataFrame({
4131
4197
  SAMPLE_NAME_KEY: ["sample1", "sample2"],
4132
4198
  "body_product": ["UBERON:feces", "UBERON:feces"],
4133
4199
  "body_site": ["gut", "gut"],
4134
4200
  "description": ["human sample", "human sample"],
4201
+ "dna_extracted": ["TRUE", "FALSE"],
4135
4202
  "host_common_name": ["human", "human"],
4136
4203
  QIITA_SAMPLE_TYPE: ["stool", "stool"],
4137
4204
  SAMPLE_TYPE_KEY: ["stool", "stool"],
@@ -4149,6 +4216,64 @@ class TestMetadataExtender(TestCase):
4149
4216
  os.path.join(tmpdir, "*_test_output_validation_errors.csv"))
4150
4217
  self.assertEqual(0, len(validation_files))
4151
4218
 
4219
+ def test_write_extended_metadata_preserves_string_booleans(self):
4220
+ """Test that TRUE/FALSE string values are not converted to booleans.
4221
+
4222
+ This tests for a bug where loading a CSV without dtype=str causes
4223
+ pandas to convert 'TRUE'/'FALSE' strings to boolean True/False,
4224
+ which then fail validation against allowed string values.
4225
+ """
4226
+ with tempfile.TemporaryDirectory() as tmpdir:
4227
+ # Create a CSV file with TRUE/FALSE string values
4228
+ csv_content = (
4229
+ "sample_name,hosttype_shorthand,sampletype_shorthand,dna_extracted\n"
4230
+ "sample1,human,stool,TRUE\n"
4231
+ "sample2,human,stool,FALSE\n"
4232
+ )
4233
+ csv_fp = path.join(tmpdir, "test_bool_strings.csv")
4234
+ with open(csv_fp, "w") as f:
4235
+ f.write(csv_content)
4236
+
4237
+ # Create a config that defines TRUE/FALSE as allowed string values
4238
+ config_content = """
4239
+ default: "not provided"
4240
+ leave_requireds_blank: false
4241
+ overwrite_non_nans: false
4242
+ study_specific_metadata:
4243
+ host_type_specific_metadata:
4244
+ human:
4245
+ default: "not provided"
4246
+ leave_requireds_blank: false
4247
+ overwrite_non_nans: false
4248
+ sample_type_specific_metadata:
4249
+ stool:
4250
+ metadata_fields:
4251
+ dna_extracted:
4252
+ type: string
4253
+ allowed:
4254
+ - "TRUE"
4255
+ - "FALSE"
4256
+ """
4257
+ config_fp = path.join(tmpdir, "test_bool_config.yml")
4258
+ with open(config_fp, "w") as f:
4259
+ f.write(config_content)
4260
+
4261
+ # Call write_extended_metadata
4262
+ result_df = write_extended_metadata(
4263
+ csv_fp, config_fp, tmpdir, "test_output",
4264
+ stds_fp=self.TEST_STDS_FP)
4265
+
4266
+ # Verify the dna_extracted values are preserved as strings
4267
+ self.assertEqual("TRUE", result_df.loc[0, "dna_extracted"])
4268
+ self.assertEqual("FALSE", result_df.loc[1, "dna_extracted"])
4269
+
4270
+ # Verify no validation errors occurred
4271
+ validation_files = glob.glob(
4272
+ os.path.join(tmpdir, "*_test_output_validation_errors.csv"))
4273
+ self.assertEqual(1, len(validation_files))
4274
+ # The validation errors file should be empty (0 bytes)
4275
+ self.assertEqual(0, os.path.getsize(validation_files[0]))
4276
+
4152
4277
  # Integration tests
4153
4278
 
4154
4279
  TEST_PROJECT1_METADATA_FP = path.join(TEST_DIR, "data/test_project1_input_metadata.csv")
@@ -4171,9 +4296,6 @@ class TestMetadataExtender(TestCase):
4171
4296
 
4172
4297
  # Load input metadata CSV
4173
4298
  input_df = pandas.read_csv(self.TEST_PROJECT1_METADATA_FP, dtype=str)
4174
- # for the columns "plating_notes" and "notes", fill NaN with empty string
4175
- input_df["plating_notes"] = input_df["plating_notes"].fillna("")
4176
- input_df["notes"] = input_df["notes"].fillna("")
4177
4299
 
4178
4300
  # Load study config
4179
4301
  study_config = _get_study_specific_config(self.TEST_PROJECT1_CONFIG_FP)
@@ -4219,6 +4341,58 @@ class TestMetadataExtender(TestCase):
4219
4341
  self.assertEqual(1, len(validation_files))
4220
4342
  self.assertEqual(0, os.path.getsize(validation_files[0]))
4221
4343
 
4344
+ def test_write_extended_metadata_project1_integration(self):
4345
+ """Integration test for write_extended_metadata using project1 test data files."""
4346
+
4347
+ def write_mismatched_debug_files(expected_content, actual_content, file_name):
4348
+ """Write debug files to Desktop for unmatched content."""
4349
+ debug_dir = path.join(path.expanduser("~"), "Desktop")
4350
+ with open(path.join(debug_dir, f"UNMATCHED_1_{file_name}"), 'w') as debug_expected_file:
4351
+ debug_expected_file.write(expected_content)
4352
+ with open(path.join(debug_dir, f"UNMATCHED_2_{file_name}"), 'w') as debug_actual_file:
4353
+ debug_actual_file.write(actual_content)
4354
+
4355
+ with tempfile.TemporaryDirectory() as tmpdir:
4356
+ write_extended_metadata(
4357
+ self.TEST_PROJECT1_METADATA_FP, self.TEST_PROJECT1_CONFIG_FP,
4358
+ tmpdir, "test_output", remove_internals=True)
4359
+
4360
+ # Compare main output file directly to expected file
4361
+ output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
4362
+ self.assertEqual(1, len(output_files))
4363
+ with open(output_files[0], 'r') as actual_file:
4364
+ actual_content = actual_file.read()
4365
+ with open(self.TEST_PROJECT1_EXPECTED_OUTPUT_FP, 'r') as expected_file:
4366
+ expected_content = expected_file.read()
4367
+ try:
4368
+ self.assertEqual(expected_content, actual_content)
4369
+ except AssertionError:
4370
+ write_mismatched_debug_files(
4371
+ expected_content, actual_content,
4372
+ "project1_output.txt")
4373
+ raise
4374
+
4375
+ # Compare fails file directly to expected file
4376
+ fails_files = glob.glob(os.path.join(tmpdir, "*_test_output_fails.csv"))
4377
+ self.assertEqual(1, len(fails_files))
4378
+ with open(fails_files[0], 'r') as actual_file:
4379
+ actual_fails_content = actual_file.read()
4380
+ with open(self.TEST_PROJECT1_EXPECTED_FAILS_FP, 'r') as expected_file:
4381
+ expected_fails_content = expected_file.read()
4382
+ try:
4383
+ self.assertEqual(expected_fails_content, actual_fails_content)
4384
+ except AssertionError:
4385
+ write_mismatched_debug_files(
4386
+ expected_fails_content, actual_fails_content,
4387
+ "project1_fails.csv")
4388
+ raise
4389
+
4390
+ # Verify validation errors file is empty
4391
+ validation_files = glob.glob(
4392
+ os.path.join(tmpdir, "*_test_output_validation_errors.csv"))
4393
+ self.assertEqual(1, len(validation_files))
4394
+ self.assertEqual(0, os.path.getsize(validation_files[0]))
4395
+
4222
4396
  # Tests for _get_specified_column_name
4223
4397
 
4224
4398
  def test__get_specified_column_name_finds_column(self):
@@ -547,7 +547,7 @@ class TestOutputValidationMsgs(TestCase):
547
547
  output_files = glob.glob(os.path.join(tmp_dir, "*_test_validation_errors.txt"))
548
548
  self.assertEqual(1, len(output_files))
549
549
 
550
- result_df = pd.read_csv(output_files[0], sep="\t")
550
+ result_df = pd.read_csv(output_files[0], sep="\t", dtype=str, keep_default_na=False)
551
551
  pd.testing.assert_frame_equal(validation_msgs_df, result_df)
552
552
 
553
553
  def test_output_validation_msgs_non_empty_df_comma_separator(self):
@@ -564,7 +564,7 @@ class TestOutputValidationMsgs(TestCase):
564
564
  output_files = glob.glob(os.path.join(tmp_dir, "*_test_validation_errors.csv"))
565
565
  self.assertEqual(1, len(output_files))
566
566
 
567
- result_df = pd.read_csv(output_files[0], sep=",")
567
+ result_df = pd.read_csv(output_files[0], sep=",", dtype=str, keep_default_na=False)
568
568
  pd.testing.assert_frame_equal(validation_msgs_df, result_df)
569
569
 
570
570
  def test_output_validation_msgs_empty_df_creates_empty_file(self):
@@ -1,12 +1,13 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: metameq
3
- Version: 2026.2.2
3
+ Version: 2026.2.3
4
4
  Summary: Qiita-compliant metadata generation and validation tool
5
5
  Home-page: https://github.com/AmandaBirmingham/metameq
6
6
  Author: Amanda Birmingham
7
7
  Author-email: abirmingham@ucsd.edu
8
8
  License: BSD-3-Clause
9
9
  Requires-Dist: click>=8.0.0
10
+ Requires-Dist: openpyxl>=3.0.0
10
11
  Requires-Dist: pandas>=1.3.0
11
12
  Requires-Dist: PyYAML>=5.4.0
12
13
  Requires-Dist: Cerberus>=1.3.4
@@ -1,4 +1,5 @@
1
1
  click>=8.0.0
2
+ openpyxl>=3.0.0
2
3
  pandas>=1.3.0
3
4
  PyYAML>=5.4.0
4
5
  Cerberus>=1.3.4
@@ -25,6 +25,7 @@ setup(name='metameq',
25
25
  # NB: if changing here, also change the environment.yml
26
26
  install_requires=[
27
27
  'click>=8.0.0',
28
+ 'openpyxl>=3.0.0',
28
29
  'pandas>=1.3.0',
29
30
  'PyYAML>=5.4.0',
30
31
  'Cerberus>=1.3.4',
File without changes
File without changes
File without changes
File without changes