metameq 2026.2.1__py3-none-any.whl → 2026.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -475,7 +475,7 @@ class TestMetadataExtender(TestCase):
475
475
 
476
476
  # Verify metadata file contents - includes failed row when remove_internals=False
477
477
  result_df = pandas.read_csv(
478
- metadata_files[0], sep="\t", keep_default_na=False)
478
+ metadata_files[0], sep="\t", dtype=str, keep_default_na=False)
479
479
  assert_frame_equal(metadata_df, result_df)
480
480
 
481
481
  # Find the validation errors file (uses comma separator)
@@ -484,7 +484,7 @@ class TestMetadataExtender(TestCase):
484
484
  self.assertEqual(1, len(validation_files))
485
485
 
486
486
  # Verify validation errors file contents
487
- result_validation_df = pandas.read_csv(validation_files[0], sep=",")
487
+ result_validation_df = pandas.read_csv(validation_files[0], sep=",", dtype=str, keep_default_na=False)
488
488
  assert_frame_equal(validation_msgs_df, result_validation_df)
489
489
 
490
490
  # No fails file should be created when remove_internals=False
@@ -513,7 +513,7 @@ class TestMetadataExtender(TestCase):
513
513
  self.assertEqual(1, len(metadata_files))
514
514
 
515
515
  # Verify metadata has internal cols removed and no failures
516
- result_df = pandas.read_csv(metadata_files[0], sep="\t")
516
+ result_df = pandas.read_csv(metadata_files[0], sep="\t", dtype=str, keep_default_na=False)
517
517
  expected_df = pandas.DataFrame({
518
518
  SAMPLE_NAME_KEY: ["sample1", "sample3"],
519
519
  "field_a": ["a1", "a3"]
@@ -526,7 +526,7 @@ class TestMetadataExtender(TestCase):
526
526
  self.assertEqual(1, len(fails_files))
527
527
 
528
528
  # Verify fails file contains the failed row
529
- fails_df = pandas.read_csv(fails_files[0], sep=",")
529
+ fails_df = pandas.read_csv(fails_files[0], sep=",", dtype=str, keep_default_na=False)
530
530
  expected_fails_df = pandas.DataFrame({
531
531
  SAMPLE_NAME_KEY: ["sample2"],
532
532
  "field_a": ["a2"],
@@ -593,7 +593,7 @@ class TestMetadataExtender(TestCase):
593
593
  self.assertEqual(1, len(metadata_files))
594
594
 
595
595
  # Verify custom internal cols are removed
596
- result_df = pandas.read_csv(metadata_files[0], sep="\t")
596
+ result_df = pandas.read_csv(metadata_files[0], sep="\t", dtype=str, keep_default_na=False)
597
597
  expected_df = pandas.DataFrame({
598
598
  SAMPLE_NAME_KEY: ["sample1", "sample2"],
599
599
  "field_a": ["a1", "a2"]
@@ -748,16 +748,15 @@ class TestMetadataExtender(TestCase):
748
748
 
749
749
  # Tests for _fill_na_if_default
750
750
 
751
- def test__fill_na_if_default_specific_overrides_settings(self):
751
+ def test__fill_na_if_default_has_default_in_settings(self):
752
752
  """Test that specific_dict default takes precedence over settings_dict."""
753
753
  input_df = pandas.DataFrame({
754
754
  "field1": ["value1", np.nan, "value3"],
755
755
  "field2": [np.nan, "value2", np.nan]
756
756
  })
757
- specific_dict = {DEFAULT_KEY: "filled"}
758
- settings_dict = {DEFAULT_KEY: "unused"}
757
+ settings_dict = {DEFAULT_KEY: "filled"}
759
758
 
760
- result = _fill_na_if_default(input_df, specific_dict, settings_dict)
759
+ result = _fill_na_if_default(input_df, settings_dict)
761
760
 
762
761
  expected = pandas.DataFrame({
763
762
  "field1": ["value1", "filled", "value3"],
@@ -765,18 +764,19 @@ class TestMetadataExtender(TestCase):
765
764
  })
766
765
  assert_frame_equal(expected, result)
767
766
 
768
- def test__fill_na_if_default_uses_settings_when_specific_missing(self):
769
- """Test that settings_dict default is used when specific_dict has no default."""
767
+ def test__fill_na_if_default_no_default_in_settings(self):
768
+ """Test that NaN values are unchanged when no default is in settings."""
770
769
  input_df = pandas.DataFrame({
771
- "field1": [np.nan]
770
+ "field1": ["value1", np.nan, "value3"],
771
+ "field2": [np.nan, "value2", np.nan]
772
772
  })
773
- specific_dict = {}
774
- settings_dict = {DEFAULT_KEY: "settings_default"}
773
+ settings_dict = {}
775
774
 
776
- result = _fill_na_if_default(input_df, specific_dict, settings_dict)
775
+ result = _fill_na_if_default(input_df, settings_dict)
777
776
 
778
777
  expected = pandas.DataFrame({
779
- "field1": ["settings_default"]
778
+ "field1": ["value1", np.nan, "value3"],
779
+ "field2": [np.nan, "value2", np.nan]
780
780
  })
781
781
  assert_frame_equal(expected, result)
782
782
 
@@ -1273,14 +1273,13 @@ class TestMetadataExtender(TestCase):
1273
1273
  SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
1274
1274
  QC_NOTE_KEY: ["", ""]
1275
1275
  })
1276
- global_plus_host_settings_dict = {
1277
- OVERWRITE_NON_NANS_KEY: False,
1278
- LEAVE_REQUIREDS_BLANK_KEY: False,
1279
- DEFAULT_KEY: "not provided"
1280
- }
1276
+
1281
1277
  # Config is pre-resolved: sample type's metadata_fields already includes
1282
1278
  # host fields merged in, plus sample_type and qiita_sample_type
1283
1279
  host_type_config_dict = {
1280
+ OVERWRITE_NON_NANS_KEY: False,
1281
+ LEAVE_REQUIREDS_BLANK_KEY: False,
1282
+ DEFAULT_KEY: "not provided",
1284
1283
  METADATA_FIELDS_KEY: {
1285
1284
  "host_field": {
1286
1285
  DEFAULT_KEY: "host_default",
@@ -1314,7 +1313,7 @@ class TestMetadataExtender(TestCase):
1314
1313
  }
1315
1314
 
1316
1315
  result_df, validation_msgs = _generate_metadata_for_a_sample_type_in_a_host_type(
1317
- input_df, "stool", global_plus_host_settings_dict, host_type_config_dict)
1316
+ input_df, "stool", host_type_config_dict)
1318
1317
 
1319
1318
  expected_df = pandas.DataFrame({
1320
1319
  SAMPLE_NAME_KEY: ["sample1", "sample2"],
@@ -1337,12 +1336,11 @@ class TestMetadataExtender(TestCase):
1337
1336
  SAMPLETYPE_SHORTHAND_KEY: ["unknown_type"],
1338
1337
  QC_NOTE_KEY: [""]
1339
1338
  })
1340
- global_plus_host_settings_dict = {
1339
+
1340
+ host_type_config_dict = {
1341
1341
  OVERWRITE_NON_NANS_KEY: False,
1342
1342
  LEAVE_REQUIREDS_BLANK_KEY: False,
1343
- DEFAULT_KEY: "not provided"
1344
- }
1345
- host_type_config_dict = {
1343
+ DEFAULT_KEY: "not provided",
1346
1344
  METADATA_FIELDS_KEY: {},
1347
1345
  SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
1348
1346
  "stool": {
@@ -1352,7 +1350,7 @@ class TestMetadataExtender(TestCase):
1352
1350
  }
1353
1351
 
1354
1352
  result_df, validation_msgs = _generate_metadata_for_a_sample_type_in_a_host_type(
1355
- input_df, "unknown_type", global_plus_host_settings_dict, host_type_config_dict)
1353
+ input_df, "unknown_type", host_type_config_dict)
1356
1354
 
1357
1355
  expected_df = pandas.DataFrame({
1358
1356
  SAMPLE_NAME_KEY: ["sample1"],
@@ -1371,12 +1369,11 @@ class TestMetadataExtender(TestCase):
1371
1369
  SAMPLETYPE_SHORTHAND_KEY: ["stool", "blood", "stool"],
1372
1370
  QC_NOTE_KEY: ["", "", ""]
1373
1371
  })
1374
- global_plus_host_settings_dict = {
1372
+
1373
+ host_type_config_dict = {
1375
1374
  OVERWRITE_NON_NANS_KEY: False,
1376
1375
  LEAVE_REQUIREDS_BLANK_KEY: False,
1377
- DEFAULT_KEY: "not provided"
1378
- }
1379
- host_type_config_dict = {
1376
+ DEFAULT_KEY: "not provided",
1380
1377
  METADATA_FIELDS_KEY: {},
1381
1378
  SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
1382
1379
  "stool": {
@@ -1394,7 +1391,7 @@ class TestMetadataExtender(TestCase):
1394
1391
  }
1395
1392
 
1396
1393
  result_df, validation_msgs = _generate_metadata_for_a_sample_type_in_a_host_type(
1397
- input_df, "stool", global_plus_host_settings_dict, host_type_config_dict)
1394
+ input_df, "stool", host_type_config_dict)
1398
1395
 
1399
1396
  # Should only have the two stool samples
1400
1397
  self.assertEqual(2, len(result_df))
@@ -1409,12 +1406,11 @@ class TestMetadataExtender(TestCase):
1409
1406
  SAMPLETYPE_SHORTHAND_KEY: ["stool"],
1410
1407
  QC_NOTE_KEY: [""]
1411
1408
  })
1412
- global_plus_host_settings_dict = {
1409
+
1410
+ host_type_config_dict = {
1413
1411
  OVERWRITE_NON_NANS_KEY: False,
1414
1412
  LEAVE_REQUIREDS_BLANK_KEY: True,
1415
- DEFAULT_KEY: "not provided"
1416
- }
1417
- host_type_config_dict = {
1413
+ DEFAULT_KEY: "not provided",
1418
1414
  METADATA_FIELDS_KEY: {},
1419
1415
  SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
1420
1416
  "stool": {
@@ -1429,7 +1425,7 @@ class TestMetadataExtender(TestCase):
1429
1425
  }
1430
1426
 
1431
1427
  result_df, validation_msgs = _generate_metadata_for_a_sample_type_in_a_host_type(
1432
- input_df, "stool", global_plus_host_settings_dict, host_type_config_dict)
1428
+ input_df, "stool", host_type_config_dict)
1433
1429
 
1434
1430
  self.assertEqual(LEAVE_BLANK_VAL, result_df["required_field"].iloc[0])
1435
1431
 
@@ -1441,12 +1437,11 @@ class TestMetadataExtender(TestCase):
1441
1437
  SAMPLETYPE_SHORTHAND_KEY: ["stool"],
1442
1438
  QC_NOTE_KEY: [""]
1443
1439
  })
1444
- global_plus_host_settings_dict = {
1440
+
1441
+ host_type_config_dict = {
1445
1442
  OVERWRITE_NON_NANS_KEY: False,
1446
1443
  LEAVE_REQUIREDS_BLANK_KEY: False,
1447
- DEFAULT_KEY: "global_default"
1448
- }
1449
- host_type_config_dict = {
1444
+ DEFAULT_KEY: "global_default",
1450
1445
  METADATA_FIELDS_KEY: {},
1451
1446
  SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
1452
1447
  "stool": {
@@ -1461,7 +1456,7 @@ class TestMetadataExtender(TestCase):
1461
1456
  }
1462
1457
 
1463
1458
  result_df, validation_msgs = _generate_metadata_for_a_sample_type_in_a_host_type(
1464
- input_df, "stool", global_plus_host_settings_dict, host_type_config_dict)
1459
+ input_df, "stool", host_type_config_dict)
1465
1460
 
1466
1461
  # When leave_requireds_blank is False, NaN values get filled with global default
1467
1462
  self.assertEqual("global_default", result_df["required_field"].iloc[0])
@@ -1475,12 +1470,11 @@ class TestMetadataExtender(TestCase):
1475
1470
  QC_NOTE_KEY: [""],
1476
1471
  "existing_field": ["original_value"]
1477
1472
  })
1478
- global_plus_host_settings_dict = {
1473
+
1474
+ host_type_config_dict = {
1479
1475
  OVERWRITE_NON_NANS_KEY: True,
1480
1476
  LEAVE_REQUIREDS_BLANK_KEY: False,
1481
- DEFAULT_KEY: "not provided"
1482
- }
1483
- host_type_config_dict = {
1477
+ DEFAULT_KEY: "not provided",
1484
1478
  METADATA_FIELDS_KEY: {},
1485
1479
  SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
1486
1480
  "stool": {
@@ -1495,7 +1489,7 @@ class TestMetadataExtender(TestCase):
1495
1489
  }
1496
1490
 
1497
1491
  result_df, validation_msgs = _generate_metadata_for_a_sample_type_in_a_host_type(
1498
- input_df, "stool", global_plus_host_settings_dict, host_type_config_dict)
1492
+ input_df, "stool", host_type_config_dict)
1499
1493
 
1500
1494
  self.assertEqual("new_value", result_df["existing_field"].iloc[0])
1501
1495
 
@@ -1508,12 +1502,11 @@ class TestMetadataExtender(TestCase):
1508
1502
  QC_NOTE_KEY: [""],
1509
1503
  "existing_field": ["original_value"]
1510
1504
  })
1511
- global_plus_host_settings_dict = {
1505
+
1506
+ host_type_config_dict = {
1512
1507
  OVERWRITE_NON_NANS_KEY: False,
1513
1508
  LEAVE_REQUIREDS_BLANK_KEY: False,
1514
- DEFAULT_KEY: "not provided"
1515
- }
1516
- host_type_config_dict = {
1509
+ DEFAULT_KEY: "not provided",
1517
1510
  METADATA_FIELDS_KEY: {},
1518
1511
  SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
1519
1512
  "stool": {
@@ -1528,7 +1521,7 @@ class TestMetadataExtender(TestCase):
1528
1521
  }
1529
1522
 
1530
1523
  result_df, validation_msgs = _generate_metadata_for_a_sample_type_in_a_host_type(
1531
- input_df, "stool", global_plus_host_settings_dict, host_type_config_dict)
1524
+ input_df, "stool", host_type_config_dict)
1532
1525
 
1533
1526
  self.assertEqual("original_value", result_df["existing_field"].iloc[0])
1534
1527
 
@@ -1540,14 +1533,13 @@ class TestMetadataExtender(TestCase):
1540
1533
  SAMPLETYPE_SHORTHAND_KEY: ["feces"],
1541
1534
  QC_NOTE_KEY: [""]
1542
1535
  })
1543
- global_plus_host_settings_dict = {
1544
- OVERWRITE_NON_NANS_KEY: False,
1545
- LEAVE_REQUIREDS_BLANK_KEY: False,
1546
- DEFAULT_KEY: "not provided"
1547
- }
1536
+
1548
1537
  # Config is pre-resolved: alias "feces" has its own metadata_fields
1549
1538
  # that is a copy of "stool"'s resolved fields with sample_type="stool"
1550
1539
  host_type_config_dict = {
1540
+ OVERWRITE_NON_NANS_KEY: False,
1541
+ LEAVE_REQUIREDS_BLANK_KEY: False,
1542
+ DEFAULT_KEY: "not provided",
1551
1543
  METADATA_FIELDS_KEY: {},
1552
1544
  SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
1553
1545
  "feces": {
@@ -1590,7 +1582,7 @@ class TestMetadataExtender(TestCase):
1590
1582
  }
1591
1583
 
1592
1584
  result_df, validation_msgs = _generate_metadata_for_a_sample_type_in_a_host_type(
1593
- input_df, "feces", global_plus_host_settings_dict, host_type_config_dict)
1585
+ input_df, "feces", host_type_config_dict)
1594
1586
 
1595
1587
  self.assertEqual("stool_value", result_df["stool_field"].iloc[0])
1596
1588
  # sample_type should be set to the resolved type "stool"
@@ -1606,17 +1598,15 @@ class TestMetadataExtender(TestCase):
1606
1598
  SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
1607
1599
  QC_NOTE_KEY: ["", ""]
1608
1600
  })
1609
- settings_dict = {
1610
- OVERWRITE_NON_NANS_KEY: False,
1611
- LEAVE_REQUIREDS_BLANK_KEY: False,
1612
- DEFAULT_KEY: "global_default"
1613
- }
1601
+
1614
1602
  # Config is pre-resolved: sample type's metadata_fields includes
1615
1603
  # host fields merged in, plus sample_type and qiita_sample_type
1616
1604
  full_flat_config_dict = {
1617
1605
  HOST_TYPE_SPECIFIC_METADATA_KEY: {
1618
1606
  "human": {
1619
1607
  DEFAULT_KEY: "human_default",
1608
+ OVERWRITE_NON_NANS_KEY: False,
1609
+ LEAVE_REQUIREDS_BLANK_KEY: False,
1620
1610
  METADATA_FIELDS_KEY: {
1621
1611
  "host_field": {
1622
1612
  DEFAULT_KEY: "host_value",
@@ -1652,7 +1642,7 @@ class TestMetadataExtender(TestCase):
1652
1642
  }
1653
1643
 
1654
1644
  result_df, validation_msgs = _generate_metadata_for_a_host_type(
1655
- input_df, "human", settings_dict, full_flat_config_dict)
1645
+ input_df, "human", full_flat_config_dict)
1656
1646
 
1657
1647
  expected_df = pandas.DataFrame({
1658
1648
  SAMPLE_NAME_KEY: ["sample1", "sample2"],
@@ -1675,14 +1665,13 @@ class TestMetadataExtender(TestCase):
1675
1665
  SAMPLETYPE_SHORTHAND_KEY: ["stool"],
1676
1666
  QC_NOTE_KEY: [""]
1677
1667
  })
1678
- settings_dict = {
1679
- OVERWRITE_NON_NANS_KEY: False,
1680
- LEAVE_REQUIREDS_BLANK_KEY: False,
1681
- DEFAULT_KEY: "global_default"
1682
- }
1668
+
1683
1669
  full_flat_config_dict = {
1684
1670
  HOST_TYPE_SPECIFIC_METADATA_KEY: {
1685
1671
  "human": {
1672
+ OVERWRITE_NON_NANS_KEY: False,
1673
+ LEAVE_REQUIREDS_BLANK_KEY: False,
1674
+ DEFAULT_KEY: "global_default",
1686
1675
  METADATA_FIELDS_KEY: {},
1687
1676
  SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {}
1688
1677
  }
@@ -1690,7 +1679,7 @@ class TestMetadataExtender(TestCase):
1690
1679
  }
1691
1680
 
1692
1681
  result_df, validation_msgs = _generate_metadata_for_a_host_type(
1693
- input_df, "unknown_host", settings_dict, full_flat_config_dict)
1682
+ input_df, "unknown_host", full_flat_config_dict)
1694
1683
 
1695
1684
  expected_df = pandas.DataFrame({
1696
1685
  SAMPLE_NAME_KEY: ["sample1"],
@@ -1709,14 +1698,13 @@ class TestMetadataExtender(TestCase):
1709
1698
  SAMPLETYPE_SHORTHAND_KEY: ["unknown_sample"],
1710
1699
  QC_NOTE_KEY: [""]
1711
1700
  })
1712
- settings_dict = {
1713
- OVERWRITE_NON_NANS_KEY: False,
1714
- LEAVE_REQUIREDS_BLANK_KEY: False,
1715
- DEFAULT_KEY: "global_default"
1716
- }
1701
+
1717
1702
  full_flat_config_dict = {
1718
1703
  HOST_TYPE_SPECIFIC_METADATA_KEY: {
1719
1704
  "human": {
1705
+ OVERWRITE_NON_NANS_KEY: False,
1706
+ LEAVE_REQUIREDS_BLANK_KEY: False,
1707
+ DEFAULT_KEY: "global_default",
1720
1708
  METADATA_FIELDS_KEY: {},
1721
1709
  SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
1722
1710
  "stool": {
@@ -1728,7 +1716,7 @@ class TestMetadataExtender(TestCase):
1728
1716
  }
1729
1717
 
1730
1718
  result_df, validation_msgs = _generate_metadata_for_a_host_type(
1731
- input_df, "human", settings_dict, full_flat_config_dict)
1719
+ input_df, "human", full_flat_config_dict)
1732
1720
 
1733
1721
  expected_df = pandas.DataFrame({
1734
1722
  SAMPLE_NAME_KEY: ["sample1"],
@@ -1747,16 +1735,15 @@ class TestMetadataExtender(TestCase):
1747
1735
  SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool", "stool"],
1748
1736
  QC_NOTE_KEY: ["", "", ""]
1749
1737
  })
1750
- settings_dict = {
1751
- OVERWRITE_NON_NANS_KEY: False,
1752
- LEAVE_REQUIREDS_BLANK_KEY: False,
1753
- DEFAULT_KEY: "global_default"
1754
- }
1738
+
1755
1739
  # Config is pre-resolved: sample type's metadata_fields includes
1756
1740
  # host fields merged in, plus sample_type and qiita_sample_type
1757
1741
  full_flat_config_dict = {
1758
1742
  HOST_TYPE_SPECIFIC_METADATA_KEY: {
1759
1743
  "human": {
1744
+ OVERWRITE_NON_NANS_KEY: False,
1745
+ LEAVE_REQUIREDS_BLANK_KEY: False,
1746
+ DEFAULT_KEY: "global_default",
1760
1747
  METADATA_FIELDS_KEY: {
1761
1748
  "human_field": {
1762
1749
  DEFAULT_KEY: "human_value",
@@ -1785,6 +1772,9 @@ class TestMetadataExtender(TestCase):
1785
1772
  }
1786
1773
  },
1787
1774
  "mouse": {
1775
+ OVERWRITE_NON_NANS_KEY: False,
1776
+ LEAVE_REQUIREDS_BLANK_KEY: False,
1777
+ DEFAULT_KEY: "global_default",
1788
1778
  METADATA_FIELDS_KEY: {},
1789
1779
  SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {}
1790
1780
  }
@@ -1792,7 +1782,7 @@ class TestMetadataExtender(TestCase):
1792
1782
  }
1793
1783
 
1794
1784
  result_df, validation_msgs = _generate_metadata_for_a_host_type(
1795
- input_df, "human", settings_dict, full_flat_config_dict)
1785
+ input_df, "human", full_flat_config_dict)
1796
1786
 
1797
1787
  expected_df = pandas.DataFrame({
1798
1788
  SAMPLE_NAME_KEY: ["sample1", "sample3"],
@@ -1813,17 +1803,15 @@ class TestMetadataExtender(TestCase):
1813
1803
  SAMPLETYPE_SHORTHAND_KEY: ["stool"],
1814
1804
  QC_NOTE_KEY: [""]
1815
1805
  })
1816
- settings_dict = {
1817
- OVERWRITE_NON_NANS_KEY: False,
1818
- LEAVE_REQUIREDS_BLANK_KEY: False,
1819
- DEFAULT_KEY: "global_default"
1820
- }
1806
+
1821
1807
  # Config is pre-resolved: sample type's metadata_fields includes
1822
1808
  # host fields merged in, plus sample_type and qiita_sample_type
1823
1809
  full_flat_config_dict = {
1824
1810
  HOST_TYPE_SPECIFIC_METADATA_KEY: {
1825
1811
  "human": {
1826
1812
  DEFAULT_KEY: "human_specific_default",
1813
+ OVERWRITE_NON_NANS_KEY: False,
1814
+ LEAVE_REQUIREDS_BLANK_KEY: False,
1827
1815
  METADATA_FIELDS_KEY: {},
1828
1816
  SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
1829
1817
  "stool": {
@@ -1850,7 +1838,7 @@ class TestMetadataExtender(TestCase):
1850
1838
  }
1851
1839
 
1852
1840
  result_df, validation_msgs = _generate_metadata_for_a_host_type(
1853
- input_df, "human", settings_dict, full_flat_config_dict)
1841
+ input_df, "human", full_flat_config_dict)
1854
1842
 
1855
1843
  expected_df = pandas.DataFrame({
1856
1844
  SAMPLE_NAME_KEY: ["sample1"],
@@ -1871,17 +1859,14 @@ class TestMetadataExtender(TestCase):
1871
1859
  SAMPLETYPE_SHORTHAND_KEY: ["stool"],
1872
1860
  QC_NOTE_KEY: [""]
1873
1861
  })
1874
- settings_dict = {
1875
- OVERWRITE_NON_NANS_KEY: False,
1876
- LEAVE_REQUIREDS_BLANK_KEY: False,
1877
- DEFAULT_KEY: "global_default"
1878
- }
1879
1862
  # Config is pre-resolved: sample type's metadata_fields includes
1880
1863
  # host fields merged in, plus sample_type and qiita_sample_type
1881
1864
  full_flat_config_dict = {
1882
1865
  HOST_TYPE_SPECIFIC_METADATA_KEY: {
1883
1866
  "human": {
1884
- # No DEFAULT_KEY here
1867
+ OVERWRITE_NON_NANS_KEY: False,
1868
+ LEAVE_REQUIREDS_BLANK_KEY: False,
1869
+ DEFAULT_KEY: "global_default",
1885
1870
  METADATA_FIELDS_KEY: {},
1886
1871
  SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
1887
1872
  "stool": {
@@ -1908,7 +1893,7 @@ class TestMetadataExtender(TestCase):
1908
1893
  }
1909
1894
 
1910
1895
  result_df, validation_msgs = _generate_metadata_for_a_host_type(
1911
- input_df, "human", settings_dict, full_flat_config_dict)
1896
+ input_df, "human", full_flat_config_dict)
1912
1897
 
1913
1898
  expected_df = pandas.DataFrame({
1914
1899
  SAMPLE_NAME_KEY: ["sample1"],
@@ -1939,6 +1924,9 @@ class TestMetadataExtender(TestCase):
1939
1924
  OVERWRITE_NON_NANS_KEY: False,
1940
1925
  HOST_TYPE_SPECIFIC_METADATA_KEY: {
1941
1926
  "human": {
1927
+ DEFAULT_KEY: "global_default",
1928
+ LEAVE_REQUIREDS_BLANK_KEY: False,
1929
+ OVERWRITE_NON_NANS_KEY: False,
1942
1930
  METADATA_FIELDS_KEY: {
1943
1931
  "host_field": {
1944
1932
  DEFAULT_KEY: "host_value",
@@ -2005,6 +1993,9 @@ class TestMetadataExtender(TestCase):
2005
1993
  OVERWRITE_NON_NANS_KEY: False,
2006
1994
  HOST_TYPE_SPECIFIC_METADATA_KEY: {
2007
1995
  "human": {
1996
+ DEFAULT_KEY: "global_default",
1997
+ LEAVE_REQUIREDS_BLANK_KEY: False,
1998
+ OVERWRITE_NON_NANS_KEY: False,
2008
1999
  METADATA_FIELDS_KEY: {
2009
2000
  "human_field": {
2010
2001
  DEFAULT_KEY: "human_value",
@@ -2051,6 +2042,9 @@ class TestMetadataExtender(TestCase):
2051
2042
  }
2052
2043
  },
2053
2044
  "mouse": {
2045
+ DEFAULT_KEY: "global_default",
2046
+ LEAVE_REQUIREDS_BLANK_KEY: False,
2047
+ OVERWRITE_NON_NANS_KEY: False,
2054
2048
  METADATA_FIELDS_KEY: {
2055
2049
  "mouse_field": {
2056
2050
  DEFAULT_KEY: "mouse_value",
@@ -2182,6 +2176,9 @@ class TestMetadataExtender(TestCase):
2182
2176
  OVERWRITE_NON_NANS_KEY: False,
2183
2177
  HOST_TYPE_SPECIFIC_METADATA_KEY: {
2184
2178
  "human": {
2179
+ DEFAULT_KEY: "global_default",
2180
+ LEAVE_REQUIREDS_BLANK_KEY: True, # This causes required fields to get LEAVE_BLANK_VAL
2181
+ OVERWRITE_NON_NANS_KEY: False,
2185
2182
  METADATA_FIELDS_KEY: {},
2186
2183
  SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
2187
2184
  "stool": {
@@ -2506,6 +2503,9 @@ class TestMetadataExtender(TestCase):
2506
2503
  OVERWRITE_NON_NANS_KEY: False,
2507
2504
  HOST_TYPE_SPECIFIC_METADATA_KEY: {
2508
2505
  "human": {
2506
+ DEFAULT_KEY: "not provided",
2507
+ LEAVE_REQUIREDS_BLANK_KEY: False,
2508
+ OVERWRITE_NON_NANS_KEY: False,
2509
2509
  METADATA_FIELDS_KEY: {
2510
2510
  "host_field": {
2511
2511
  DEFAULT_KEY: "host_value",
@@ -2580,6 +2580,9 @@ class TestMetadataExtender(TestCase):
2580
2580
  },
2581
2581
  HOST_TYPE_SPECIFIC_METADATA_KEY: {
2582
2582
  "human": {
2583
+ DEFAULT_KEY: "not provided",
2584
+ LEAVE_REQUIREDS_BLANK_KEY: False,
2585
+ OVERWRITE_NON_NANS_KEY: False,
2583
2586
  METADATA_FIELDS_KEY: {},
2584
2587
  SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
2585
2588
  "stool": {
@@ -2639,6 +2642,9 @@ class TestMetadataExtender(TestCase):
2639
2642
  },
2640
2643
  HOST_TYPE_SPECIFIC_METADATA_KEY: {
2641
2644
  "human": {
2645
+ DEFAULT_KEY: "not provided",
2646
+ LEAVE_REQUIREDS_BLANK_KEY: False,
2647
+ OVERWRITE_NON_NANS_KEY: False,
2642
2648
  METADATA_FIELDS_KEY: {},
2643
2649
  SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
2644
2650
  "stool": {
@@ -2687,6 +2693,9 @@ class TestMetadataExtender(TestCase):
2687
2693
  OVERWRITE_NON_NANS_KEY: False,
2688
2694
  HOST_TYPE_SPECIFIC_METADATA_KEY: {
2689
2695
  "human": {
2696
+ DEFAULT_KEY: "not provided",
2697
+ LEAVE_REQUIREDS_BLANK_KEY: False,
2698
+ OVERWRITE_NON_NANS_KEY: False,
2690
2699
  METADATA_FIELDS_KEY: {},
2691
2700
  SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {}
2692
2701
  }
@@ -2721,6 +2730,9 @@ class TestMetadataExtender(TestCase):
2721
2730
  OVERWRITE_NON_NANS_KEY: False,
2722
2731
  HOST_TYPE_SPECIFIC_METADATA_KEY: {
2723
2732
  "human": {
2733
+ DEFAULT_KEY: "not provided",
2734
+ LEAVE_REQUIREDS_BLANK_KEY: False,
2735
+ OVERWRITE_NON_NANS_KEY: False,
2724
2736
  METADATA_FIELDS_KEY: {},
2725
2737
  SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
2726
2738
  "stool": {
@@ -2781,6 +2793,9 @@ class TestMetadataExtender(TestCase):
2781
2793
  },
2782
2794
  HOST_TYPE_SPECIFIC_METADATA_KEY: {
2783
2795
  "human": {
2796
+ DEFAULT_KEY: "not provided",
2797
+ LEAVE_REQUIREDS_BLANK_KEY: False,
2798
+ OVERWRITE_NON_NANS_KEY: False,
2784
2799
  METADATA_FIELDS_KEY: {},
2785
2800
  SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
2786
2801
  "stool": {
@@ -3319,7 +3334,7 @@ class TestMetadataExtender(TestCase):
3319
3334
  self.assertEqual(1, len(output_files))
3320
3335
 
3321
3336
  # Read and verify contents (keep_default_na=False preserves empty strings)
3322
- result_df = pandas.read_csv(output_files[0], sep="\t", keep_default_na=False)
3337
+ result_df = pandas.read_csv(output_files[0], sep="\t", dtype=str, keep_default_na=False)
3323
3338
  expected_df = input_df
3324
3339
  assert_frame_equal(expected_df, result_df)
3325
3340
 
@@ -3343,7 +3358,7 @@ class TestMetadataExtender(TestCase):
3343
3358
  self.assertEqual(1, len(output_files))
3344
3359
 
3345
3360
  # Verify main output has internal cols removed and no failures
3346
- result_df = pandas.read_csv(output_files[0], sep="\t")
3361
+ result_df = pandas.read_csv(output_files[0], sep="\t", dtype=str, keep_default_na=False)
3347
3362
  expected_df = pandas.DataFrame({
3348
3363
  SAMPLE_NAME_KEY: ["sample1", "sample3"],
3349
3364
  "field_a": ["a1", "a3"]
@@ -3355,7 +3370,7 @@ class TestMetadataExtender(TestCase):
3355
3370
  self.assertEqual(1, len(fails_files))
3356
3371
 
3357
3372
  # Verify fails file contains the failed row
3358
- fails_df = pandas.read_csv(fails_files[0], sep=",")
3373
+ fails_df = pandas.read_csv(fails_files[0], sep=",", dtype=str, keep_default_na=False)
3359
3374
  expected_fails_df = pandas.DataFrame({
3360
3375
  SAMPLE_NAME_KEY: ["sample2"],
3361
3376
  "field_a": ["a2"],
@@ -3432,7 +3447,7 @@ class TestMetadataExtender(TestCase):
3432
3447
  self.assertEqual(1, len(output_files))
3433
3448
 
3434
3449
  # Read and verify contents (keep_default_na=False preserves empty strings)
3435
- result_df = pandas.read_csv(output_files[0], sep=",", keep_default_na=False)
3450
+ result_df = pandas.read_csv(output_files[0], sep=",", dtype=str, keep_default_na=False)
3436
3451
  expected_df = input_df
3437
3452
  assert_frame_equal(expected_df, result_df)
3438
3453
 
@@ -3454,14 +3469,14 @@ class TestMetadataExtender(TestCase):
3454
3469
  # Main output file should have only headers (empty data)
3455
3470
  output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
3456
3471
  self.assertEqual(1, len(output_files))
3457
- result_df = pandas.read_csv(output_files[0], sep="\t")
3472
+ result_df = pandas.read_csv(output_files[0], sep="\t", dtype=str, keep_default_na=False)
3458
3473
  self.assertTrue(result_df.empty)
3459
3474
  self.assertEqual([SAMPLE_NAME_KEY, "field_a"], list(result_df.columns))
3460
3475
 
3461
3476
  # Fails file should have both rows
3462
3477
  fails_files = glob.glob(os.path.join(tmpdir, "*_test_output_fails.csv"))
3463
3478
  self.assertEqual(1, len(fails_files))
3464
- fails_df = pandas.read_csv(fails_files[0], sep=",")
3479
+ fails_df = pandas.read_csv(fails_files[0], sep=",", dtype=str, keep_default_na=False)
3465
3480
  self.assertEqual(2, len(fails_df))
3466
3481
 
3467
3482
  # Tests for get_extended_metadata_from_df_and_yaml
@@ -3606,7 +3621,7 @@ class TestMetadataExtender(TestCase):
3606
3621
  # Verify main output file was created (internal cols removed by default)
3607
3622
  output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
3608
3623
  self.assertEqual(1, len(output_files))
3609
- output_df = pandas.read_csv(output_files[0], sep="\t")
3624
+ output_df = pandas.read_csv(output_files[0], sep="\t", dtype=str, keep_default_na=False)
3610
3625
  expected_output_df = pandas.DataFrame({
3611
3626
  SAMPLE_NAME_KEY: ["sample1", "sample2"],
3612
3627
  "body_product": ["UBERON:feces", "UBERON:feces"],
@@ -3679,7 +3694,7 @@ class TestMetadataExtender(TestCase):
3679
3694
  # Verify main output file excludes failure rows
3680
3695
  output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
3681
3696
  self.assertEqual(1, len(output_files))
3682
- output_df = pandas.read_csv(output_files[0], sep="\t")
3697
+ output_df = pandas.read_csv(output_files[0], sep="\t", dtype=str, keep_default_na=False)
3683
3698
  expected_output_df = pandas.DataFrame({
3684
3699
  SAMPLE_NAME_KEY: ["sample1", "sample3"],
3685
3700
  "body_product": ["UBERON:feces", "UBERON:feces"],
@@ -3694,7 +3709,7 @@ class TestMetadataExtender(TestCase):
3694
3709
  # Verify fails file contains the failed row
3695
3710
  fails_files = glob.glob(os.path.join(tmpdir, "*_test_output_fails.csv"))
3696
3711
  self.assertEqual(1, len(fails_files))
3697
- fails_df = pandas.read_csv(fails_files[0], sep=",")
3712
+ fails_df = pandas.read_csv(fails_files[0], sep=",", dtype=str, keep_default_na=False)
3698
3713
  expected_fails_df = pandas.DataFrame({
3699
3714
  SAMPLE_NAME_KEY: ["sample2"],
3700
3715
  "body_product": ["not provided"],
@@ -3765,7 +3780,7 @@ class TestMetadataExtender(TestCase):
3765
3780
  validation_files = glob.glob(
3766
3781
  os.path.join(tmpdir, "*_test_output_validation_errors.csv"))
3767
3782
  self.assertEqual(1, len(validation_files))
3768
- validation_df = pandas.read_csv(validation_files[0], sep=",")
3783
+ validation_df = pandas.read_csv(validation_files[0], sep=",", dtype=str, keep_default_na=False)
3769
3784
  expected_validation_df = pandas.DataFrame({
3770
3785
  "sample_name": ["sample1"],
3771
3786
  "field_name": ["restricted_field"],
@@ -3806,7 +3821,7 @@ class TestMetadataExtender(TestCase):
3806
3821
  # Verify main output file includes internal columns
3807
3822
  output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
3808
3823
  self.assertEqual(1, len(output_files))
3809
- output_df = pandas.read_csv(output_files[0], sep="\t", keep_default_na=False)
3824
+ output_df = pandas.read_csv(output_files[0], sep="\t", dtype=str, keep_default_na=False)
3810
3825
  expected_output_df = pandas.DataFrame({
3811
3826
  SAMPLE_NAME_KEY: ["sample1"],
3812
3827
  "body_product": ["UBERON:feces"],
@@ -3829,6 +3844,7 @@ class TestMetadataExtender(TestCase):
3829
3844
 
3830
3845
  TEST_METADATA_CSV_FP = path.join(TEST_DIR, "data/test_metadata.csv")
3831
3846
  TEST_METADATA_TXT_FP = path.join(TEST_DIR, "data/test_metadata.txt")
3847
+ TEST_METADATA_XLSX_FP = path.join(TEST_DIR, "data/test_metadata.xlsx")
3832
3848
  TEST_METADATA_WITH_ERRORS_FP = path.join(
3833
3849
  TEST_DIR, "data/test_metadata_with_errors.csv")
3834
3850
  TEST_STUDY_CONFIG_WITH_VALIDATION_FP = path.join(
@@ -3847,6 +3863,7 @@ class TestMetadataExtender(TestCase):
3847
3863
  "body_product": ["UBERON:feces", "UBERON:feces"],
3848
3864
  "body_site": ["gut", "gut"],
3849
3865
  "description": ["human sample", "human sample"],
3866
+ "dna_extracted": ["TRUE", "FALSE"],
3850
3867
  "host_common_name": ["human", "human"],
3851
3868
  QIITA_SAMPLE_TYPE: ["stool", "stool"],
3852
3869
  SAMPLE_TYPE_KEY: ["stool", "stool"],
@@ -3861,12 +3878,13 @@ class TestMetadataExtender(TestCase):
3861
3878
  # Verify main output file was created (internal cols removed by default)
3862
3879
  output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
3863
3880
  self.assertEqual(1, len(output_files))
3864
- output_df = pandas.read_csv(output_files[0], sep="\t")
3881
+ output_df = pandas.read_csv(output_files[0], sep="\t", dtype=str, keep_default_na=False)
3865
3882
  expected_output_df = pandas.DataFrame({
3866
3883
  SAMPLE_NAME_KEY: ["sample1", "sample2"],
3867
3884
  "body_product": ["UBERON:feces", "UBERON:feces"],
3868
3885
  "body_site": ["gut", "gut"],
3869
3886
  "description": ["human sample", "human sample"],
3887
+ "dna_extracted": ["TRUE", "FALSE"],
3870
3888
  "host_common_name": ["human", "human"],
3871
3889
  QIITA_SAMPLE_TYPE: ["stool", "stool"],
3872
3890
  SAMPLE_TYPE_KEY: ["stool", "stool"],
@@ -3899,6 +3917,7 @@ class TestMetadataExtender(TestCase):
3899
3917
  "body_product": ["UBERON:feces", "UBERON:feces"],
3900
3918
  "body_site": ["gut", "gut"],
3901
3919
  "description": ["human sample", "human sample"],
3920
+ "dna_extracted": ["TRUE", "FALSE"],
3902
3921
  "host_common_name": ["human", "human"],
3903
3922
  QIITA_SAMPLE_TYPE: ["stool", "stool"],
3904
3923
  SAMPLE_TYPE_KEY: ["stool", "stool"],
@@ -3913,12 +3932,13 @@ class TestMetadataExtender(TestCase):
3913
3932
  # Verify main output file was created
3914
3933
  output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
3915
3934
  self.assertEqual(1, len(output_files))
3916
- output_df = pandas.read_csv(output_files[0], sep="\t")
3935
+ output_df = pandas.read_csv(output_files[0], sep="\t", dtype=str, keep_default_na=False)
3917
3936
  expected_output_df = pandas.DataFrame({
3918
3937
  SAMPLE_NAME_KEY: ["sample1", "sample2"],
3919
3938
  "body_product": ["UBERON:feces", "UBERON:feces"],
3920
3939
  "body_site": ["gut", "gut"],
3921
3940
  "description": ["human sample", "human sample"],
3941
+ "dna_extracted": ["TRUE", "FALSE"],
3922
3942
  "host_common_name": ["human", "human"],
3923
3943
  QIITA_SAMPLE_TYPE: ["stool", "stool"],
3924
3944
  SAMPLE_TYPE_KEY: ["stool", "stool"],
@@ -3927,6 +3947,60 @@ class TestMetadataExtender(TestCase):
3927
3947
  })
3928
3948
  assert_frame_equal(expected_output_df, output_df)
3929
3949
 
3950
+ def test_write_extended_metadata_xlsx_input(self):
3951
+ """Test writing extended metadata from an Excel XLSX input file."""
3952
+ with tempfile.TemporaryDirectory() as tmpdir:
3953
+ result_df = write_extended_metadata(
3954
+ self.TEST_METADATA_XLSX_FP, self.TEST_STUDY_CONFIG_FP,
3955
+ tmpdir, "test_output", stds_fp=self.TEST_STDS_FP)
3956
+
3957
+ # Verify returned DataFrame
3958
+ expected_result_df = pandas.DataFrame({
3959
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
3960
+ "body_product": ["UBERON:feces", "UBERON:feces"],
3961
+ "body_site": ["gut", "gut"],
3962
+ "description": ["human sample", "human sample"],
3963
+ "dna_extracted": ["TRUE", "FALSE"],
3964
+ "host_common_name": ["human", "human"],
3965
+ QIITA_SAMPLE_TYPE: ["stool", "stool"],
3966
+ SAMPLE_TYPE_KEY: ["stool", "stool"],
3967
+ "study_custom_field": ["custom_value", "custom_value"],
3968
+ "study_stool_field": ["stool_custom", "stool_custom"],
3969
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
3970
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
3971
+ QC_NOTE_KEY: ["", ""]
3972
+ })
3973
+ assert_frame_equal(expected_result_df, result_df)
3974
+
3975
+ # Verify main output file was created
3976
+ output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
3977
+ self.assertEqual(1, len(output_files))
3978
+ output_df = pandas.read_csv(output_files[0], sep="\t", dtype=str, keep_default_na=False)
3979
+ expected_output_df = pandas.DataFrame({
3980
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
3981
+ "body_product": ["UBERON:feces", "UBERON:feces"],
3982
+ "body_site": ["gut", "gut"],
3983
+ "description": ["human sample", "human sample"],
3984
+ "dna_extracted": ["TRUE", "FALSE"],
3985
+ "host_common_name": ["human", "human"],
3986
+ QIITA_SAMPLE_TYPE: ["stool", "stool"],
3987
+ SAMPLE_TYPE_KEY: ["stool", "stool"],
3988
+ "study_custom_field": ["custom_value", "custom_value"],
3989
+ "study_stool_field": ["stool_custom", "stool_custom"]
3990
+ })
3991
+ assert_frame_equal(expected_output_df, output_df)
3992
+
3993
+ # Verify empty fails file was created
3994
+ fails_files = glob.glob(os.path.join(tmpdir, "*_test_output_fails.csv"))
3995
+ self.assertEqual(1, len(fails_files))
3996
+ self.assertEqual(0, os.path.getsize(fails_files[0]))
3997
+
3998
+ # Verify empty validation errors file was created
3999
+ validation_files = glob.glob(
4000
+ os.path.join(tmpdir, "*_test_output_validation_errors.csv"))
4001
+ self.assertEqual(1, len(validation_files))
4002
+ self.assertEqual(0, os.path.getsize(validation_files[0]))
4003
+
3930
4004
  def test_write_extended_metadata_with_validation_errors(self):
3931
4005
  """Test writing extended metadata when validation errors occur."""
3932
4006
  with tempfile.TemporaryDirectory() as tmpdir:
@@ -3941,6 +4015,7 @@ class TestMetadataExtender(TestCase):
3941
4015
  "body_product": ["UBERON:feces", "UBERON:feces"],
3942
4016
  "body_site": ["gut", "gut"],
3943
4017
  "description": ["human sample", "human sample"],
4018
+ "dna_extracted": ["TRUE", "FALSE"],
3944
4019
  "host_common_name": ["human", "human"],
3945
4020
  QIITA_SAMPLE_TYPE: ["stool", "stool"],
3946
4021
  "restricted_field": ["invalid_value", "allowed_value"],
@@ -3954,12 +4029,13 @@ class TestMetadataExtender(TestCase):
3954
4029
  # Verify main output file was created
3955
4030
  output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
3956
4031
  self.assertEqual(1, len(output_files))
3957
- output_df = pandas.read_csv(output_files[0], sep="\t")
4032
+ output_df = pandas.read_csv(output_files[0], sep="\t", dtype=str, keep_default_na=False)
3958
4033
  expected_output_df = pandas.DataFrame({
3959
4034
  SAMPLE_NAME_KEY: ["sample1", "sample2"],
3960
4035
  "body_product": ["UBERON:feces", "UBERON:feces"],
3961
4036
  "body_site": ["gut", "gut"],
3962
4037
  "description": ["human sample", "human sample"],
4038
+ "dna_extracted": ["TRUE", "FALSE"],
3963
4039
  "host_common_name": ["human", "human"],
3964
4040
  QIITA_SAMPLE_TYPE: ["stool", "stool"],
3965
4041
  "restricted_field": ["invalid_value", "allowed_value"],
@@ -3971,7 +4047,7 @@ class TestMetadataExtender(TestCase):
3971
4047
  validation_files = glob.glob(
3972
4048
  os.path.join(tmpdir, "*_test_output_validation_errors.csv"))
3973
4049
  self.assertEqual(1, len(validation_files))
3974
- validation_df = pandas.read_csv(validation_files[0], sep=",")
4050
+ validation_df = pandas.read_csv(validation_files[0], sep=",", dtype=str, keep_default_na=False)
3975
4051
  expected_validation_df = pandas.DataFrame({
3976
4052
  "sample_name": ["sample1"],
3977
4053
  "field_name": ["restricted_field"],
@@ -4006,6 +4082,7 @@ class TestMetadataExtender(TestCase):
4006
4082
  "body_product": ["UBERON:feces", "UBERON:feces"],
4007
4083
  "body_site": ["gut", "gut"],
4008
4084
  "description": ["human sample", "human sample"],
4085
+ "dna_extracted": ["TRUE", "FALSE"],
4009
4086
  "host_common_name": ["human", "human"],
4010
4087
  QIITA_SAMPLE_TYPE: ["stool", "stool"],
4011
4088
  SAMPLE_TYPE_KEY: ["stool", "stool"],
@@ -4020,12 +4097,13 @@ class TestMetadataExtender(TestCase):
4020
4097
  # Verify output file has .csv extension
4021
4098
  output_files = glob.glob(os.path.join(tmpdir, "*_test_output.csv"))
4022
4099
  self.assertEqual(1, len(output_files))
4023
- output_df = pandas.read_csv(output_files[0], sep=",")
4100
+ output_df = pandas.read_csv(output_files[0], sep=",", dtype=str, keep_default_na=False)
4024
4101
  expected_output_df = pandas.DataFrame({
4025
4102
  SAMPLE_NAME_KEY: ["sample1", "sample2"],
4026
4103
  "body_product": ["UBERON:feces", "UBERON:feces"],
4027
4104
  "body_site": ["gut", "gut"],
4028
4105
  "description": ["human sample", "human sample"],
4106
+ "dna_extracted": ["TRUE", "FALSE"],
4029
4107
  "host_common_name": ["human", "human"],
4030
4108
  QIITA_SAMPLE_TYPE: ["stool", "stool"],
4031
4109
  SAMPLE_TYPE_KEY: ["stool", "stool"],
@@ -4048,6 +4126,7 @@ class TestMetadataExtender(TestCase):
4048
4126
  "body_product": ["UBERON:feces", "UBERON:feces"],
4049
4127
  "body_site": ["gut", "gut"],
4050
4128
  "description": ["human sample", "human sample"],
4129
+ "dna_extracted": ["TRUE", "FALSE"],
4051
4130
  "host_common_name": ["human", "human"],
4052
4131
  QIITA_SAMPLE_TYPE: ["stool", "stool"],
4053
4132
  SAMPLE_TYPE_KEY: ["stool", "stool"],
@@ -4062,12 +4141,13 @@ class TestMetadataExtender(TestCase):
4062
4141
  # Verify main output file includes internal columns
4063
4142
  output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
4064
4143
  self.assertEqual(1, len(output_files))
4065
- output_df = pandas.read_csv(output_files[0], sep="\t", keep_default_na=False)
4144
+ output_df = pandas.read_csv(output_files[0], sep="\t", dtype=str, keep_default_na=False)
4066
4145
  expected_output_df = pandas.DataFrame({
4067
4146
  SAMPLE_NAME_KEY: ["sample1", "sample2"],
4068
4147
  "body_product": ["UBERON:feces", "UBERON:feces"],
4069
4148
  "body_site": ["gut", "gut"],
4070
4149
  "description": ["human sample", "human sample"],
4150
+ "dna_extracted": ["TRUE", "FALSE"],
4071
4151
  "host_common_name": ["human", "human"],
4072
4152
  QIITA_SAMPLE_TYPE: ["stool", "stool"],
4073
4153
  SAMPLE_TYPE_KEY: ["stool", "stool"],
@@ -4097,6 +4177,7 @@ class TestMetadataExtender(TestCase):
4097
4177
  "body_product": ["UBERON:feces", "UBERON:feces"],
4098
4178
  "body_site": ["gut", "gut"],
4099
4179
  "description": ["human sample", "human sample"],
4180
+ "dna_extracted": ["TRUE", "FALSE"],
4100
4181
  "host_common_name": ["human", "human"],
4101
4182
  QIITA_SAMPLE_TYPE: ["stool", "stool"],
4102
4183
  SAMPLE_TYPE_KEY: ["stool", "stool"],
@@ -4111,12 +4192,13 @@ class TestMetadataExtender(TestCase):
4111
4192
  # Verify main output file was created
4112
4193
  output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
4113
4194
  self.assertEqual(1, len(output_files))
4114
- output_df = pandas.read_csv(output_files[0], sep="\t")
4195
+ output_df = pandas.read_csv(output_files[0], sep="\t", dtype=str, keep_default_na=False)
4115
4196
  expected_output_df = pandas.DataFrame({
4116
4197
  SAMPLE_NAME_KEY: ["sample1", "sample2"],
4117
4198
  "body_product": ["UBERON:feces", "UBERON:feces"],
4118
4199
  "body_site": ["gut", "gut"],
4119
4200
  "description": ["human sample", "human sample"],
4201
+ "dna_extracted": ["TRUE", "FALSE"],
4120
4202
  "host_common_name": ["human", "human"],
4121
4203
  QIITA_SAMPLE_TYPE: ["stool", "stool"],
4122
4204
  SAMPLE_TYPE_KEY: ["stool", "stool"],
@@ -4134,6 +4216,64 @@ class TestMetadataExtender(TestCase):
4134
4216
  os.path.join(tmpdir, "*_test_output_validation_errors.csv"))
4135
4217
  self.assertEqual(0, len(validation_files))
4136
4218
 
4219
+ def test_write_extended_metadata_preserves_string_booleans(self):
4220
+ """Test that TRUE/FALSE string values are not converted to booleans.
4221
+
4222
+ This tests for a bug where loading a CSV without dtype=str causes
4223
+ pandas to convert 'TRUE'/'FALSE' strings to boolean True/False,
4224
+ which then fail validation against allowed string values.
4225
+ """
4226
+ with tempfile.TemporaryDirectory() as tmpdir:
4227
+ # Create a CSV file with TRUE/FALSE string values
4228
+ csv_content = (
4229
+ "sample_name,hosttype_shorthand,sampletype_shorthand,dna_extracted\n"
4230
+ "sample1,human,stool,TRUE\n"
4231
+ "sample2,human,stool,FALSE\n"
4232
+ )
4233
+ csv_fp = path.join(tmpdir, "test_bool_strings.csv")
4234
+ with open(csv_fp, "w") as f:
4235
+ f.write(csv_content)
4236
+
4237
+ # Create a config that defines TRUE/FALSE as allowed string values
4238
+ config_content = """
4239
+ default: "not provided"
4240
+ leave_requireds_blank: false
4241
+ overwrite_non_nans: false
4242
+ study_specific_metadata:
4243
+ host_type_specific_metadata:
4244
+ human:
4245
+ default: "not provided"
4246
+ leave_requireds_blank: false
4247
+ overwrite_non_nans: false
4248
+ sample_type_specific_metadata:
4249
+ stool:
4250
+ metadata_fields:
4251
+ dna_extracted:
4252
+ type: string
4253
+ allowed:
4254
+ - "TRUE"
4255
+ - "FALSE"
4256
+ """
4257
+ config_fp = path.join(tmpdir, "test_bool_config.yml")
4258
+ with open(config_fp, "w") as f:
4259
+ f.write(config_content)
4260
+
4261
+ # Call write_extended_metadata
4262
+ result_df = write_extended_metadata(
4263
+ csv_fp, config_fp, tmpdir, "test_output",
4264
+ stds_fp=self.TEST_STDS_FP)
4265
+
4266
+ # Verify the dna_extracted values are preserved as strings
4267
+ self.assertEqual("TRUE", result_df.loc[0, "dna_extracted"])
4268
+ self.assertEqual("FALSE", result_df.loc[1, "dna_extracted"])
4269
+
4270
+ # Verify no validation errors occurred
4271
+ validation_files = glob.glob(
4272
+ os.path.join(tmpdir, "*_test_output_validation_errors.csv"))
4273
+ self.assertEqual(1, len(validation_files))
4274
+ # The validation errors file should be empty (0 bytes)
4275
+ self.assertEqual(0, os.path.getsize(validation_files[0]))
4276
+
4137
4277
  # Integration tests
4138
4278
 
4139
4279
  TEST_PROJECT1_METADATA_FP = path.join(TEST_DIR, "data/test_project1_input_metadata.csv")
@@ -4142,6 +4282,7 @@ class TestMetadataExtender(TestCase):
4142
4282
  TEST_DIR, "data/test_project1_output_metadata.txt")
4143
4283
  TEST_PROJECT1_EXPECTED_FAILS_FP = path.join(
4144
4284
  TEST_DIR, "data/test_project1_output_fails.csv")
4285
+
4145
4286
  def test_write_extended_metadata_from_df_project1_integration(self):
4146
4287
  """Integration test using project1 test data files."""
4147
4288
 
@@ -4153,12 +4294,8 @@ class TestMetadataExtender(TestCase):
4153
4294
  with open(path.join(debug_dir, f"UNMATCHED_2_{file_name}"), 'w') as debug_actual_file:
4154
4295
  debug_actual_file.write(actual_content)
4155
4296
 
4156
-
4157
4297
  # Load input metadata CSV
4158
4298
  input_df = pandas.read_csv(self.TEST_PROJECT1_METADATA_FP, dtype=str)
4159
- # for the columns "plating_notes" and "notes", fill NaN with empty string
4160
- input_df["plating_notes"] = input_df["plating_notes"].fillna("")
4161
- input_df["notes"] = input_df["notes"].fillna("")
4162
4299
 
4163
4300
  # Load study config
4164
4301
  study_config = _get_study_specific_config(self.TEST_PROJECT1_CONFIG_FP)
@@ -4204,6 +4341,58 @@ class TestMetadataExtender(TestCase):
4204
4341
  self.assertEqual(1, len(validation_files))
4205
4342
  self.assertEqual(0, os.path.getsize(validation_files[0]))
4206
4343
 
4344
+ def test_write_extended_metadata_project1_integration(self):
4345
+ """Integration test for write_extended_metadata using project1 test data files."""
4346
+
4347
+ def write_mismatched_debug_files(expected_content, actual_content, file_name):
4348
+ """Write debug files to Desktop for unmatched content."""
4349
+ debug_dir = path.join(path.expanduser("~"), "Desktop")
4350
+ with open(path.join(debug_dir, f"UNMATCHED_1_{file_name}"), 'w') as debug_expected_file:
4351
+ debug_expected_file.write(expected_content)
4352
+ with open(path.join(debug_dir, f"UNMATCHED_2_{file_name}"), 'w') as debug_actual_file:
4353
+ debug_actual_file.write(actual_content)
4354
+
4355
+ with tempfile.TemporaryDirectory() as tmpdir:
4356
+ write_extended_metadata(
4357
+ self.TEST_PROJECT1_METADATA_FP, self.TEST_PROJECT1_CONFIG_FP,
4358
+ tmpdir, "test_output", remove_internals=True)
4359
+
4360
+ # Compare main output file directly to expected file
4361
+ output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
4362
+ self.assertEqual(1, len(output_files))
4363
+ with open(output_files[0], 'r') as actual_file:
4364
+ actual_content = actual_file.read()
4365
+ with open(self.TEST_PROJECT1_EXPECTED_OUTPUT_FP, 'r') as expected_file:
4366
+ expected_content = expected_file.read()
4367
+ try:
4368
+ self.assertEqual(expected_content, actual_content)
4369
+ except AssertionError:
4370
+ write_mismatched_debug_files(
4371
+ expected_content, actual_content,
4372
+ "project1_output.txt")
4373
+ raise
4374
+
4375
+ # Compare fails file directly to expected file
4376
+ fails_files = glob.glob(os.path.join(tmpdir, "*_test_output_fails.csv"))
4377
+ self.assertEqual(1, len(fails_files))
4378
+ with open(fails_files[0], 'r') as actual_file:
4379
+ actual_fails_content = actual_file.read()
4380
+ with open(self.TEST_PROJECT1_EXPECTED_FAILS_FP, 'r') as expected_file:
4381
+ expected_fails_content = expected_file.read()
4382
+ try:
4383
+ self.assertEqual(expected_fails_content, actual_fails_content)
4384
+ except AssertionError:
4385
+ write_mismatched_debug_files(
4386
+ expected_fails_content, actual_fails_content,
4387
+ "project1_fails.csv")
4388
+ raise
4389
+
4390
+ # Verify validation errors file is empty
4391
+ validation_files = glob.glob(
4392
+ os.path.join(tmpdir, "*_test_output_validation_errors.csv"))
4393
+ self.assertEqual(1, len(validation_files))
4394
+ self.assertEqual(0, os.path.getsize(validation_files[0]))
4395
+
4207
4396
  # Tests for _get_specified_column_name
4208
4397
 
4209
4398
  def test__get_specified_column_name_finds_column(self):