metameq 2026.1.2__py3-none-any.whl → 2026.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -14,7 +14,8 @@ from metameq.src.util import \
14
14
  OVERWRITE_NON_NANS_KEY, LEAVE_REQUIREDS_BLANK_KEY, LEAVE_BLANK_VAL, \
15
15
  HOST_TYPE_SPECIFIC_METADATA_KEY, METADATA_TRANSFORMERS_KEY, \
16
16
  SOURCES_KEY, FUNCTION_KEY, PRE_TRANSFORMERS_KEY, POST_TRANSFORMERS_KEY, \
17
- STUDY_SPECIFIC_METADATA_KEY
17
+ STUDY_SPECIFIC_METADATA_KEY, HOSTTYPE_COL_OPTIONS_KEY, \
18
+ SAMPLETYPE_COL_OPTIONS_KEY
18
19
  from metameq.src.metadata_extender import \
19
20
  id_missing_cols, get_qc_failures, get_reserved_cols, find_standard_cols, \
20
21
  find_nonstandard_cols, write_metadata_results, \
@@ -26,7 +27,7 @@ from metameq.src.metadata_extender import \
26
27
  _generate_metadata_for_a_host_type, _generate_metadata_for_host_types, \
27
28
  _transform_metadata, _populate_metadata_df, extend_metadata_df, \
28
29
  _get_study_specific_config, _output_metadata_df_to_files, \
29
- INTERNAL_COL_KEYS, REQ_PLACEHOLDER
30
+ _get_specified_column_name, INTERNAL_COL_KEYS, REQ_PLACEHOLDER
30
31
 
31
32
 
32
33
  class TestMetadataExtender(TestCase):
@@ -747,16 +748,15 @@ class TestMetadataExtender(TestCase):
747
748
 
748
749
  # Tests for _fill_na_if_default
749
750
 
750
- def test__fill_na_if_default_specific_overrides_settings(self):
751
+ def test__fill_na_if_default_has_default_in_settings(self):
751
752
  """Test that specific_dict default takes precedence over settings_dict."""
752
753
  input_df = pandas.DataFrame({
753
754
  "field1": ["value1", np.nan, "value3"],
754
755
  "field2": [np.nan, "value2", np.nan]
755
756
  })
756
- specific_dict = {DEFAULT_KEY: "filled"}
757
- settings_dict = {DEFAULT_KEY: "unused"}
757
+ settings_dict = {DEFAULT_KEY: "filled"}
758
758
 
759
- result = _fill_na_if_default(input_df, specific_dict, settings_dict)
759
+ result = _fill_na_if_default(input_df, settings_dict)
760
760
 
761
761
  expected = pandas.DataFrame({
762
762
  "field1": ["value1", "filled", "value3"],
@@ -764,18 +764,19 @@ class TestMetadataExtender(TestCase):
764
764
  })
765
765
  assert_frame_equal(expected, result)
766
766
 
767
- def test__fill_na_if_default_uses_settings_when_specific_missing(self):
768
- """Test that settings_dict default is used when specific_dict has no default."""
767
+ def test__fill_na_if_default_no_default_in_settings(self):
768
+ """Test that NaN values are unchanged when no default is in settings."""
769
769
  input_df = pandas.DataFrame({
770
- "field1": [np.nan]
770
+ "field1": ["value1", np.nan, "value3"],
771
+ "field2": [np.nan, "value2", np.nan]
771
772
  })
772
- specific_dict = {}
773
- settings_dict = {DEFAULT_KEY: "settings_default"}
773
+ settings_dict = {}
774
774
 
775
- result = _fill_na_if_default(input_df, specific_dict, settings_dict)
775
+ result = _fill_na_if_default(input_df, settings_dict)
776
776
 
777
777
  expected = pandas.DataFrame({
778
- "field1": ["settings_default"]
778
+ "field1": ["value1", np.nan, "value3"],
779
+ "field2": [np.nan, "value2", np.nan]
779
780
  })
780
781
  assert_frame_equal(expected, result)
781
782
 
@@ -1272,14 +1273,13 @@ class TestMetadataExtender(TestCase):
1272
1273
  SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
1273
1274
  QC_NOTE_KEY: ["", ""]
1274
1275
  })
1275
- global_plus_host_settings_dict = {
1276
- OVERWRITE_NON_NANS_KEY: False,
1277
- LEAVE_REQUIREDS_BLANK_KEY: False,
1278
- DEFAULT_KEY: "not provided"
1279
- }
1276
+
1280
1277
  # Config is pre-resolved: sample type's metadata_fields already includes
1281
1278
  # host fields merged in, plus sample_type and qiita_sample_type
1282
1279
  host_type_config_dict = {
1280
+ OVERWRITE_NON_NANS_KEY: False,
1281
+ LEAVE_REQUIREDS_BLANK_KEY: False,
1282
+ DEFAULT_KEY: "not provided",
1283
1283
  METADATA_FIELDS_KEY: {
1284
1284
  "host_field": {
1285
1285
  DEFAULT_KEY: "host_default",
@@ -1313,7 +1313,7 @@ class TestMetadataExtender(TestCase):
1313
1313
  }
1314
1314
 
1315
1315
  result_df, validation_msgs = _generate_metadata_for_a_sample_type_in_a_host_type(
1316
- input_df, "stool", global_plus_host_settings_dict, host_type_config_dict)
1316
+ input_df, "stool", host_type_config_dict)
1317
1317
 
1318
1318
  expected_df = pandas.DataFrame({
1319
1319
  SAMPLE_NAME_KEY: ["sample1", "sample2"],
@@ -1336,12 +1336,11 @@ class TestMetadataExtender(TestCase):
1336
1336
  SAMPLETYPE_SHORTHAND_KEY: ["unknown_type"],
1337
1337
  QC_NOTE_KEY: [""]
1338
1338
  })
1339
- global_plus_host_settings_dict = {
1339
+
1340
+ host_type_config_dict = {
1340
1341
  OVERWRITE_NON_NANS_KEY: False,
1341
1342
  LEAVE_REQUIREDS_BLANK_KEY: False,
1342
- DEFAULT_KEY: "not provided"
1343
- }
1344
- host_type_config_dict = {
1343
+ DEFAULT_KEY: "not provided",
1345
1344
  METADATA_FIELDS_KEY: {},
1346
1345
  SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
1347
1346
  "stool": {
@@ -1351,7 +1350,7 @@ class TestMetadataExtender(TestCase):
1351
1350
  }
1352
1351
 
1353
1352
  result_df, validation_msgs = _generate_metadata_for_a_sample_type_in_a_host_type(
1354
- input_df, "unknown_type", global_plus_host_settings_dict, host_type_config_dict)
1353
+ input_df, "unknown_type", host_type_config_dict)
1355
1354
 
1356
1355
  expected_df = pandas.DataFrame({
1357
1356
  SAMPLE_NAME_KEY: ["sample1"],
@@ -1370,12 +1369,11 @@ class TestMetadataExtender(TestCase):
1370
1369
  SAMPLETYPE_SHORTHAND_KEY: ["stool", "blood", "stool"],
1371
1370
  QC_NOTE_KEY: ["", "", ""]
1372
1371
  })
1373
- global_plus_host_settings_dict = {
1372
+
1373
+ host_type_config_dict = {
1374
1374
  OVERWRITE_NON_NANS_KEY: False,
1375
1375
  LEAVE_REQUIREDS_BLANK_KEY: False,
1376
- DEFAULT_KEY: "not provided"
1377
- }
1378
- host_type_config_dict = {
1376
+ DEFAULT_KEY: "not provided",
1379
1377
  METADATA_FIELDS_KEY: {},
1380
1378
  SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
1381
1379
  "stool": {
@@ -1393,7 +1391,7 @@ class TestMetadataExtender(TestCase):
1393
1391
  }
1394
1392
 
1395
1393
  result_df, validation_msgs = _generate_metadata_for_a_sample_type_in_a_host_type(
1396
- input_df, "stool", global_plus_host_settings_dict, host_type_config_dict)
1394
+ input_df, "stool", host_type_config_dict)
1397
1395
 
1398
1396
  # Should only have the two stool samples
1399
1397
  self.assertEqual(2, len(result_df))
@@ -1408,12 +1406,11 @@ class TestMetadataExtender(TestCase):
1408
1406
  SAMPLETYPE_SHORTHAND_KEY: ["stool"],
1409
1407
  QC_NOTE_KEY: [""]
1410
1408
  })
1411
- global_plus_host_settings_dict = {
1409
+
1410
+ host_type_config_dict = {
1412
1411
  OVERWRITE_NON_NANS_KEY: False,
1413
1412
  LEAVE_REQUIREDS_BLANK_KEY: True,
1414
- DEFAULT_KEY: "not provided"
1415
- }
1416
- host_type_config_dict = {
1413
+ DEFAULT_KEY: "not provided",
1417
1414
  METADATA_FIELDS_KEY: {},
1418
1415
  SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
1419
1416
  "stool": {
@@ -1428,7 +1425,7 @@ class TestMetadataExtender(TestCase):
1428
1425
  }
1429
1426
 
1430
1427
  result_df, validation_msgs = _generate_metadata_for_a_sample_type_in_a_host_type(
1431
- input_df, "stool", global_plus_host_settings_dict, host_type_config_dict)
1428
+ input_df, "stool", host_type_config_dict)
1432
1429
 
1433
1430
  self.assertEqual(LEAVE_BLANK_VAL, result_df["required_field"].iloc[0])
1434
1431
 
@@ -1440,12 +1437,11 @@ class TestMetadataExtender(TestCase):
1440
1437
  SAMPLETYPE_SHORTHAND_KEY: ["stool"],
1441
1438
  QC_NOTE_KEY: [""]
1442
1439
  })
1443
- global_plus_host_settings_dict = {
1440
+
1441
+ host_type_config_dict = {
1444
1442
  OVERWRITE_NON_NANS_KEY: False,
1445
1443
  LEAVE_REQUIREDS_BLANK_KEY: False,
1446
- DEFAULT_KEY: "global_default"
1447
- }
1448
- host_type_config_dict = {
1444
+ DEFAULT_KEY: "global_default",
1449
1445
  METADATA_FIELDS_KEY: {},
1450
1446
  SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
1451
1447
  "stool": {
@@ -1460,7 +1456,7 @@ class TestMetadataExtender(TestCase):
1460
1456
  }
1461
1457
 
1462
1458
  result_df, validation_msgs = _generate_metadata_for_a_sample_type_in_a_host_type(
1463
- input_df, "stool", global_plus_host_settings_dict, host_type_config_dict)
1459
+ input_df, "stool", host_type_config_dict)
1464
1460
 
1465
1461
  # When leave_requireds_blank is False, NaN values get filled with global default
1466
1462
  self.assertEqual("global_default", result_df["required_field"].iloc[0])
@@ -1474,12 +1470,11 @@ class TestMetadataExtender(TestCase):
1474
1470
  QC_NOTE_KEY: [""],
1475
1471
  "existing_field": ["original_value"]
1476
1472
  })
1477
- global_plus_host_settings_dict = {
1473
+
1474
+ host_type_config_dict = {
1478
1475
  OVERWRITE_NON_NANS_KEY: True,
1479
1476
  LEAVE_REQUIREDS_BLANK_KEY: False,
1480
- DEFAULT_KEY: "not provided"
1481
- }
1482
- host_type_config_dict = {
1477
+ DEFAULT_KEY: "not provided",
1483
1478
  METADATA_FIELDS_KEY: {},
1484
1479
  SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
1485
1480
  "stool": {
@@ -1494,7 +1489,7 @@ class TestMetadataExtender(TestCase):
1494
1489
  }
1495
1490
 
1496
1491
  result_df, validation_msgs = _generate_metadata_for_a_sample_type_in_a_host_type(
1497
- input_df, "stool", global_plus_host_settings_dict, host_type_config_dict)
1492
+ input_df, "stool", host_type_config_dict)
1498
1493
 
1499
1494
  self.assertEqual("new_value", result_df["existing_field"].iloc[0])
1500
1495
 
@@ -1507,12 +1502,11 @@ class TestMetadataExtender(TestCase):
1507
1502
  QC_NOTE_KEY: [""],
1508
1503
  "existing_field": ["original_value"]
1509
1504
  })
1510
- global_plus_host_settings_dict = {
1505
+
1506
+ host_type_config_dict = {
1511
1507
  OVERWRITE_NON_NANS_KEY: False,
1512
1508
  LEAVE_REQUIREDS_BLANK_KEY: False,
1513
- DEFAULT_KEY: "not provided"
1514
- }
1515
- host_type_config_dict = {
1509
+ DEFAULT_KEY: "not provided",
1516
1510
  METADATA_FIELDS_KEY: {},
1517
1511
  SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
1518
1512
  "stool": {
@@ -1527,7 +1521,7 @@ class TestMetadataExtender(TestCase):
1527
1521
  }
1528
1522
 
1529
1523
  result_df, validation_msgs = _generate_metadata_for_a_sample_type_in_a_host_type(
1530
- input_df, "stool", global_plus_host_settings_dict, host_type_config_dict)
1524
+ input_df, "stool", host_type_config_dict)
1531
1525
 
1532
1526
  self.assertEqual("original_value", result_df["existing_field"].iloc[0])
1533
1527
 
@@ -1539,14 +1533,13 @@ class TestMetadataExtender(TestCase):
1539
1533
  SAMPLETYPE_SHORTHAND_KEY: ["feces"],
1540
1534
  QC_NOTE_KEY: [""]
1541
1535
  })
1542
- global_plus_host_settings_dict = {
1543
- OVERWRITE_NON_NANS_KEY: False,
1544
- LEAVE_REQUIREDS_BLANK_KEY: False,
1545
- DEFAULT_KEY: "not provided"
1546
- }
1536
+
1547
1537
  # Config is pre-resolved: alias "feces" has its own metadata_fields
1548
1538
  # that is a copy of "stool"'s resolved fields with sample_type="stool"
1549
1539
  host_type_config_dict = {
1540
+ OVERWRITE_NON_NANS_KEY: False,
1541
+ LEAVE_REQUIREDS_BLANK_KEY: False,
1542
+ DEFAULT_KEY: "not provided",
1550
1543
  METADATA_FIELDS_KEY: {},
1551
1544
  SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
1552
1545
  "feces": {
@@ -1589,7 +1582,7 @@ class TestMetadataExtender(TestCase):
1589
1582
  }
1590
1583
 
1591
1584
  result_df, validation_msgs = _generate_metadata_for_a_sample_type_in_a_host_type(
1592
- input_df, "feces", global_plus_host_settings_dict, host_type_config_dict)
1585
+ input_df, "feces", host_type_config_dict)
1593
1586
 
1594
1587
  self.assertEqual("stool_value", result_df["stool_field"].iloc[0])
1595
1588
  # sample_type should be set to the resolved type "stool"
@@ -1605,17 +1598,15 @@ class TestMetadataExtender(TestCase):
1605
1598
  SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
1606
1599
  QC_NOTE_KEY: ["", ""]
1607
1600
  })
1608
- settings_dict = {
1609
- OVERWRITE_NON_NANS_KEY: False,
1610
- LEAVE_REQUIREDS_BLANK_KEY: False,
1611
- DEFAULT_KEY: "global_default"
1612
- }
1601
+
1613
1602
  # Config is pre-resolved: sample type's metadata_fields includes
1614
1603
  # host fields merged in, plus sample_type and qiita_sample_type
1615
1604
  full_flat_config_dict = {
1616
1605
  HOST_TYPE_SPECIFIC_METADATA_KEY: {
1617
1606
  "human": {
1618
1607
  DEFAULT_KEY: "human_default",
1608
+ OVERWRITE_NON_NANS_KEY: False,
1609
+ LEAVE_REQUIREDS_BLANK_KEY: False,
1619
1610
  METADATA_FIELDS_KEY: {
1620
1611
  "host_field": {
1621
1612
  DEFAULT_KEY: "host_value",
@@ -1651,7 +1642,7 @@ class TestMetadataExtender(TestCase):
1651
1642
  }
1652
1643
 
1653
1644
  result_df, validation_msgs = _generate_metadata_for_a_host_type(
1654
- input_df, "human", settings_dict, full_flat_config_dict)
1645
+ input_df, "human", full_flat_config_dict)
1655
1646
 
1656
1647
  expected_df = pandas.DataFrame({
1657
1648
  SAMPLE_NAME_KEY: ["sample1", "sample2"],
@@ -1674,14 +1665,13 @@ class TestMetadataExtender(TestCase):
1674
1665
  SAMPLETYPE_SHORTHAND_KEY: ["stool"],
1675
1666
  QC_NOTE_KEY: [""]
1676
1667
  })
1677
- settings_dict = {
1678
- OVERWRITE_NON_NANS_KEY: False,
1679
- LEAVE_REQUIREDS_BLANK_KEY: False,
1680
- DEFAULT_KEY: "global_default"
1681
- }
1668
+
1682
1669
  full_flat_config_dict = {
1683
1670
  HOST_TYPE_SPECIFIC_METADATA_KEY: {
1684
1671
  "human": {
1672
+ OVERWRITE_NON_NANS_KEY: False,
1673
+ LEAVE_REQUIREDS_BLANK_KEY: False,
1674
+ DEFAULT_KEY: "global_default",
1685
1675
  METADATA_FIELDS_KEY: {},
1686
1676
  SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {}
1687
1677
  }
@@ -1689,7 +1679,7 @@ class TestMetadataExtender(TestCase):
1689
1679
  }
1690
1680
 
1691
1681
  result_df, validation_msgs = _generate_metadata_for_a_host_type(
1692
- input_df, "unknown_host", settings_dict, full_flat_config_dict)
1682
+ input_df, "unknown_host", full_flat_config_dict)
1693
1683
 
1694
1684
  expected_df = pandas.DataFrame({
1695
1685
  SAMPLE_NAME_KEY: ["sample1"],
@@ -1708,14 +1698,13 @@ class TestMetadataExtender(TestCase):
1708
1698
  SAMPLETYPE_SHORTHAND_KEY: ["unknown_sample"],
1709
1699
  QC_NOTE_KEY: [""]
1710
1700
  })
1711
- settings_dict = {
1712
- OVERWRITE_NON_NANS_KEY: False,
1713
- LEAVE_REQUIREDS_BLANK_KEY: False,
1714
- DEFAULT_KEY: "global_default"
1715
- }
1701
+
1716
1702
  full_flat_config_dict = {
1717
1703
  HOST_TYPE_SPECIFIC_METADATA_KEY: {
1718
1704
  "human": {
1705
+ OVERWRITE_NON_NANS_KEY: False,
1706
+ LEAVE_REQUIREDS_BLANK_KEY: False,
1707
+ DEFAULT_KEY: "global_default",
1719
1708
  METADATA_FIELDS_KEY: {},
1720
1709
  SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
1721
1710
  "stool": {
@@ -1727,7 +1716,7 @@ class TestMetadataExtender(TestCase):
1727
1716
  }
1728
1717
 
1729
1718
  result_df, validation_msgs = _generate_metadata_for_a_host_type(
1730
- input_df, "human", settings_dict, full_flat_config_dict)
1719
+ input_df, "human", full_flat_config_dict)
1731
1720
 
1732
1721
  expected_df = pandas.DataFrame({
1733
1722
  SAMPLE_NAME_KEY: ["sample1"],
@@ -1746,16 +1735,15 @@ class TestMetadataExtender(TestCase):
1746
1735
  SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool", "stool"],
1747
1736
  QC_NOTE_KEY: ["", "", ""]
1748
1737
  })
1749
- settings_dict = {
1750
- OVERWRITE_NON_NANS_KEY: False,
1751
- LEAVE_REQUIREDS_BLANK_KEY: False,
1752
- DEFAULT_KEY: "global_default"
1753
- }
1738
+
1754
1739
  # Config is pre-resolved: sample type's metadata_fields includes
1755
1740
  # host fields merged in, plus sample_type and qiita_sample_type
1756
1741
  full_flat_config_dict = {
1757
1742
  HOST_TYPE_SPECIFIC_METADATA_KEY: {
1758
1743
  "human": {
1744
+ OVERWRITE_NON_NANS_KEY: False,
1745
+ LEAVE_REQUIREDS_BLANK_KEY: False,
1746
+ DEFAULT_KEY: "global_default",
1759
1747
  METADATA_FIELDS_KEY: {
1760
1748
  "human_field": {
1761
1749
  DEFAULT_KEY: "human_value",
@@ -1784,6 +1772,9 @@ class TestMetadataExtender(TestCase):
1784
1772
  }
1785
1773
  },
1786
1774
  "mouse": {
1775
+ OVERWRITE_NON_NANS_KEY: False,
1776
+ LEAVE_REQUIREDS_BLANK_KEY: False,
1777
+ DEFAULT_KEY: "global_default",
1787
1778
  METADATA_FIELDS_KEY: {},
1788
1779
  SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {}
1789
1780
  }
@@ -1791,7 +1782,7 @@ class TestMetadataExtender(TestCase):
1791
1782
  }
1792
1783
 
1793
1784
  result_df, validation_msgs = _generate_metadata_for_a_host_type(
1794
- input_df, "human", settings_dict, full_flat_config_dict)
1785
+ input_df, "human", full_flat_config_dict)
1795
1786
 
1796
1787
  expected_df = pandas.DataFrame({
1797
1788
  SAMPLE_NAME_KEY: ["sample1", "sample3"],
@@ -1812,17 +1803,15 @@ class TestMetadataExtender(TestCase):
1812
1803
  SAMPLETYPE_SHORTHAND_KEY: ["stool"],
1813
1804
  QC_NOTE_KEY: [""]
1814
1805
  })
1815
- settings_dict = {
1816
- OVERWRITE_NON_NANS_KEY: False,
1817
- LEAVE_REQUIREDS_BLANK_KEY: False,
1818
- DEFAULT_KEY: "global_default"
1819
- }
1806
+
1820
1807
  # Config is pre-resolved: sample type's metadata_fields includes
1821
1808
  # host fields merged in, plus sample_type and qiita_sample_type
1822
1809
  full_flat_config_dict = {
1823
1810
  HOST_TYPE_SPECIFIC_METADATA_KEY: {
1824
1811
  "human": {
1825
1812
  DEFAULT_KEY: "human_specific_default",
1813
+ OVERWRITE_NON_NANS_KEY: False,
1814
+ LEAVE_REQUIREDS_BLANK_KEY: False,
1826
1815
  METADATA_FIELDS_KEY: {},
1827
1816
  SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
1828
1817
  "stool": {
@@ -1849,7 +1838,7 @@ class TestMetadataExtender(TestCase):
1849
1838
  }
1850
1839
 
1851
1840
  result_df, validation_msgs = _generate_metadata_for_a_host_type(
1852
- input_df, "human", settings_dict, full_flat_config_dict)
1841
+ input_df, "human", full_flat_config_dict)
1853
1842
 
1854
1843
  expected_df = pandas.DataFrame({
1855
1844
  SAMPLE_NAME_KEY: ["sample1"],
@@ -1870,17 +1859,14 @@ class TestMetadataExtender(TestCase):
1870
1859
  SAMPLETYPE_SHORTHAND_KEY: ["stool"],
1871
1860
  QC_NOTE_KEY: [""]
1872
1861
  })
1873
- settings_dict = {
1874
- OVERWRITE_NON_NANS_KEY: False,
1875
- LEAVE_REQUIREDS_BLANK_KEY: False,
1876
- DEFAULT_KEY: "global_default"
1877
- }
1878
1862
  # Config is pre-resolved: sample type's metadata_fields includes
1879
1863
  # host fields merged in, plus sample_type and qiita_sample_type
1880
1864
  full_flat_config_dict = {
1881
1865
  HOST_TYPE_SPECIFIC_METADATA_KEY: {
1882
1866
  "human": {
1883
- # No DEFAULT_KEY here
1867
+ OVERWRITE_NON_NANS_KEY: False,
1868
+ LEAVE_REQUIREDS_BLANK_KEY: False,
1869
+ DEFAULT_KEY: "global_default",
1884
1870
  METADATA_FIELDS_KEY: {},
1885
1871
  SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
1886
1872
  "stool": {
@@ -1907,7 +1893,7 @@ class TestMetadataExtender(TestCase):
1907
1893
  }
1908
1894
 
1909
1895
  result_df, validation_msgs = _generate_metadata_for_a_host_type(
1910
- input_df, "human", settings_dict, full_flat_config_dict)
1896
+ input_df, "human", full_flat_config_dict)
1911
1897
 
1912
1898
  expected_df = pandas.DataFrame({
1913
1899
  SAMPLE_NAME_KEY: ["sample1"],
@@ -1938,6 +1924,9 @@ class TestMetadataExtender(TestCase):
1938
1924
  OVERWRITE_NON_NANS_KEY: False,
1939
1925
  HOST_TYPE_SPECIFIC_METADATA_KEY: {
1940
1926
  "human": {
1927
+ DEFAULT_KEY: "global_default",
1928
+ LEAVE_REQUIREDS_BLANK_KEY: False,
1929
+ OVERWRITE_NON_NANS_KEY: False,
1941
1930
  METADATA_FIELDS_KEY: {
1942
1931
  "host_field": {
1943
1932
  DEFAULT_KEY: "host_value",
@@ -2004,6 +1993,9 @@ class TestMetadataExtender(TestCase):
2004
1993
  OVERWRITE_NON_NANS_KEY: False,
2005
1994
  HOST_TYPE_SPECIFIC_METADATA_KEY: {
2006
1995
  "human": {
1996
+ DEFAULT_KEY: "global_default",
1997
+ LEAVE_REQUIREDS_BLANK_KEY: False,
1998
+ OVERWRITE_NON_NANS_KEY: False,
2007
1999
  METADATA_FIELDS_KEY: {
2008
2000
  "human_field": {
2009
2001
  DEFAULT_KEY: "human_value",
@@ -2050,6 +2042,9 @@ class TestMetadataExtender(TestCase):
2050
2042
  }
2051
2043
  },
2052
2044
  "mouse": {
2045
+ DEFAULT_KEY: "global_default",
2046
+ LEAVE_REQUIREDS_BLANK_KEY: False,
2047
+ OVERWRITE_NON_NANS_KEY: False,
2053
2048
  METADATA_FIELDS_KEY: {
2054
2049
  "mouse_field": {
2055
2050
  DEFAULT_KEY: "mouse_value",
@@ -2181,6 +2176,9 @@ class TestMetadataExtender(TestCase):
2181
2176
  OVERWRITE_NON_NANS_KEY: False,
2182
2177
  HOST_TYPE_SPECIFIC_METADATA_KEY: {
2183
2178
  "human": {
2179
+ DEFAULT_KEY: "global_default",
2180
+ LEAVE_REQUIREDS_BLANK_KEY: True, # This causes required fields to get LEAVE_BLANK_VAL
2181
+ OVERWRITE_NON_NANS_KEY: False,
2184
2182
  METADATA_FIELDS_KEY: {},
2185
2183
  SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
2186
2184
  "stool": {
@@ -2505,6 +2503,9 @@ class TestMetadataExtender(TestCase):
2505
2503
  OVERWRITE_NON_NANS_KEY: False,
2506
2504
  HOST_TYPE_SPECIFIC_METADATA_KEY: {
2507
2505
  "human": {
2506
+ DEFAULT_KEY: "not provided",
2507
+ LEAVE_REQUIREDS_BLANK_KEY: False,
2508
+ OVERWRITE_NON_NANS_KEY: False,
2508
2509
  METADATA_FIELDS_KEY: {
2509
2510
  "host_field": {
2510
2511
  DEFAULT_KEY: "host_value",
@@ -2579,6 +2580,9 @@ class TestMetadataExtender(TestCase):
2579
2580
  },
2580
2581
  HOST_TYPE_SPECIFIC_METADATA_KEY: {
2581
2582
  "human": {
2583
+ DEFAULT_KEY: "not provided",
2584
+ LEAVE_REQUIREDS_BLANK_KEY: False,
2585
+ OVERWRITE_NON_NANS_KEY: False,
2582
2586
  METADATA_FIELDS_KEY: {},
2583
2587
  SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
2584
2588
  "stool": {
@@ -2638,6 +2642,9 @@ class TestMetadataExtender(TestCase):
2638
2642
  },
2639
2643
  HOST_TYPE_SPECIFIC_METADATA_KEY: {
2640
2644
  "human": {
2645
+ DEFAULT_KEY: "not provided",
2646
+ LEAVE_REQUIREDS_BLANK_KEY: False,
2647
+ OVERWRITE_NON_NANS_KEY: False,
2641
2648
  METADATA_FIELDS_KEY: {},
2642
2649
  SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
2643
2650
  "stool": {
@@ -2686,6 +2693,9 @@ class TestMetadataExtender(TestCase):
2686
2693
  OVERWRITE_NON_NANS_KEY: False,
2687
2694
  HOST_TYPE_SPECIFIC_METADATA_KEY: {
2688
2695
  "human": {
2696
+ DEFAULT_KEY: "not provided",
2697
+ LEAVE_REQUIREDS_BLANK_KEY: False,
2698
+ OVERWRITE_NON_NANS_KEY: False,
2689
2699
  METADATA_FIELDS_KEY: {},
2690
2700
  SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {}
2691
2701
  }
@@ -2720,6 +2730,9 @@ class TestMetadataExtender(TestCase):
2720
2730
  OVERWRITE_NON_NANS_KEY: False,
2721
2731
  HOST_TYPE_SPECIFIC_METADATA_KEY: {
2722
2732
  "human": {
2733
+ DEFAULT_KEY: "not provided",
2734
+ LEAVE_REQUIREDS_BLANK_KEY: False,
2735
+ OVERWRITE_NON_NANS_KEY: False,
2723
2736
  METADATA_FIELDS_KEY: {},
2724
2737
  SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
2725
2738
  "stool": {
@@ -2780,6 +2793,9 @@ class TestMetadataExtender(TestCase):
2780
2793
  },
2781
2794
  HOST_TYPE_SPECIFIC_METADATA_KEY: {
2782
2795
  "human": {
2796
+ DEFAULT_KEY: "not provided",
2797
+ LEAVE_REQUIREDS_BLANK_KEY: False,
2798
+ OVERWRITE_NON_NANS_KEY: False,
2783
2799
  METADATA_FIELDS_KEY: {},
2784
2800
  SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
2785
2801
  "stool": {
@@ -3189,6 +3205,63 @@ class TestMetadataExtender(TestCase):
3189
3205
  })
3190
3206
  assert_frame_equal(expected_df, result_df)
3191
3207
 
3208
+ def test_extend_metadata_df_with_alternate_column_names(self):
3209
+ """Test metadata extension with alternate hosttype and sampletype column names."""
3210
+ # Use alternate column names instead of hosttype_shorthand and sampletype_shorthand
3211
+ input_df = pandas.DataFrame({
3212
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
3213
+ "host_type": ["human", "human"],
3214
+ "sample": ["stool", "stool"]
3215
+ })
3216
+ study_config = {
3217
+ DEFAULT_KEY: "not provided",
3218
+ LEAVE_REQUIREDS_BLANK_KEY: True,
3219
+ OVERWRITE_NON_NANS_KEY: False,
3220
+ STUDY_SPECIFIC_METADATA_KEY: {
3221
+ HOST_TYPE_SPECIFIC_METADATA_KEY: {
3222
+ "human": {
3223
+ METADATA_FIELDS_KEY: {},
3224
+ SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
3225
+ "stool": {
3226
+ METADATA_FIELDS_KEY: {}
3227
+ }
3228
+ }
3229
+ }
3230
+ }
3231
+ }
3232
+ }
3233
+ # Software config specifies alternate column names
3234
+ software_config = {
3235
+ DEFAULT_KEY: "not provided",
3236
+ LEAVE_REQUIREDS_BLANK_KEY: True,
3237
+ OVERWRITE_NON_NANS_KEY: False,
3238
+ HOSTTYPE_COL_OPTIONS_KEY: ["host_type"],
3239
+ SAMPLETYPE_COL_OPTIONS_KEY: ["sample"]
3240
+ }
3241
+
3242
+ result_df, validation_msgs_df = extend_metadata_df(
3243
+ input_df, study_config, None, software_config, self.TEST_STDS_FP)
3244
+
3245
+ expected_df = pandas.DataFrame({
3246
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
3247
+ "body_product": ["UBERON:feces", "UBERON:feces"],
3248
+ "body_site": ["gut", "gut"],
3249
+ "description": ["human sample", "human sample"],
3250
+ "host_common_name": ["human", "human"],
3251
+ # Alternate column names from input are preserved
3252
+ "host_type": ["human", "human"],
3253
+ QIITA_SAMPLE_TYPE: ["stool", "stool"],
3254
+ # Alternate column names from input are preserved
3255
+ "sample": ["stool", "stool"],
3256
+ SAMPLE_TYPE_KEY: ["stool", "stool"],
3257
+ # Standard internal columns added at end (in order of INTERNAL_COL_KEYS)
3258
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
3259
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
3260
+ QC_NOTE_KEY: ["", ""]
3261
+ })
3262
+ assert_frame_equal(expected_df, result_df)
3263
+ self.assertTrue(validation_msgs_df.empty)
3264
+
3192
3265
  # Tests for _get_study_specific_config
3193
3266
 
3194
3267
  def test__get_study_specific_config_with_valid_file(self):
@@ -4084,6 +4157,7 @@ class TestMetadataExtender(TestCase):
4084
4157
  TEST_DIR, "data/test_project1_output_metadata.txt")
4085
4158
  TEST_PROJECT1_EXPECTED_FAILS_FP = path.join(
4086
4159
  TEST_DIR, "data/test_project1_output_fails.csv")
4160
+
4087
4161
  def test_write_extended_metadata_from_df_project1_integration(self):
4088
4162
  """Integration test using project1 test data files."""
4089
4163
 
@@ -4095,7 +4169,6 @@ class TestMetadataExtender(TestCase):
4095
4169
  with open(path.join(debug_dir, f"UNMATCHED_2_{file_name}"), 'w') as debug_actual_file:
4096
4170
  debug_actual_file.write(actual_content)
4097
4171
 
4098
-
4099
4172
  # Load input metadata CSV
4100
4173
  input_df = pandas.read_csv(self.TEST_PROJECT1_METADATA_FP, dtype=str)
4101
4174
  # for the columns "plating_notes" and "notes", fill NaN with empty string
@@ -4145,3 +4218,83 @@ class TestMetadataExtender(TestCase):
4145
4218
  os.path.join(tmpdir, "*_test_output_validation_errors.csv"))
4146
4219
  self.assertEqual(1, len(validation_files))
4147
4220
  self.assertEqual(0, os.path.getsize(validation_files[0]))
4221
+
4222
+ # Tests for _get_specified_column_name
4223
+
4224
+ def test__get_specified_column_name_finds_column(self):
4225
+ """Test that _get_specified_column_name finds a column that exists."""
4226
+ input_df = pandas.DataFrame({
4227
+ "sample_name": ["s1"],
4228
+ "host_type": ["human"]
4229
+ })
4230
+ config_dict = {
4231
+ HOSTTYPE_COL_OPTIONS_KEY: ["host_type", "host_common_name"]
4232
+ }
4233
+ result = _get_specified_column_name(
4234
+ HOSTTYPE_COL_OPTIONS_KEY, input_df, config_dict)
4235
+ self.assertEqual("host_type", result)
4236
+
4237
+ def test__get_specified_column_name_returns_first_match(self):
4238
+ """Test that _get_specified_column_name returns the first match when multiple options exist."""
4239
+ input_df = pandas.DataFrame({
4240
+ "sample_name": ["s1"],
4241
+ "host_type": ["human"],
4242
+ "host_common_name": ["human"]
4243
+ })
4244
+ config_dict = {
4245
+ HOSTTYPE_COL_OPTIONS_KEY: ["host_type", "host_common_name"]
4246
+ }
4247
+ result = _get_specified_column_name(
4248
+ HOSTTYPE_COL_OPTIONS_KEY, input_df, config_dict)
4249
+ self.assertEqual("host_type", result)
4250
+
4251
+ def test__get_specified_column_name_returns_none_when_no_match(self):
4252
+ """Test that _get_specified_column_name returns None when no options match."""
4253
+ input_df = pandas.DataFrame({
4254
+ "sample_name": ["s1"],
4255
+ "other_column": ["value"]
4256
+ })
4257
+ config_dict = {
4258
+ HOSTTYPE_COL_OPTIONS_KEY: ["host_type", "host_common_name"]
4259
+ }
4260
+ result = _get_specified_column_name(
4261
+ HOSTTYPE_COL_OPTIONS_KEY, input_df, config_dict)
4262
+ self.assertIsNone(result)
4263
+
4264
+ def test__get_specified_column_name_returns_none_when_key_missing(self):
4265
+ """Test that _get_specified_column_name returns None when col_options_key is not in config."""
4266
+ input_df = pandas.DataFrame({
4267
+ "sample_name": ["s1"],
4268
+ "host_type": ["human"]
4269
+ })
4270
+ config_dict = {}
4271
+ result = _get_specified_column_name(
4272
+ HOSTTYPE_COL_OPTIONS_KEY, input_df, config_dict)
4273
+ self.assertIsNone(result)
4274
+
4275
+ def test__get_specified_column_name_returns_none_when_options_empty(self):
4276
+ """Test that _get_specified_column_name returns None when col_options is empty list."""
4277
+ input_df = pandas.DataFrame({
4278
+ "sample_name": ["s1"],
4279
+ "host_type": ["human"]
4280
+ })
4281
+ config_dict = {
4282
+ HOSTTYPE_COL_OPTIONS_KEY: []
4283
+ }
4284
+ result = _get_specified_column_name(
4285
+ HOSTTYPE_COL_OPTIONS_KEY, input_df, config_dict)
4286
+ self.assertIsNone(result)
4287
+
4288
+ def test__get_specified_column_name_with_sampletype_key(self):
4289
+ """Test that _get_specified_column_name works with sampletype column options."""
4290
+ input_df = pandas.DataFrame({
4291
+ "sample_name": ["s1"],
4292
+ "sample_type": ["stool"]
4293
+ })
4294
+ config_dict = {
4295
+ SAMPLETYPE_COL_OPTIONS_KEY: ["sample_type", "sampletype"]
4296
+ }
4297
+ result = _get_specified_column_name(
4298
+ SAMPLETYPE_COL_OPTIONS_KEY, input_df, config_dict)
4299
+ self.assertEqual("sample_type", result)
4300
+ # endregion _get_specified_column_name tests