metameq 2026.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,2610 @@
1
+ import glob
2
+ import numpy as np
3
+ import os
4
+ import os.path as path
5
+ import pandas
6
+ import tempfile
7
+ from pandas.testing import assert_frame_equal
8
+ from unittest import TestCase
9
+ from metameq.src.util import \
10
+ SAMPLE_NAME_KEY, HOSTTYPE_SHORTHAND_KEY, SAMPLETYPE_SHORTHAND_KEY, \
11
+ QC_NOTE_KEY, DEFAULT_KEY, REQUIRED_RAW_METADATA_FIELDS, REQUIRED_KEY, \
12
+ METADATA_FIELDS_KEY, ALIAS_KEY, BASE_TYPE_KEY, ALLOWED_KEY, TYPE_KEY, \
13
+ SAMPLE_TYPE_KEY, QIITA_SAMPLE_TYPE, SAMPLE_TYPE_SPECIFIC_METADATA_KEY, \
14
+ OVERWRITE_NON_NANS_KEY, LEAVE_REQUIREDS_BLANK_KEY, LEAVE_BLANK_VAL, \
15
+ HOST_TYPE_SPECIFIC_METADATA_KEY, METADATA_TRANSFORMERS_KEY, \
16
+ SOURCES_KEY, FUNCTION_KEY, PRE_TRANSFORMERS_KEY, POST_TRANSFORMERS_KEY, \
17
+ STUDY_SPECIFIC_METADATA_KEY
18
+ from metameq.src.metadata_extender import \
19
+ id_missing_cols, get_qc_failures, _reorder_df, \
20
+ _catch_nan_required_fields, _fill_na_if_default, \
21
+ _update_metadata_from_metadata_fields_dict, _update_metadata_from_dict, \
22
+ _construct_sample_type_metadata_fields_dict, \
23
+ _generate_metadata_for_a_sample_type_in_a_host_type, \
24
+ _generate_metadata_for_a_host_type, _generate_metadata_for_host_types, \
25
+ _transform_metadata, _populate_metadata_df, extend_metadata_df, \
26
+ _get_study_specific_config, _output_metadata_df_to_files, \
27
+ INTERNAL_COL_KEYS, REQ_PLACEHOLDER
28
+
29
+
30
+ class TestMetadataExtender(TestCase):
31
+ """Test suite for metadata_extender module."""
32
+
33
+ # Tests for id_missing_cols
34
+
35
+ def test_id_missing_cols_all_present(self):
36
+ """Test returns empty list when all required columns exist."""
37
+ input_df = pandas.DataFrame({
38
+ SAMPLE_NAME_KEY: ["sample1"],
39
+ HOSTTYPE_SHORTHAND_KEY: ["human"],
40
+ SAMPLETYPE_SHORTHAND_KEY: ["stool"]
41
+ })
42
+
43
+ result = id_missing_cols(input_df)
44
+
45
+ expected = []
46
+ self.assertEqual(expected, result)
47
+
48
+ def test_id_missing_cols_some_missing(self):
49
+ """Test returns sorted list of missing required columns."""
50
+ input_df = pandas.DataFrame({
51
+ SAMPLE_NAME_KEY: ["sample1"]
52
+ })
53
+
54
+ result = id_missing_cols(input_df)
55
+
56
+ expected = sorted([HOSTTYPE_SHORTHAND_KEY, SAMPLETYPE_SHORTHAND_KEY])
57
+ self.assertEqual(expected, result)
58
+
59
+ def test_id_missing_cols_all_missing(self):
60
+ """Test returns all required columns when df has none of them."""
61
+ input_df = pandas.DataFrame({
62
+ "other_col": ["value1"]
63
+ })
64
+
65
+ result = id_missing_cols(input_df)
66
+
67
+ expected = sorted(REQUIRED_RAW_METADATA_FIELDS)
68
+ self.assertEqual(expected, result)
69
+
70
+ # Tests for get_qc_failures
71
+
72
+ def test_get_qc_failures_no_failures(self):
73
+ """Test returns empty df when QC_NOTE_KEY is all empty strings."""
74
+ input_df = pandas.DataFrame({
75
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
76
+ QC_NOTE_KEY: ["", ""]
77
+ })
78
+
79
+ result = get_qc_failures(input_df)
80
+
81
+ self.assertTrue(result.empty)
82
+
83
+ def test_get_qc_failures_some_failures(self):
84
+ """Test returns only rows where QC_NOTE_KEY is not empty."""
85
+ input_df = pandas.DataFrame({
86
+ SAMPLE_NAME_KEY: ["sample1", "sample2", "sample3"],
87
+ QC_NOTE_KEY: ["", "invalid host_type", ""]
88
+ })
89
+
90
+ result = get_qc_failures(input_df)
91
+
92
+ expected = pandas.DataFrame({
93
+ SAMPLE_NAME_KEY: ["sample2"],
94
+ QC_NOTE_KEY: ["invalid host_type"]
95
+ }, index=[1])
96
+ assert_frame_equal(expected, result)
97
+
98
+ def test_get_qc_failures_all_failures(self):
99
+ """Test returns all rows when all have QC notes."""
100
+ input_df = pandas.DataFrame({
101
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
102
+ QC_NOTE_KEY: ["invalid host_type", "invalid sample_type"]
103
+ })
104
+
105
+ result = get_qc_failures(input_df)
106
+
107
+ assert_frame_equal(input_df, result)
108
+
109
+ # Tests for _reorder_df
110
+
111
+ def test__reorder_df_sample_name_first(self):
112
+ """Test that sample_name becomes the first column."""
113
+ input_df = pandas.DataFrame({
114
+ "zebra": ["z"],
115
+ SAMPLE_NAME_KEY: ["sample1"],
116
+ "apple": ["a"],
117
+ QC_NOTE_KEY: [""],
118
+ HOSTTYPE_SHORTHAND_KEY: ["human"],
119
+ SAMPLETYPE_SHORTHAND_KEY: ["stool"]
120
+ })
121
+
122
+ result = _reorder_df(input_df, INTERNAL_COL_KEYS)
123
+
124
+ self.assertEqual(SAMPLE_NAME_KEY, result.columns[0])
125
+
126
+ def test__reorder_df_alphabetical_order(self):
127
+ """Test that non-internal columns are sorted alphabetically after sample_name."""
128
+ input_df = pandas.DataFrame({
129
+ "zebra": ["z"],
130
+ SAMPLE_NAME_KEY: ["sample1"],
131
+ "apple": ["a"],
132
+ QC_NOTE_KEY: [""],
133
+ HOSTTYPE_SHORTHAND_KEY: ["human"],
134
+ SAMPLETYPE_SHORTHAND_KEY: ["stool"]
135
+ })
136
+
137
+ result = _reorder_df(input_df, INTERNAL_COL_KEYS)
138
+
139
+ expected_order = [SAMPLE_NAME_KEY, "apple", "zebra"] + INTERNAL_COL_KEYS
140
+ self.assertEqual(expected_order, list(result.columns))
141
+
142
+ def test__reorder_df_internals_at_end(self):
143
+ """Test that internal columns are moved to the end in the provided order."""
144
+ input_df = pandas.DataFrame({
145
+ "field1": ["value1"],
146
+ SAMPLE_NAME_KEY: ["sample1"],
147
+ QC_NOTE_KEY: [""],
148
+ HOSTTYPE_SHORTHAND_KEY: ["human"],
149
+ SAMPLETYPE_SHORTHAND_KEY: ["stool"]
150
+ })
151
+
152
+ result = _reorder_df(input_df, INTERNAL_COL_KEYS)
153
+
154
+ expected_order = [SAMPLE_NAME_KEY, "field1"] + INTERNAL_COL_KEYS
155
+ self.assertEqual(expected_order, list(result.columns))
156
+
157
+ def test__reorder_df_full_ordering(self):
158
+ """Test complete column ordering: sample_name, alphabetical, internals."""
159
+ input_df = pandas.DataFrame({
160
+ "zebra": ["z"],
161
+ SAMPLE_NAME_KEY: ["sample1"],
162
+ "apple": ["a"],
163
+ QC_NOTE_KEY: [""],
164
+ HOSTTYPE_SHORTHAND_KEY: ["human"],
165
+ SAMPLETYPE_SHORTHAND_KEY: ["stool"],
166
+ "banana": ["b"]
167
+ })
168
+
169
+ result = _reorder_df(input_df, INTERNAL_COL_KEYS)
170
+
171
+ expected_order = [SAMPLE_NAME_KEY, "apple", "banana", "zebra"] + INTERNAL_COL_KEYS
172
+ self.assertEqual(expected_order, list(result.columns))
173
+
174
+ # Tests for _catch_nan_required_fields
175
+
176
+ def test__catch_nan_required_fields_no_nans(self):
177
+ """Test returns unchanged df when no NaNs in required fields."""
178
+ input_df = pandas.DataFrame({
179
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
180
+ HOSTTYPE_SHORTHAND_KEY: ["human", "control"],
181
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "blank"]
182
+ })
183
+
184
+ result = _catch_nan_required_fields(input_df)
185
+
186
+ assert_frame_equal(input_df, result)
187
+
188
+ def test__catch_nan_required_fields_nan_sample_name_raises(self):
189
+ """Test raises ValueError when sample_name contains NaN."""
190
+ input_df = pandas.DataFrame({
191
+ SAMPLE_NAME_KEY: ["sample1", np.nan],
192
+ HOSTTYPE_SHORTHAND_KEY: ["human", "control"],
193
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "blank"]
194
+ })
195
+
196
+ with self.assertRaisesRegex(ValueError, "Metadata contains NaN sample names"):
197
+ _catch_nan_required_fields(input_df)
198
+
199
+ def test__catch_nan_required_fields_nan_shorthand_fields_become_empty(self):
200
+ """Test that NaN hosttype_shorthand and sampletype_shorthand values are set to 'empty'."""
201
+ input_df = pandas.DataFrame({
202
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
203
+ HOSTTYPE_SHORTHAND_KEY: ["human", np.nan],
204
+ SAMPLETYPE_SHORTHAND_KEY: [np.nan, "blank"]
205
+ })
206
+
207
+ result = _catch_nan_required_fields(input_df)
208
+
209
+ expected = pandas.DataFrame({
210
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
211
+ HOSTTYPE_SHORTHAND_KEY: ["human", "empty"],
212
+ SAMPLETYPE_SHORTHAND_KEY: ["empty", "blank"]
213
+ })
214
+ assert_frame_equal(expected, result)
215
+
216
+ # Tests for _fill_na_if_default
217
+
218
+ def test__fill_na_if_default_specific_overrides_settings(self):
219
+ """Test that specific_dict default takes precedence over settings_dict."""
220
+ input_df = pandas.DataFrame({
221
+ "field1": ["value1", np.nan, "value3"],
222
+ "field2": [np.nan, "value2", np.nan]
223
+ })
224
+ specific_dict = {DEFAULT_KEY: "filled"}
225
+ settings_dict = {DEFAULT_KEY: "unused"}
226
+
227
+ result = _fill_na_if_default(input_df, specific_dict, settings_dict)
228
+
229
+ expected = pandas.DataFrame({
230
+ "field1": ["value1", "filled", "value3"],
231
+ "field2": ["filled", "value2", "filled"]
232
+ })
233
+ assert_frame_equal(expected, result)
234
+
235
+ def test__fill_na_if_default_uses_settings_when_specific_missing(self):
236
+ """Test that settings_dict default is used when specific_dict has no default."""
237
+ input_df = pandas.DataFrame({
238
+ "field1": [np.nan]
239
+ })
240
+ specific_dict = {}
241
+ settings_dict = {DEFAULT_KEY: "settings_default"}
242
+
243
+ result = _fill_na_if_default(input_df, specific_dict, settings_dict)
244
+
245
+ expected = pandas.DataFrame({
246
+ "field1": ["settings_default"]
247
+ })
248
+ assert_frame_equal(expected, result)
249
+
250
+ # Tests for _update_metadata_from_metadata_fields_dict
251
+
252
+ def test__update_metadata_from_metadata_fields_dict_adds_new_column_with_default(self):
253
+ """Test that a new column is added with the default value when field has default."""
254
+ input_df = pandas.DataFrame({
255
+ SAMPLE_NAME_KEY: ["sample1", "sample2"]
256
+ })
257
+ metadata_fields_dict = {
258
+ "new_field": {
259
+ DEFAULT_KEY: "default_value"
260
+ }
261
+ }
262
+
263
+ result = _update_metadata_from_metadata_fields_dict(
264
+ input_df, metadata_fields_dict, overwrite_non_nans=False)
265
+
266
+ expected = pandas.DataFrame({
267
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
268
+ "new_field": ["default_value", "default_value"]
269
+ })
270
+ assert_frame_equal(expected, result)
271
+
272
+ def test__update_metadata_from_metadata_fields_dict_fills_nans_with_default(self):
273
+ """Test that NaN values in existing column are filled with default."""
274
+ input_df = pandas.DataFrame({
275
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
276
+ "existing_field": ["value1", np.nan]
277
+ })
278
+ metadata_fields_dict = {
279
+ "existing_field": {
280
+ DEFAULT_KEY: "default_value"
281
+ }
282
+ }
283
+
284
+ result = _update_metadata_from_metadata_fields_dict(
285
+ input_df, metadata_fields_dict, overwrite_non_nans=False)
286
+
287
+ expected = pandas.DataFrame({
288
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
289
+ "existing_field": ["value1", "default_value"]
290
+ })
291
+ assert_frame_equal(expected, result)
292
+
293
+ def test__update_metadata_from_metadata_fields_dict_overwrite_non_nans_false(self):
294
+ """Test that existing non-NaN values are preserved when overwrite_non_nans is False."""
295
+ input_df = pandas.DataFrame({
296
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
297
+ "existing_field": ["original", np.nan]
298
+ })
299
+ metadata_fields_dict = {
300
+ "existing_field": {
301
+ DEFAULT_KEY: "default_value"
302
+ }
303
+ }
304
+
305
+ result = _update_metadata_from_metadata_fields_dict(
306
+ input_df, metadata_fields_dict, overwrite_non_nans=False)
307
+
308
+ expected = pandas.DataFrame({
309
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
310
+ "existing_field": ["original", "default_value"]
311
+ })
312
+ assert_frame_equal(expected, result)
313
+
314
+ def test__update_metadata_from_metadata_fields_dict_overwrite_non_nans_true(self):
315
+ """Test that existing values are overwritten when overwrite_non_nans is True."""
316
+ input_df = pandas.DataFrame({
317
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
318
+ "existing_field": ["original", "also_original"]
319
+ })
320
+ metadata_fields_dict = {
321
+ "existing_field": {
322
+ DEFAULT_KEY: "default_value"
323
+ }
324
+ }
325
+
326
+ result = _update_metadata_from_metadata_fields_dict(
327
+ input_df, metadata_fields_dict, overwrite_non_nans=True)
328
+
329
+ expected = pandas.DataFrame({
330
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
331
+ "existing_field": ["default_value", "default_value"]
332
+ })
333
+ assert_frame_equal(expected, result)
334
+
335
+ def test__update_metadata_from_metadata_fields_dict_adds_required_placeholder(self):
336
+ """Test that required field without default gets placeholder when column doesn't exist."""
337
+ input_df = pandas.DataFrame({
338
+ SAMPLE_NAME_KEY: ["sample1", "sample2"]
339
+ })
340
+ metadata_fields_dict = {
341
+ "required_field": {
342
+ REQUIRED_KEY: True
343
+ }
344
+ }
345
+
346
+ result = _update_metadata_from_metadata_fields_dict(
347
+ input_df, metadata_fields_dict, overwrite_non_nans=False)
348
+
349
+ expected = pandas.DataFrame({
350
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
351
+ "required_field": [REQ_PLACEHOLDER, REQ_PLACEHOLDER]
352
+ })
353
+ assert_frame_equal(expected, result)
354
+
355
+ def test__update_metadata_from_metadata_fields_dict_preserves_existing_required(self):
356
+ """Test that existing values in required, no-default field are preserved (no placeholder)."""
357
+ input_df = pandas.DataFrame({
358
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
359
+ "required_field": ["existing1", "existing2"]
360
+ })
361
+ metadata_fields_dict = {
362
+ "required_field": {
363
+ REQUIRED_KEY: True
364
+ }
365
+ }
366
+
367
+ result = _update_metadata_from_metadata_fields_dict(
368
+ input_df, metadata_fields_dict, overwrite_non_nans=False)
369
+
370
+ expected = pandas.DataFrame({
371
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
372
+ "required_field": ["existing1", "existing2"]
373
+ })
374
+ assert_frame_equal(expected, result)
375
+
376
+ def test__update_metadata_from_metadata_fields_dict_required_false_no_placeholder(self):
377
+ """Test that field with required=False and no default doesn't get added."""
378
+ input_df = pandas.DataFrame({
379
+ SAMPLE_NAME_KEY: ["sample1", "sample2"]
380
+ })
381
+ metadata_fields_dict = {
382
+ "optional_field": {
383
+ REQUIRED_KEY: False
384
+ }
385
+ }
386
+
387
+ result = _update_metadata_from_metadata_fields_dict(
388
+ input_df, metadata_fields_dict, overwrite_non_nans=False)
389
+
390
+ expected = pandas.DataFrame({
391
+ SAMPLE_NAME_KEY: ["sample1", "sample2"]
392
+ })
393
+ assert_frame_equal(expected, result)
394
+
395
+ def test__update_metadata_from_metadata_fields_dict_default_takes_precedence(self):
396
+ """Test that default value is used even when field is also marked required."""
397
+ input_df = pandas.DataFrame({
398
+ SAMPLE_NAME_KEY: ["sample1", "sample2"]
399
+ })
400
+ metadata_fields_dict = {
401
+ "field_with_both": {
402
+ DEFAULT_KEY: "the_default",
403
+ REQUIRED_KEY: True
404
+ }
405
+ }
406
+
407
+ result = _update_metadata_from_metadata_fields_dict(
408
+ input_df, metadata_fields_dict, overwrite_non_nans=False)
409
+
410
+ expected = pandas.DataFrame({
411
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
412
+ "field_with_both": ["the_default", "the_default"]
413
+ })
414
+ assert_frame_equal(expected, result)
415
+
416
+ def test__update_metadata_from_metadata_fields_dict_multiple_fields(self):
417
+ """Test updating multiple fields at once."""
418
+ input_df = pandas.DataFrame({
419
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
420
+ "existing": ["val1", np.nan]
421
+ })
422
+ metadata_fields_dict = {
423
+ "existing": {
424
+ DEFAULT_KEY: "filled"
425
+ },
426
+ "new_default": {
427
+ DEFAULT_KEY: "new_val"
428
+ },
429
+ "new_required": {
430
+ REQUIRED_KEY: True
431
+ }
432
+ }
433
+
434
+ result = _update_metadata_from_metadata_fields_dict(
435
+ input_df, metadata_fields_dict, overwrite_non_nans=False)
436
+
437
+ expected = pandas.DataFrame({
438
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
439
+ "existing": ["val1", "filled"],
440
+ "new_default": ["new_val", "new_val"],
441
+ "new_required": [REQ_PLACEHOLDER, REQ_PLACEHOLDER]
442
+ })
443
+ assert_frame_equal(expected, result)
444
+
445
+ # Tests for _update_metadata_from_dict
446
+
447
+ def test__update_metadata_from_dict_extracts_metadata_fields(self):
448
+ """Test that METADATA_FIELDS_KEY is extracted when dict_is_metadata_fields=False."""
449
+ input_df = pandas.DataFrame({
450
+ SAMPLE_NAME_KEY: ["sample1", "sample2"]
451
+ })
452
+ config_section_dict = {
453
+ METADATA_FIELDS_KEY: {
454
+ "new_field": {
455
+ DEFAULT_KEY: "default_value"
456
+ }
457
+ },
458
+ "other_key": "ignored"
459
+ }
460
+
461
+ result = _update_metadata_from_dict(
462
+ input_df, config_section_dict,
463
+ dict_is_metadata_fields=False, overwrite_non_nans=False)
464
+
465
+ expected = pandas.DataFrame({
466
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
467
+ "new_field": ["default_value", "default_value"]
468
+ })
469
+ assert_frame_equal(expected, result)
470
+
471
+ def test__update_metadata_from_dict_uses_dict_directly(self):
472
+ """Test that dict is used directly when dict_is_metadata_fields=True."""
473
+ input_df = pandas.DataFrame({
474
+ SAMPLE_NAME_KEY: ["sample1", "sample2"]
475
+ })
476
+ config_section_dict = {
477
+ "new_field": {
478
+ DEFAULT_KEY: "default_value"
479
+ }
480
+ }
481
+
482
+ result = _update_metadata_from_dict(
483
+ input_df, config_section_dict,
484
+ dict_is_metadata_fields=True, overwrite_non_nans=False)
485
+
486
+ expected = pandas.DataFrame({
487
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
488
+ "new_field": ["default_value", "default_value"]
489
+ })
490
+ assert_frame_equal(expected, result)
491
+
492
+ def test__update_metadata_from_dict_passes_overwrite_non_nans(self):
493
+ """Test that overwrite_non_nans parameter is passed through correctly."""
494
+ input_df = pandas.DataFrame({
495
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
496
+ "existing_field": ["original", "also_original"]
497
+ })
498
+ config_section_dict = {
499
+ "existing_field": {
500
+ DEFAULT_KEY: "new_value"
501
+ }
502
+ }
503
+
504
+ result = _update_metadata_from_dict(
505
+ input_df, config_section_dict,
506
+ dict_is_metadata_fields=True, overwrite_non_nans=True)
507
+
508
+ expected = pandas.DataFrame({
509
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
510
+ "existing_field": ["new_value", "new_value"]
511
+ })
512
+ assert_frame_equal(expected, result)
513
+
514
+ # Tests for _construct_sample_type_metadata_fields_dict
515
+
516
+ def test__construct_sample_type_metadata_fields_dict_simple(self):
517
+ """Test combining host and sample type fields for a simple sample type."""
518
+ host_sample_types_config_dict = {
519
+ "stool": {
520
+ METADATA_FIELDS_KEY: {
521
+ "sample_field": {
522
+ DEFAULT_KEY: "sample_default"
523
+ }
524
+ }
525
+ }
526
+ }
527
+ host_metadata_fields_dict = {
528
+ "host_field": {
529
+ DEFAULT_KEY: "host_default"
530
+ }
531
+ }
532
+
533
+ result = _construct_sample_type_metadata_fields_dict(
534
+ "stool", host_sample_types_config_dict, host_metadata_fields_dict)
535
+
536
+ expected = {
537
+ "host_field": {
538
+ DEFAULT_KEY: "host_default"
539
+ },
540
+ "sample_field": {
541
+ DEFAULT_KEY: "sample_default"
542
+ },
543
+ SAMPLE_TYPE_KEY: {
544
+ ALLOWED_KEY: ["stool"],
545
+ DEFAULT_KEY: "stool",
546
+ TYPE_KEY: "string"
547
+ },
548
+ QIITA_SAMPLE_TYPE: {
549
+ ALLOWED_KEY: ["stool"],
550
+ DEFAULT_KEY: "stool",
551
+ TYPE_KEY: "string"
552
+ }
553
+ }
554
+ self.assertDictEqual(expected, result)
555
+
556
+ def test__construct_sample_type_metadata_fields_dict_with_alias(self):
557
+ """Test that alias resolves to target sample type."""
558
+ host_sample_types_config_dict = {
559
+ "feces": {
560
+ ALIAS_KEY: "stool"
561
+ },
562
+ "stool": {
563
+ METADATA_FIELDS_KEY: {
564
+ "stool_field": {
565
+ DEFAULT_KEY: "stool_value"
566
+ }
567
+ }
568
+ }
569
+ }
570
+ host_metadata_fields_dict = {}
571
+
572
+ result = _construct_sample_type_metadata_fields_dict(
573
+ "feces", host_sample_types_config_dict, host_metadata_fields_dict)
574
+
575
+ expected = {
576
+ "stool_field": {
577
+ DEFAULT_KEY: "stool_value"
578
+ },
579
+ SAMPLE_TYPE_KEY: {
580
+ ALLOWED_KEY: ["stool"],
581
+ DEFAULT_KEY: "stool",
582
+ TYPE_KEY: "string"
583
+ },
584
+ QIITA_SAMPLE_TYPE: {
585
+ ALLOWED_KEY: ["stool"],
586
+ DEFAULT_KEY: "stool",
587
+ TYPE_KEY: "string"
588
+ }
589
+ }
590
+ self.assertDictEqual(expected, result)
591
+
592
+ def test__construct_sample_type_metadata_fields_dict_chained_alias_raises(self):
593
+ """Test that chained aliases raise ValueError."""
594
+ host_sample_types_config_dict = {
595
+ "feces": {
596
+ ALIAS_KEY: "stool"
597
+ },
598
+ "stool": {
599
+ ALIAS_KEY: "poop"
600
+ },
601
+ "poop": {
602
+ METADATA_FIELDS_KEY: {}
603
+ }
604
+ }
605
+ host_metadata_fields_dict = {}
606
+
607
+ with self.assertRaisesRegex(ValueError, "May not chain aliases"):
608
+ _construct_sample_type_metadata_fields_dict(
609
+ "feces", host_sample_types_config_dict, host_metadata_fields_dict)
610
+
611
+ def test__construct_sample_type_metadata_fields_dict_with_base_type(self):
612
+ """Test that base type fields are inherited and overlaid."""
613
+ host_sample_types_config_dict = {
614
+ "base_sample": {
615
+ METADATA_FIELDS_KEY: {
616
+ "base_field": {
617
+ DEFAULT_KEY: "base_value"
618
+ }
619
+ }
620
+ },
621
+ "derived_sample": {
622
+ BASE_TYPE_KEY: "base_sample",
623
+ METADATA_FIELDS_KEY: {
624
+ "derived_field": {
625
+ DEFAULT_KEY: "derived_value"
626
+ }
627
+ }
628
+ }
629
+ }
630
+ host_metadata_fields_dict = {}
631
+
632
+ result = _construct_sample_type_metadata_fields_dict(
633
+ "derived_sample", host_sample_types_config_dict, host_metadata_fields_dict)
634
+
635
+ expected = {
636
+ "base_field": {
637
+ DEFAULT_KEY: "base_value"
638
+ },
639
+ "derived_field": {
640
+ DEFAULT_KEY: "derived_value"
641
+ },
642
+ SAMPLE_TYPE_KEY: {
643
+ ALLOWED_KEY: ["derived_sample"],
644
+ DEFAULT_KEY: "derived_sample",
645
+ TYPE_KEY: "string"
646
+ },
647
+ QIITA_SAMPLE_TYPE: {
648
+ ALLOWED_KEY: ["derived_sample"],
649
+ DEFAULT_KEY: "derived_sample",
650
+ TYPE_KEY: "string"
651
+ }
652
+ }
653
+ self.assertDictEqual(expected, result)
654
+
655
+ def test__construct_sample_type_metadata_fields_dict_base_type_invalid_raises(self):
656
+ """Test that base type with non-metadata-fields keys raises ValueError."""
657
+ host_sample_types_config_dict = {
658
+ "base_sample": {
659
+ METADATA_FIELDS_KEY: {
660
+ "base_field": {DEFAULT_KEY: "value"}
661
+ },
662
+ "extra_key": "not_allowed"
663
+ },
664
+ "derived_sample": {
665
+ BASE_TYPE_KEY: "base_sample",
666
+ METADATA_FIELDS_KEY: {}
667
+ }
668
+ }
669
+ host_metadata_fields_dict = {}
670
+
671
+ with self.assertRaisesRegex(ValueError, "must only have metadata fields"):
672
+ _construct_sample_type_metadata_fields_dict(
673
+ "derived_sample", host_sample_types_config_dict, host_metadata_fields_dict)
674
+
675
+ def test__construct_sample_type_metadata_fields_dict_sets_sample_type(self):
676
+ """Test that sample_type field is set with correct allowed/default values."""
677
+ host_sample_types_config_dict = {
678
+ "blood": {
679
+ METADATA_FIELDS_KEY: {}
680
+ }
681
+ }
682
+ host_metadata_fields_dict = {}
683
+
684
+ result = _construct_sample_type_metadata_fields_dict(
685
+ "blood", host_sample_types_config_dict, host_metadata_fields_dict)
686
+
687
+ expected = {
688
+ SAMPLE_TYPE_KEY: {
689
+ ALLOWED_KEY: ["blood"],
690
+ DEFAULT_KEY: "blood",
691
+ TYPE_KEY: "string"
692
+ },
693
+ QIITA_SAMPLE_TYPE: {
694
+ ALLOWED_KEY: ["blood"],
695
+ DEFAULT_KEY: "blood",
696
+ TYPE_KEY: "string"
697
+ }
698
+ }
699
+ self.assertDictEqual(expected, result)
700
+
701
+ def test__construct_sample_type_metadata_fields_dict_preserves_existing_qiita_sample_type(self):
702
+ """Test that existing qiita_sample_type is not overwritten."""
703
+ host_sample_types_config_dict = {
704
+ "stool": {
705
+ METADATA_FIELDS_KEY: {
706
+ QIITA_SAMPLE_TYPE: {
707
+ ALLOWED_KEY: ["custom_type"],
708
+ DEFAULT_KEY: "custom_type",
709
+ TYPE_KEY: "string"
710
+ }
711
+ }
712
+ }
713
+ }
714
+ host_metadata_fields_dict = {}
715
+
716
+ result = _construct_sample_type_metadata_fields_dict(
717
+ "stool", host_sample_types_config_dict, host_metadata_fields_dict)
718
+
719
+ expected = {
720
+ SAMPLE_TYPE_KEY: {
721
+ ALLOWED_KEY: ["stool"],
722
+ DEFAULT_KEY: "stool",
723
+ TYPE_KEY: "string"
724
+ },
725
+ QIITA_SAMPLE_TYPE: {
726
+ ALLOWED_KEY: ["custom_type"],
727
+ DEFAULT_KEY: "custom_type",
728
+ TYPE_KEY: "string"
729
+ }
730
+ }
731
+ self.assertDictEqual(expected, result)
732
+
733
+ # Tests for _generate_metadata_for_a_sample_type_in_a_host_type
734
+
735
+ def test__generate_metadata_for_a_sample_type_in_a_host_type_basic(self):
736
+ """Test basic metadata generation for a known sample type."""
737
+ input_df = pandas.DataFrame({
738
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
739
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
740
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
741
+ QC_NOTE_KEY: ["", ""]
742
+ })
743
+ global_plus_host_settings_dict = {
744
+ OVERWRITE_NON_NANS_KEY: False,
745
+ LEAVE_REQUIREDS_BLANK_KEY: False,
746
+ DEFAULT_KEY: "not provided"
747
+ }
748
+ host_type_config_dict = {
749
+ METADATA_FIELDS_KEY: {
750
+ "host_field": {
751
+ DEFAULT_KEY: "host_default",
752
+ TYPE_KEY: "string"
753
+ }
754
+ },
755
+ SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
756
+ "stool": {
757
+ METADATA_FIELDS_KEY: {
758
+ "stool_field": {
759
+ DEFAULT_KEY: "stool_default",
760
+ TYPE_KEY: "string"
761
+ }
762
+ }
763
+ }
764
+ }
765
+ }
766
+
767
+ result_df, validation_msgs = _generate_metadata_for_a_sample_type_in_a_host_type(
768
+ input_df, "stool", global_plus_host_settings_dict, host_type_config_dict)
769
+
770
+ expected_df = pandas.DataFrame({
771
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
772
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
773
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
774
+ QC_NOTE_KEY: ["", ""],
775
+ "host_field": ["host_default", "host_default"],
776
+ "stool_field": ["stool_default", "stool_default"],
777
+ SAMPLE_TYPE_KEY: ["stool", "stool"],
778
+ QIITA_SAMPLE_TYPE: ["stool", "stool"]
779
+ })
780
+ assert_frame_equal(expected_df, result_df)
781
+ self.assertEqual([], validation_msgs)
782
+
783
+ def test__generate_metadata_for_a_sample_type_in_a_host_type_unknown_sample_type(self):
784
+ """Test that unknown sample type adds QC note."""
785
+ input_df = pandas.DataFrame({
786
+ SAMPLE_NAME_KEY: ["sample1"],
787
+ HOSTTYPE_SHORTHAND_KEY: ["human"],
788
+ SAMPLETYPE_SHORTHAND_KEY: ["unknown_type"],
789
+ QC_NOTE_KEY: [""]
790
+ })
791
+ global_plus_host_settings_dict = {
792
+ OVERWRITE_NON_NANS_KEY: False,
793
+ LEAVE_REQUIREDS_BLANK_KEY: False,
794
+ DEFAULT_KEY: "not provided"
795
+ }
796
+ host_type_config_dict = {
797
+ METADATA_FIELDS_KEY: {},
798
+ SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
799
+ "stool": {
800
+ METADATA_FIELDS_KEY: {}
801
+ }
802
+ }
803
+ }
804
+
805
+ result_df, validation_msgs = _generate_metadata_for_a_sample_type_in_a_host_type(
806
+ input_df, "unknown_type", global_plus_host_settings_dict, host_type_config_dict)
807
+
808
+ expected_df = pandas.DataFrame({
809
+ SAMPLE_NAME_KEY: ["sample1"],
810
+ HOSTTYPE_SHORTHAND_KEY: ["human"],
811
+ SAMPLETYPE_SHORTHAND_KEY: ["unknown_type"],
812
+ QC_NOTE_KEY: ["invalid sample_type"]
813
+ })
814
+ assert_frame_equal(expected_df, result_df)
815
+ self.assertEqual([], validation_msgs)
816
+
817
+ def test__generate_metadata_for_a_sample_type_in_a_host_type_filters_by_sample_type(self):
818
+ """Test that only rows matching the sample type are processed."""
819
+ input_df = pandas.DataFrame({
820
+ SAMPLE_NAME_KEY: ["sample1", "sample2", "sample3"],
821
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human", "human"],
822
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "blood", "stool"],
823
+ QC_NOTE_KEY: ["", "", ""]
824
+ })
825
+ global_plus_host_settings_dict = {
826
+ OVERWRITE_NON_NANS_KEY: False,
827
+ LEAVE_REQUIREDS_BLANK_KEY: False,
828
+ DEFAULT_KEY: "not provided"
829
+ }
830
+ host_type_config_dict = {
831
+ METADATA_FIELDS_KEY: {},
832
+ SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
833
+ "stool": {
834
+ METADATA_FIELDS_KEY: {
835
+ "stool_field": {
836
+ DEFAULT_KEY: "stool_value",
837
+ TYPE_KEY: "string"
838
+ }
839
+ }
840
+ },
841
+ "blood": {
842
+ METADATA_FIELDS_KEY: {}
843
+ }
844
+ }
845
+ }
846
+
847
+ result_df, validation_msgs = _generate_metadata_for_a_sample_type_in_a_host_type(
848
+ input_df, "stool", global_plus_host_settings_dict, host_type_config_dict)
849
+
850
+ # Should only have the two stool samples
851
+ self.assertEqual(2, len(result_df))
852
+ self.assertEqual(["sample1", "sample3"], result_df[SAMPLE_NAME_KEY].tolist())
853
+ self.assertEqual(["stool_value", "stool_value"], result_df["stool_field"].tolist())
854
+
855
+ def test__generate_metadata_for_a_sample_type_in_a_host_type_leave_requireds_blank_true(self):
856
+ """Test that required fields get LEAVE_BLANK_VAL when leave_requireds_blank is True."""
857
+ input_df = pandas.DataFrame({
858
+ SAMPLE_NAME_KEY: ["sample1"],
859
+ HOSTTYPE_SHORTHAND_KEY: ["human"],
860
+ SAMPLETYPE_SHORTHAND_KEY: ["stool"],
861
+ QC_NOTE_KEY: [""]
862
+ })
863
+ global_plus_host_settings_dict = {
864
+ OVERWRITE_NON_NANS_KEY: False,
865
+ LEAVE_REQUIREDS_BLANK_KEY: True,
866
+ DEFAULT_KEY: "not provided"
867
+ }
868
+ host_type_config_dict = {
869
+ METADATA_FIELDS_KEY: {},
870
+ SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
871
+ "stool": {
872
+ METADATA_FIELDS_KEY: {
873
+ "required_field": {
874
+ REQUIRED_KEY: True,
875
+ TYPE_KEY: "string"
876
+ }
877
+ }
878
+ }
879
+ }
880
+ }
881
+
882
+ result_df, validation_msgs = _generate_metadata_for_a_sample_type_in_a_host_type(
883
+ input_df, "stool", global_plus_host_settings_dict, host_type_config_dict)
884
+
885
+ self.assertEqual(LEAVE_BLANK_VAL, result_df["required_field"].iloc[0])
886
+
887
+ def test__generate_metadata_for_a_sample_type_in_a_host_type_leave_requireds_blank_false(self):
888
+ """Test that required fields get default when leave_requireds_blank is False."""
889
+ input_df = pandas.DataFrame({
890
+ SAMPLE_NAME_KEY: ["sample1"],
891
+ HOSTTYPE_SHORTHAND_KEY: ["human"],
892
+ SAMPLETYPE_SHORTHAND_KEY: ["stool"],
893
+ QC_NOTE_KEY: [""]
894
+ })
895
+ global_plus_host_settings_dict = {
896
+ OVERWRITE_NON_NANS_KEY: False,
897
+ LEAVE_REQUIREDS_BLANK_KEY: False,
898
+ DEFAULT_KEY: "global_default"
899
+ }
900
+ host_type_config_dict = {
901
+ METADATA_FIELDS_KEY: {},
902
+ SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
903
+ "stool": {
904
+ METADATA_FIELDS_KEY: {
905
+ "required_field": {
906
+ REQUIRED_KEY: True,
907
+ TYPE_KEY: "string"
908
+ }
909
+ }
910
+ }
911
+ }
912
+ }
913
+
914
+ result_df, validation_msgs = _generate_metadata_for_a_sample_type_in_a_host_type(
915
+ input_df, "stool", global_plus_host_settings_dict, host_type_config_dict)
916
+
917
+ # When leave_requireds_blank is False, NaN values get filled with global default
918
+ self.assertEqual("global_default", result_df["required_field"].iloc[0])
919
+
920
+ def test__generate_metadata_for_a_sample_type_in_a_host_type_overwrite_non_nans_true(self):
921
+ """Test that existing values are overwritten when overwrite_non_nans is True."""
922
+ input_df = pandas.DataFrame({
923
+ SAMPLE_NAME_KEY: ["sample1"],
924
+ HOSTTYPE_SHORTHAND_KEY: ["human"],
925
+ SAMPLETYPE_SHORTHAND_KEY: ["stool"],
926
+ QC_NOTE_KEY: [""],
927
+ "existing_field": ["original_value"]
928
+ })
929
+ global_plus_host_settings_dict = {
930
+ OVERWRITE_NON_NANS_KEY: True,
931
+ LEAVE_REQUIREDS_BLANK_KEY: False,
932
+ DEFAULT_KEY: "not provided"
933
+ }
934
+ host_type_config_dict = {
935
+ METADATA_FIELDS_KEY: {},
936
+ SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
937
+ "stool": {
938
+ METADATA_FIELDS_KEY: {
939
+ "existing_field": {
940
+ DEFAULT_KEY: "new_value",
941
+ TYPE_KEY: "string"
942
+ }
943
+ }
944
+ }
945
+ }
946
+ }
947
+
948
+ result_df, validation_msgs = _generate_metadata_for_a_sample_type_in_a_host_type(
949
+ input_df, "stool", global_plus_host_settings_dict, host_type_config_dict)
950
+
951
+ self.assertEqual("new_value", result_df["existing_field"].iloc[0])
952
+
953
+ def test__generate_metadata_for_a_sample_type_in_a_host_type_overwrite_non_nans_false(self):
954
+ """Test that existing values are preserved when overwrite_non_nans is False."""
955
+ input_df = pandas.DataFrame({
956
+ SAMPLE_NAME_KEY: ["sample1"],
957
+ HOSTTYPE_SHORTHAND_KEY: ["human"],
958
+ SAMPLETYPE_SHORTHAND_KEY: ["stool"],
959
+ QC_NOTE_KEY: [""],
960
+ "existing_field": ["original_value"]
961
+ })
962
+ global_plus_host_settings_dict = {
963
+ OVERWRITE_NON_NANS_KEY: False,
964
+ LEAVE_REQUIREDS_BLANK_KEY: False,
965
+ DEFAULT_KEY: "not provided"
966
+ }
967
+ host_type_config_dict = {
968
+ METADATA_FIELDS_KEY: {},
969
+ SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
970
+ "stool": {
971
+ METADATA_FIELDS_KEY: {
972
+ "existing_field": {
973
+ DEFAULT_KEY: "new_value",
974
+ TYPE_KEY: "string"
975
+ }
976
+ }
977
+ }
978
+ }
979
+ }
980
+
981
+ result_df, validation_msgs = _generate_metadata_for_a_sample_type_in_a_host_type(
982
+ input_df, "stool", global_plus_host_settings_dict, host_type_config_dict)
983
+
984
+ self.assertEqual("original_value", result_df["existing_field"].iloc[0])
985
+
986
+ def test__generate_metadata_for_a_sample_type_in_a_host_type_with_alias(self):
987
+ """Test that sample type aliases are resolved correctly."""
988
+ input_df = pandas.DataFrame({
989
+ SAMPLE_NAME_KEY: ["sample1"],
990
+ HOSTTYPE_SHORTHAND_KEY: ["human"],
991
+ SAMPLETYPE_SHORTHAND_KEY: ["feces"],
992
+ QC_NOTE_KEY: [""]
993
+ })
994
+ global_plus_host_settings_dict = {
995
+ OVERWRITE_NON_NANS_KEY: False,
996
+ LEAVE_REQUIREDS_BLANK_KEY: False,
997
+ DEFAULT_KEY: "not provided"
998
+ }
999
+ host_type_config_dict = {
1000
+ METADATA_FIELDS_KEY: {},
1001
+ SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
1002
+ "feces": {
1003
+ ALIAS_KEY: "stool"
1004
+ },
1005
+ "stool": {
1006
+ METADATA_FIELDS_KEY: {
1007
+ "stool_field": {
1008
+ DEFAULT_KEY: "stool_value",
1009
+ TYPE_KEY: "string"
1010
+ }
1011
+ }
1012
+ }
1013
+ }
1014
+ }
1015
+
1016
+ result_df, validation_msgs = _generate_metadata_for_a_sample_type_in_a_host_type(
1017
+ input_df, "feces", global_plus_host_settings_dict, host_type_config_dict)
1018
+
1019
+ self.assertEqual("stool_value", result_df["stool_field"].iloc[0])
1020
+ # sample_type should be set to the resolved type "stool"
1021
+ self.assertEqual("stool", result_df[SAMPLE_TYPE_KEY].iloc[0])
1022
+
1023
+ # Tests for _generate_metadata_for_a_host_type
1024
+
1025
+ def test__generate_metadata_for_a_host_type_basic(self):
1026
+ """Test basic metadata generation for a known host type."""
1027
+ input_df = pandas.DataFrame({
1028
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
1029
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
1030
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
1031
+ QC_NOTE_KEY: ["", ""]
1032
+ })
1033
+ settings_dict = {
1034
+ OVERWRITE_NON_NANS_KEY: False,
1035
+ LEAVE_REQUIREDS_BLANK_KEY: False,
1036
+ DEFAULT_KEY: "global_default"
1037
+ }
1038
+ full_flat_config_dict = {
1039
+ HOST_TYPE_SPECIFIC_METADATA_KEY: {
1040
+ "human": {
1041
+ DEFAULT_KEY: "human_default",
1042
+ METADATA_FIELDS_KEY: {
1043
+ "host_field": {
1044
+ DEFAULT_KEY: "host_value",
1045
+ TYPE_KEY: "string"
1046
+ }
1047
+ },
1048
+ SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
1049
+ "stool": {
1050
+ METADATA_FIELDS_KEY: {
1051
+ "stool_field": {
1052
+ DEFAULT_KEY: "stool_value",
1053
+ TYPE_KEY: "string"
1054
+ }
1055
+ }
1056
+ }
1057
+ }
1058
+ }
1059
+ }
1060
+ }
1061
+
1062
+ result_df, validation_msgs = _generate_metadata_for_a_host_type(
1063
+ input_df, "human", settings_dict, full_flat_config_dict)
1064
+
1065
+ expected_df = pandas.DataFrame({
1066
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
1067
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
1068
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
1069
+ QC_NOTE_KEY: ["", ""],
1070
+ "host_field": ["host_value", "host_value"],
1071
+ "stool_field": ["stool_value", "stool_value"],
1072
+ SAMPLE_TYPE_KEY: ["stool", "stool"],
1073
+ QIITA_SAMPLE_TYPE: ["stool", "stool"]
1074
+ })
1075
+ assert_frame_equal(expected_df, result_df)
1076
+ self.assertEqual([], validation_msgs)
1077
+
1078
+ def test__generate_metadata_for_a_host_type_unknown_host_type(self):
1079
+ """Test that unknown host type adds QC note."""
1080
+ input_df = pandas.DataFrame({
1081
+ SAMPLE_NAME_KEY: ["sample1"],
1082
+ HOSTTYPE_SHORTHAND_KEY: ["unknown_host"],
1083
+ SAMPLETYPE_SHORTHAND_KEY: ["stool"],
1084
+ QC_NOTE_KEY: [""]
1085
+ })
1086
+ settings_dict = {
1087
+ OVERWRITE_NON_NANS_KEY: False,
1088
+ LEAVE_REQUIREDS_BLANK_KEY: False,
1089
+ DEFAULT_KEY: "global_default"
1090
+ }
1091
+ full_flat_config_dict = {
1092
+ HOST_TYPE_SPECIFIC_METADATA_KEY: {
1093
+ "human": {
1094
+ METADATA_FIELDS_KEY: {},
1095
+ SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {}
1096
+ }
1097
+ }
1098
+ }
1099
+
1100
+ result_df, validation_msgs = _generate_metadata_for_a_host_type(
1101
+ input_df, "unknown_host", settings_dict, full_flat_config_dict)
1102
+
1103
+ expected_df = pandas.DataFrame({
1104
+ SAMPLE_NAME_KEY: ["sample1"],
1105
+ HOSTTYPE_SHORTHAND_KEY: ["unknown_host"],
1106
+ SAMPLETYPE_SHORTHAND_KEY: ["stool"],
1107
+ QC_NOTE_KEY: ["invalid host_type"]
1108
+ })
1109
+ assert_frame_equal(expected_df, result_df)
1110
+ self.assertEqual([], validation_msgs)
1111
+
1112
+ def test__generate_metadata_for_a_host_type_unknown_sample_type(self):
1113
+ """Test that unknown sample type within known host type adds QC note."""
1114
+ input_df = pandas.DataFrame({
1115
+ SAMPLE_NAME_KEY: ["sample1"],
1116
+ HOSTTYPE_SHORTHAND_KEY: ["human"],
1117
+ SAMPLETYPE_SHORTHAND_KEY: ["unknown_sample"],
1118
+ QC_NOTE_KEY: [""]
1119
+ })
1120
+ settings_dict = {
1121
+ OVERWRITE_NON_NANS_KEY: False,
1122
+ LEAVE_REQUIREDS_BLANK_KEY: False,
1123
+ DEFAULT_KEY: "global_default"
1124
+ }
1125
+ full_flat_config_dict = {
1126
+ HOST_TYPE_SPECIFIC_METADATA_KEY: {
1127
+ "human": {
1128
+ METADATA_FIELDS_KEY: {},
1129
+ SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
1130
+ "stool": {
1131
+ METADATA_FIELDS_KEY: {}
1132
+ }
1133
+ }
1134
+ }
1135
+ }
1136
+ }
1137
+
1138
+ result_df, validation_msgs = _generate_metadata_for_a_host_type(
1139
+ input_df, "human", settings_dict, full_flat_config_dict)
1140
+
1141
+ expected_df = pandas.DataFrame({
1142
+ SAMPLE_NAME_KEY: ["sample1"],
1143
+ HOSTTYPE_SHORTHAND_KEY: ["human"],
1144
+ SAMPLETYPE_SHORTHAND_KEY: ["unknown_sample"],
1145
+ QC_NOTE_KEY: ["invalid sample_type"]
1146
+ })
1147
+ assert_frame_equal(expected_df, result_df)
1148
+ self.assertEqual([], validation_msgs)
1149
+
1150
+ def test__generate_metadata_for_a_host_type_filters_by_host_type(self):
1151
+ """Test that only rows matching the host type are processed."""
1152
+ input_df = pandas.DataFrame({
1153
+ SAMPLE_NAME_KEY: ["sample1", "sample2", "sample3"],
1154
+ HOSTTYPE_SHORTHAND_KEY: ["human", "mouse", "human"],
1155
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool", "stool"],
1156
+ QC_NOTE_KEY: ["", "", ""]
1157
+ })
1158
+ settings_dict = {
1159
+ OVERWRITE_NON_NANS_KEY: False,
1160
+ LEAVE_REQUIREDS_BLANK_KEY: False,
1161
+ DEFAULT_KEY: "global_default"
1162
+ }
1163
+ full_flat_config_dict = {
1164
+ HOST_TYPE_SPECIFIC_METADATA_KEY: {
1165
+ "human": {
1166
+ METADATA_FIELDS_KEY: {
1167
+ "human_field": {
1168
+ DEFAULT_KEY: "human_value",
1169
+ TYPE_KEY: "string"
1170
+ }
1171
+ },
1172
+ SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
1173
+ "stool": {
1174
+ METADATA_FIELDS_KEY: {}
1175
+ }
1176
+ }
1177
+ },
1178
+ "mouse": {
1179
+ METADATA_FIELDS_KEY: {},
1180
+ SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {}
1181
+ }
1182
+ }
1183
+ }
1184
+
1185
+ result_df, validation_msgs = _generate_metadata_for_a_host_type(
1186
+ input_df, "human", settings_dict, full_flat_config_dict)
1187
+
1188
+ expected_df = pandas.DataFrame({
1189
+ SAMPLE_NAME_KEY: ["sample1", "sample3"],
1190
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
1191
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
1192
+ QC_NOTE_KEY: ["", ""],
1193
+ "human_field": ["human_value", "human_value"],
1194
+ SAMPLE_TYPE_KEY: ["stool", "stool"],
1195
+ QIITA_SAMPLE_TYPE: ["stool", "stool"]
1196
+ })
1197
+ assert_frame_equal(expected_df, result_df)
1198
+
1199
+ def test__generate_metadata_for_a_host_type_uses_host_default(self):
1200
+ """Test that host-type-specific default overrides global default."""
1201
+ input_df = pandas.DataFrame({
1202
+ SAMPLE_NAME_KEY: ["sample1"],
1203
+ HOSTTYPE_SHORTHAND_KEY: ["human"],
1204
+ SAMPLETYPE_SHORTHAND_KEY: ["stool"],
1205
+ QC_NOTE_KEY: [""]
1206
+ })
1207
+ settings_dict = {
1208
+ OVERWRITE_NON_NANS_KEY: False,
1209
+ LEAVE_REQUIREDS_BLANK_KEY: False,
1210
+ DEFAULT_KEY: "global_default"
1211
+ }
1212
+ full_flat_config_dict = {
1213
+ HOST_TYPE_SPECIFIC_METADATA_KEY: {
1214
+ "human": {
1215
+ DEFAULT_KEY: "human_specific_default",
1216
+ METADATA_FIELDS_KEY: {},
1217
+ SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
1218
+ "stool": {
1219
+ METADATA_FIELDS_KEY: {
1220
+ "required_field": {
1221
+ REQUIRED_KEY: True,
1222
+ TYPE_KEY: "string"
1223
+ }
1224
+ }
1225
+ }
1226
+ }
1227
+ }
1228
+ }
1229
+ }
1230
+
1231
+ result_df, validation_msgs = _generate_metadata_for_a_host_type(
1232
+ input_df, "human", settings_dict, full_flat_config_dict)
1233
+
1234
+ expected_df = pandas.DataFrame({
1235
+ SAMPLE_NAME_KEY: ["sample1"],
1236
+ HOSTTYPE_SHORTHAND_KEY: ["human"],
1237
+ SAMPLETYPE_SHORTHAND_KEY: ["stool"],
1238
+ QC_NOTE_KEY: [""],
1239
+ "required_field": ["human_specific_default"],
1240
+ SAMPLE_TYPE_KEY: ["stool"],
1241
+ QIITA_SAMPLE_TYPE: ["stool"]
1242
+ })
1243
+ assert_frame_equal(expected_df, result_df)
1244
+
1245
+ def test__generate_metadata_for_a_host_type_uses_global_default_when_no_host_default(self):
1246
+ """Test that global default is used when host type has no specific default."""
1247
+ input_df = pandas.DataFrame({
1248
+ SAMPLE_NAME_KEY: ["sample1"],
1249
+ HOSTTYPE_SHORTHAND_KEY: ["human"],
1250
+ SAMPLETYPE_SHORTHAND_KEY: ["stool"],
1251
+ QC_NOTE_KEY: [""]
1252
+ })
1253
+ settings_dict = {
1254
+ OVERWRITE_NON_NANS_KEY: False,
1255
+ LEAVE_REQUIREDS_BLANK_KEY: False,
1256
+ DEFAULT_KEY: "global_default"
1257
+ }
1258
+ full_flat_config_dict = {
1259
+ HOST_TYPE_SPECIFIC_METADATA_KEY: {
1260
+ "human": {
1261
+ # No DEFAULT_KEY here
1262
+ METADATA_FIELDS_KEY: {},
1263
+ SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
1264
+ "stool": {
1265
+ METADATA_FIELDS_KEY: {
1266
+ "required_field": {
1267
+ REQUIRED_KEY: True,
1268
+ TYPE_KEY: "string"
1269
+ }
1270
+ }
1271
+ }
1272
+ }
1273
+ }
1274
+ }
1275
+ }
1276
+
1277
+ result_df, validation_msgs = _generate_metadata_for_a_host_type(
1278
+ input_df, "human", settings_dict, full_flat_config_dict)
1279
+
1280
+ expected_df = pandas.DataFrame({
1281
+ SAMPLE_NAME_KEY: ["sample1"],
1282
+ HOSTTYPE_SHORTHAND_KEY: ["human"],
1283
+ SAMPLETYPE_SHORTHAND_KEY: ["stool"],
1284
+ QC_NOTE_KEY: [""],
1285
+ "required_field": ["global_default"],
1286
+ SAMPLE_TYPE_KEY: ["stool"],
1287
+ QIITA_SAMPLE_TYPE: ["stool"]
1288
+ })
1289
+ assert_frame_equal(expected_df, result_df)
1290
+
1291
+ # Tests for _generate_metadata_for_host_types
1292
+
1293
+ def test__generate_metadata_for_host_types_single_host_type(self):
1294
+ """Test metadata generation for a single host type."""
1295
+ input_df = pandas.DataFrame({
1296
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
1297
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
1298
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
1299
+ QC_NOTE_KEY: ["", ""]
1300
+ })
1301
+ full_flat_config_dict = {
1302
+ DEFAULT_KEY: "global_default",
1303
+ LEAVE_REQUIREDS_BLANK_KEY: False,
1304
+ OVERWRITE_NON_NANS_KEY: False,
1305
+ HOST_TYPE_SPECIFIC_METADATA_KEY: {
1306
+ "human": {
1307
+ METADATA_FIELDS_KEY: {
1308
+ "host_field": {
1309
+ DEFAULT_KEY: "host_value",
1310
+ TYPE_KEY: "string"
1311
+ }
1312
+ },
1313
+ SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
1314
+ "stool": {
1315
+ METADATA_FIELDS_KEY: {
1316
+ "stool_field": {
1317
+ DEFAULT_KEY: "stool_value",
1318
+ TYPE_KEY: "string"
1319
+ }
1320
+ }
1321
+ }
1322
+ }
1323
+ }
1324
+ }
1325
+ }
1326
+
1327
+ result_df, validation_msgs = _generate_metadata_for_host_types(
1328
+ input_df, full_flat_config_dict)
1329
+
1330
+ expected_df = pandas.DataFrame({
1331
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
1332
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
1333
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
1334
+ QC_NOTE_KEY: ["", ""],
1335
+ "host_field": ["host_value", "host_value"],
1336
+ "stool_field": ["stool_value", "stool_value"],
1337
+ SAMPLE_TYPE_KEY: ["stool", "stool"],
1338
+ QIITA_SAMPLE_TYPE: ["stool", "stool"]
1339
+ })
1340
+ assert_frame_equal(expected_df, result_df)
1341
+ self.assertEqual([], validation_msgs)
1342
+
1343
+ def test__generate_metadata_for_host_types_multiple_host_types(self):
1344
+ """Test metadata generation for multiple host types with NA filling."""
1345
+ input_df = pandas.DataFrame({
1346
+ SAMPLE_NAME_KEY: ["sample1", "sample2", "sample3"],
1347
+ HOSTTYPE_SHORTHAND_KEY: ["human", "mouse", "human"],
1348
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool", "blood"],
1349
+ QC_NOTE_KEY: ["", "", ""]
1350
+ })
1351
+ full_flat_config_dict = {
1352
+ DEFAULT_KEY: "global_default",
1353
+ LEAVE_REQUIREDS_BLANK_KEY: False,
1354
+ OVERWRITE_NON_NANS_KEY: False,
1355
+ HOST_TYPE_SPECIFIC_METADATA_KEY: {
1356
+ "human": {
1357
+ METADATA_FIELDS_KEY: {
1358
+ "human_field": {
1359
+ DEFAULT_KEY: "human_value",
1360
+ TYPE_KEY: "string"
1361
+ }
1362
+ },
1363
+ SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
1364
+ "stool": {
1365
+ METADATA_FIELDS_KEY: {}
1366
+ },
1367
+ "blood": {
1368
+ METADATA_FIELDS_KEY: {}
1369
+ }
1370
+ }
1371
+ },
1372
+ "mouse": {
1373
+ METADATA_FIELDS_KEY: {
1374
+ "mouse_field": {
1375
+ DEFAULT_KEY: "mouse_value",
1376
+ TYPE_KEY: "string"
1377
+ }
1378
+ },
1379
+ SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
1380
+ "stool": {
1381
+ METADATA_FIELDS_KEY: {}
1382
+ }
1383
+ }
1384
+ }
1385
+ }
1386
+ }
1387
+
1388
+ result_df, validation_msgs = _generate_metadata_for_host_types(
1389
+ input_df, full_flat_config_dict)
1390
+
1391
+ # After concat, columns from different host types will have NaNs filled with global_default
1392
+ expected_df = pandas.DataFrame({
1393
+ SAMPLE_NAME_KEY: ["sample1", "sample3", "sample2"],
1394
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human", "mouse"],
1395
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "blood", "stool"],
1396
+ QC_NOTE_KEY: ["", "", ""],
1397
+ "human_field": ["human_value", "human_value", "global_default"],
1398
+ SAMPLE_TYPE_KEY: ["stool", "blood", "stool"],
1399
+ QIITA_SAMPLE_TYPE: ["stool", "blood", "stool"],
1400
+ "mouse_field": ["global_default", "global_default", "mouse_value"]
1401
+ })
1402
+ assert_frame_equal(expected_df, result_df)
1403
+ self.assertEqual([], validation_msgs)
1404
+
1405
+ def test__generate_metadata_for_host_types_unknown_host_type(self):
1406
+ """Test that unknown host type adds QC note."""
1407
+ input_df = pandas.DataFrame({
1408
+ SAMPLE_NAME_KEY: ["sample1"],
1409
+ HOSTTYPE_SHORTHAND_KEY: ["unknown_host"],
1410
+ SAMPLETYPE_SHORTHAND_KEY: ["stool"],
1411
+ QC_NOTE_KEY: [""]
1412
+ })
1413
+ full_flat_config_dict = {
1414
+ DEFAULT_KEY: "global_default",
1415
+ LEAVE_REQUIREDS_BLANK_KEY: False,
1416
+ OVERWRITE_NON_NANS_KEY: False,
1417
+ HOST_TYPE_SPECIFIC_METADATA_KEY: {
1418
+ "human": {
1419
+ METADATA_FIELDS_KEY: {},
1420
+ SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {}
1421
+ }
1422
+ }
1423
+ }
1424
+
1425
+ result_df, validation_msgs = _generate_metadata_for_host_types(
1426
+ input_df, full_flat_config_dict)
1427
+
1428
+ expected_df = pandas.DataFrame({
1429
+ SAMPLE_NAME_KEY: ["sample1"],
1430
+ HOSTTYPE_SHORTHAND_KEY: ["unknown_host"],
1431
+ SAMPLETYPE_SHORTHAND_KEY: ["stool"],
1432
+ QC_NOTE_KEY: ["invalid host_type"]
1433
+ })
1434
+ assert_frame_equal(expected_df, result_df)
1435
+ self.assertEqual([], validation_msgs)
1436
+
1437
+ def test__generate_metadata_for_host_types_unknown_sample_type(self):
1438
+ """Test that unknown sample type within known host type adds QC note."""
1439
+ input_df = pandas.DataFrame({
1440
+ SAMPLE_NAME_KEY: ["sample1"],
1441
+ HOSTTYPE_SHORTHAND_KEY: ["human"],
1442
+ SAMPLETYPE_SHORTHAND_KEY: ["unknown_sample"],
1443
+ QC_NOTE_KEY: [""]
1444
+ })
1445
+ full_flat_config_dict = {
1446
+ DEFAULT_KEY: "global_default",
1447
+ LEAVE_REQUIREDS_BLANK_KEY: False,
1448
+ OVERWRITE_NON_NANS_KEY: False,
1449
+ HOST_TYPE_SPECIFIC_METADATA_KEY: {
1450
+ "human": {
1451
+ METADATA_FIELDS_KEY: {},
1452
+ SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
1453
+ "stool": {
1454
+ METADATA_FIELDS_KEY: {}
1455
+ }
1456
+ }
1457
+ }
1458
+ }
1459
+ }
1460
+
1461
+ result_df, validation_msgs = _generate_metadata_for_host_types(
1462
+ input_df, full_flat_config_dict)
1463
+
1464
+ expected_df = pandas.DataFrame({
1465
+ SAMPLE_NAME_KEY: ["sample1"],
1466
+ HOSTTYPE_SHORTHAND_KEY: ["human"],
1467
+ SAMPLETYPE_SHORTHAND_KEY: ["unknown_sample"],
1468
+ QC_NOTE_KEY: ["invalid sample_type"]
1469
+ })
1470
+ assert_frame_equal(expected_df, result_df)
1471
+ self.assertEqual([], validation_msgs)
1472
+
1473
+ def test__generate_metadata_for_host_types_replaces_leave_blank_val(self):
1474
+ """Test that LEAVE_BLANK_VAL is replaced with empty string."""
1475
+ input_df = pandas.DataFrame({
1476
+ SAMPLE_NAME_KEY: ["sample1"],
1477
+ HOSTTYPE_SHORTHAND_KEY: ["human"],
1478
+ SAMPLETYPE_SHORTHAND_KEY: ["stool"],
1479
+ QC_NOTE_KEY: [""]
1480
+ })
1481
+ full_flat_config_dict = {
1482
+ DEFAULT_KEY: "global_default",
1483
+ LEAVE_REQUIREDS_BLANK_KEY: True, # This causes required fields to get LEAVE_BLANK_VAL
1484
+ OVERWRITE_NON_NANS_KEY: False,
1485
+ HOST_TYPE_SPECIFIC_METADATA_KEY: {
1486
+ "human": {
1487
+ METADATA_FIELDS_KEY: {},
1488
+ SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
1489
+ "stool": {
1490
+ METADATA_FIELDS_KEY: {
1491
+ "required_field": {
1492
+ REQUIRED_KEY: True,
1493
+ TYPE_KEY: "string"
1494
+ }
1495
+ }
1496
+ }
1497
+ }
1498
+ }
1499
+ }
1500
+ }
1501
+
1502
+ result_df, validation_msgs = _generate_metadata_for_host_types(
1503
+ input_df, full_flat_config_dict)
1504
+
1505
+ expected_df = pandas.DataFrame({
1506
+ SAMPLE_NAME_KEY: ["sample1"],
1507
+ HOSTTYPE_SHORTHAND_KEY: ["human"],
1508
+ SAMPLETYPE_SHORTHAND_KEY: ["stool"],
1509
+ QC_NOTE_KEY: [""],
1510
+ "required_field": [""], # LEAVE_BLANK_VAL replaced with empty string
1511
+ SAMPLE_TYPE_KEY: ["stool"],
1512
+ QIITA_SAMPLE_TYPE: ["stool"]
1513
+ })
1514
+ assert_frame_equal(expected_df, result_df)
1515
+
1516
+ # Tests for _transform_metadata
1517
+
1518
+ def test__transform_metadata_no_transformers(self):
1519
+ """Test that df is returned unchanged when no transformers are configured."""
1520
+ input_df = pandas.DataFrame({
1521
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
1522
+ "field1": ["value1", "value2"]
1523
+ })
1524
+ full_flat_config_dict = {}
1525
+
1526
+ result_df = _transform_metadata(
1527
+ input_df, full_flat_config_dict, "pre", None)
1528
+
1529
+ expected_df = input_df
1530
+
1531
+ assert_frame_equal(expected_df, result_df)
1532
+
1533
+ def test__transform_metadata_no_stage_transformers(self):
1534
+ """Test that df is returned unchanged when stage has no transformers."""
1535
+ input_df = pandas.DataFrame({
1536
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
1537
+ "field1": ["value1", "value2"]
1538
+ })
1539
+ full_flat_config_dict = {
1540
+ METADATA_TRANSFORMERS_KEY: {
1541
+ "post": {
1542
+ "target_field": {
1543
+ SOURCES_KEY: ["field1"],
1544
+ FUNCTION_KEY: "pass_through"
1545
+ }
1546
+ }
1547
+ }
1548
+ }
1549
+
1550
+ result_df = _transform_metadata(
1551
+ input_df, full_flat_config_dict, "pre", None)
1552
+
1553
+ expected_df = input_df
1554
+
1555
+ assert_frame_equal(expected_df, result_df)
1556
+
1557
+ def test__transform_metadata_builtin_pass_through(self):
1558
+ """Test using built-in pass_through transformer."""
1559
+ input_df = pandas.DataFrame({
1560
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
1561
+ "source_field": ["value1", "value2"]
1562
+ })
1563
+ full_flat_config_dict = {
1564
+ METADATA_TRANSFORMERS_KEY: {
1565
+ "pre": {
1566
+ "target_field": {
1567
+ SOURCES_KEY: ["source_field"],
1568
+ FUNCTION_KEY: "pass_through"
1569
+ }
1570
+ }
1571
+ }
1572
+ }
1573
+
1574
+ result_df = _transform_metadata(
1575
+ input_df, full_flat_config_dict, "pre", None)
1576
+
1577
+ expected_df = pandas.DataFrame({
1578
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
1579
+ "source_field": ["value1", "value2"],
1580
+ "target_field": ["value1", "value2"]
1581
+ })
1582
+ assert_frame_equal(expected_df, result_df)
1583
+
1584
+ def test__transform_metadata_builtin_sex_transformer(self):
1585
+ """Test using built-in transform_input_sex_to_std_sex transformer."""
1586
+ input_df = pandas.DataFrame({
1587
+ SAMPLE_NAME_KEY: ["sample1", "sample2", "sample3"],
1588
+ "input_sex": ["F", "Male", "female"]
1589
+ })
1590
+ full_flat_config_dict = {
1591
+ METADATA_TRANSFORMERS_KEY: {
1592
+ "pre": {
1593
+ "sex": {
1594
+ SOURCES_KEY: ["input_sex"],
1595
+ FUNCTION_KEY: "transform_input_sex_to_std_sex"
1596
+ }
1597
+ }
1598
+ }
1599
+ }
1600
+
1601
+ result_df = _transform_metadata(
1602
+ input_df, full_flat_config_dict, "pre", None)
1603
+
1604
+ expected_df = pandas.DataFrame({
1605
+ SAMPLE_NAME_KEY: ["sample1", "sample2", "sample3"],
1606
+ "input_sex": ["F", "Male", "female"],
1607
+ "sex": ["female", "male", "female"]
1608
+ })
1609
+ assert_frame_equal(expected_df, result_df)
1610
+
1611
+ def test__transform_metadata_builtin_age_to_life_stage(self):
1612
+ """Test using built-in transform_age_to_life_stage transformer."""
1613
+ input_df = pandas.DataFrame({
1614
+ SAMPLE_NAME_KEY: ["sample1", "sample2", "sample3"],
1615
+ "age_years": [10, 17, 45]
1616
+ })
1617
+ full_flat_config_dict = {
1618
+ METADATA_TRANSFORMERS_KEY: {
1619
+ "pre": {
1620
+ "life_stage": {
1621
+ SOURCES_KEY: ["age_years"],
1622
+ FUNCTION_KEY: "transform_age_to_life_stage"
1623
+ }
1624
+ }
1625
+ }
1626
+ }
1627
+
1628
+ result_df = _transform_metadata(
1629
+ input_df, full_flat_config_dict, "pre", None)
1630
+
1631
+ expected_df = pandas.DataFrame({
1632
+ SAMPLE_NAME_KEY: ["sample1", "sample2", "sample3"],
1633
+ "age_years": [10, 17, 45],
1634
+ "life_stage": ["child", "adult", "adult"]
1635
+ })
1636
+ assert_frame_equal(expected_df, result_df)
1637
+
1638
+ def test__transform_metadata_custom_transformer(self):
1639
+ """Test using a custom transformer function passed in transformer_funcs_dict."""
1640
+ input_df = pandas.DataFrame({
1641
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
1642
+ "source_field": ["hello", "world"]
1643
+ })
1644
+ full_flat_config_dict = {
1645
+ METADATA_TRANSFORMERS_KEY: {
1646
+ "pre": {
1647
+ "target_field": {
1648
+ SOURCES_KEY: ["source_field"],
1649
+ FUNCTION_KEY: "custom_upper"
1650
+ }
1651
+ }
1652
+ }
1653
+ }
1654
+
1655
+ def custom_upper(row, source_fields):
1656
+ return row[source_fields[0]].upper()
1657
+
1658
+ transformer_funcs_dict = {
1659
+ "custom_upper": custom_upper
1660
+ }
1661
+
1662
+ result_df = _transform_metadata(
1663
+ input_df, full_flat_config_dict, "pre", transformer_funcs_dict)
1664
+
1665
+ expected_df = pandas.DataFrame({
1666
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
1667
+ "source_field": ["hello", "world"],
1668
+ "target_field": ["HELLO", "WORLD"]
1669
+ })
1670
+ assert_frame_equal(expected_df, result_df)
1671
+
1672
+ def test__transform_metadata_unknown_transformer_raises(self):
1673
+ """Test that unknown transformer function raises ValueError."""
1674
+ input_df = pandas.DataFrame({
1675
+ SAMPLE_NAME_KEY: ["sample1"],
1676
+ "source_field": ["value1"]
1677
+ })
1678
+ full_flat_config_dict = {
1679
+ METADATA_TRANSFORMERS_KEY: {
1680
+ "pre": {
1681
+ "target_field": {
1682
+ SOURCES_KEY: ["source_field"],
1683
+ FUNCTION_KEY: "nonexistent_function"
1684
+ }
1685
+ }
1686
+ }
1687
+ }
1688
+
1689
+ with self.assertRaisesRegex(ValueError, "Unable to find transformer 'nonexistent_function'"):
1690
+ _transform_metadata(input_df, full_flat_config_dict, "pre", None)
1691
+
1692
+ def test__transform_metadata_overwrite_non_nans_false(self):
1693
+ """Test that existing values are preserved when overwrite_non_nans is False."""
1694
+ input_df = pandas.DataFrame({
1695
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
1696
+ "source_field": ["value1", "value2"],
1697
+ "target_field": ["existing", np.nan]
1698
+ })
1699
+ full_flat_config_dict = {
1700
+ OVERWRITE_NON_NANS_KEY: False,
1701
+ METADATA_TRANSFORMERS_KEY: {
1702
+ "pre": {
1703
+ "target_field": {
1704
+ SOURCES_KEY: ["source_field"],
1705
+ FUNCTION_KEY: "pass_through"
1706
+ }
1707
+ }
1708
+ }
1709
+ }
1710
+
1711
+ result_df = _transform_metadata(
1712
+ input_df, full_flat_config_dict, "pre", None)
1713
+
1714
+ expected_df = pandas.DataFrame({
1715
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
1716
+ "source_field": ["value1", "value2"],
1717
+ "target_field": ["existing", "value2"]
1718
+ })
1719
+ assert_frame_equal(expected_df, result_df)
1720
+
1721
+ def test__transform_metadata_overwrite_non_nans_true(self):
1722
+ """Test that existing values are overwritten when overwrite_non_nans is True."""
1723
+ input_df = pandas.DataFrame({
1724
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
1725
+ "source_field": ["value1", "value2"],
1726
+ "target_field": ["existing", "also_existing"]
1727
+ })
1728
+ full_flat_config_dict = {
1729
+ OVERWRITE_NON_NANS_KEY: True,
1730
+ METADATA_TRANSFORMERS_KEY: {
1731
+ "pre": {
1732
+ "target_field": {
1733
+ SOURCES_KEY: ["source_field"],
1734
+ FUNCTION_KEY: "pass_through"
1735
+ }
1736
+ }
1737
+ }
1738
+ }
1739
+
1740
+ result_df = _transform_metadata(
1741
+ input_df, full_flat_config_dict, "pre", None)
1742
+
1743
+ expected_df = pandas.DataFrame({
1744
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
1745
+ "source_field": ["value1", "value2"],
1746
+ "target_field": ["value1", "value2"]
1747
+ })
1748
+ assert_frame_equal(expected_df, result_df)
1749
+
1750
+ def test__transform_metadata_multiple_transformers(self):
1751
+ """Test applying multiple transformers in a single stage."""
1752
+ input_df = pandas.DataFrame({
1753
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
1754
+ "field_a": ["a1", "a2"],
1755
+ "field_b": ["b1", "b2"]
1756
+ })
1757
+ full_flat_config_dict = {
1758
+ METADATA_TRANSFORMERS_KEY: {
1759
+ "pre": {
1760
+ "target_a": {
1761
+ SOURCES_KEY: ["field_a"],
1762
+ FUNCTION_KEY: "pass_through"
1763
+ },
1764
+ "target_b": {
1765
+ SOURCES_KEY: ["field_b"],
1766
+ FUNCTION_KEY: "pass_through"
1767
+ }
1768
+ }
1769
+ }
1770
+ }
1771
+
1772
+ result_df = _transform_metadata(
1773
+ input_df, full_flat_config_dict, "pre", None)
1774
+
1775
+ expected_df = pandas.DataFrame({
1776
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
1777
+ "field_a": ["a1", "a2"],
1778
+ "field_b": ["b1", "b2"],
1779
+ "target_a": ["a1", "a2"],
1780
+ "target_b": ["b1", "b2"]
1781
+ })
1782
+ assert_frame_equal(expected_df, result_df)
1783
+
1784
+ # Tests for _populate_metadata_df
1785
+
1786
+ def test__populate_metadata_df_basic(self):
1787
+ """Test basic metadata population with a simple config."""
1788
+ input_df = pandas.DataFrame({
1789
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
1790
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
1791
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"]
1792
+ })
1793
+ full_flat_config_dict = {
1794
+ DEFAULT_KEY: "not provided",
1795
+ LEAVE_REQUIREDS_BLANK_KEY: False,
1796
+ OVERWRITE_NON_NANS_KEY: False,
1797
+ HOST_TYPE_SPECIFIC_METADATA_KEY: {
1798
+ "human": {
1799
+ METADATA_FIELDS_KEY: {
1800
+ "host_field": {
1801
+ DEFAULT_KEY: "host_value",
1802
+ TYPE_KEY: "string"
1803
+ }
1804
+ },
1805
+ SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
1806
+ "stool": {
1807
+ METADATA_FIELDS_KEY: {
1808
+ "stool_field": {
1809
+ DEFAULT_KEY: "stool_value",
1810
+ TYPE_KEY: "string"
1811
+ }
1812
+ }
1813
+ }
1814
+ }
1815
+ }
1816
+ }
1817
+ }
1818
+
1819
+ result_df, validation_msgs_df = _populate_metadata_df(
1820
+ input_df, full_flat_config_dict, None)
1821
+
1822
+ expected_df = pandas.DataFrame({
1823
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
1824
+ "host_field": ["host_value", "host_value"],
1825
+ QIITA_SAMPLE_TYPE: ["stool", "stool"],
1826
+ SAMPLE_TYPE_KEY: ["stool", "stool"],
1827
+ "stool_field": ["stool_value", "stool_value"],
1828
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
1829
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
1830
+ QC_NOTE_KEY: ["", ""]
1831
+ })
1832
+ assert_frame_equal(expected_df, result_df)
1833
+ self.assertTrue(validation_msgs_df.empty)
1834
+
1835
+ def test__populate_metadata_df_with_pre_transformer(self):
1836
+ """Test metadata population with pre-transformer."""
1837
+ input_df = pandas.DataFrame({
1838
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
1839
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
1840
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
1841
+ "input_sex": ["F", "Male"]
1842
+ })
1843
+ full_flat_config_dict = {
1844
+ DEFAULT_KEY: "not provided",
1845
+ LEAVE_REQUIREDS_BLANK_KEY: False,
1846
+ OVERWRITE_NON_NANS_KEY: False,
1847
+ METADATA_TRANSFORMERS_KEY: {
1848
+ PRE_TRANSFORMERS_KEY: {
1849
+ "sex": {
1850
+ SOURCES_KEY: ["input_sex"],
1851
+ FUNCTION_KEY: "transform_input_sex_to_std_sex"
1852
+ }
1853
+ }
1854
+ },
1855
+ HOST_TYPE_SPECIFIC_METADATA_KEY: {
1856
+ "human": {
1857
+ METADATA_FIELDS_KEY: {},
1858
+ SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
1859
+ "stool": {
1860
+ METADATA_FIELDS_KEY: {}
1861
+ }
1862
+ }
1863
+ }
1864
+ }
1865
+ }
1866
+
1867
+ result_df, validation_msgs_df = _populate_metadata_df(
1868
+ input_df, full_flat_config_dict, None)
1869
+
1870
+ expected_df = pandas.DataFrame({
1871
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
1872
+ "input_sex": ["F", "Male"],
1873
+ QIITA_SAMPLE_TYPE: ["stool", "stool"],
1874
+ SAMPLE_TYPE_KEY: ["stool", "stool"],
1875
+ "sex": ["female", "male"],
1876
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
1877
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
1878
+ QC_NOTE_KEY: ["", ""]
1879
+ })
1880
+ assert_frame_equal(expected_df, result_df)
1881
+
1882
+ def test__populate_metadata_df_with_post_transformer(self):
1883
+ """Test metadata population with post-transformer."""
1884
+ input_df = pandas.DataFrame({
1885
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
1886
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
1887
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"]
1888
+ })
1889
+ full_flat_config_dict = {
1890
+ DEFAULT_KEY: "not provided",
1891
+ LEAVE_REQUIREDS_BLANK_KEY: False,
1892
+ OVERWRITE_NON_NANS_KEY: False,
1893
+ METADATA_TRANSFORMERS_KEY: {
1894
+ POST_TRANSFORMERS_KEY: {
1895
+ "copied_sample_type": {
1896
+ SOURCES_KEY: [SAMPLE_TYPE_KEY],
1897
+ FUNCTION_KEY: "pass_through"
1898
+ }
1899
+ }
1900
+ },
1901
+ HOST_TYPE_SPECIFIC_METADATA_KEY: {
1902
+ "human": {
1903
+ METADATA_FIELDS_KEY: {},
1904
+ SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
1905
+ "stool": {
1906
+ METADATA_FIELDS_KEY: {}
1907
+ }
1908
+ }
1909
+ }
1910
+ }
1911
+ }
1912
+
1913
+ result_df, validation_msgs_df = _populate_metadata_df(
1914
+ input_df, full_flat_config_dict, None)
1915
+
1916
+ expected_df = pandas.DataFrame({
1917
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
1918
+ "copied_sample_type": ["stool", "stool"],
1919
+ QIITA_SAMPLE_TYPE: ["stool", "stool"],
1920
+ SAMPLE_TYPE_KEY: ["stool", "stool"],
1921
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
1922
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
1923
+ QC_NOTE_KEY: ["", ""]
1924
+ })
1925
+ assert_frame_equal(expected_df, result_df)
1926
+
1927
+ def test__populate_metadata_df_unknown_host_type(self):
1928
+ """Test that unknown host type adds QC note."""
1929
+ input_df = pandas.DataFrame({
1930
+ SAMPLE_NAME_KEY: ["sample1"],
1931
+ HOSTTYPE_SHORTHAND_KEY: ["unknown_host"],
1932
+ SAMPLETYPE_SHORTHAND_KEY: ["stool"]
1933
+ })
1934
+ full_flat_config_dict = {
1935
+ DEFAULT_KEY: "not provided",
1936
+ LEAVE_REQUIREDS_BLANK_KEY: False,
1937
+ OVERWRITE_NON_NANS_KEY: False,
1938
+ HOST_TYPE_SPECIFIC_METADATA_KEY: {
1939
+ "human": {
1940
+ METADATA_FIELDS_KEY: {},
1941
+ SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {}
1942
+ }
1943
+ }
1944
+ }
1945
+
1946
+ result_df, validation_msgs_df = _populate_metadata_df(
1947
+ input_df, full_flat_config_dict, None)
1948
+
1949
+ expected_df = pandas.DataFrame({
1950
+ SAMPLE_NAME_KEY: ["sample1"],
1951
+ HOSTTYPE_SHORTHAND_KEY: ["unknown_host"],
1952
+ SAMPLETYPE_SHORTHAND_KEY: ["stool"],
1953
+ QC_NOTE_KEY: ["invalid host_type"]
1954
+ })
1955
+ assert_frame_equal(expected_df, result_df)
1956
+
1957
+ def test__populate_metadata_df_columns_reordered(self):
1958
+ """Test that columns are reordered correctly."""
1959
+ input_df = pandas.DataFrame({
1960
+ "zebra_field": ["z1", "z2"],
1961
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
1962
+ "apple_field": ["a1", "a2"],
1963
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
1964
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"]
1965
+ })
1966
+ full_flat_config_dict = {
1967
+ DEFAULT_KEY: "not provided",
1968
+ LEAVE_REQUIREDS_BLANK_KEY: False,
1969
+ OVERWRITE_NON_NANS_KEY: False,
1970
+ HOST_TYPE_SPECIFIC_METADATA_KEY: {
1971
+ "human": {
1972
+ METADATA_FIELDS_KEY: {},
1973
+ SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
1974
+ "stool": {
1975
+ METADATA_FIELDS_KEY: {}
1976
+ }
1977
+ }
1978
+ }
1979
+ }
1980
+ }
1981
+
1982
+ result_df, validation_msgs_df = _populate_metadata_df(
1983
+ input_df, full_flat_config_dict, None)
1984
+
1985
+ expected_df = pandas.DataFrame({
1986
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
1987
+ "apple_field": ["a1", "a2"],
1988
+ QIITA_SAMPLE_TYPE: ["stool", "stool"],
1989
+ SAMPLE_TYPE_KEY: ["stool", "stool"],
1990
+ "zebra_field": ["z1", "z2"],
1991
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
1992
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
1993
+ QC_NOTE_KEY: ["", ""]
1994
+ })
1995
+ assert_frame_equal(expected_df, result_df)
1996
+
1997
+ def test__populate_metadata_df_with_custom_transformer(self):
1998
+ """Test metadata population with custom transformer function."""
1999
+ input_df = pandas.DataFrame({
2000
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
2001
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
2002
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
2003
+ "source_field": ["hello", "world"]
2004
+ })
2005
+ full_flat_config_dict = {
2006
+ DEFAULT_KEY: "not provided",
2007
+ LEAVE_REQUIREDS_BLANK_KEY: False,
2008
+ OVERWRITE_NON_NANS_KEY: False,
2009
+ METADATA_TRANSFORMERS_KEY: {
2010
+ PRE_TRANSFORMERS_KEY: {
2011
+ "upper_field": {
2012
+ SOURCES_KEY: ["source_field"],
2013
+ FUNCTION_KEY: "custom_upper"
2014
+ }
2015
+ }
2016
+ },
2017
+ HOST_TYPE_SPECIFIC_METADATA_KEY: {
2018
+ "human": {
2019
+ METADATA_FIELDS_KEY: {},
2020
+ SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
2021
+ "stool": {
2022
+ METADATA_FIELDS_KEY: {}
2023
+ }
2024
+ }
2025
+ }
2026
+ }
2027
+ }
2028
+
2029
+ def custom_upper(row, source_fields):
2030
+ return row[source_fields[0]].upper()
2031
+
2032
+ transformer_funcs_dict = {"custom_upper": custom_upper}
2033
+
2034
+ result_df, validation_msgs_df = _populate_metadata_df(
2035
+ input_df, full_flat_config_dict, transformer_funcs_dict)
2036
+
2037
+ expected_df = pandas.DataFrame({
2038
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
2039
+ QIITA_SAMPLE_TYPE: ["stool", "stool"],
2040
+ SAMPLE_TYPE_KEY: ["stool", "stool"],
2041
+ "source_field": ["hello", "world"],
2042
+ "upper_field": ["HELLO", "WORLD"],
2043
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
2044
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
2045
+ QC_NOTE_KEY: ["", ""]
2046
+ })
2047
+ assert_frame_equal(expected_df, result_df)
2048
+
2049
+ def test__populate_metadata_df_nan_sample_name_raises(self):
2050
+ """Test that NaN sample name raises ValueError."""
2051
+ input_df = pandas.DataFrame({
2052
+ SAMPLE_NAME_KEY: ["sample1", np.nan],
2053
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
2054
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"]
2055
+ })
2056
+ full_flat_config_dict = {
2057
+ HOST_TYPE_SPECIFIC_METADATA_KEY: {}
2058
+ }
2059
+
2060
+ with self.assertRaisesRegex(ValueError, "Metadata contains NaN sample names"):
2061
+ _populate_metadata_df(input_df, full_flat_config_dict, None)
2062
+
2063
+ # Tests for extend_metadata_df
2064
+
2065
+ TEST_DIR = path.dirname(__file__)
2066
+ TEST_STDS_FP = path.join(TEST_DIR, "data/test_standards.yml")
2067
+
2068
+ def test_extend_metadata_df_basic(self):
2069
+ """Test basic metadata extension with study config."""
2070
+ input_df = pandas.DataFrame({
2071
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
2072
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
2073
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"]
2074
+ })
2075
+ study_config = {
2076
+ DEFAULT_KEY: "not provided",
2077
+ LEAVE_REQUIREDS_BLANK_KEY: True,
2078
+ OVERWRITE_NON_NANS_KEY: False,
2079
+ STUDY_SPECIFIC_METADATA_KEY: {
2080
+ HOST_TYPE_SPECIFIC_METADATA_KEY: {
2081
+ "human": {
2082
+ METADATA_FIELDS_KEY: {
2083
+ "custom_field": {
2084
+ DEFAULT_KEY: "custom_value",
2085
+ TYPE_KEY: "string"
2086
+ }
2087
+ },
2088
+ SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
2089
+ "stool": {
2090
+ METADATA_FIELDS_KEY: {}
2091
+ }
2092
+ }
2093
+ }
2094
+ }
2095
+ }
2096
+ }
2097
+
2098
+ result_df, validation_msgs_df = extend_metadata_df(
2099
+ input_df, study_config, None, None, self.TEST_STDS_FP)
2100
+
2101
+ expected_df = pandas.DataFrame({
2102
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
2103
+ "body_site": ["gut", "gut"],
2104
+ "custom_field": ["custom_value", "custom_value"],
2105
+ "host_common_name": ["human", "human"],
2106
+ QIITA_SAMPLE_TYPE: ["stool", "stool"],
2107
+ SAMPLE_TYPE_KEY: ["stool", "stool"],
2108
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
2109
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
2110
+ QC_NOTE_KEY: ["", ""]
2111
+ })
2112
+ assert_frame_equal(expected_df, result_df)
2113
+ self.assertTrue(validation_msgs_df.empty)
2114
+
2115
+ def test_extend_metadata_df_with_pre_transformer(self):
2116
+ """Test metadata extension with pre-transformer."""
2117
+ input_df = pandas.DataFrame({
2118
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
2119
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
2120
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
2121
+ "input_sex": ["F", "Male"]
2122
+ })
2123
+ study_config = {
2124
+ DEFAULT_KEY: "not provided",
2125
+ LEAVE_REQUIREDS_BLANK_KEY: True,
2126
+ OVERWRITE_NON_NANS_KEY: False,
2127
+ METADATA_TRANSFORMERS_KEY: {
2128
+ PRE_TRANSFORMERS_KEY: {
2129
+ "sex": {
2130
+ SOURCES_KEY: ["input_sex"],
2131
+ FUNCTION_KEY: "transform_input_sex_to_std_sex"
2132
+ }
2133
+ }
2134
+ },
2135
+ STUDY_SPECIFIC_METADATA_KEY: {
2136
+ HOST_TYPE_SPECIFIC_METADATA_KEY: {
2137
+ "human": {
2138
+ METADATA_FIELDS_KEY: {},
2139
+ SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
2140
+ "stool": {
2141
+ METADATA_FIELDS_KEY: {}
2142
+ }
2143
+ }
2144
+ }
2145
+ }
2146
+ }
2147
+ }
2148
+
2149
+ result_df, validation_msgs_df = extend_metadata_df(
2150
+ input_df, study_config, None, None, self.TEST_STDS_FP)
2151
+
2152
+ expected_df = pandas.DataFrame({
2153
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
2154
+ "body_site": ["gut", "gut"],
2155
+ "host_common_name": ["human", "human"],
2156
+ "input_sex": ["F", "Male"],
2157
+ QIITA_SAMPLE_TYPE: ["stool", "stool"],
2158
+ SAMPLE_TYPE_KEY: ["stool", "stool"],
2159
+ "sex": ["female", "male"],
2160
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
2161
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
2162
+ QC_NOTE_KEY: ["", ""]
2163
+ })
2164
+ assert_frame_equal(expected_df, result_df)
2165
+
2166
+ def test_extend_metadata_df_with_custom_transformer(self):
2167
+ """Test metadata extension with custom transformer function."""
2168
+ input_df = pandas.DataFrame({
2169
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
2170
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
2171
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
2172
+ "source_field": ["hello", "world"]
2173
+ })
2174
+ study_config = {
2175
+ DEFAULT_KEY: "not provided",
2176
+ LEAVE_REQUIREDS_BLANK_KEY: True,
2177
+ OVERWRITE_NON_NANS_KEY: False,
2178
+ METADATA_TRANSFORMERS_KEY: {
2179
+ PRE_TRANSFORMERS_KEY: {
2180
+ "upper_field": {
2181
+ SOURCES_KEY: ["source_field"],
2182
+ FUNCTION_KEY: "custom_upper"
2183
+ }
2184
+ }
2185
+ },
2186
+ STUDY_SPECIFIC_METADATA_KEY: {
2187
+ HOST_TYPE_SPECIFIC_METADATA_KEY: {
2188
+ "human": {
2189
+ METADATA_FIELDS_KEY: {},
2190
+ SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
2191
+ "stool": {
2192
+ METADATA_FIELDS_KEY: {}
2193
+ }
2194
+ }
2195
+ }
2196
+ }
2197
+ }
2198
+ }
2199
+
2200
+ def custom_upper(row, source_fields):
2201
+ return row[source_fields[0]].upper()
2202
+
2203
+ transformer_funcs_dict = {"custom_upper": custom_upper}
2204
+
2205
+ result_df, validation_msgs_df = extend_metadata_df(
2206
+ input_df, study_config, transformer_funcs_dict, None, self.TEST_STDS_FP)
2207
+
2208
+ expected_df = pandas.DataFrame({
2209
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
2210
+ "body_site": ["gut", "gut"],
2211
+ "host_common_name": ["human", "human"],
2212
+ QIITA_SAMPLE_TYPE: ["stool", "stool"],
2213
+ SAMPLE_TYPE_KEY: ["stool", "stool"],
2214
+ "source_field": ["hello", "world"],
2215
+ "upper_field": ["HELLO", "WORLD"],
2216
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
2217
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
2218
+ QC_NOTE_KEY: ["", ""]
2219
+ })
2220
+ assert_frame_equal(expected_df, result_df)
2221
+
2222
+ def test_extend_metadata_df_missing_required_columns_raises(self):
2223
+ """Test that missing required columns raises ValueError."""
2224
+ input_df = pandas.DataFrame({
2225
+ SAMPLE_NAME_KEY: ["sample1", "sample2"]
2226
+ # Missing HOSTTYPE_SHORTHAND_KEY and SAMPLETYPE_SHORTHAND_KEY
2227
+ })
2228
+ study_config = {}
2229
+
2230
+ with self.assertRaisesRegex(ValueError, "metadata missing required columns"):
2231
+ extend_metadata_df(input_df, study_config, None, None, self.TEST_STDS_FP)
2232
+
2233
+ def test_extend_metadata_df_none_study_config(self):
2234
+ """Test metadata extension with None study config uses standards only."""
2235
+ input_df = pandas.DataFrame({
2236
+ SAMPLE_NAME_KEY: ["sample1"],
2237
+ HOSTTYPE_SHORTHAND_KEY: ["human"],
2238
+ SAMPLETYPE_SHORTHAND_KEY: ["stool"]
2239
+ })
2240
+
2241
+ result_df, validation_msgs_df = extend_metadata_df(
2242
+ input_df, None, None, None, self.TEST_STDS_FP)
2243
+
2244
+ expected_df = pandas.DataFrame({
2245
+ SAMPLE_NAME_KEY: ["sample1"],
2246
+ "body_site": ["gut"],
2247
+ "host_common_name": ["human"],
2248
+ QIITA_SAMPLE_TYPE: ["stool"],
2249
+ SAMPLE_TYPE_KEY: ["stool"],
2250
+ HOSTTYPE_SHORTHAND_KEY: ["human"],
2251
+ SAMPLETYPE_SHORTHAND_KEY: ["stool"],
2252
+ QC_NOTE_KEY: [""]
2253
+ })
2254
+ assert_frame_equal(expected_df, result_df)
2255
+
2256
+ def test_extend_metadata_df_unknown_host_type(self):
2257
+ """Test that unknown host type adds QC note."""
2258
+ input_df = pandas.DataFrame({
2259
+ SAMPLE_NAME_KEY: ["sample1"],
2260
+ HOSTTYPE_SHORTHAND_KEY: ["unknown_host"],
2261
+ SAMPLETYPE_SHORTHAND_KEY: ["stool"]
2262
+ })
2263
+ study_config = {
2264
+ DEFAULT_KEY: "not provided",
2265
+ LEAVE_REQUIREDS_BLANK_KEY: True,
2266
+ OVERWRITE_NON_NANS_KEY: False,
2267
+ STUDY_SPECIFIC_METADATA_KEY: {
2268
+ HOST_TYPE_SPECIFIC_METADATA_KEY: {
2269
+ "human": {
2270
+ METADATA_FIELDS_KEY: {},
2271
+ SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
2272
+ "stool": {
2273
+ METADATA_FIELDS_KEY: {}
2274
+ }
2275
+ }
2276
+ }
2277
+ }
2278
+ }
2279
+ }
2280
+
2281
+ result_df, validation_msgs_df = extend_metadata_df(
2282
+ input_df, study_config, None, None, self.TEST_STDS_FP)
2283
+
2284
+ expected_df = pandas.DataFrame({
2285
+ SAMPLE_NAME_KEY: ["sample1"],
2286
+ HOSTTYPE_SHORTHAND_KEY: ["unknown_host"],
2287
+ SAMPLETYPE_SHORTHAND_KEY: ["stool"],
2288
+ QC_NOTE_KEY: ["invalid host_type"]
2289
+ })
2290
+ assert_frame_equal(expected_df, result_df)
2291
+
2292
+ def test_extend_metadata_df_multiple_host_types(self):
2293
+ """Test metadata extension with multiple host types."""
2294
+ input_df = pandas.DataFrame({
2295
+ SAMPLE_NAME_KEY: ["sample1", "sample2", "sample3"],
2296
+ HOSTTYPE_SHORTHAND_KEY: ["human", "mouse", "human"],
2297
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool", "blood"]
2298
+ })
2299
+ study_config = {
2300
+ DEFAULT_KEY: "not provided",
2301
+ LEAVE_REQUIREDS_BLANK_KEY: True,
2302
+ OVERWRITE_NON_NANS_KEY: False,
2303
+ STUDY_SPECIFIC_METADATA_KEY: {
2304
+ HOST_TYPE_SPECIFIC_METADATA_KEY: {
2305
+ "human": {
2306
+ METADATA_FIELDS_KEY: {},
2307
+ SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
2308
+ "stool": {
2309
+ METADATA_FIELDS_KEY: {}
2310
+ },
2311
+ "blood": {
2312
+ METADATA_FIELDS_KEY: {}
2313
+ }
2314
+ }
2315
+ },
2316
+ "mouse": {
2317
+ METADATA_FIELDS_KEY: {},
2318
+ SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
2319
+ "stool": {
2320
+ METADATA_FIELDS_KEY: {}
2321
+ }
2322
+ }
2323
+ }
2324
+ }
2325
+ }
2326
+ }
2327
+
2328
+ result_df, validation_msgs_df = extend_metadata_df(
2329
+ input_df, study_config, None, None, self.TEST_STDS_FP)
2330
+
2331
+ # After processing multiple host types, rows may be reordered
2332
+ # Human samples are processed together, then mouse samples
2333
+ expected_df = pandas.DataFrame({
2334
+ SAMPLE_NAME_KEY: ["sample1", "sample3", "sample2"],
2335
+ "body_site": ["gut", "blood", "gut"],
2336
+ "host_common_name": ["human", "human", "mouse"],
2337
+ QIITA_SAMPLE_TYPE: ["stool", "blood", "stool"],
2338
+ SAMPLE_TYPE_KEY: ["stool", "blood", "stool"],
2339
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human", "mouse"],
2340
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "blood", "stool"],
2341
+ QC_NOTE_KEY: ["", "", ""]
2342
+ })
2343
+ assert_frame_equal(expected_df, result_df)
2344
+
2345
+ def test_extend_metadata_df_with_software_config(self):
2346
+ """Test metadata extension with custom software config overrides defaults."""
2347
+ input_df = pandas.DataFrame({
2348
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
2349
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
2350
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"]
2351
+ })
2352
+ # Software config with custom default value
2353
+ software_config = {
2354
+ DEFAULT_KEY: "custom_software_default",
2355
+ LEAVE_REQUIREDS_BLANK_KEY: True,
2356
+ OVERWRITE_NON_NANS_KEY: False
2357
+ }
2358
+ # Study config that doesn't override DEFAULT_KEY
2359
+ study_config = {
2360
+ STUDY_SPECIFIC_METADATA_KEY: {
2361
+ HOST_TYPE_SPECIFIC_METADATA_KEY: {
2362
+ "human": {
2363
+ METADATA_FIELDS_KEY: {
2364
+ "study_field": {
2365
+ DEFAULT_KEY: "study_value",
2366
+ TYPE_KEY: "string"
2367
+ }
2368
+ },
2369
+ SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
2370
+ "stool": {
2371
+ METADATA_FIELDS_KEY: {}
2372
+ }
2373
+ }
2374
+ }
2375
+ }
2376
+ }
2377
+ }
2378
+
2379
+ result_df, validation_msgs_df = extend_metadata_df(
2380
+ input_df, study_config, None, software_config, self.TEST_STDS_FP)
2381
+
2382
+ expected_df = pandas.DataFrame({
2383
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
2384
+ "body_site": ["gut", "gut"],
2385
+ "host_common_name": ["human", "human"],
2386
+ QIITA_SAMPLE_TYPE: ["stool", "stool"],
2387
+ SAMPLE_TYPE_KEY: ["stool", "stool"],
2388
+ "study_field": ["study_value", "study_value"],
2389
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
2390
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
2391
+ QC_NOTE_KEY: ["", ""]
2392
+ })
2393
+ assert_frame_equal(expected_df, result_df)
2394
+
2395
+ # Tests for _get_study_specific_config
2396
+
2397
+ def test__get_study_specific_config_with_valid_file(self):
2398
+ """Test loading study-specific config from a valid YAML file."""
2399
+ config_fp = path.join(self.TEST_DIR, "data/test_config.yml")
2400
+
2401
+ result = _get_study_specific_config(config_fp)
2402
+
2403
+ expected = {
2404
+ HOST_TYPE_SPECIFIC_METADATA_KEY: {
2405
+ "base": {
2406
+ METADATA_FIELDS_KEY: {
2407
+ "sample_name": {
2408
+ TYPE_KEY: "string",
2409
+ "unique": True
2410
+ },
2411
+ "sample_type": {
2412
+ "empty": False,
2413
+ "is_phi": False
2414
+ }
2415
+ }
2416
+ }
2417
+ }
2418
+ }
2419
+ self.assertDictEqual(expected, result)
2420
+
2421
+ def test__get_study_specific_config_with_none(self):
2422
+ """Test that None file path returns None."""
2423
+ result = _get_study_specific_config(None)
2424
+
2425
+ self.assertIsNone(result)
2426
+
2427
+ def test__get_study_specific_config_with_empty_string(self):
2428
+ """Test that empty string file path returns None."""
2429
+ result = _get_study_specific_config("")
2430
+
2431
+ self.assertIsNone(result)
2432
+
2433
+ def test__get_study_specific_config_nonexistent_file_raises(self):
2434
+ """Test that nonexistent file raises FileNotFoundError."""
2435
+ with self.assertRaises(FileNotFoundError):
2436
+ _get_study_specific_config("/nonexistent/path/config.yml")
2437
+
2438
+ def test__get_study_specific_config_invalid_yaml_raises(self):
2439
+ """Test that invalid YAML file raises an error."""
2440
+ invalid_fp = path.join(self.TEST_DIR, "data/invalid.yml")
2441
+
2442
+ with self.assertRaises(Exception):
2443
+ _get_study_specific_config(invalid_fp)
2444
+
2445
+ # Tests for _output_metadata_df_to_files
2446
+
2447
+ def test__output_metadata_df_to_files_basic(self):
2448
+ """Test basic output of metadata DataFrame to file."""
2449
+ input_df = pandas.DataFrame({
2450
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
2451
+ "field_a": ["a1", "a2"],
2452
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
2453
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
2454
+ QC_NOTE_KEY: ["", ""]
2455
+ })
2456
+
2457
+ with tempfile.TemporaryDirectory() as tmpdir:
2458
+ _output_metadata_df_to_files(
2459
+ input_df, tmpdir, "test_output", INTERNAL_COL_KEYS,
2460
+ sep="\t", remove_internals_and_fails=False)
2461
+
2462
+ # Find the output file (has timestamp prefix)
2463
+ output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
2464
+ self.assertEqual(1, len(output_files))
2465
+
2466
+ # Read and verify contents (keep_default_na=False preserves empty strings)
2467
+ result_df = pandas.read_csv(output_files[0], sep="\t", keep_default_na=False)
2468
+ expected_df = input_df
2469
+ assert_frame_equal(expected_df, result_df)
2470
+
2471
+ def test__output_metadata_df_to_files_remove_internals_and_fails(self):
2472
+ """Test output with internal columns and failures removed."""
2473
+ input_df = pandas.DataFrame({
2474
+ SAMPLE_NAME_KEY: ["sample1", "sample2", "sample3"],
2475
+ "field_a": ["a1", "a2", "a3"],
2476
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human", "human"],
2477
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool", "stool"],
2478
+ QC_NOTE_KEY: ["", "invalid host_type", ""]
2479
+ })
2480
+
2481
+ with tempfile.TemporaryDirectory() as tmpdir:
2482
+ _output_metadata_df_to_files(
2483
+ input_df, tmpdir, "test_output", INTERNAL_COL_KEYS,
2484
+ sep="\t", remove_internals_and_fails=True)
2485
+
2486
+ # Find the main output file
2487
+ output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
2488
+ self.assertEqual(1, len(output_files))
2489
+
2490
+ # Verify main output has internal cols removed and no failures
2491
+ result_df = pandas.read_csv(output_files[0], sep="\t")
2492
+ expected_df = pandas.DataFrame({
2493
+ SAMPLE_NAME_KEY: ["sample1", "sample3"],
2494
+ "field_a": ["a1", "a3"]
2495
+ })
2496
+ assert_frame_equal(expected_df, result_df)
2497
+
2498
+ # Find the fails file
2499
+ fails_files = glob.glob(os.path.join(tmpdir, "*_test_output_fails.csv"))
2500
+ self.assertEqual(1, len(fails_files))
2501
+
2502
+ # Verify fails file contains the failed row
2503
+ fails_df = pandas.read_csv(fails_files[0], sep=",")
2504
+ expected_fails_df = pandas.DataFrame({
2505
+ SAMPLE_NAME_KEY: ["sample2"],
2506
+ "field_a": ["a2"],
2507
+ HOSTTYPE_SHORTHAND_KEY: ["human"],
2508
+ SAMPLETYPE_SHORTHAND_KEY: ["stool"],
2509
+ QC_NOTE_KEY: ["invalid host_type"]
2510
+ })
2511
+ assert_frame_equal(expected_fails_df, fails_df)
2512
+
2513
+ def test__output_metadata_df_to_files_no_failures_creates_empty_file(self):
2514
+ """Test that empty fails file is created when there are no failures."""
2515
+ input_df = pandas.DataFrame({
2516
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
2517
+ "field_a": ["a1", "a2"],
2518
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
2519
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
2520
+ QC_NOTE_KEY: ["", ""]
2521
+ })
2522
+
2523
+ with tempfile.TemporaryDirectory() as tmpdir:
2524
+ _output_metadata_df_to_files(
2525
+ input_df, tmpdir, "test_output", INTERNAL_COL_KEYS,
2526
+ sep="\t", remove_internals_and_fails=True,
2527
+ suppress_empty_fails=False)
2528
+
2529
+ # Find the fails file
2530
+ fails_files = glob.glob(os.path.join(tmpdir, "*_test_output_fails.csv"))
2531
+ self.assertEqual(1, len(fails_files))
2532
+
2533
+ # Verify fails file is empty (zero bytes)
2534
+ self.assertEqual(0, os.path.getsize(fails_files[0]))
2535
+
2536
+ def test__output_metadata_df_to_files_suppress_empty_fails(self):
2537
+ """Test that empty fails file is not created when suppress_empty_fails=True."""
2538
+ input_df = pandas.DataFrame({
2539
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
2540
+ "field_a": ["a1", "a2"],
2541
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
2542
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
2543
+ QC_NOTE_KEY: ["", ""]
2544
+ })
2545
+
2546
+ with tempfile.TemporaryDirectory() as tmpdir:
2547
+ _output_metadata_df_to_files(
2548
+ input_df, tmpdir, "test_output", INTERNAL_COL_KEYS,
2549
+ sep="\t", remove_internals_and_fails=True,
2550
+ suppress_empty_fails=True)
2551
+
2552
+ # Find the fails file - should not exist
2553
+ fails_files = glob.glob(os.path.join(tmpdir, "*_test_output_fails.csv"))
2554
+ self.assertEqual(0, len(fails_files))
2555
+
2556
+ # Main output file should still exist
2557
+ output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
2558
+ self.assertEqual(1, len(output_files))
2559
+
2560
+ def test__output_metadata_df_to_files_csv_separator(self):
2561
+ """Test output with comma separator creates .csv file."""
2562
+ input_df = pandas.DataFrame({
2563
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
2564
+ "field_a": ["a1", "a2"],
2565
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
2566
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
2567
+ QC_NOTE_KEY: ["", ""]
2568
+ })
2569
+
2570
+ with tempfile.TemporaryDirectory() as tmpdir:
2571
+ _output_metadata_df_to_files(
2572
+ input_df, tmpdir, "test_output", INTERNAL_COL_KEYS,
2573
+ sep=",", remove_internals_and_fails=False)
2574
+
2575
+ # Find the output file with .csv extension
2576
+ output_files = glob.glob(os.path.join(tmpdir, "*_test_output.csv"))
2577
+ self.assertEqual(1, len(output_files))
2578
+
2579
+ # Read and verify contents (keep_default_na=False preserves empty strings)
2580
+ result_df = pandas.read_csv(output_files[0], sep=",", keep_default_na=False)
2581
+ expected_df = input_df
2582
+ assert_frame_equal(expected_df, result_df)
2583
+
2584
+ def test__output_metadata_df_to_files_all_failures(self):
2585
+ """Test output when all rows are failures."""
2586
+ input_df = pandas.DataFrame({
2587
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
2588
+ "field_a": ["a1", "a2"],
2589
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
2590
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
2591
+ QC_NOTE_KEY: ["invalid host_type", "invalid sample_type"]
2592
+ })
2593
+
2594
+ with tempfile.TemporaryDirectory() as tmpdir:
2595
+ _output_metadata_df_to_files(
2596
+ input_df, tmpdir, "test_output", INTERNAL_COL_KEYS,
2597
+ sep="\t", remove_internals_and_fails=True)
2598
+
2599
+ # Main output file should have only headers (empty data)
2600
+ output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
2601
+ self.assertEqual(1, len(output_files))
2602
+ result_df = pandas.read_csv(output_files[0], sep="\t")
2603
+ self.assertTrue(result_df.empty)
2604
+ self.assertEqual([SAMPLE_NAME_KEY, "field_a"], list(result_df.columns))
2605
+
2606
+ # Fails file should have both rows
2607
+ fails_files = glob.glob(os.path.join(tmpdir, "*_test_output_fails.csv"))
2608
+ self.assertEqual(1, len(fails_files))
2609
+ fails_df = pandas.read_csv(fails_files[0], sep=",")
2610
+ self.assertEqual(2, len(fails_df))