metameq 2026.1.1__py3-none-any.whl → 2026.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -14,17 +14,20 @@ from metameq.src.util import \
14
14
  OVERWRITE_NON_NANS_KEY, LEAVE_REQUIREDS_BLANK_KEY, LEAVE_BLANK_VAL, \
15
15
  HOST_TYPE_SPECIFIC_METADATA_KEY, METADATA_TRANSFORMERS_KEY, \
16
16
  SOURCES_KEY, FUNCTION_KEY, PRE_TRANSFORMERS_KEY, POST_TRANSFORMERS_KEY, \
17
- STUDY_SPECIFIC_METADATA_KEY
17
+ STUDY_SPECIFIC_METADATA_KEY, HOSTTYPE_COL_OPTIONS_KEY, \
18
+ SAMPLETYPE_COL_OPTIONS_KEY
18
19
  from metameq.src.metadata_extender import \
19
- id_missing_cols, get_qc_failures, _reorder_df, \
20
- _catch_nan_required_fields, _fill_na_if_default, \
21
- _update_metadata_from_metadata_fields_dict, _update_metadata_from_dict, \
22
- _construct_sample_type_metadata_fields_dict, \
20
+ id_missing_cols, get_qc_failures, get_reserved_cols, find_standard_cols, \
21
+ find_nonstandard_cols, write_metadata_results, \
22
+ get_extended_metadata_from_df_and_yaml, write_extended_metadata_from_df, \
23
+ write_extended_metadata, _reorder_df, _catch_nan_required_fields, \
24
+ _fill_na_if_default, _update_metadata_from_metadata_fields_dict, \
25
+ _update_metadata_from_dict, _construct_sample_type_metadata_fields_dict, \
23
26
  _generate_metadata_for_a_sample_type_in_a_host_type, \
24
27
  _generate_metadata_for_a_host_type, _generate_metadata_for_host_types, \
25
28
  _transform_metadata, _populate_metadata_df, extend_metadata_df, \
26
29
  _get_study_specific_config, _output_metadata_df_to_files, \
27
- INTERNAL_COL_KEYS, REQ_PLACEHOLDER
30
+ _get_specified_column_name, INTERNAL_COL_KEYS, REQ_PLACEHOLDER
28
31
 
29
32
 
30
33
  class TestMetadataExtender(TestCase):
@@ -67,6 +70,536 @@ class TestMetadataExtender(TestCase):
67
70
  expected = sorted(REQUIRED_RAW_METADATA_FIELDS)
68
71
  self.assertEqual(expected, result)
69
72
 
73
+ # Tests for get_reserved_cols
74
+
75
+ def test_get_reserved_cols_single_host_sample_type(self):
76
+ """Test returns sorted list of reserved column names for a single host/sample type."""
77
+ input_df = pandas.DataFrame({
78
+ SAMPLE_NAME_KEY: ["sample1"],
79
+ HOSTTYPE_SHORTHAND_KEY: ["human"],
80
+ SAMPLETYPE_SHORTHAND_KEY: ["stool"]
81
+ })
82
+ study_config = {
83
+ DEFAULT_KEY: "not provided",
84
+ LEAVE_REQUIREDS_BLANK_KEY: True,
85
+ OVERWRITE_NON_NANS_KEY: False,
86
+ STUDY_SPECIFIC_METADATA_KEY: {
87
+ HOST_TYPE_SPECIFIC_METADATA_KEY: {
88
+ "human": {
89
+ METADATA_FIELDS_KEY: {
90
+ "host_common_name": {
91
+ DEFAULT_KEY: "human",
92
+ TYPE_KEY: "string"
93
+ }
94
+ },
95
+ SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
96
+ "stool": {
97
+ METADATA_FIELDS_KEY: {
98
+ "body_site": {
99
+ DEFAULT_KEY: "gut",
100
+ TYPE_KEY: "string"
101
+ },
102
+ "stool_consistency": {
103
+ DEFAULT_KEY: "normal",
104
+ TYPE_KEY: "string"
105
+ }
106
+ }
107
+ }
108
+ }
109
+ }
110
+ }
111
+ }
112
+ }
113
+
114
+ result = get_reserved_cols(input_df, study_config, self.TEST_STDS_FP)
115
+
116
+ # Expected columns are union of study_config fields and test_standards.yml fields
117
+ # From standards: sample_name, sample_type (base), description (human overrides host_associated),
118
+ # body_site (host_associated stool), body_product (human stool), host_common_name (human)
119
+ expected = [
120
+ "body_product", # from human stool in test_standards.yml
121
+ "body_site",
122
+ "description", # from human in test_standards.yml (overrides host_associated)
123
+ "host_common_name",
124
+ HOSTTYPE_SHORTHAND_KEY,
125
+ QC_NOTE_KEY,
126
+ QIITA_SAMPLE_TYPE,
127
+ SAMPLE_NAME_KEY,
128
+ SAMPLE_TYPE_KEY,
129
+ SAMPLETYPE_SHORTHAND_KEY,
130
+ "stool_consistency"
131
+ ]
132
+ self.assertEqual(expected, result)
133
+
134
+ def test_get_reserved_cols_missing_hosttype_shorthand_raises(self):
135
+ """Test raises ValueError when hosttype_shorthand column is missing."""
136
+ input_df = pandas.DataFrame({
137
+ SAMPLE_NAME_KEY: ["sample1"],
138
+ SAMPLETYPE_SHORTHAND_KEY: ["stool"]
139
+ })
140
+ study_config = {}
141
+
142
+ with self.assertRaisesRegex(ValueError, HOSTTYPE_SHORTHAND_KEY):
143
+ get_reserved_cols(input_df, study_config)
144
+
145
+ def test_get_reserved_cols_missing_sampletype_shorthand_raises(self):
146
+ """Test raises ValueError when sampletype_shorthand column is missing."""
147
+ input_df = pandas.DataFrame({
148
+ SAMPLE_NAME_KEY: ["sample1"],
149
+ HOSTTYPE_SHORTHAND_KEY: ["human"]
150
+ })
151
+ study_config = {}
152
+
153
+ with self.assertRaisesRegex(ValueError, SAMPLETYPE_SHORTHAND_KEY):
154
+ get_reserved_cols(input_df, study_config)
155
+
156
+ def test_get_reserved_cols_multiple_host_sample_types(self):
157
+ """Test returns deduped union of reserved columns for multiple host/sample type combinations."""
158
+ input_df = pandas.DataFrame({
159
+ SAMPLE_NAME_KEY: ["sample1", "sample2", "sample3"],
160
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human", "mouse"],
161
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "blood", "stool"]
162
+ })
163
+ # Both human and mouse define host_common_name and body_site - should appear only once each
164
+ study_config = {
165
+ DEFAULT_KEY: "not provided",
166
+ LEAVE_REQUIREDS_BLANK_KEY: True,
167
+ OVERWRITE_NON_NANS_KEY: False,
168
+ STUDY_SPECIFIC_METADATA_KEY: {
169
+ HOST_TYPE_SPECIFIC_METADATA_KEY: {
170
+ "human": {
171
+ METADATA_FIELDS_KEY: {
172
+ "host_common_name": {
173
+ DEFAULT_KEY: "human",
174
+ TYPE_KEY: "string"
175
+ },
176
+ "human_field": {
177
+ DEFAULT_KEY: "human_value",
178
+ TYPE_KEY: "string"
179
+ }
180
+ },
181
+ SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
182
+ "stool": {
183
+ METADATA_FIELDS_KEY: {
184
+ "body_site": {
185
+ DEFAULT_KEY: "gut",
186
+ TYPE_KEY: "string"
187
+ },
188
+ "stool_consistency": {
189
+ DEFAULT_KEY: "normal",
190
+ TYPE_KEY: "string"
191
+ }
192
+ }
193
+ },
194
+ "blood": {
195
+ METADATA_FIELDS_KEY: {
196
+ "body_site": {
197
+ DEFAULT_KEY: "blood",
198
+ TYPE_KEY: "string"
199
+ },
200
+ "blood_type": {
201
+ DEFAULT_KEY: "unknown",
202
+ TYPE_KEY: "string"
203
+ }
204
+ }
205
+ }
206
+ }
207
+ },
208
+ "mouse": {
209
+ METADATA_FIELDS_KEY: {
210
+ "host_common_name": {
211
+ DEFAULT_KEY: "mouse",
212
+ TYPE_KEY: "string"
213
+ },
214
+ "mouse_field": {
215
+ DEFAULT_KEY: "mouse_value",
216
+ TYPE_KEY: "string"
217
+ }
218
+ },
219
+ SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
220
+ "stool": {
221
+ METADATA_FIELDS_KEY: {
222
+ "body_site": {
223
+ DEFAULT_KEY: "gut",
224
+ TYPE_KEY: "string"
225
+ },
226
+ "mouse_stool_field": {
227
+ DEFAULT_KEY: "mouse_stool_value",
228
+ TYPE_KEY: "string"
229
+ }
230
+ }
231
+ }
232
+ }
233
+ }
234
+ }
235
+ }
236
+ }
237
+
238
+ result = get_reserved_cols(input_df, study_config, self.TEST_STDS_FP)
239
+
240
+ # Expected columns are union of study_config fields and test_standards.yml fields
241
+ # From standards for human/stool: sample_name, sample_type (base), description (human),
242
+ # body_site (host_associated stool), body_product (human stool), host_common_name (human)
243
+ # From standards for human/blood: body_site (human blood), body_product (human blood),
244
+ # description (human), host_common_name (human)
245
+ # From standards for mouse/stool: sample_name, sample_type (base), description (host_associated),
246
+ # body_site (host_associated stool), host_common_name (mouse)
247
+ # TODO: cage_id from mouse stool in test_standards.yml SHOULD be included here
248
+ # but is currently excluded because it has required: false and no default.
249
+ # The function under test needs to be changed to include fields even when
250
+ # they have required: false and no default.
251
+ expected = [
252
+ "blood_type",
253
+ "body_product", # from human stool and human blood in test_standards.yml
254
+ "body_site",
255
+ "description", # from human (overrides host_associated) and host_associated (mouse inherits)
256
+ "host_common_name",
257
+ HOSTTYPE_SHORTHAND_KEY,
258
+ "human_field",
259
+ "mouse_field",
260
+ "mouse_stool_field",
261
+ QC_NOTE_KEY,
262
+ QIITA_SAMPLE_TYPE,
263
+ SAMPLE_NAME_KEY,
264
+ SAMPLE_TYPE_KEY,
265
+ SAMPLETYPE_SHORTHAND_KEY,
266
+ "stool_consistency"
267
+ ]
268
+ self.assertEqual(expected, result)
269
+
270
+ # Tests for find_standard_cols
271
+
272
+ def test_find_standard_cols_returns_standard_cols_in_df(self):
273
+ """Test returns standard columns that exist in the input DataFrame, excluding internals."""
274
+ input_df = pandas.DataFrame({
275
+ SAMPLE_NAME_KEY: ["sample1"],
276
+ HOSTTYPE_SHORTHAND_KEY: ["human"],
277
+ SAMPLETYPE_SHORTHAND_KEY: ["stool"],
278
+ "body_site": ["gut"],
279
+ "host_common_name": ["human"],
280
+ "my_custom_column": ["custom_value"]
281
+ })
282
+ study_config = {
283
+ DEFAULT_KEY: "not provided",
284
+ LEAVE_REQUIREDS_BLANK_KEY: True,
285
+ OVERWRITE_NON_NANS_KEY: False,
286
+ STUDY_SPECIFIC_METADATA_KEY: {
287
+ HOST_TYPE_SPECIFIC_METADATA_KEY: {
288
+ "human": {
289
+ METADATA_FIELDS_KEY: {
290
+ "host_common_name": {
291
+ DEFAULT_KEY: "human",
292
+ TYPE_KEY: "string"
293
+ }
294
+ },
295
+ SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
296
+ "stool": {
297
+ METADATA_FIELDS_KEY: {
298
+ "body_site": {
299
+ DEFAULT_KEY: "gut",
300
+ TYPE_KEY: "string"
301
+ }
302
+ }
303
+ }
304
+ }
305
+ }
306
+ }
307
+ }
308
+ }
309
+
310
+ result = find_standard_cols(input_df, study_config, self.TEST_STDS_FP)
311
+
312
+ # Returns intersection of reserved cols (minus internals) with df columns.
313
+ # body_site, host_common_name, sample_name are standard and in df
314
+ # hosttype_shorthand, sampletype_shorthand are internal (excluded)
315
+ # my_custom_column is nonstandard (excluded)
316
+ expected = ["body_site", "host_common_name", SAMPLE_NAME_KEY]
317
+ self.assertEqual(sorted(expected), sorted(result))
318
+
319
+ def test_find_standard_cols_missing_hosttype_shorthand_raises(self):
320
+ """Test raises ValueError when hosttype_shorthand column is missing."""
321
+ input_df = pandas.DataFrame({
322
+ SAMPLE_NAME_KEY: ["sample1"],
323
+ SAMPLETYPE_SHORTHAND_KEY: ["stool"]
324
+ })
325
+ study_config = {}
326
+
327
+ with self.assertRaisesRegex(ValueError, HOSTTYPE_SHORTHAND_KEY):
328
+ find_standard_cols(input_df, study_config, self.TEST_STDS_FP)
329
+
330
+ def test_find_standard_cols_missing_sampletype_shorthand_raises(self):
331
+ """Test raises ValueError when sampletype_shorthand column is missing."""
332
+ input_df = pandas.DataFrame({
333
+ SAMPLE_NAME_KEY: ["sample1"],
334
+ HOSTTYPE_SHORTHAND_KEY: ["human"]
335
+ })
336
+ study_config = {}
337
+
338
+ with self.assertRaisesRegex(ValueError, SAMPLETYPE_SHORTHAND_KEY):
339
+ find_standard_cols(input_df, study_config, self.TEST_STDS_FP)
340
+
341
+ def test_find_standard_cols_missing_sample_name_raises(self):
342
+ """Test raises ValueError when sample_name column is missing."""
343
+ input_df = pandas.DataFrame({
344
+ HOSTTYPE_SHORTHAND_KEY: ["human"],
345
+ SAMPLETYPE_SHORTHAND_KEY: ["stool"]
346
+ })
347
+ study_config = {}
348
+
349
+ with self.assertRaisesRegex(ValueError, SAMPLE_NAME_KEY):
350
+ find_standard_cols(input_df, study_config, self.TEST_STDS_FP)
351
+
352
+ def test_find_standard_cols_suppress_missing_name_err(self):
353
+ """Test that suppress_missing_name_err=True allows missing sample_name."""
354
+ input_df = pandas.DataFrame({
355
+ HOSTTYPE_SHORTHAND_KEY: ["human"],
356
+ SAMPLETYPE_SHORTHAND_KEY: ["stool"],
357
+ "body_site": ["gut"]
358
+ })
359
+ study_config = {
360
+ DEFAULT_KEY: "not provided",
361
+ LEAVE_REQUIREDS_BLANK_KEY: True,
362
+ OVERWRITE_NON_NANS_KEY: False,
363
+ STUDY_SPECIFIC_METADATA_KEY: {
364
+ HOST_TYPE_SPECIFIC_METADATA_KEY: {
365
+ "human": {
366
+ METADATA_FIELDS_KEY: {},
367
+ SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
368
+ "stool": {
369
+ METADATA_FIELDS_KEY: {
370
+ "body_site": {
371
+ DEFAULT_KEY: "gut",
372
+ TYPE_KEY: "string"
373
+ }
374
+ }
375
+ }
376
+ }
377
+ }
378
+ }
379
+ }
380
+ }
381
+
382
+ result = find_standard_cols(
383
+ input_df, study_config, self.TEST_STDS_FP,
384
+ suppress_missing_name_err=True)
385
+
386
+ # Only body_site is a standard col in df (sample_name is missing but allowed)
387
+ expected = ["body_site"]
388
+ self.assertEqual(expected, sorted(result))
389
+
390
+ # Tests for find_nonstandard_cols
391
+
392
+ def test_find_nonstandard_cols_returns_nonstandard_cols(self):
393
+ """Test returns columns in df that are not in the reserved columns list."""
394
+ input_df = pandas.DataFrame({
395
+ SAMPLE_NAME_KEY: ["sample1"],
396
+ HOSTTYPE_SHORTHAND_KEY: ["human"],
397
+ SAMPLETYPE_SHORTHAND_KEY: ["stool"],
398
+ "body_site": ["gut"],
399
+ "host_common_name": ["human"],
400
+ "my_custom_column": ["custom_value"],
401
+ "another_nonstandard": ["value"]
402
+ })
403
+ study_config = {
404
+ DEFAULT_KEY: "not provided",
405
+ LEAVE_REQUIREDS_BLANK_KEY: True,
406
+ OVERWRITE_NON_NANS_KEY: False,
407
+ STUDY_SPECIFIC_METADATA_KEY: {
408
+ HOST_TYPE_SPECIFIC_METADATA_KEY: {
409
+ "human": {
410
+ METADATA_FIELDS_KEY: {
411
+ "host_common_name": {
412
+ DEFAULT_KEY: "human",
413
+ TYPE_KEY: "string"
414
+ }
415
+ },
416
+ SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
417
+ "stool": {
418
+ METADATA_FIELDS_KEY: {
419
+ "body_site": {
420
+ DEFAULT_KEY: "gut",
421
+ TYPE_KEY: "string"
422
+ }
423
+ }
424
+ }
425
+ }
426
+ }
427
+ }
428
+ }
429
+ }
430
+
431
+ result = find_nonstandard_cols(input_df, study_config, self.TEST_STDS_FP)
432
+
433
+ # Only my_custom_column and another_nonstandard are not in the reserved list
434
+ # sample_name, body_site, host_common_name, hosttype_shorthand,
435
+ # sampletype_shorthand are all reserved
436
+ expected = ["another_nonstandard", "my_custom_column"]
437
+ self.assertEqual(sorted(expected), sorted(result))
438
+
439
+ def test_find_nonstandard_cols_missing_required_col_raises(self):
440
+ """Test raises ValueError when a required column is missing."""
441
+ input_df = pandas.DataFrame({
442
+ SAMPLE_NAME_KEY: ["sample1"],
443
+ SAMPLETYPE_SHORTHAND_KEY: ["stool"]
444
+ # missing HOSTTYPE_SHORTHAND_KEY
445
+ })
446
+ study_config = {}
447
+
448
+ with self.assertRaisesRegex(ValueError, HOSTTYPE_SHORTHAND_KEY):
449
+ find_nonstandard_cols(input_df, study_config, self.TEST_STDS_FP)
450
+
451
+ # Tests for write_metadata_results
452
+
453
+ def test_write_metadata_results_creates_all_files(self):
454
+ """Test creates metadata file and validation errors file, includes failed rows."""
455
+ metadata_df = pandas.DataFrame({
456
+ SAMPLE_NAME_KEY: ["sample1", "sample2", "sample3"],
457
+ "field_a": ["a1", "a2", "a3"],
458
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human", "human"],
459
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool", "stool"],
460
+ QC_NOTE_KEY: ["", "invalid host_type", ""]
461
+ })
462
+ validation_msgs_df = pandas.DataFrame({
463
+ "field": ["field_a"],
464
+ "error": ["some validation error"]
465
+ })
466
+
467
+ with tempfile.TemporaryDirectory() as tmpdir:
468
+ write_metadata_results(
469
+ metadata_df, validation_msgs_df, tmpdir, "test_output",
470
+ sep="\t", remove_internals=False)
471
+
472
+ # Find the main metadata file
473
+ metadata_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
474
+ self.assertEqual(1, len(metadata_files))
475
+
476
+ # Verify metadata file contents - includes failed row when remove_internals=False
477
+ result_df = pandas.read_csv(
478
+ metadata_files[0], sep="\t", keep_default_na=False)
479
+ assert_frame_equal(metadata_df, result_df)
480
+
481
+ # Find the validation errors file (uses comma separator)
482
+ validation_files = glob.glob(
483
+ os.path.join(tmpdir, "*_test_output_validation_errors.csv"))
484
+ self.assertEqual(1, len(validation_files))
485
+
486
+ # Verify validation errors file contents
487
+ result_validation_df = pandas.read_csv(validation_files[0], sep=",")
488
+ assert_frame_equal(validation_msgs_df, result_validation_df)
489
+
490
+ # No fails file should be created when remove_internals=False
491
+ fails_files = glob.glob(
492
+ os.path.join(tmpdir, "*_test_output_fails.csv"))
493
+ self.assertEqual(0, len(fails_files))
494
+
495
+ def test_write_metadata_results_remove_internals_creates_fails_file(self):
496
+ """Test with remove_internals=True creates fails file and removes internal cols."""
497
+ metadata_df = pandas.DataFrame({
498
+ SAMPLE_NAME_KEY: ["sample1", "sample2", "sample3"],
499
+ "field_a": ["a1", "a2", "a3"],
500
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human", "human"],
501
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool", "stool"],
502
+ QC_NOTE_KEY: ["", "invalid host_type", ""]
503
+ })
504
+ validation_msgs_df = pandas.DataFrame()
505
+
506
+ with tempfile.TemporaryDirectory() as tmpdir:
507
+ write_metadata_results(
508
+ metadata_df, validation_msgs_df, tmpdir, "test_output",
509
+ sep="\t", remove_internals=True)
510
+
511
+ # Find the main metadata file
512
+ metadata_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
513
+ self.assertEqual(1, len(metadata_files))
514
+
515
+ # Verify metadata has internal cols removed and no failures
516
+ result_df = pandas.read_csv(metadata_files[0], sep="\t")
517
+ expected_df = pandas.DataFrame({
518
+ SAMPLE_NAME_KEY: ["sample1", "sample3"],
519
+ "field_a": ["a1", "a3"]
520
+ })
521
+ assert_frame_equal(expected_df, result_df)
522
+
523
+ # Find the fails file
524
+ fails_files = glob.glob(
525
+ os.path.join(tmpdir, "*_test_output_fails.csv"))
526
+ self.assertEqual(1, len(fails_files))
527
+
528
+ # Verify fails file contains the failed row
529
+ fails_df = pandas.read_csv(fails_files[0], sep=",")
530
+ expected_fails_df = pandas.DataFrame({
531
+ SAMPLE_NAME_KEY: ["sample2"],
532
+ "field_a": ["a2"],
533
+ HOSTTYPE_SHORTHAND_KEY: ["human"],
534
+ SAMPLETYPE_SHORTHAND_KEY: ["stool"],
535
+ QC_NOTE_KEY: ["invalid host_type"]
536
+ })
537
+ assert_frame_equal(expected_fails_df, fails_df)
538
+
539
+ # Validation errors file should be empty (touched)
540
+ validation_files = glob.glob(
541
+ os.path.join(tmpdir, "*_test_output_validation_errors.csv"))
542
+ self.assertEqual(1, len(validation_files))
543
+ self.assertEqual(0, os.path.getsize(validation_files[0]))
544
+
545
+ def test_write_metadata_results_suppress_empty_fails(self):
546
+ """Test with suppress_empty_fails=True does not create empty files."""
547
+ metadata_df = pandas.DataFrame({
548
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
549
+ "field_a": ["a1", "a2"],
550
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
551
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
552
+ QC_NOTE_KEY: ["", ""]
553
+ })
554
+ validation_msgs_df = pandas.DataFrame()
555
+
556
+ with tempfile.TemporaryDirectory() as tmpdir:
557
+ write_metadata_results(
558
+ metadata_df, validation_msgs_df, tmpdir, "test_output",
559
+ sep="\t", remove_internals=True, suppress_empty_fails=True)
560
+
561
+ # Main metadata file should exist
562
+ metadata_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
563
+ self.assertEqual(1, len(metadata_files))
564
+
565
+ # Fails file should NOT exist (no failures, suppressed)
566
+ fails_files = glob.glob(
567
+ os.path.join(tmpdir, "*_test_output_fails.csv"))
568
+ self.assertEqual(0, len(fails_files))
569
+
570
+ # Validation errors file should NOT exist (empty, suppressed)
571
+ validation_files = glob.glob(
572
+ os.path.join(tmpdir, "*_test_output_validation_errors.csv"))
573
+ self.assertEqual(0, len(validation_files))
574
+
575
+ def test_write_metadata_results_custom_internal_col_names(self):
576
+ """Test with custom internal_col_names parameter."""
577
+ metadata_df = pandas.DataFrame({
578
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
579
+ "field_a": ["a1", "a2"],
580
+ "custom_internal": ["x", "y"],
581
+ QC_NOTE_KEY: ["", ""]
582
+ })
583
+ validation_msgs_df = pandas.DataFrame()
584
+
585
+ with tempfile.TemporaryDirectory() as tmpdir:
586
+ write_metadata_results(
587
+ metadata_df, validation_msgs_df, tmpdir, "test_output",
588
+ sep="\t", remove_internals=True, suppress_empty_fails=True,
589
+ internal_col_names=["custom_internal", QC_NOTE_KEY])
590
+
591
+ # Find the main metadata file
592
+ metadata_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
593
+ self.assertEqual(1, len(metadata_files))
594
+
595
+ # Verify custom internal cols are removed
596
+ result_df = pandas.read_csv(metadata_files[0], sep="\t")
597
+ expected_df = pandas.DataFrame({
598
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
599
+ "field_a": ["a1", "a2"]
600
+ })
601
+ assert_frame_equal(expected_df, result_df)
602
+
70
603
  # Tests for get_qc_failures
71
604
 
72
605
  def test_get_qc_failures_no_failures(self):
@@ -745,6 +1278,8 @@ class TestMetadataExtender(TestCase):
745
1278
  LEAVE_REQUIREDS_BLANK_KEY: False,
746
1279
  DEFAULT_KEY: "not provided"
747
1280
  }
1281
+ # Config is pre-resolved: sample type's metadata_fields already includes
1282
+ # host fields merged in, plus sample_type and qiita_sample_type
748
1283
  host_type_config_dict = {
749
1284
  METADATA_FIELDS_KEY: {
750
1285
  "host_field": {
@@ -755,9 +1290,23 @@ class TestMetadataExtender(TestCase):
755
1290
  SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
756
1291
  "stool": {
757
1292
  METADATA_FIELDS_KEY: {
1293
+ "host_field": {
1294
+ DEFAULT_KEY: "host_default",
1295
+ TYPE_KEY: "string"
1296
+ },
758
1297
  "stool_field": {
759
1298
  DEFAULT_KEY: "stool_default",
760
1299
  TYPE_KEY: "string"
1300
+ },
1301
+ SAMPLE_TYPE_KEY: {
1302
+ ALLOWED_KEY: ["stool"],
1303
+ DEFAULT_KEY: "stool",
1304
+ TYPE_KEY: "string"
1305
+ },
1306
+ QIITA_SAMPLE_TYPE: {
1307
+ ALLOWED_KEY: ["stool"],
1308
+ DEFAULT_KEY: "stool",
1309
+ TYPE_KEY: "string"
761
1310
  }
762
1311
  }
763
1312
  }
@@ -996,17 +1545,44 @@ class TestMetadataExtender(TestCase):
996
1545
  LEAVE_REQUIREDS_BLANK_KEY: False,
997
1546
  DEFAULT_KEY: "not provided"
998
1547
  }
1548
+ # Config is pre-resolved: alias "feces" has its own metadata_fields
1549
+ # that is a copy of "stool"'s resolved fields with sample_type="stool"
999
1550
  host_type_config_dict = {
1000
1551
  METADATA_FIELDS_KEY: {},
1001
1552
  SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
1002
1553
  "feces": {
1003
- ALIAS_KEY: "stool"
1554
+ METADATA_FIELDS_KEY: {
1555
+ "stool_field": {
1556
+ DEFAULT_KEY: "stool_value",
1557
+ TYPE_KEY: "string"
1558
+ },
1559
+ SAMPLE_TYPE_KEY: {
1560
+ ALLOWED_KEY: ["stool"],
1561
+ DEFAULT_KEY: "stool",
1562
+ TYPE_KEY: "string"
1563
+ },
1564
+ QIITA_SAMPLE_TYPE: {
1565
+ ALLOWED_KEY: ["stool"],
1566
+ DEFAULT_KEY: "stool",
1567
+ TYPE_KEY: "string"
1568
+ }
1569
+ }
1004
1570
  },
1005
1571
  "stool": {
1006
1572
  METADATA_FIELDS_KEY: {
1007
1573
  "stool_field": {
1008
1574
  DEFAULT_KEY: "stool_value",
1009
1575
  TYPE_KEY: "string"
1576
+ },
1577
+ SAMPLE_TYPE_KEY: {
1578
+ ALLOWED_KEY: ["stool"],
1579
+ DEFAULT_KEY: "stool",
1580
+ TYPE_KEY: "string"
1581
+ },
1582
+ QIITA_SAMPLE_TYPE: {
1583
+ ALLOWED_KEY: ["stool"],
1584
+ DEFAULT_KEY: "stool",
1585
+ TYPE_KEY: "string"
1010
1586
  }
1011
1587
  }
1012
1588
  }
@@ -1035,6 +1611,8 @@ class TestMetadataExtender(TestCase):
1035
1611
  LEAVE_REQUIREDS_BLANK_KEY: False,
1036
1612
  DEFAULT_KEY: "global_default"
1037
1613
  }
1614
+ # Config is pre-resolved: sample type's metadata_fields includes
1615
+ # host fields merged in, plus sample_type and qiita_sample_type
1038
1616
  full_flat_config_dict = {
1039
1617
  HOST_TYPE_SPECIFIC_METADATA_KEY: {
1040
1618
  "human": {
@@ -1048,9 +1626,23 @@ class TestMetadataExtender(TestCase):
1048
1626
  SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
1049
1627
  "stool": {
1050
1628
  METADATA_FIELDS_KEY: {
1629
+ "host_field": {
1630
+ DEFAULT_KEY: "host_value",
1631
+ TYPE_KEY: "string"
1632
+ },
1051
1633
  "stool_field": {
1052
1634
  DEFAULT_KEY: "stool_value",
1053
1635
  TYPE_KEY: "string"
1636
+ },
1637
+ SAMPLE_TYPE_KEY: {
1638
+ ALLOWED_KEY: ["stool"],
1639
+ DEFAULT_KEY: "stool",
1640
+ TYPE_KEY: "string"
1641
+ },
1642
+ QIITA_SAMPLE_TYPE: {
1643
+ ALLOWED_KEY: ["stool"],
1644
+ DEFAULT_KEY: "stool",
1645
+ TYPE_KEY: "string"
1054
1646
  }
1055
1647
  }
1056
1648
  }
@@ -1160,6 +1752,8 @@ class TestMetadataExtender(TestCase):
1160
1752
  LEAVE_REQUIREDS_BLANK_KEY: False,
1161
1753
  DEFAULT_KEY: "global_default"
1162
1754
  }
1755
+ # Config is pre-resolved: sample type's metadata_fields includes
1756
+ # host fields merged in, plus sample_type and qiita_sample_type
1163
1757
  full_flat_config_dict = {
1164
1758
  HOST_TYPE_SPECIFIC_METADATA_KEY: {
1165
1759
  "human": {
@@ -1171,7 +1765,22 @@ class TestMetadataExtender(TestCase):
1171
1765
  },
1172
1766
  SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
1173
1767
  "stool": {
1174
- METADATA_FIELDS_KEY: {}
1768
+ METADATA_FIELDS_KEY: {
1769
+ "human_field": {
1770
+ DEFAULT_KEY: "human_value",
1771
+ TYPE_KEY: "string"
1772
+ },
1773
+ SAMPLE_TYPE_KEY: {
1774
+ ALLOWED_KEY: ["stool"],
1775
+ DEFAULT_KEY: "stool",
1776
+ TYPE_KEY: "string"
1777
+ },
1778
+ QIITA_SAMPLE_TYPE: {
1779
+ ALLOWED_KEY: ["stool"],
1780
+ DEFAULT_KEY: "stool",
1781
+ TYPE_KEY: "string"
1782
+ }
1783
+ }
1175
1784
  }
1176
1785
  }
1177
1786
  },
@@ -1209,6 +1818,8 @@ class TestMetadataExtender(TestCase):
1209
1818
  LEAVE_REQUIREDS_BLANK_KEY: False,
1210
1819
  DEFAULT_KEY: "global_default"
1211
1820
  }
1821
+ # Config is pre-resolved: sample type's metadata_fields includes
1822
+ # host fields merged in, plus sample_type and qiita_sample_type
1212
1823
  full_flat_config_dict = {
1213
1824
  HOST_TYPE_SPECIFIC_METADATA_KEY: {
1214
1825
  "human": {
@@ -1220,6 +1831,16 @@ class TestMetadataExtender(TestCase):
1220
1831
  "required_field": {
1221
1832
  REQUIRED_KEY: True,
1222
1833
  TYPE_KEY: "string"
1834
+ },
1835
+ SAMPLE_TYPE_KEY: {
1836
+ ALLOWED_KEY: ["stool"],
1837
+ DEFAULT_KEY: "stool",
1838
+ TYPE_KEY: "string"
1839
+ },
1840
+ QIITA_SAMPLE_TYPE: {
1841
+ ALLOWED_KEY: ["stool"],
1842
+ DEFAULT_KEY: "stool",
1843
+ TYPE_KEY: "string"
1223
1844
  }
1224
1845
  }
1225
1846
  }
@@ -1255,6 +1876,8 @@ class TestMetadataExtender(TestCase):
1255
1876
  LEAVE_REQUIREDS_BLANK_KEY: False,
1256
1877
  DEFAULT_KEY: "global_default"
1257
1878
  }
1879
+ # Config is pre-resolved: sample type's metadata_fields includes
1880
+ # host fields merged in, plus sample_type and qiita_sample_type
1258
1881
  full_flat_config_dict = {
1259
1882
  HOST_TYPE_SPECIFIC_METADATA_KEY: {
1260
1883
  "human": {
@@ -1266,6 +1889,16 @@ class TestMetadataExtender(TestCase):
1266
1889
  "required_field": {
1267
1890
  REQUIRED_KEY: True,
1268
1891
  TYPE_KEY: "string"
1892
+ },
1893
+ SAMPLE_TYPE_KEY: {
1894
+ ALLOWED_KEY: ["stool"],
1895
+ DEFAULT_KEY: "stool",
1896
+ TYPE_KEY: "string"
1897
+ },
1898
+ QIITA_SAMPLE_TYPE: {
1899
+ ALLOWED_KEY: ["stool"],
1900
+ DEFAULT_KEY: "stool",
1901
+ TYPE_KEY: "string"
1269
1902
  }
1270
1903
  }
1271
1904
  }
@@ -1298,6 +1931,8 @@ class TestMetadataExtender(TestCase):
1298
1931
  SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
1299
1932
  QC_NOTE_KEY: ["", ""]
1300
1933
  })
1934
+ # Config is pre-resolved: sample type's metadata_fields includes
1935
+ # host fields merged in, plus sample_type and qiita_sample_type
1301
1936
  full_flat_config_dict = {
1302
1937
  DEFAULT_KEY: "global_default",
1303
1938
  LEAVE_REQUIREDS_BLANK_KEY: False,
@@ -1313,9 +1948,23 @@ class TestMetadataExtender(TestCase):
1313
1948
  SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
1314
1949
  "stool": {
1315
1950
  METADATA_FIELDS_KEY: {
1951
+ "host_field": {
1952
+ DEFAULT_KEY: "host_value",
1953
+ TYPE_KEY: "string"
1954
+ },
1316
1955
  "stool_field": {
1317
1956
  DEFAULT_KEY: "stool_value",
1318
1957
  TYPE_KEY: "string"
1958
+ },
1959
+ SAMPLE_TYPE_KEY: {
1960
+ ALLOWED_KEY: ["stool"],
1961
+ DEFAULT_KEY: "stool",
1962
+ TYPE_KEY: "string"
1963
+ },
1964
+ QIITA_SAMPLE_TYPE: {
1965
+ ALLOWED_KEY: ["stool"],
1966
+ DEFAULT_KEY: "stool",
1967
+ TYPE_KEY: "string"
1319
1968
  }
1320
1969
  }
1321
1970
  }
@@ -1348,6 +1997,8 @@ class TestMetadataExtender(TestCase):
1348
1997
  SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool", "blood"],
1349
1998
  QC_NOTE_KEY: ["", "", ""]
1350
1999
  })
2000
+ # Config is pre-resolved: sample type's metadata_fields includes
2001
+ # host fields merged in, plus sample_type and qiita_sample_type
1351
2002
  full_flat_config_dict = {
1352
2003
  DEFAULT_KEY: "global_default",
1353
2004
  LEAVE_REQUIREDS_BLANK_KEY: False,
@@ -1362,10 +2013,40 @@ class TestMetadataExtender(TestCase):
1362
2013
  },
1363
2014
  SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
1364
2015
  "stool": {
1365
- METADATA_FIELDS_KEY: {}
2016
+ METADATA_FIELDS_KEY: {
2017
+ "human_field": {
2018
+ DEFAULT_KEY: "human_value",
2019
+ TYPE_KEY: "string"
2020
+ },
2021
+ SAMPLE_TYPE_KEY: {
2022
+ ALLOWED_KEY: ["stool"],
2023
+ DEFAULT_KEY: "stool",
2024
+ TYPE_KEY: "string"
2025
+ },
2026
+ QIITA_SAMPLE_TYPE: {
2027
+ ALLOWED_KEY: ["stool"],
2028
+ DEFAULT_KEY: "stool",
2029
+ TYPE_KEY: "string"
2030
+ }
2031
+ }
1366
2032
  },
1367
2033
  "blood": {
1368
- METADATA_FIELDS_KEY: {}
2034
+ METADATA_FIELDS_KEY: {
2035
+ "human_field": {
2036
+ DEFAULT_KEY: "human_value",
2037
+ TYPE_KEY: "string"
2038
+ },
2039
+ SAMPLE_TYPE_KEY: {
2040
+ ALLOWED_KEY: ["blood"],
2041
+ DEFAULT_KEY: "blood",
2042
+ TYPE_KEY: "string"
2043
+ },
2044
+ QIITA_SAMPLE_TYPE: {
2045
+ ALLOWED_KEY: ["blood"],
2046
+ DEFAULT_KEY: "blood",
2047
+ TYPE_KEY: "string"
2048
+ }
2049
+ }
1369
2050
  }
1370
2051
  }
1371
2052
  },
@@ -1378,7 +2059,22 @@ class TestMetadataExtender(TestCase):
1378
2059
  },
1379
2060
  SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
1380
2061
  "stool": {
1381
- METADATA_FIELDS_KEY: {}
2062
+ METADATA_FIELDS_KEY: {
2063
+ "mouse_field": {
2064
+ DEFAULT_KEY: "mouse_value",
2065
+ TYPE_KEY: "string"
2066
+ },
2067
+ SAMPLE_TYPE_KEY: {
2068
+ ALLOWED_KEY: ["stool"],
2069
+ DEFAULT_KEY: "stool",
2070
+ TYPE_KEY: "string"
2071
+ },
2072
+ QIITA_SAMPLE_TYPE: {
2073
+ ALLOWED_KEY: ["stool"],
2074
+ DEFAULT_KEY: "stool",
2075
+ TYPE_KEY: "string"
2076
+ }
2077
+ }
1382
2078
  }
1383
2079
  }
1384
2080
  }
@@ -1478,6 +2174,8 @@ class TestMetadataExtender(TestCase):
1478
2174
  SAMPLETYPE_SHORTHAND_KEY: ["stool"],
1479
2175
  QC_NOTE_KEY: [""]
1480
2176
  })
2177
+ # Config is pre-resolved: sample type's metadata_fields includes
2178
+ # host fields merged in, plus sample_type and qiita_sample_type
1481
2179
  full_flat_config_dict = {
1482
2180
  DEFAULT_KEY: "global_default",
1483
2181
  LEAVE_REQUIREDS_BLANK_KEY: True, # This causes required fields to get LEAVE_BLANK_VAL
@@ -1491,6 +2189,16 @@ class TestMetadataExtender(TestCase):
1491
2189
  "required_field": {
1492
2190
  REQUIRED_KEY: True,
1493
2191
  TYPE_KEY: "string"
2192
+ },
2193
+ SAMPLE_TYPE_KEY: {
2194
+ ALLOWED_KEY: ["stool"],
2195
+ DEFAULT_KEY: "stool",
2196
+ TYPE_KEY: "string"
2197
+ },
2198
+ QIITA_SAMPLE_TYPE: {
2199
+ ALLOWED_KEY: ["stool"],
2200
+ DEFAULT_KEY: "stool",
2201
+ TYPE_KEY: "string"
1494
2202
  }
1495
2203
  }
1496
2204
  }
@@ -1790,6 +2498,8 @@ class TestMetadataExtender(TestCase):
1790
2498
  HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
1791
2499
  SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"]
1792
2500
  })
2501
+ # Config is pre-resolved: sample type's metadata_fields includes
2502
+ # host fields merged in, plus sample_type and qiita_sample_type
1793
2503
  full_flat_config_dict = {
1794
2504
  DEFAULT_KEY: "not provided",
1795
2505
  LEAVE_REQUIREDS_BLANK_KEY: False,
@@ -1805,9 +2515,23 @@ class TestMetadataExtender(TestCase):
1805
2515
  SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
1806
2516
  "stool": {
1807
2517
  METADATA_FIELDS_KEY: {
2518
+ "host_field": {
2519
+ DEFAULT_KEY: "host_value",
2520
+ TYPE_KEY: "string"
2521
+ },
1808
2522
  "stool_field": {
1809
2523
  DEFAULT_KEY: "stool_value",
1810
2524
  TYPE_KEY: "string"
2525
+ },
2526
+ SAMPLE_TYPE_KEY: {
2527
+ ALLOWED_KEY: ["stool"],
2528
+ DEFAULT_KEY: "stool",
2529
+ TYPE_KEY: "string"
2530
+ },
2531
+ QIITA_SAMPLE_TYPE: {
2532
+ ALLOWED_KEY: ["stool"],
2533
+ DEFAULT_KEY: "stool",
2534
+ TYPE_KEY: "string"
1811
2535
  }
1812
2536
  }
1813
2537
  }
@@ -1840,6 +2564,8 @@ class TestMetadataExtender(TestCase):
1840
2564
  SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
1841
2565
  "input_sex": ["F", "Male"]
1842
2566
  })
2567
+ # Config is pre-resolved: sample type's metadata_fields includes
2568
+ # host fields merged in, plus sample_type and qiita_sample_type
1843
2569
  full_flat_config_dict = {
1844
2570
  DEFAULT_KEY: "not provided",
1845
2571
  LEAVE_REQUIREDS_BLANK_KEY: False,
@@ -1857,7 +2583,18 @@ class TestMetadataExtender(TestCase):
1857
2583
  METADATA_FIELDS_KEY: {},
1858
2584
  SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
1859
2585
  "stool": {
1860
- METADATA_FIELDS_KEY: {}
2586
+ METADATA_FIELDS_KEY: {
2587
+ SAMPLE_TYPE_KEY: {
2588
+ ALLOWED_KEY: ["stool"],
2589
+ DEFAULT_KEY: "stool",
2590
+ TYPE_KEY: "string"
2591
+ },
2592
+ QIITA_SAMPLE_TYPE: {
2593
+ ALLOWED_KEY: ["stool"],
2594
+ DEFAULT_KEY: "stool",
2595
+ TYPE_KEY: "string"
2596
+ }
2597
+ }
1861
2598
  }
1862
2599
  }
1863
2600
  }
@@ -1886,6 +2623,8 @@ class TestMetadataExtender(TestCase):
1886
2623
  HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
1887
2624
  SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"]
1888
2625
  })
2626
+ # Config is pre-resolved: sample type's metadata_fields includes
2627
+ # host fields merged in, plus sample_type and qiita_sample_type
1889
2628
  full_flat_config_dict = {
1890
2629
  DEFAULT_KEY: "not provided",
1891
2630
  LEAVE_REQUIREDS_BLANK_KEY: False,
@@ -1903,7 +2642,18 @@ class TestMetadataExtender(TestCase):
1903
2642
  METADATA_FIELDS_KEY: {},
1904
2643
  SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
1905
2644
  "stool": {
1906
- METADATA_FIELDS_KEY: {}
2645
+ METADATA_FIELDS_KEY: {
2646
+ SAMPLE_TYPE_KEY: {
2647
+ ALLOWED_KEY: ["stool"],
2648
+ DEFAULT_KEY: "stool",
2649
+ TYPE_KEY: "string"
2650
+ },
2651
+ QIITA_SAMPLE_TYPE: {
2652
+ ALLOWED_KEY: ["stool"],
2653
+ DEFAULT_KEY: "stool",
2654
+ TYPE_KEY: "string"
2655
+ }
2656
+ }
1907
2657
  }
1908
2658
  }
1909
2659
  }
@@ -1963,6 +2713,8 @@ class TestMetadataExtender(TestCase):
1963
2713
  HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
1964
2714
  SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"]
1965
2715
  })
2716
+ # Config is pre-resolved: sample type's metadata_fields includes
2717
+ # host fields merged in, plus sample_type and qiita_sample_type
1966
2718
  full_flat_config_dict = {
1967
2719
  DEFAULT_KEY: "not provided",
1968
2720
  LEAVE_REQUIREDS_BLANK_KEY: False,
@@ -1972,7 +2724,18 @@ class TestMetadataExtender(TestCase):
1972
2724
  METADATA_FIELDS_KEY: {},
1973
2725
  SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
1974
2726
  "stool": {
1975
- METADATA_FIELDS_KEY: {}
2727
+ METADATA_FIELDS_KEY: {
2728
+ SAMPLE_TYPE_KEY: {
2729
+ ALLOWED_KEY: ["stool"],
2730
+ DEFAULT_KEY: "stool",
2731
+ TYPE_KEY: "string"
2732
+ },
2733
+ QIITA_SAMPLE_TYPE: {
2734
+ ALLOWED_KEY: ["stool"],
2735
+ DEFAULT_KEY: "stool",
2736
+ TYPE_KEY: "string"
2737
+ }
2738
+ }
1976
2739
  }
1977
2740
  }
1978
2741
  }
@@ -2002,6 +2765,8 @@ class TestMetadataExtender(TestCase):
2002
2765
  SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
2003
2766
  "source_field": ["hello", "world"]
2004
2767
  })
2768
+ # Config is pre-resolved: sample type's metadata_fields includes
2769
+ # host fields merged in, plus sample_type and qiita_sample_type
2005
2770
  full_flat_config_dict = {
2006
2771
  DEFAULT_KEY: "not provided",
2007
2772
  LEAVE_REQUIREDS_BLANK_KEY: False,
@@ -2019,7 +2784,18 @@ class TestMetadataExtender(TestCase):
2019
2784
  METADATA_FIELDS_KEY: {},
2020
2785
  SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
2021
2786
  "stool": {
2022
- METADATA_FIELDS_KEY: {}
2787
+ METADATA_FIELDS_KEY: {
2788
+ SAMPLE_TYPE_KEY: {
2789
+ ALLOWED_KEY: ["stool"],
2790
+ DEFAULT_KEY: "stool",
2791
+ TYPE_KEY: "string"
2792
+ },
2793
+ QIITA_SAMPLE_TYPE: {
2794
+ ALLOWED_KEY: ["stool"],
2795
+ DEFAULT_KEY: "stool",
2796
+ TYPE_KEY: "string"
2797
+ }
2798
+ }
2023
2799
  }
2024
2800
  }
2025
2801
  }
@@ -2100,8 +2876,15 @@ class TestMetadataExtender(TestCase):
2100
2876
 
2101
2877
  expected_df = pandas.DataFrame({
2102
2878
  SAMPLE_NAME_KEY: ["sample1", "sample2"],
2879
+ # body_product from human stool in test_standards.yml
2880
+ "body_product": ["UBERON:feces", "UBERON:feces"],
2881
+ # body_site inherited from host_associated stool
2103
2882
  "body_site": ["gut", "gut"],
2883
+ # custom_field from study_specific_metadata
2104
2884
  "custom_field": ["custom_value", "custom_value"],
2885
+ # description overridden at human level
2886
+ "description": ["human sample", "human sample"],
2887
+ # host_common_name from human level
2105
2888
  "host_common_name": ["human", "human"],
2106
2889
  QIITA_SAMPLE_TYPE: ["stool", "stool"],
2107
2890
  SAMPLE_TYPE_KEY: ["stool", "stool"],
@@ -2151,7 +2934,11 @@ class TestMetadataExtender(TestCase):
2151
2934
 
2152
2935
  expected_df = pandas.DataFrame({
2153
2936
  SAMPLE_NAME_KEY: ["sample1", "sample2"],
2937
+ # body_product from human stool in test_standards.yml
2938
+ "body_product": ["UBERON:feces", "UBERON:feces"],
2154
2939
  "body_site": ["gut", "gut"],
2940
+ # description overridden at human level
2941
+ "description": ["human sample", "human sample"],
2155
2942
  "host_common_name": ["human", "human"],
2156
2943
  "input_sex": ["F", "Male"],
2157
2944
  QIITA_SAMPLE_TYPE: ["stool", "stool"],
@@ -2207,7 +2994,9 @@ class TestMetadataExtender(TestCase):
2207
2994
 
2208
2995
  expected_df = pandas.DataFrame({
2209
2996
  SAMPLE_NAME_KEY: ["sample1", "sample2"],
2997
+ "body_product": ["UBERON:feces", "UBERON:feces"],
2210
2998
  "body_site": ["gut", "gut"],
2999
+ "description": ["human sample", "human sample"],
2211
3000
  "host_common_name": ["human", "human"],
2212
3001
  QIITA_SAMPLE_TYPE: ["stool", "stool"],
2213
3002
  SAMPLE_TYPE_KEY: ["stool", "stool"],
@@ -2243,7 +3032,9 @@ class TestMetadataExtender(TestCase):
2243
3032
 
2244
3033
  expected_df = pandas.DataFrame({
2245
3034
  SAMPLE_NAME_KEY: ["sample1"],
3035
+ "body_product": ["UBERON:feces"],
2246
3036
  "body_site": ["gut"],
3037
+ "description": ["human sample"],
2247
3038
  "host_common_name": ["human"],
2248
3039
  QIITA_SAMPLE_TYPE: ["stool"],
2249
3040
  SAMPLE_TYPE_KEY: ["stool"],
@@ -2332,7 +3123,12 @@ class TestMetadataExtender(TestCase):
2332
3123
  # Human samples are processed together, then mouse samples
2333
3124
  expected_df = pandas.DataFrame({
2334
3125
  SAMPLE_NAME_KEY: ["sample1", "sample3", "sample2"],
3126
+ # body_product: human stool/blood have it, mouse stool uses default
3127
+ "body_product": ["UBERON:feces", "UBERON:blood", "not provided"],
2335
3128
  "body_site": ["gut", "blood", "gut"],
3129
+ # description: human overrides to "human sample",
3130
+ # mouse inherits "host associated sample"
3131
+ "description": ["human sample", "human sample", "host associated sample"],
2336
3132
  "host_common_name": ["human", "human", "mouse"],
2337
3133
  QIITA_SAMPLE_TYPE: ["stool", "blood", "stool"],
2338
3134
  SAMPLE_TYPE_KEY: ["stool", "blood", "stool"],
@@ -2342,30 +3138,74 @@ class TestMetadataExtender(TestCase):
2342
3138
  })
2343
3139
  assert_frame_equal(expected_df, result_df)
2344
3140
 
2345
- def test_extend_metadata_df_with_software_config(self):
2346
- """Test metadata extension with custom software config overrides defaults."""
3141
+ def test_extend_metadata_df_with_software_config(self):
3142
+ """Test metadata extension with custom software config overrides defaults."""
3143
+ input_df = pandas.DataFrame({
3144
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
3145
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
3146
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"]
3147
+ })
3148
+ # Software config with custom default value
3149
+ software_config = {
3150
+ DEFAULT_KEY: "custom_software_default",
3151
+ LEAVE_REQUIREDS_BLANK_KEY: True,
3152
+ OVERWRITE_NON_NANS_KEY: False
3153
+ }
3154
+ # Study config that doesn't override DEFAULT_KEY
3155
+ study_config = {
3156
+ STUDY_SPECIFIC_METADATA_KEY: {
3157
+ HOST_TYPE_SPECIFIC_METADATA_KEY: {
3158
+ "human": {
3159
+ METADATA_FIELDS_KEY: {
3160
+ "study_field": {
3161
+ DEFAULT_KEY: "study_value",
3162
+ TYPE_KEY: "string"
3163
+ }
3164
+ },
3165
+ SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
3166
+ "stool": {
3167
+ METADATA_FIELDS_KEY: {}
3168
+ }
3169
+ }
3170
+ }
3171
+ }
3172
+ }
3173
+ }
3174
+
3175
+ result_df, validation_msgs_df = extend_metadata_df(
3176
+ input_df, study_config, None, software_config, self.TEST_STDS_FP)
3177
+
3178
+ expected_df = pandas.DataFrame({
3179
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
3180
+ "body_product": ["UBERON:feces", "UBERON:feces"],
3181
+ "body_site": ["gut", "gut"],
3182
+ "description": ["human sample", "human sample"],
3183
+ "host_common_name": ["human", "human"],
3184
+ QIITA_SAMPLE_TYPE: ["stool", "stool"],
3185
+ SAMPLE_TYPE_KEY: ["stool", "stool"],
3186
+ "study_field": ["study_value", "study_value"],
3187
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
3188
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
3189
+ QC_NOTE_KEY: ["", ""]
3190
+ })
3191
+ assert_frame_equal(expected_df, result_df)
3192
+
3193
+ def test_extend_metadata_df_with_alternate_column_names(self):
3194
+ """Test metadata extension with alternate hosttype and sampletype column names."""
3195
+ # Use alternate column names instead of hosttype_shorthand and sampletype_shorthand
2347
3196
  input_df = pandas.DataFrame({
2348
3197
  SAMPLE_NAME_KEY: ["sample1", "sample2"],
2349
- HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
2350
- SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"]
3198
+ "host_type": ["human", "human"],
3199
+ "sample": ["stool", "stool"]
2351
3200
  })
2352
- # Software config with custom default value
2353
- software_config = {
2354
- DEFAULT_KEY: "custom_software_default",
2355
- LEAVE_REQUIREDS_BLANK_KEY: True,
2356
- OVERWRITE_NON_NANS_KEY: False
2357
- }
2358
- # Study config that doesn't override DEFAULT_KEY
2359
3201
  study_config = {
3202
+ DEFAULT_KEY: "not provided",
3203
+ LEAVE_REQUIREDS_BLANK_KEY: True,
3204
+ OVERWRITE_NON_NANS_KEY: False,
2360
3205
  STUDY_SPECIFIC_METADATA_KEY: {
2361
3206
  HOST_TYPE_SPECIFIC_METADATA_KEY: {
2362
3207
  "human": {
2363
- METADATA_FIELDS_KEY: {
2364
- "study_field": {
2365
- DEFAULT_KEY: "study_value",
2366
- TYPE_KEY: "string"
2367
- }
2368
- },
3208
+ METADATA_FIELDS_KEY: {},
2369
3209
  SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
2370
3210
  "stool": {
2371
3211
  METADATA_FIELDS_KEY: {}
@@ -2375,22 +3215,37 @@ class TestMetadataExtender(TestCase):
2375
3215
  }
2376
3216
  }
2377
3217
  }
3218
+ # Software config specifies alternate column names
3219
+ software_config = {
3220
+ DEFAULT_KEY: "not provided",
3221
+ LEAVE_REQUIREDS_BLANK_KEY: True,
3222
+ OVERWRITE_NON_NANS_KEY: False,
3223
+ HOSTTYPE_COL_OPTIONS_KEY: ["host_type"],
3224
+ SAMPLETYPE_COL_OPTIONS_KEY: ["sample"]
3225
+ }
2378
3226
 
2379
3227
  result_df, validation_msgs_df = extend_metadata_df(
2380
3228
  input_df, study_config, None, software_config, self.TEST_STDS_FP)
2381
3229
 
2382
3230
  expected_df = pandas.DataFrame({
2383
3231
  SAMPLE_NAME_KEY: ["sample1", "sample2"],
3232
+ "body_product": ["UBERON:feces", "UBERON:feces"],
2384
3233
  "body_site": ["gut", "gut"],
3234
+ "description": ["human sample", "human sample"],
2385
3235
  "host_common_name": ["human", "human"],
3236
+ # Alternate column names from input are preserved
3237
+ "host_type": ["human", "human"],
2386
3238
  QIITA_SAMPLE_TYPE: ["stool", "stool"],
3239
+ # Alternate column names from input are preserved
3240
+ "sample": ["stool", "stool"],
2387
3241
  SAMPLE_TYPE_KEY: ["stool", "stool"],
2388
- "study_field": ["study_value", "study_value"],
3242
+ # Standard internal columns added at end (in order of INTERNAL_COL_KEYS)
2389
3243
  HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
2390
3244
  SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
2391
3245
  QC_NOTE_KEY: ["", ""]
2392
3246
  })
2393
3247
  assert_frame_equal(expected_df, result_df)
3248
+ self.assertTrue(validation_msgs_df.empty)
2394
3249
 
2395
3250
  # Tests for _get_study_specific_config
2396
3251
 
@@ -2455,156 +3310,976 @@ class TestMetadataExtender(TestCase):
2455
3310
  })
2456
3311
 
2457
3312
  with tempfile.TemporaryDirectory() as tmpdir:
2458
- _output_metadata_df_to_files(
2459
- input_df, tmpdir, "test_output", INTERNAL_COL_KEYS,
2460
- sep="\t", remove_internals_and_fails=False)
3313
+ _output_metadata_df_to_files(
3314
+ input_df, tmpdir, "test_output", INTERNAL_COL_KEYS,
3315
+ sep="\t", remove_internals_and_fails=False)
3316
+
3317
+ # Find the output file (has timestamp prefix)
3318
+ output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
3319
+ self.assertEqual(1, len(output_files))
3320
+
3321
+ # Read and verify contents (keep_default_na=False preserves empty strings)
3322
+ result_df = pandas.read_csv(output_files[0], sep="\t", keep_default_na=False)
3323
+ expected_df = input_df
3324
+ assert_frame_equal(expected_df, result_df)
3325
+
3326
+ def test__output_metadata_df_to_files_remove_internals_and_fails(self):
3327
+ """Test output with internal columns and failures removed."""
3328
+ input_df = pandas.DataFrame({
3329
+ SAMPLE_NAME_KEY: ["sample1", "sample2", "sample3"],
3330
+ "field_a": ["a1", "a2", "a3"],
3331
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human", "human"],
3332
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool", "stool"],
3333
+ QC_NOTE_KEY: ["", "invalid host_type", ""]
3334
+ })
3335
+
3336
+ with tempfile.TemporaryDirectory() as tmpdir:
3337
+ _output_metadata_df_to_files(
3338
+ input_df, tmpdir, "test_output", INTERNAL_COL_KEYS,
3339
+ sep="\t", remove_internals_and_fails=True)
3340
+
3341
+ # Find the main output file
3342
+ output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
3343
+ self.assertEqual(1, len(output_files))
3344
+
3345
+ # Verify main output has internal cols removed and no failures
3346
+ result_df = pandas.read_csv(output_files[0], sep="\t")
3347
+ expected_df = pandas.DataFrame({
3348
+ SAMPLE_NAME_KEY: ["sample1", "sample3"],
3349
+ "field_a": ["a1", "a3"]
3350
+ })
3351
+ assert_frame_equal(expected_df, result_df)
3352
+
3353
+ # Find the fails file
3354
+ fails_files = glob.glob(os.path.join(tmpdir, "*_test_output_fails.csv"))
3355
+ self.assertEqual(1, len(fails_files))
3356
+
3357
+ # Verify fails file contains the failed row
3358
+ fails_df = pandas.read_csv(fails_files[0], sep=",")
3359
+ expected_fails_df = pandas.DataFrame({
3360
+ SAMPLE_NAME_KEY: ["sample2"],
3361
+ "field_a": ["a2"],
3362
+ HOSTTYPE_SHORTHAND_KEY: ["human"],
3363
+ SAMPLETYPE_SHORTHAND_KEY: ["stool"],
3364
+ QC_NOTE_KEY: ["invalid host_type"]
3365
+ })
3366
+ assert_frame_equal(expected_fails_df, fails_df)
3367
+
3368
+ def test__output_metadata_df_to_files_no_failures_creates_empty_file(self):
3369
+ """Test that empty fails file is created when there are no failures."""
3370
+ input_df = pandas.DataFrame({
3371
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
3372
+ "field_a": ["a1", "a2"],
3373
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
3374
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
3375
+ QC_NOTE_KEY: ["", ""]
3376
+ })
3377
+
3378
+ with tempfile.TemporaryDirectory() as tmpdir:
3379
+ _output_metadata_df_to_files(
3380
+ input_df, tmpdir, "test_output", INTERNAL_COL_KEYS,
3381
+ sep="\t", remove_internals_and_fails=True,
3382
+ suppress_empty_fails=False)
3383
+
3384
+ # Find the fails file
3385
+ fails_files = glob.glob(os.path.join(tmpdir, "*_test_output_fails.csv"))
3386
+ self.assertEqual(1, len(fails_files))
3387
+
3388
+ # Verify fails file is empty (zero bytes)
3389
+ self.assertEqual(0, os.path.getsize(fails_files[0]))
3390
+
3391
+ def test__output_metadata_df_to_files_suppress_empty_fails(self):
3392
+ """Test that empty fails file is not created when suppress_empty_fails=True."""
3393
+ input_df = pandas.DataFrame({
3394
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
3395
+ "field_a": ["a1", "a2"],
3396
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
3397
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
3398
+ QC_NOTE_KEY: ["", ""]
3399
+ })
3400
+
3401
+ with tempfile.TemporaryDirectory() as tmpdir:
3402
+ _output_metadata_df_to_files(
3403
+ input_df, tmpdir, "test_output", INTERNAL_COL_KEYS,
3404
+ sep="\t", remove_internals_and_fails=True,
3405
+ suppress_empty_fails=True)
3406
+
3407
+ # Find the fails file - should not exist
3408
+ fails_files = glob.glob(os.path.join(tmpdir, "*_test_output_fails.csv"))
3409
+ self.assertEqual(0, len(fails_files))
3410
+
3411
+ # Main output file should still exist
3412
+ output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
3413
+ self.assertEqual(1, len(output_files))
3414
+
3415
+ def test__output_metadata_df_to_files_csv_separator(self):
3416
+ """Test output with comma separator creates .csv file."""
3417
+ input_df = pandas.DataFrame({
3418
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
3419
+ "field_a": ["a1", "a2"],
3420
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
3421
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
3422
+ QC_NOTE_KEY: ["", ""]
3423
+ })
3424
+
3425
+ with tempfile.TemporaryDirectory() as tmpdir:
3426
+ _output_metadata_df_to_files(
3427
+ input_df, tmpdir, "test_output", INTERNAL_COL_KEYS,
3428
+ sep=",", remove_internals_and_fails=False)
3429
+
3430
+ # Find the output file with .csv extension
3431
+ output_files = glob.glob(os.path.join(tmpdir, "*_test_output.csv"))
3432
+ self.assertEqual(1, len(output_files))
3433
+
3434
+ # Read and verify contents (keep_default_na=False preserves empty strings)
3435
+ result_df = pandas.read_csv(output_files[0], sep=",", keep_default_na=False)
3436
+ expected_df = input_df
3437
+ assert_frame_equal(expected_df, result_df)
3438
+
3439
+ def test__output_metadata_df_to_files_all_failures(self):
3440
+ """Test output when all rows are failures."""
3441
+ input_df = pandas.DataFrame({
3442
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
3443
+ "field_a": ["a1", "a2"],
3444
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
3445
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
3446
+ QC_NOTE_KEY: ["invalid host_type", "invalid sample_type"]
3447
+ })
3448
+
3449
+ with tempfile.TemporaryDirectory() as tmpdir:
3450
+ _output_metadata_df_to_files(
3451
+ input_df, tmpdir, "test_output", INTERNAL_COL_KEYS,
3452
+ sep="\t", remove_internals_and_fails=True)
3453
+
3454
+ # Main output file should have only headers (empty data)
3455
+ output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
3456
+ self.assertEqual(1, len(output_files))
3457
+ result_df = pandas.read_csv(output_files[0], sep="\t")
3458
+ self.assertTrue(result_df.empty)
3459
+ self.assertEqual([SAMPLE_NAME_KEY, "field_a"], list(result_df.columns))
3460
+
3461
+ # Fails file should have both rows
3462
+ fails_files = glob.glob(os.path.join(tmpdir, "*_test_output_fails.csv"))
3463
+ self.assertEqual(1, len(fails_files))
3464
+ fails_df = pandas.read_csv(fails_files[0], sep=",")
3465
+ self.assertEqual(2, len(fails_df))
3466
+
3467
+ # Tests for get_extended_metadata_from_df_and_yaml
3468
+
3469
+ TEST_STUDY_CONFIG_FP = path.join(TEST_DIR, "data/test_study_config.yml")
3470
+
3471
+ def test_get_extended_metadata_from_df_and_yaml_with_config(self):
3472
+ """Test extending metadata with a study-specific YAML config file."""
3473
+ input_df = pandas.DataFrame({
3474
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
3475
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
3476
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"]
3477
+ })
3478
+
3479
+ result_df, validation_msgs_df = get_extended_metadata_from_df_and_yaml(
3480
+ input_df, self.TEST_STUDY_CONFIG_FP, self.TEST_STDS_FP)
3481
+
3482
+ expected_df = pandas.DataFrame({
3483
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
3484
+ "body_product": ["UBERON:feces", "UBERON:feces"],
3485
+ "body_site": ["gut", "gut"],
3486
+ "description": ["human sample", "human sample"],
3487
+ "host_common_name": ["human", "human"],
3488
+ QIITA_SAMPLE_TYPE: ["stool", "stool"],
3489
+ SAMPLE_TYPE_KEY: ["stool", "stool"],
3490
+ "study_custom_field": ["custom_value", "custom_value"],
3491
+ "study_stool_field": ["stool_custom", "stool_custom"],
3492
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
3493
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
3494
+ QC_NOTE_KEY: ["", ""]
3495
+ })
3496
+ assert_frame_equal(expected_df, result_df)
3497
+ self.assertTrue(validation_msgs_df.empty)
3498
+
3499
+ def test_get_extended_metadata_from_df_and_yaml_none_config(self):
3500
+ """Test extending metadata with None for study_specific_config_fp."""
3501
+ input_df = pandas.DataFrame({
3502
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
3503
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
3504
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"]
3505
+ })
3506
+
3507
+ result_df, validation_msgs_df = get_extended_metadata_from_df_and_yaml(
3508
+ input_df, None, self.TEST_STDS_FP)
3509
+
3510
+ expected_df = pandas.DataFrame({
3511
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
3512
+ "body_product": ["UBERON:feces", "UBERON:feces"],
3513
+ "body_site": ["gut", "gut"],
3514
+ "description": ["human sample", "human sample"],
3515
+ "host_common_name": ["human", "human"],
3516
+ QIITA_SAMPLE_TYPE: ["stool", "stool"],
3517
+ SAMPLE_TYPE_KEY: ["stool", "stool"],
3518
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
3519
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
3520
+ QC_NOTE_KEY: ["", ""]
3521
+ })
3522
+ assert_frame_equal(expected_df, result_df)
3523
+ self.assertTrue(validation_msgs_df.empty)
3524
+
3525
+ def test_get_extended_metadata_from_df_and_yaml_invalid_host_type(self):
3526
+ """Test that invalid host types are flagged with QC note."""
3527
+ input_df = pandas.DataFrame({
3528
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
3529
+ HOSTTYPE_SHORTHAND_KEY: ["unknown_host", "human"],
3530
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"]
3531
+ })
3532
+
3533
+ result_df, validation_msgs_df = get_extended_metadata_from_df_and_yaml(
3534
+ input_df, self.TEST_STUDY_CONFIG_FP, self.TEST_STDS_FP)
3535
+
3536
+ expected_df = pandas.DataFrame({
3537
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
3538
+ "body_product": ["not provided", "UBERON:feces"],
3539
+ "body_site": ["not provided", "gut"],
3540
+ "description": ["not provided", "human sample"],
3541
+ "host_common_name": ["not provided", "human"],
3542
+ QIITA_SAMPLE_TYPE: ["not provided", "stool"],
3543
+ SAMPLE_TYPE_KEY: ["not provided", "stool"],
3544
+ "study_custom_field": ["not provided", "custom_value"],
3545
+ "study_stool_field": ["not provided", "stool_custom"],
3546
+ HOSTTYPE_SHORTHAND_KEY: ["unknown_host", "human"],
3547
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
3548
+ QC_NOTE_KEY: ["invalid host_type", ""]
3549
+ })
3550
+ assert_frame_equal(expected_df, result_df)
3551
+ self.assertTrue(validation_msgs_df.empty)
3552
+
3553
+ # Tests for write_extended_metadata_from_df
3554
+
3555
+ def test_write_extended_metadata_from_df_basic(self):
3556
+ """Test basic writing of extended metadata to files."""
3557
+ input_df = pandas.DataFrame({
3558
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
3559
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
3560
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"]
3561
+ })
3562
+ study_config = {
3563
+ DEFAULT_KEY: "not provided",
3564
+ LEAVE_REQUIREDS_BLANK_KEY: True,
3565
+ OVERWRITE_NON_NANS_KEY: False,
3566
+ STUDY_SPECIFIC_METADATA_KEY: {
3567
+ HOST_TYPE_SPECIFIC_METADATA_KEY: {
3568
+ "human": {
3569
+ METADATA_FIELDS_KEY: {
3570
+ "custom_field": {
3571
+ DEFAULT_KEY: "custom_value",
3572
+ TYPE_KEY: "string"
3573
+ }
3574
+ },
3575
+ SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
3576
+ "stool": {
3577
+ METADATA_FIELDS_KEY: {}
3578
+ }
3579
+ }
3580
+ }
3581
+ }
3582
+ }
3583
+ }
3584
+
3585
+ with tempfile.TemporaryDirectory() as tmpdir:
3586
+ result_df = write_extended_metadata_from_df(
3587
+ input_df, study_config, tmpdir, "test_output",
3588
+ stds_fp=self.TEST_STDS_FP)
2461
3589
 
2462
- # Find the output file (has timestamp prefix)
3590
+ # Verify returned DataFrame
3591
+ expected_df = pandas.DataFrame({
3592
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
3593
+ "body_product": ["UBERON:feces", "UBERON:feces"],
3594
+ "body_site": ["gut", "gut"],
3595
+ "custom_field": ["custom_value", "custom_value"],
3596
+ "description": ["human sample", "human sample"],
3597
+ "host_common_name": ["human", "human"],
3598
+ QIITA_SAMPLE_TYPE: ["stool", "stool"],
3599
+ SAMPLE_TYPE_KEY: ["stool", "stool"],
3600
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
3601
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
3602
+ QC_NOTE_KEY: ["", ""]
3603
+ })
3604
+ assert_frame_equal(expected_df, result_df)
3605
+
3606
+ # Verify main output file was created (internal cols removed by default)
2463
3607
  output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
2464
3608
  self.assertEqual(1, len(output_files))
3609
+ output_df = pandas.read_csv(output_files[0], sep="\t")
3610
+ expected_output_df = pandas.DataFrame({
3611
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
3612
+ "body_product": ["UBERON:feces", "UBERON:feces"],
3613
+ "body_site": ["gut", "gut"],
3614
+ "custom_field": ["custom_value", "custom_value"],
3615
+ "description": ["human sample", "human sample"],
3616
+ "host_common_name": ["human", "human"],
3617
+ QIITA_SAMPLE_TYPE: ["stool", "stool"],
3618
+ SAMPLE_TYPE_KEY: ["stool", "stool"]
3619
+ })
3620
+ assert_frame_equal(expected_output_df, output_df)
2465
3621
 
2466
- # Read and verify contents (keep_default_na=False preserves empty strings)
2467
- result_df = pandas.read_csv(output_files[0], sep="\t", keep_default_na=False)
2468
- expected_df = input_df
2469
- assert_frame_equal(expected_df, result_df)
3622
+ # Verify empty fails file was created
3623
+ fails_files = glob.glob(os.path.join(tmpdir, "*_test_output_fails.csv"))
3624
+ self.assertEqual(1, len(fails_files))
3625
+ self.assertEqual(0, os.path.getsize(fails_files[0]))
2470
3626
 
2471
- def test__output_metadata_df_to_files_remove_internals_and_fails(self):
2472
- """Test output with internal columns and failures removed."""
3627
+ # Verify validation errors file was created (empty)
3628
+ validation_files = glob.glob(
3629
+ os.path.join(tmpdir, "*_test_output_validation_errors.csv"))
3630
+ self.assertEqual(1, len(validation_files))
3631
+ self.assertEqual(0, os.path.getsize(validation_files[0]))
3632
+
3633
+ def test_write_extended_metadata_from_df_with_qc_failures(self):
3634
+ """Test writing extended metadata when some rows have QC failures."""
2473
3635
  input_df = pandas.DataFrame({
2474
3636
  SAMPLE_NAME_KEY: ["sample1", "sample2", "sample3"],
2475
- "field_a": ["a1", "a2", "a3"],
2476
- HOSTTYPE_SHORTHAND_KEY: ["human", "human", "human"],
2477
- SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool", "stool"],
2478
- QC_NOTE_KEY: ["", "invalid host_type", ""]
3637
+ HOSTTYPE_SHORTHAND_KEY: ["human", "unknown_host", "human"],
3638
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool", "stool"]
2479
3639
  })
3640
+ study_config = {
3641
+ DEFAULT_KEY: "not provided",
3642
+ LEAVE_REQUIREDS_BLANK_KEY: True,
3643
+ OVERWRITE_NON_NANS_KEY: False,
3644
+ STUDY_SPECIFIC_METADATA_KEY: {
3645
+ HOST_TYPE_SPECIFIC_METADATA_KEY: {
3646
+ "human": {
3647
+ METADATA_FIELDS_KEY: {},
3648
+ SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
3649
+ "stool": {
3650
+ METADATA_FIELDS_KEY: {}
3651
+ }
3652
+ }
3653
+ }
3654
+ }
3655
+ }
3656
+ }
2480
3657
 
2481
3658
  with tempfile.TemporaryDirectory() as tmpdir:
2482
- _output_metadata_df_to_files(
2483
- input_df, tmpdir, "test_output", INTERNAL_COL_KEYS,
2484
- sep="\t", remove_internals_and_fails=True)
3659
+ result_df = write_extended_metadata_from_df(
3660
+ input_df, study_config, tmpdir, "test_output",
3661
+ stds_fp=self.TEST_STDS_FP)
3662
+
3663
+ # Verify returned DataFrame includes all rows (including failures)
3664
+ # Note: rows are reordered by host type processing (valid hosts first)
3665
+ expected_result_df = pandas.DataFrame({
3666
+ SAMPLE_NAME_KEY: ["sample1", "sample3", "sample2"],
3667
+ "body_product": ["UBERON:feces", "UBERON:feces", "not provided"],
3668
+ "body_site": ["gut", "gut", "not provided"],
3669
+ "description": ["human sample", "human sample", "not provided"],
3670
+ "host_common_name": ["human", "human", "not provided"],
3671
+ QIITA_SAMPLE_TYPE: ["stool", "stool", "not provided"],
3672
+ SAMPLE_TYPE_KEY: ["stool", "stool", "not provided"],
3673
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human", "unknown_host"],
3674
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool", "stool"],
3675
+ QC_NOTE_KEY: ["", "", "invalid host_type"]
3676
+ })
3677
+ assert_frame_equal(expected_result_df, result_df)
2485
3678
 
2486
- # Find the main output file
3679
+ # Verify main output file excludes failure rows
2487
3680
  output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
2488
3681
  self.assertEqual(1, len(output_files))
2489
-
2490
- # Verify main output has internal cols removed and no failures
2491
- result_df = pandas.read_csv(output_files[0], sep="\t")
2492
- expected_df = pandas.DataFrame({
3682
+ output_df = pandas.read_csv(output_files[0], sep="\t")
3683
+ expected_output_df = pandas.DataFrame({
2493
3684
  SAMPLE_NAME_KEY: ["sample1", "sample3"],
2494
- "field_a": ["a1", "a3"]
3685
+ "body_product": ["UBERON:feces", "UBERON:feces"],
3686
+ "body_site": ["gut", "gut"],
3687
+ "description": ["human sample", "human sample"],
3688
+ "host_common_name": ["human", "human"],
3689
+ QIITA_SAMPLE_TYPE: ["stool", "stool"],
3690
+ SAMPLE_TYPE_KEY: ["stool", "stool"]
2495
3691
  })
2496
- assert_frame_equal(expected_df, result_df)
3692
+ assert_frame_equal(expected_output_df, output_df)
2497
3693
 
2498
- # Find the fails file
3694
+ # Verify fails file contains the failed row
2499
3695
  fails_files = glob.glob(os.path.join(tmpdir, "*_test_output_fails.csv"))
2500
3696
  self.assertEqual(1, len(fails_files))
2501
-
2502
- # Verify fails file contains the failed row
2503
3697
  fails_df = pandas.read_csv(fails_files[0], sep=",")
2504
3698
  expected_fails_df = pandas.DataFrame({
2505
3699
  SAMPLE_NAME_KEY: ["sample2"],
2506
- "field_a": ["a2"],
2507
- HOSTTYPE_SHORTHAND_KEY: ["human"],
3700
+ "body_product": ["not provided"],
3701
+ "body_site": ["not provided"],
3702
+ "description": ["not provided"],
3703
+ "host_common_name": ["not provided"],
3704
+ QIITA_SAMPLE_TYPE: ["not provided"],
3705
+ SAMPLE_TYPE_KEY: ["not provided"],
3706
+ HOSTTYPE_SHORTHAND_KEY: ["unknown_host"],
2508
3707
  SAMPLETYPE_SHORTHAND_KEY: ["stool"],
2509
3708
  QC_NOTE_KEY: ["invalid host_type"]
2510
3709
  })
2511
3710
  assert_frame_equal(expected_fails_df, fails_df)
2512
3711
 
2513
- def test__output_metadata_df_to_files_no_failures_creates_empty_file(self):
2514
- """Test that empty fails file is created when there are no failures."""
3712
+ def test_write_extended_metadata_from_df_with_validation_errors(self):
3713
+ """Test writing extended metadata when validation errors occur."""
2515
3714
  input_df = pandas.DataFrame({
2516
3715
  SAMPLE_NAME_KEY: ["sample1", "sample2"],
2517
- "field_a": ["a1", "a2"],
2518
3716
  HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
2519
3717
  SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
2520
- QC_NOTE_KEY: ["", ""]
3718
+ "restricted_field": ["invalid_value", "allowed_value"]
2521
3719
  })
3720
+ study_config = {
3721
+ DEFAULT_KEY: "not provided",
3722
+ LEAVE_REQUIREDS_BLANK_KEY: True,
3723
+ OVERWRITE_NON_NANS_KEY: False,
3724
+ STUDY_SPECIFIC_METADATA_KEY: {
3725
+ HOST_TYPE_SPECIFIC_METADATA_KEY: {
3726
+ "human": {
3727
+ METADATA_FIELDS_KEY: {
3728
+ "restricted_field": {
3729
+ TYPE_KEY: "string",
3730
+ ALLOWED_KEY: ["allowed_value"]
3731
+ }
3732
+ },
3733
+ SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
3734
+ "stool": {
3735
+ METADATA_FIELDS_KEY: {}
3736
+ }
3737
+ }
3738
+ }
3739
+ }
3740
+ }
3741
+ }
2522
3742
 
2523
3743
  with tempfile.TemporaryDirectory() as tmpdir:
2524
- _output_metadata_df_to_files(
2525
- input_df, tmpdir, "test_output", INTERNAL_COL_KEYS,
2526
- sep="\t", remove_internals_and_fails=True,
2527
- suppress_empty_fails=False)
2528
-
2529
- # Find the fails file
2530
- fails_files = glob.glob(os.path.join(tmpdir, "*_test_output_fails.csv"))
2531
- self.assertEqual(1, len(fails_files))
2532
-
2533
- # Verify fails file is empty (zero bytes)
2534
- self.assertEqual(0, os.path.getsize(fails_files[0]))
3744
+ result_df = write_extended_metadata_from_df(
3745
+ input_df, study_config, tmpdir, "test_output",
3746
+ stds_fp=self.TEST_STDS_FP)
3747
+
3748
+ # Verify returned DataFrame
3749
+ expected_result_df = pandas.DataFrame({
3750
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
3751
+ "body_product": ["UBERON:feces", "UBERON:feces"],
3752
+ "body_site": ["gut", "gut"],
3753
+ "description": ["human sample", "human sample"],
3754
+ "host_common_name": ["human", "human"],
3755
+ QIITA_SAMPLE_TYPE: ["stool", "stool"],
3756
+ "restricted_field": ["invalid_value", "allowed_value"],
3757
+ SAMPLE_TYPE_KEY: ["stool", "stool"],
3758
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
3759
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
3760
+ QC_NOTE_KEY: ["", ""]
3761
+ })
3762
+ assert_frame_equal(expected_result_df, result_df)
3763
+
3764
+ # Verify validation errors file contains the error
3765
+ validation_files = glob.glob(
3766
+ os.path.join(tmpdir, "*_test_output_validation_errors.csv"))
3767
+ self.assertEqual(1, len(validation_files))
3768
+ validation_df = pandas.read_csv(validation_files[0], sep=",")
3769
+ expected_validation_df = pandas.DataFrame({
3770
+ "sample_name": ["sample1"],
3771
+ "field_name": ["restricted_field"],
3772
+ "error_message": ["['unallowed value invalid_value']"]
3773
+ })
3774
+ assert_frame_equal(expected_validation_df, validation_df)
2535
3775
 
2536
- def test__output_metadata_df_to_files_suppress_empty_fails(self):
2537
- """Test that empty fails file is not created when suppress_empty_fails=True."""
3776
+ def test_write_extended_metadata_from_df_remove_internals_false(self):
3777
+ """Test writing extended metadata with remove_internals=False."""
2538
3778
  input_df = pandas.DataFrame({
2539
- SAMPLE_NAME_KEY: ["sample1", "sample2"],
2540
- "field_a": ["a1", "a2"],
2541
- HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
2542
- SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
2543
- QC_NOTE_KEY: ["", ""]
3779
+ SAMPLE_NAME_KEY: ["sample1"],
3780
+ HOSTTYPE_SHORTHAND_KEY: ["human"],
3781
+ SAMPLETYPE_SHORTHAND_KEY: ["stool"]
2544
3782
  })
3783
+ study_config = {
3784
+ DEFAULT_KEY: "not provided",
3785
+ LEAVE_REQUIREDS_BLANK_KEY: True,
3786
+ OVERWRITE_NON_NANS_KEY: False,
3787
+ STUDY_SPECIFIC_METADATA_KEY: {
3788
+ HOST_TYPE_SPECIFIC_METADATA_KEY: {
3789
+ "human": {
3790
+ METADATA_FIELDS_KEY: {},
3791
+ SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
3792
+ "stool": {
3793
+ METADATA_FIELDS_KEY: {}
3794
+ }
3795
+ }
3796
+ }
3797
+ }
3798
+ }
3799
+ }
2545
3800
 
2546
3801
  with tempfile.TemporaryDirectory() as tmpdir:
2547
- _output_metadata_df_to_files(
2548
- input_df, tmpdir, "test_output", INTERNAL_COL_KEYS,
2549
- sep="\t", remove_internals_and_fails=True,
2550
- suppress_empty_fails=True)
3802
+ write_extended_metadata_from_df(
3803
+ input_df, study_config, tmpdir, "test_output",
3804
+ remove_internals=False, stds_fp=self.TEST_STDS_FP)
2551
3805
 
2552
- # Find the fails file - should not exist
3806
+ # Verify main output file includes internal columns
3807
+ output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
3808
+ self.assertEqual(1, len(output_files))
3809
+ output_df = pandas.read_csv(output_files[0], sep="\t", keep_default_na=False)
3810
+ expected_output_df = pandas.DataFrame({
3811
+ SAMPLE_NAME_KEY: ["sample1"],
3812
+ "body_product": ["UBERON:feces"],
3813
+ "body_site": ["gut"],
3814
+ "description": ["human sample"],
3815
+ "host_common_name": ["human"],
3816
+ QIITA_SAMPLE_TYPE: ["stool"],
3817
+ SAMPLE_TYPE_KEY: ["stool"],
3818
+ HOSTTYPE_SHORTHAND_KEY: ["human"],
3819
+ SAMPLETYPE_SHORTHAND_KEY: ["stool"],
3820
+ QC_NOTE_KEY: [""]
3821
+ })
3822
+ assert_frame_equal(expected_output_df, output_df)
3823
+
3824
+ # Verify no fails file was created (since remove_internals=False)
2553
3825
  fails_files = glob.glob(os.path.join(tmpdir, "*_test_output_fails.csv"))
2554
3826
  self.assertEqual(0, len(fails_files))
2555
3827
 
2556
- # Main output file should still exist
3828
+ # Tests for write_extended_metadata
3829
+
3830
+ TEST_METADATA_CSV_FP = path.join(TEST_DIR, "data/test_metadata.csv")
3831
+ TEST_METADATA_TXT_FP = path.join(TEST_DIR, "data/test_metadata.txt")
3832
+ TEST_METADATA_WITH_ERRORS_FP = path.join(
3833
+ TEST_DIR, "data/test_metadata_with_errors.csv")
3834
+ TEST_STUDY_CONFIG_WITH_VALIDATION_FP = path.join(
3835
+ TEST_DIR, "data/test_study_config_with_validation.yml")
3836
+
3837
+ def test_write_extended_metadata_csv_input(self):
3838
+ """Test writing extended metadata from a CSV input file."""
3839
+ with tempfile.TemporaryDirectory() as tmpdir:
3840
+ result_df = write_extended_metadata(
3841
+ self.TEST_METADATA_CSV_FP, self.TEST_STUDY_CONFIG_FP,
3842
+ tmpdir, "test_output", stds_fp=self.TEST_STDS_FP)
3843
+
3844
+ # Verify returned DataFrame
3845
+ expected_result_df = pandas.DataFrame({
3846
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
3847
+ "body_product": ["UBERON:feces", "UBERON:feces"],
3848
+ "body_site": ["gut", "gut"],
3849
+ "description": ["human sample", "human sample"],
3850
+ "host_common_name": ["human", "human"],
3851
+ QIITA_SAMPLE_TYPE: ["stool", "stool"],
3852
+ SAMPLE_TYPE_KEY: ["stool", "stool"],
3853
+ "study_custom_field": ["custom_value", "custom_value"],
3854
+ "study_stool_field": ["stool_custom", "stool_custom"],
3855
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
3856
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
3857
+ QC_NOTE_KEY: ["", ""]
3858
+ })
3859
+ assert_frame_equal(expected_result_df, result_df)
3860
+
3861
+ # Verify main output file was created (internal cols removed by default)
2557
3862
  output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
2558
3863
  self.assertEqual(1, len(output_files))
3864
+ output_df = pandas.read_csv(output_files[0], sep="\t")
3865
+ expected_output_df = pandas.DataFrame({
3866
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
3867
+ "body_product": ["UBERON:feces", "UBERON:feces"],
3868
+ "body_site": ["gut", "gut"],
3869
+ "description": ["human sample", "human sample"],
3870
+ "host_common_name": ["human", "human"],
3871
+ QIITA_SAMPLE_TYPE: ["stool", "stool"],
3872
+ SAMPLE_TYPE_KEY: ["stool", "stool"],
3873
+ "study_custom_field": ["custom_value", "custom_value"],
3874
+ "study_stool_field": ["stool_custom", "stool_custom"]
3875
+ })
3876
+ assert_frame_equal(expected_output_df, output_df)
2559
3877
 
2560
- def test__output_metadata_df_to_files_csv_separator(self):
2561
- """Test output with comma separator creates .csv file."""
2562
- input_df = pandas.DataFrame({
2563
- SAMPLE_NAME_KEY: ["sample1", "sample2"],
2564
- "field_a": ["a1", "a2"],
2565
- HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
2566
- SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
2567
- QC_NOTE_KEY: ["", ""]
2568
- })
3878
+ # Verify empty fails file was created
3879
+ fails_files = glob.glob(os.path.join(tmpdir, "*_test_output_fails.csv"))
3880
+ self.assertEqual(1, len(fails_files))
3881
+ self.assertEqual(0, os.path.getsize(fails_files[0]))
3882
+
3883
+ # Verify empty validation errors file was created
3884
+ validation_files = glob.glob(
3885
+ os.path.join(tmpdir, "*_test_output_validation_errors.csv"))
3886
+ self.assertEqual(1, len(validation_files))
3887
+ self.assertEqual(0, os.path.getsize(validation_files[0]))
2569
3888
 
3889
+ def test_write_extended_metadata_txt_input(self):
3890
+ """Test writing extended metadata from a tab-delimited TXT input file."""
2570
3891
  with tempfile.TemporaryDirectory() as tmpdir:
2571
- _output_metadata_df_to_files(
2572
- input_df, tmpdir, "test_output", INTERNAL_COL_KEYS,
2573
- sep=",", remove_internals_and_fails=False)
3892
+ result_df = write_extended_metadata(
3893
+ self.TEST_METADATA_TXT_FP, self.TEST_STUDY_CONFIG_FP,
3894
+ tmpdir, "test_output", stds_fp=self.TEST_STDS_FP)
3895
+
3896
+ # Verify returned DataFrame
3897
+ expected_result_df = pandas.DataFrame({
3898
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
3899
+ "body_product": ["UBERON:feces", "UBERON:feces"],
3900
+ "body_site": ["gut", "gut"],
3901
+ "description": ["human sample", "human sample"],
3902
+ "host_common_name": ["human", "human"],
3903
+ QIITA_SAMPLE_TYPE: ["stool", "stool"],
3904
+ SAMPLE_TYPE_KEY: ["stool", "stool"],
3905
+ "study_custom_field": ["custom_value", "custom_value"],
3906
+ "study_stool_field": ["stool_custom", "stool_custom"],
3907
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
3908
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
3909
+ QC_NOTE_KEY: ["", ""]
3910
+ })
3911
+ assert_frame_equal(expected_result_df, result_df)
2574
3912
 
2575
- # Find the output file with .csv extension
3913
+ # Verify main output file was created
3914
+ output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
3915
+ self.assertEqual(1, len(output_files))
3916
+ output_df = pandas.read_csv(output_files[0], sep="\t")
3917
+ expected_output_df = pandas.DataFrame({
3918
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
3919
+ "body_product": ["UBERON:feces", "UBERON:feces"],
3920
+ "body_site": ["gut", "gut"],
3921
+ "description": ["human sample", "human sample"],
3922
+ "host_common_name": ["human", "human"],
3923
+ QIITA_SAMPLE_TYPE: ["stool", "stool"],
3924
+ SAMPLE_TYPE_KEY: ["stool", "stool"],
3925
+ "study_custom_field": ["custom_value", "custom_value"],
3926
+ "study_stool_field": ["stool_custom", "stool_custom"]
3927
+ })
3928
+ assert_frame_equal(expected_output_df, output_df)
3929
+
3930
+ def test_write_extended_metadata_with_validation_errors(self):
3931
+ """Test writing extended metadata when validation errors occur."""
3932
+ with tempfile.TemporaryDirectory() as tmpdir:
3933
+ result_df = write_extended_metadata(
3934
+ self.TEST_METADATA_WITH_ERRORS_FP,
3935
+ self.TEST_STUDY_CONFIG_WITH_VALIDATION_FP,
3936
+ tmpdir, "test_output", stds_fp=self.TEST_STDS_FP)
3937
+
3938
+ # Verify returned DataFrame
3939
+ expected_result_df = pandas.DataFrame({
3940
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
3941
+ "body_product": ["UBERON:feces", "UBERON:feces"],
3942
+ "body_site": ["gut", "gut"],
3943
+ "description": ["human sample", "human sample"],
3944
+ "host_common_name": ["human", "human"],
3945
+ QIITA_SAMPLE_TYPE: ["stool", "stool"],
3946
+ "restricted_field": ["invalid_value", "allowed_value"],
3947
+ SAMPLE_TYPE_KEY: ["stool", "stool"],
3948
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
3949
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
3950
+ QC_NOTE_KEY: ["", ""]
3951
+ })
3952
+ assert_frame_equal(expected_result_df, result_df)
3953
+
3954
+ # Verify main output file was created
3955
+ output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
3956
+ self.assertEqual(1, len(output_files))
3957
+ output_df = pandas.read_csv(output_files[0], sep="\t")
3958
+ expected_output_df = pandas.DataFrame({
3959
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
3960
+ "body_product": ["UBERON:feces", "UBERON:feces"],
3961
+ "body_site": ["gut", "gut"],
3962
+ "description": ["human sample", "human sample"],
3963
+ "host_common_name": ["human", "human"],
3964
+ QIITA_SAMPLE_TYPE: ["stool", "stool"],
3965
+ "restricted_field": ["invalid_value", "allowed_value"],
3966
+ SAMPLE_TYPE_KEY: ["stool", "stool"]
3967
+ })
3968
+ assert_frame_equal(expected_output_df, output_df)
3969
+
3970
+ # Verify validation errors file contains the error
3971
+ validation_files = glob.glob(
3972
+ os.path.join(tmpdir, "*_test_output_validation_errors.csv"))
3973
+ self.assertEqual(1, len(validation_files))
3974
+ validation_df = pandas.read_csv(validation_files[0], sep=",")
3975
+ expected_validation_df = pandas.DataFrame({
3976
+ "sample_name": ["sample1"],
3977
+ "field_name": ["restricted_field"],
3978
+ "error_message": ["['unallowed value invalid_value']"]
3979
+ })
3980
+ assert_frame_equal(expected_validation_df, validation_df)
3981
+
3982
+ def test_write_extended_metadata_unrecognized_extension_raises(self):
3983
+ """Test that unrecognized file extension raises ValueError."""
3984
+ with tempfile.TemporaryDirectory() as tmpdir:
3985
+ fake_fp = path.join(tmpdir, "test.json")
3986
+ # Create a dummy file so the path exists
3987
+ with open(fake_fp, "w") as f:
3988
+ f.write("{}")
3989
+
3990
+ with self.assertRaisesRegex(
3991
+ ValueError, "Unrecognized input file extension"):
3992
+ write_extended_metadata(
3993
+ fake_fp, self.TEST_STUDY_CONFIG_FP,
3994
+ tmpdir, "test_output", stds_fp=self.TEST_STDS_FP)
3995
+
3996
+ def test_write_extended_metadata_csv_separator_output(self):
3997
+ """Test writing extended metadata with CSV separator for output."""
3998
+ with tempfile.TemporaryDirectory() as tmpdir:
3999
+ result_df = write_extended_metadata(
4000
+ self.TEST_METADATA_CSV_FP, self.TEST_STUDY_CONFIG_FP,
4001
+ tmpdir, "test_output", sep=",", stds_fp=self.TEST_STDS_FP)
4002
+
4003
+ # Verify returned DataFrame
4004
+ expected_result_df = pandas.DataFrame({
4005
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
4006
+ "body_product": ["UBERON:feces", "UBERON:feces"],
4007
+ "body_site": ["gut", "gut"],
4008
+ "description": ["human sample", "human sample"],
4009
+ "host_common_name": ["human", "human"],
4010
+ QIITA_SAMPLE_TYPE: ["stool", "stool"],
4011
+ SAMPLE_TYPE_KEY: ["stool", "stool"],
4012
+ "study_custom_field": ["custom_value", "custom_value"],
4013
+ "study_stool_field": ["stool_custom", "stool_custom"],
4014
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
4015
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
4016
+ QC_NOTE_KEY: ["", ""]
4017
+ })
4018
+ assert_frame_equal(expected_result_df, result_df)
4019
+
4020
+ # Verify output file has .csv extension
2576
4021
  output_files = glob.glob(os.path.join(tmpdir, "*_test_output.csv"))
2577
4022
  self.assertEqual(1, len(output_files))
4023
+ output_df = pandas.read_csv(output_files[0], sep=",")
4024
+ expected_output_df = pandas.DataFrame({
4025
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
4026
+ "body_product": ["UBERON:feces", "UBERON:feces"],
4027
+ "body_site": ["gut", "gut"],
4028
+ "description": ["human sample", "human sample"],
4029
+ "host_common_name": ["human", "human"],
4030
+ QIITA_SAMPLE_TYPE: ["stool", "stool"],
4031
+ SAMPLE_TYPE_KEY: ["stool", "stool"],
4032
+ "study_custom_field": ["custom_value", "custom_value"],
4033
+ "study_stool_field": ["stool_custom", "stool_custom"]
4034
+ })
4035
+ assert_frame_equal(expected_output_df, output_df)
2578
4036
 
2579
- # Read and verify contents (keep_default_na=False preserves empty strings)
2580
- result_df = pandas.read_csv(output_files[0], sep=",", keep_default_na=False)
2581
- expected_df = input_df
2582
- assert_frame_equal(expected_df, result_df)
4037
+ def test_write_extended_metadata_remove_internals_false(self):
4038
+ """Test writing extended metadata with remove_internals=False."""
4039
+ with tempfile.TemporaryDirectory() as tmpdir:
4040
+ result_df = write_extended_metadata(
4041
+ self.TEST_METADATA_CSV_FP, self.TEST_STUDY_CONFIG_FP,
4042
+ tmpdir, "test_output", remove_internals=False,
4043
+ stds_fp=self.TEST_STDS_FP)
4044
+
4045
+ # Verify returned DataFrame
4046
+ expected_result_df = pandas.DataFrame({
4047
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
4048
+ "body_product": ["UBERON:feces", "UBERON:feces"],
4049
+ "body_site": ["gut", "gut"],
4050
+ "description": ["human sample", "human sample"],
4051
+ "host_common_name": ["human", "human"],
4052
+ QIITA_SAMPLE_TYPE: ["stool", "stool"],
4053
+ SAMPLE_TYPE_KEY: ["stool", "stool"],
4054
+ "study_custom_field": ["custom_value", "custom_value"],
4055
+ "study_stool_field": ["stool_custom", "stool_custom"],
4056
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
4057
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
4058
+ QC_NOTE_KEY: ["", ""]
4059
+ })
4060
+ assert_frame_equal(expected_result_df, result_df)
2583
4061
 
2584
- def test__output_metadata_df_to_files_all_failures(self):
2585
- """Test output when all rows are failures."""
2586
- input_df = pandas.DataFrame({
2587
- SAMPLE_NAME_KEY: ["sample1", "sample2"],
2588
- "field_a": ["a1", "a2"],
2589
- HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
2590
- SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
2591
- QC_NOTE_KEY: ["invalid host_type", "invalid sample_type"]
2592
- })
4062
+ # Verify main output file includes internal columns
4063
+ output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
4064
+ self.assertEqual(1, len(output_files))
4065
+ output_df = pandas.read_csv(output_files[0], sep="\t", keep_default_na=False)
4066
+ expected_output_df = pandas.DataFrame({
4067
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
4068
+ "body_product": ["UBERON:feces", "UBERON:feces"],
4069
+ "body_site": ["gut", "gut"],
4070
+ "description": ["human sample", "human sample"],
4071
+ "host_common_name": ["human", "human"],
4072
+ QIITA_SAMPLE_TYPE: ["stool", "stool"],
4073
+ SAMPLE_TYPE_KEY: ["stool", "stool"],
4074
+ "study_custom_field": ["custom_value", "custom_value"],
4075
+ "study_stool_field": ["stool_custom", "stool_custom"],
4076
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
4077
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
4078
+ QC_NOTE_KEY: ["", ""]
4079
+ })
4080
+ assert_frame_equal(expected_output_df, output_df)
4081
+
4082
+ # Verify no fails file was created (since remove_internals=False)
4083
+ fails_files = glob.glob(os.path.join(tmpdir, "*_test_output_fails.csv"))
4084
+ self.assertEqual(0, len(fails_files))
2593
4085
 
4086
+ def test_write_extended_metadata_suppress_empty_fails(self):
4087
+ """Test writing extended metadata with suppress_empty_fails=True."""
2594
4088
  with tempfile.TemporaryDirectory() as tmpdir:
2595
- _output_metadata_df_to_files(
2596
- input_df, tmpdir, "test_output", INTERNAL_COL_KEYS,
2597
- sep="\t", remove_internals_and_fails=True)
4089
+ result_df = write_extended_metadata(
4090
+ self.TEST_METADATA_CSV_FP, self.TEST_STUDY_CONFIG_FP,
4091
+ tmpdir, "test_output", suppress_empty_fails=True,
4092
+ stds_fp=self.TEST_STDS_FP)
4093
+
4094
+ # Verify returned DataFrame
4095
+ expected_result_df = pandas.DataFrame({
4096
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
4097
+ "body_product": ["UBERON:feces", "UBERON:feces"],
4098
+ "body_site": ["gut", "gut"],
4099
+ "description": ["human sample", "human sample"],
4100
+ "host_common_name": ["human", "human"],
4101
+ QIITA_SAMPLE_TYPE: ["stool", "stool"],
4102
+ SAMPLE_TYPE_KEY: ["stool", "stool"],
4103
+ "study_custom_field": ["custom_value", "custom_value"],
4104
+ "study_stool_field": ["stool_custom", "stool_custom"],
4105
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
4106
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
4107
+ QC_NOTE_KEY: ["", ""]
4108
+ })
4109
+ assert_frame_equal(expected_result_df, result_df)
2598
4110
 
2599
- # Main output file should have only headers (empty data)
4111
+ # Verify main output file was created
2600
4112
  output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
2601
4113
  self.assertEqual(1, len(output_files))
2602
- result_df = pandas.read_csv(output_files[0], sep="\t")
2603
- self.assertTrue(result_df.empty)
2604
- self.assertEqual([SAMPLE_NAME_KEY, "field_a"], list(result_df.columns))
4114
+ output_df = pandas.read_csv(output_files[0], sep="\t")
4115
+ expected_output_df = pandas.DataFrame({
4116
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
4117
+ "body_product": ["UBERON:feces", "UBERON:feces"],
4118
+ "body_site": ["gut", "gut"],
4119
+ "description": ["human sample", "human sample"],
4120
+ "host_common_name": ["human", "human"],
4121
+ QIITA_SAMPLE_TYPE: ["stool", "stool"],
4122
+ SAMPLE_TYPE_KEY: ["stool", "stool"],
4123
+ "study_custom_field": ["custom_value", "custom_value"],
4124
+ "study_stool_field": ["stool_custom", "stool_custom"]
4125
+ })
4126
+ assert_frame_equal(expected_output_df, output_df)
2605
4127
 
2606
- # Fails file should have both rows
4128
+ # Verify no empty fails file was created (since suppress_empty_fails=True)
4129
+ fails_files = glob.glob(os.path.join(tmpdir, "*_test_output_fails.csv"))
4130
+ self.assertEqual(0, len(fails_files))
4131
+
4132
+ # Verify no empty validation errors file was created
4133
+ validation_files = glob.glob(
4134
+ os.path.join(tmpdir, "*_test_output_validation_errors.csv"))
4135
+ self.assertEqual(0, len(validation_files))
4136
+
4137
+ # Integration tests
4138
+
4139
+ TEST_PROJECT1_METADATA_FP = path.join(TEST_DIR, "data/test_project1_input_metadata.csv")
4140
+ TEST_PROJECT1_CONFIG_FP = path.join(TEST_DIR, "data/test_project1_config.yml")
4141
+ TEST_PROJECT1_EXPECTED_OUTPUT_FP = path.join(
4142
+ TEST_DIR, "data/test_project1_output_metadata.txt")
4143
+ TEST_PROJECT1_EXPECTED_FAILS_FP = path.join(
4144
+ TEST_DIR, "data/test_project1_output_fails.csv")
4145
+ def test_write_extended_metadata_from_df_project1_integration(self):
4146
+ """Integration test using project1 test data files."""
4147
+
4148
+ def write_mismatched_debug_files(expected_content, actual_content, file_name):
4149
+ """Write debug files to Desktop for unmatched content."""
4150
+ debug_dir = path.join(path.expanduser("~"), "Desktop")
4151
+ with open(path.join(debug_dir, f"UNMATCHED_1_{file_name}"), 'w') as debug_expected_file:
4152
+ debug_expected_file.write(expected_content)
4153
+ with open(path.join(debug_dir, f"UNMATCHED_2_{file_name}"), 'w') as debug_actual_file:
4154
+ debug_actual_file.write(actual_content)
4155
+
4156
+
4157
+ # Load input metadata CSV
4158
+ input_df = pandas.read_csv(self.TEST_PROJECT1_METADATA_FP, dtype=str)
4159
+ # for the columns "plating_notes" and "notes", fill NaN with empty string
4160
+ input_df["plating_notes"] = input_df["plating_notes"].fillna("")
4161
+ input_df["notes"] = input_df["notes"].fillna("")
4162
+
4163
+ # Load study config
4164
+ study_config = _get_study_specific_config(self.TEST_PROJECT1_CONFIG_FP)
4165
+
4166
+ with tempfile.TemporaryDirectory() as tmpdir:
4167
+ write_extended_metadata_from_df(
4168
+ input_df, study_config, tmpdir, "test_output",
4169
+ remove_internals=True)
4170
+
4171
+ # Compare main output file directly to expected file
4172
+ output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
4173
+ self.assertEqual(1, len(output_files))
4174
+ with open(output_files[0], 'r') as actual_file:
4175
+ actual_content = actual_file.read()
4176
+ with open(self.TEST_PROJECT1_EXPECTED_OUTPUT_FP, 'r') as expected_file:
4177
+ expected_content = expected_file.read()
4178
+ try:
4179
+ self.assertEqual(expected_content, actual_content)
4180
+ except AssertionError:
4181
+ write_mismatched_debug_files(
4182
+ expected_content, actual_content,
4183
+ "project1_output.txt")
4184
+ raise
4185
+
4186
+ # Compare fails file directly to expected file
2607
4187
  fails_files = glob.glob(os.path.join(tmpdir, "*_test_output_fails.csv"))
2608
4188
  self.assertEqual(1, len(fails_files))
2609
- fails_df = pandas.read_csv(fails_files[0], sep=",")
2610
- self.assertEqual(2, len(fails_df))
4189
+ with open(fails_files[0], 'r') as actual_file:
4190
+ actual_fails_content = actual_file.read()
4191
+ with open(self.TEST_PROJECT1_EXPECTED_FAILS_FP, 'r') as expected_file:
4192
+ expected_fails_content = expected_file.read()
4193
+ try:
4194
+ self.assertEqual(expected_fails_content, actual_fails_content)
4195
+ except AssertionError:
4196
+ write_mismatched_debug_files(
4197
+ expected_fails_content, actual_fails_content,
4198
+ "project1_fails.csv")
4199
+ raise
4200
+
4201
+ # Verify validation errors file is empty
4202
+ validation_files = glob.glob(
4203
+ os.path.join(tmpdir, "*_test_output_validation_errors.csv"))
4204
+ self.assertEqual(1, len(validation_files))
4205
+ self.assertEqual(0, os.path.getsize(validation_files[0]))
4206
+
4207
+ # Tests for _get_specified_column_name
4208
+
4209
+ def test__get_specified_column_name_finds_column(self):
4210
+ """Test that _get_specified_column_name finds a column that exists."""
4211
+ input_df = pandas.DataFrame({
4212
+ "sample_name": ["s1"],
4213
+ "host_type": ["human"]
4214
+ })
4215
+ config_dict = {
4216
+ HOSTTYPE_COL_OPTIONS_KEY: ["host_type", "host_common_name"]
4217
+ }
4218
+ result = _get_specified_column_name(
4219
+ HOSTTYPE_COL_OPTIONS_KEY, input_df, config_dict)
4220
+ self.assertEqual("host_type", result)
4221
+
4222
+ def test__get_specified_column_name_returns_first_match(self):
4223
+ """Test that _get_specified_column_name returns the first match when multiple options exist."""
4224
+ input_df = pandas.DataFrame({
4225
+ "sample_name": ["s1"],
4226
+ "host_type": ["human"],
4227
+ "host_common_name": ["human"]
4228
+ })
4229
+ config_dict = {
4230
+ HOSTTYPE_COL_OPTIONS_KEY: ["host_type", "host_common_name"]
4231
+ }
4232
+ result = _get_specified_column_name(
4233
+ HOSTTYPE_COL_OPTIONS_KEY, input_df, config_dict)
4234
+ self.assertEqual("host_type", result)
4235
+
4236
+ def test__get_specified_column_name_returns_none_when_no_match(self):
4237
+ """Test that _get_specified_column_name returns None when no options match."""
4238
+ input_df = pandas.DataFrame({
4239
+ "sample_name": ["s1"],
4240
+ "other_column": ["value"]
4241
+ })
4242
+ config_dict = {
4243
+ HOSTTYPE_COL_OPTIONS_KEY: ["host_type", "host_common_name"]
4244
+ }
4245
+ result = _get_specified_column_name(
4246
+ HOSTTYPE_COL_OPTIONS_KEY, input_df, config_dict)
4247
+ self.assertIsNone(result)
4248
+
4249
+ def test__get_specified_column_name_returns_none_when_key_missing(self):
4250
+ """Test that _get_specified_column_name returns None when col_options_key is not in config."""
4251
+ input_df = pandas.DataFrame({
4252
+ "sample_name": ["s1"],
4253
+ "host_type": ["human"]
4254
+ })
4255
+ config_dict = {}
4256
+ result = _get_specified_column_name(
4257
+ HOSTTYPE_COL_OPTIONS_KEY, input_df, config_dict)
4258
+ self.assertIsNone(result)
4259
+
4260
+ def test__get_specified_column_name_returns_none_when_options_empty(self):
4261
+ """Test that _get_specified_column_name returns None when col_options is empty list."""
4262
+ input_df = pandas.DataFrame({
4263
+ "sample_name": ["s1"],
4264
+ "host_type": ["human"]
4265
+ })
4266
+ config_dict = {
4267
+ HOSTTYPE_COL_OPTIONS_KEY: []
4268
+ }
4269
+ result = _get_specified_column_name(
4270
+ HOSTTYPE_COL_OPTIONS_KEY, input_df, config_dict)
4271
+ self.assertIsNone(result)
4272
+
4273
+ def test__get_specified_column_name_with_sampletype_key(self):
4274
+ """Test that _get_specified_column_name works with sampletype column options."""
4275
+ input_df = pandas.DataFrame({
4276
+ "sample_name": ["s1"],
4277
+ "sample_type": ["stool"]
4278
+ })
4279
+ config_dict = {
4280
+ SAMPLETYPE_COL_OPTIONS_KEY: ["sample_type", "sampletype"]
4281
+ }
4282
+ result = _get_specified_column_name(
4283
+ SAMPLETYPE_COL_OPTIONS_KEY, input_df, config_dict)
4284
+ self.assertEqual("sample_type", result)
4285
+ # endregion _get_specified_column_name tests