metameq 2026.1.1__py3-none-any.whl → 2026.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -16,10 +16,12 @@ from metameq.src.util import \
16
16
  SOURCES_KEY, FUNCTION_KEY, PRE_TRANSFORMERS_KEY, POST_TRANSFORMERS_KEY, \
17
17
  STUDY_SPECIFIC_METADATA_KEY
18
18
  from metameq.src.metadata_extender import \
19
- id_missing_cols, get_qc_failures, _reorder_df, \
20
- _catch_nan_required_fields, _fill_na_if_default, \
21
- _update_metadata_from_metadata_fields_dict, _update_metadata_from_dict, \
22
- _construct_sample_type_metadata_fields_dict, \
19
+ id_missing_cols, get_qc_failures, get_reserved_cols, find_standard_cols, \
20
+ find_nonstandard_cols, write_metadata_results, \
21
+ get_extended_metadata_from_df_and_yaml, write_extended_metadata_from_df, \
22
+ write_extended_metadata, _reorder_df, _catch_nan_required_fields, \
23
+ _fill_na_if_default, _update_metadata_from_metadata_fields_dict, \
24
+ _update_metadata_from_dict, _construct_sample_type_metadata_fields_dict, \
23
25
  _generate_metadata_for_a_sample_type_in_a_host_type, \
24
26
  _generate_metadata_for_a_host_type, _generate_metadata_for_host_types, \
25
27
  _transform_metadata, _populate_metadata_df, extend_metadata_df, \
@@ -67,157 +69,687 @@ class TestMetadataExtender(TestCase):
67
69
  expected = sorted(REQUIRED_RAW_METADATA_FIELDS)
68
70
  self.assertEqual(expected, result)
69
71
 
70
- # Tests for get_qc_failures
72
+ # Tests for get_reserved_cols
71
73
 
72
- def test_get_qc_failures_no_failures(self):
73
- """Test returns empty df when QC_NOTE_KEY is all empty strings."""
74
+ def test_get_reserved_cols_single_host_sample_type(self):
75
+ """Test returns sorted list of reserved column names for a single host/sample type."""
74
76
  input_df = pandas.DataFrame({
75
- SAMPLE_NAME_KEY: ["sample1", "sample2"],
76
- QC_NOTE_KEY: ["", ""]
77
+ SAMPLE_NAME_KEY: ["sample1"],
78
+ HOSTTYPE_SHORTHAND_KEY: ["human"],
79
+ SAMPLETYPE_SHORTHAND_KEY: ["stool"]
77
80
  })
81
+ study_config = {
82
+ DEFAULT_KEY: "not provided",
83
+ LEAVE_REQUIREDS_BLANK_KEY: True,
84
+ OVERWRITE_NON_NANS_KEY: False,
85
+ STUDY_SPECIFIC_METADATA_KEY: {
86
+ HOST_TYPE_SPECIFIC_METADATA_KEY: {
87
+ "human": {
88
+ METADATA_FIELDS_KEY: {
89
+ "host_common_name": {
90
+ DEFAULT_KEY: "human",
91
+ TYPE_KEY: "string"
92
+ }
93
+ },
94
+ SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
95
+ "stool": {
96
+ METADATA_FIELDS_KEY: {
97
+ "body_site": {
98
+ DEFAULT_KEY: "gut",
99
+ TYPE_KEY: "string"
100
+ },
101
+ "stool_consistency": {
102
+ DEFAULT_KEY: "normal",
103
+ TYPE_KEY: "string"
104
+ }
105
+ }
106
+ }
107
+ }
108
+ }
109
+ }
110
+ }
111
+ }
78
112
 
79
- result = get_qc_failures(input_df)
80
-
81
- self.assertTrue(result.empty)
113
+ result = get_reserved_cols(input_df, study_config, self.TEST_STDS_FP)
114
+
115
+ # Expected columns are union of study_config fields and test_standards.yml fields
116
+ # From standards: sample_name, sample_type (base), description (human overrides host_associated),
117
+ # body_site (host_associated stool), body_product (human stool), host_common_name (human)
118
+ expected = [
119
+ "body_product", # from human stool in test_standards.yml
120
+ "body_site",
121
+ "description", # from human in test_standards.yml (overrides host_associated)
122
+ "host_common_name",
123
+ HOSTTYPE_SHORTHAND_KEY,
124
+ QC_NOTE_KEY,
125
+ QIITA_SAMPLE_TYPE,
126
+ SAMPLE_NAME_KEY,
127
+ SAMPLE_TYPE_KEY,
128
+ SAMPLETYPE_SHORTHAND_KEY,
129
+ "stool_consistency"
130
+ ]
131
+ self.assertEqual(expected, result)
82
132
 
83
- def test_get_qc_failures_some_failures(self):
84
- """Test returns only rows where QC_NOTE_KEY is not empty."""
133
+ def test_get_reserved_cols_missing_hosttype_shorthand_raises(self):
134
+ """Test raises ValueError when hosttype_shorthand column is missing."""
85
135
  input_df = pandas.DataFrame({
86
- SAMPLE_NAME_KEY: ["sample1", "sample2", "sample3"],
87
- QC_NOTE_KEY: ["", "invalid host_type", ""]
136
+ SAMPLE_NAME_KEY: ["sample1"],
137
+ SAMPLETYPE_SHORTHAND_KEY: ["stool"]
88
138
  })
139
+ study_config = {}
89
140
 
90
- result = get_qc_failures(input_df)
91
-
92
- expected = pandas.DataFrame({
93
- SAMPLE_NAME_KEY: ["sample2"],
94
- QC_NOTE_KEY: ["invalid host_type"]
95
- }, index=[1])
96
- assert_frame_equal(expected, result)
141
+ with self.assertRaisesRegex(ValueError, HOSTTYPE_SHORTHAND_KEY):
142
+ get_reserved_cols(input_df, study_config)
97
143
 
98
- def test_get_qc_failures_all_failures(self):
99
- """Test returns all rows when all have QC notes."""
144
+ def test_get_reserved_cols_missing_sampletype_shorthand_raises(self):
145
+ """Test raises ValueError when sampletype_shorthand column is missing."""
100
146
  input_df = pandas.DataFrame({
101
- SAMPLE_NAME_KEY: ["sample1", "sample2"],
102
- QC_NOTE_KEY: ["invalid host_type", "invalid sample_type"]
147
+ SAMPLE_NAME_KEY: ["sample1"],
148
+ HOSTTYPE_SHORTHAND_KEY: ["human"]
103
149
  })
150
+ study_config = {}
104
151
 
105
- result = get_qc_failures(input_df)
152
+ with self.assertRaisesRegex(ValueError, SAMPLETYPE_SHORTHAND_KEY):
153
+ get_reserved_cols(input_df, study_config)
106
154
 
107
- assert_frame_equal(input_df, result)
155
+ def test_get_reserved_cols_multiple_host_sample_types(self):
156
+ """Test returns deduped union of reserved columns for multiple host/sample type combinations."""
157
+ input_df = pandas.DataFrame({
158
+ SAMPLE_NAME_KEY: ["sample1", "sample2", "sample3"],
159
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human", "mouse"],
160
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "blood", "stool"]
161
+ })
162
+ # Both human and mouse define host_common_name and body_site - should appear only once each
163
+ study_config = {
164
+ DEFAULT_KEY: "not provided",
165
+ LEAVE_REQUIREDS_BLANK_KEY: True,
166
+ OVERWRITE_NON_NANS_KEY: False,
167
+ STUDY_SPECIFIC_METADATA_KEY: {
168
+ HOST_TYPE_SPECIFIC_METADATA_KEY: {
169
+ "human": {
170
+ METADATA_FIELDS_KEY: {
171
+ "host_common_name": {
172
+ DEFAULT_KEY: "human",
173
+ TYPE_KEY: "string"
174
+ },
175
+ "human_field": {
176
+ DEFAULT_KEY: "human_value",
177
+ TYPE_KEY: "string"
178
+ }
179
+ },
180
+ SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
181
+ "stool": {
182
+ METADATA_FIELDS_KEY: {
183
+ "body_site": {
184
+ DEFAULT_KEY: "gut",
185
+ TYPE_KEY: "string"
186
+ },
187
+ "stool_consistency": {
188
+ DEFAULT_KEY: "normal",
189
+ TYPE_KEY: "string"
190
+ }
191
+ }
192
+ },
193
+ "blood": {
194
+ METADATA_FIELDS_KEY: {
195
+ "body_site": {
196
+ DEFAULT_KEY: "blood",
197
+ TYPE_KEY: "string"
198
+ },
199
+ "blood_type": {
200
+ DEFAULT_KEY: "unknown",
201
+ TYPE_KEY: "string"
202
+ }
203
+ }
204
+ }
205
+ }
206
+ },
207
+ "mouse": {
208
+ METADATA_FIELDS_KEY: {
209
+ "host_common_name": {
210
+ DEFAULT_KEY: "mouse",
211
+ TYPE_KEY: "string"
212
+ },
213
+ "mouse_field": {
214
+ DEFAULT_KEY: "mouse_value",
215
+ TYPE_KEY: "string"
216
+ }
217
+ },
218
+ SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
219
+ "stool": {
220
+ METADATA_FIELDS_KEY: {
221
+ "body_site": {
222
+ DEFAULT_KEY: "gut",
223
+ TYPE_KEY: "string"
224
+ },
225
+ "mouse_stool_field": {
226
+ DEFAULT_KEY: "mouse_stool_value",
227
+ TYPE_KEY: "string"
228
+ }
229
+ }
230
+ }
231
+ }
232
+ }
233
+ }
234
+ }
235
+ }
108
236
 
109
- # Tests for _reorder_df
237
+ result = get_reserved_cols(input_df, study_config, self.TEST_STDS_FP)
238
+
239
+ # Expected columns are union of study_config fields and test_standards.yml fields
240
+ # From standards for human/stool: sample_name, sample_type (base), description (human),
241
+ # body_site (host_associated stool), body_product (human stool), host_common_name (human)
242
+ # From standards for human/blood: body_site (human blood), body_product (human blood),
243
+ # description (human), host_common_name (human)
244
+ # From standards for mouse/stool: sample_name, sample_type (base), description (host_associated),
245
+ # body_site (host_associated stool), host_common_name (mouse)
246
+ # TODO: cage_id from mouse stool in test_standards.yml SHOULD be included here
247
+ # but is currently excluded because it has required: false and no default.
248
+ # The function under test needs to be changed to include fields even when
249
+ # they have required: false and no default.
250
+ expected = [
251
+ "blood_type",
252
+ "body_product", # from human stool and human blood in test_standards.yml
253
+ "body_site",
254
+ "description", # from human (overrides host_associated) and host_associated (mouse inherits)
255
+ "host_common_name",
256
+ HOSTTYPE_SHORTHAND_KEY,
257
+ "human_field",
258
+ "mouse_field",
259
+ "mouse_stool_field",
260
+ QC_NOTE_KEY,
261
+ QIITA_SAMPLE_TYPE,
262
+ SAMPLE_NAME_KEY,
263
+ SAMPLE_TYPE_KEY,
264
+ SAMPLETYPE_SHORTHAND_KEY,
265
+ "stool_consistency"
266
+ ]
267
+ self.assertEqual(expected, result)
110
268
 
111
- def test__reorder_df_sample_name_first(self):
112
- """Test that sample_name becomes the first column."""
269
+ # Tests for find_standard_cols
270
+
271
+ def test_find_standard_cols_returns_standard_cols_in_df(self):
272
+ """Test returns standard columns that exist in the input DataFrame, excluding internals."""
113
273
  input_df = pandas.DataFrame({
114
- "zebra": ["z"],
115
274
  SAMPLE_NAME_KEY: ["sample1"],
116
- "apple": ["a"],
117
- QC_NOTE_KEY: [""],
118
275
  HOSTTYPE_SHORTHAND_KEY: ["human"],
119
- SAMPLETYPE_SHORTHAND_KEY: ["stool"]
276
+ SAMPLETYPE_SHORTHAND_KEY: ["stool"],
277
+ "body_site": ["gut"],
278
+ "host_common_name": ["human"],
279
+ "my_custom_column": ["custom_value"]
120
280
  })
281
+ study_config = {
282
+ DEFAULT_KEY: "not provided",
283
+ LEAVE_REQUIREDS_BLANK_KEY: True,
284
+ OVERWRITE_NON_NANS_KEY: False,
285
+ STUDY_SPECIFIC_METADATA_KEY: {
286
+ HOST_TYPE_SPECIFIC_METADATA_KEY: {
287
+ "human": {
288
+ METADATA_FIELDS_KEY: {
289
+ "host_common_name": {
290
+ DEFAULT_KEY: "human",
291
+ TYPE_KEY: "string"
292
+ }
293
+ },
294
+ SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
295
+ "stool": {
296
+ METADATA_FIELDS_KEY: {
297
+ "body_site": {
298
+ DEFAULT_KEY: "gut",
299
+ TYPE_KEY: "string"
300
+ }
301
+ }
302
+ }
303
+ }
304
+ }
305
+ }
306
+ }
307
+ }
121
308
 
122
- result = _reorder_df(input_df, INTERNAL_COL_KEYS)
309
+ result = find_standard_cols(input_df, study_config, self.TEST_STDS_FP)
123
310
 
124
- self.assertEqual(SAMPLE_NAME_KEY, result.columns[0])
311
+ # Returns intersection of reserved cols (minus internals) with df columns.
312
+ # body_site, host_common_name, sample_name are standard and in df
313
+ # hosttype_shorthand, sampletype_shorthand are internal (excluded)
314
+ # my_custom_column is nonstandard (excluded)
315
+ expected = ["body_site", "host_common_name", SAMPLE_NAME_KEY]
316
+ self.assertEqual(sorted(expected), sorted(result))
125
317
 
126
- def test__reorder_df_alphabetical_order(self):
127
- """Test that non-internal columns are sorted alphabetically after sample_name."""
318
+ def test_find_standard_cols_missing_hosttype_shorthand_raises(self):
319
+ """Test raises ValueError when hosttype_shorthand column is missing."""
128
320
  input_df = pandas.DataFrame({
129
- "zebra": ["z"],
130
321
  SAMPLE_NAME_KEY: ["sample1"],
131
- "apple": ["a"],
132
- QC_NOTE_KEY: [""],
133
- HOSTTYPE_SHORTHAND_KEY: ["human"],
134
322
  SAMPLETYPE_SHORTHAND_KEY: ["stool"]
135
323
  })
324
+ study_config = {}
136
325
 
137
- result = _reorder_df(input_df, INTERNAL_COL_KEYS)
138
-
139
- expected_order = [SAMPLE_NAME_KEY, "apple", "zebra"] + INTERNAL_COL_KEYS
140
- self.assertEqual(expected_order, list(result.columns))
326
+ with self.assertRaisesRegex(ValueError, HOSTTYPE_SHORTHAND_KEY):
327
+ find_standard_cols(input_df, study_config, self.TEST_STDS_FP)
141
328
 
142
- def test__reorder_df_internals_at_end(self):
143
- """Test that internal columns are moved to the end in the provided order."""
329
+ def test_find_standard_cols_missing_sampletype_shorthand_raises(self):
330
+ """Test raises ValueError when sampletype_shorthand column is missing."""
144
331
  input_df = pandas.DataFrame({
145
- "field1": ["value1"],
146
332
  SAMPLE_NAME_KEY: ["sample1"],
147
- QC_NOTE_KEY: [""],
333
+ HOSTTYPE_SHORTHAND_KEY: ["human"]
334
+ })
335
+ study_config = {}
336
+
337
+ with self.assertRaisesRegex(ValueError, SAMPLETYPE_SHORTHAND_KEY):
338
+ find_standard_cols(input_df, study_config, self.TEST_STDS_FP)
339
+
340
+ def test_find_standard_cols_missing_sample_name_raises(self):
341
+ """Test raises ValueError when sample_name column is missing."""
342
+ input_df = pandas.DataFrame({
148
343
  HOSTTYPE_SHORTHAND_KEY: ["human"],
149
344
  SAMPLETYPE_SHORTHAND_KEY: ["stool"]
150
345
  })
346
+ study_config = {}
151
347
 
152
- result = _reorder_df(input_df, INTERNAL_COL_KEYS)
153
-
154
- expected_order = [SAMPLE_NAME_KEY, "field1"] + INTERNAL_COL_KEYS
155
- self.assertEqual(expected_order, list(result.columns))
348
+ with self.assertRaisesRegex(ValueError, SAMPLE_NAME_KEY):
349
+ find_standard_cols(input_df, study_config, self.TEST_STDS_FP)
156
350
 
157
- def test__reorder_df_full_ordering(self):
158
- """Test complete column ordering: sample_name, alphabetical, internals."""
351
+ def test_find_standard_cols_suppress_missing_name_err(self):
352
+ """Test that suppress_missing_name_err=True allows missing sample_name."""
159
353
  input_df = pandas.DataFrame({
160
- "zebra": ["z"],
161
- SAMPLE_NAME_KEY: ["sample1"],
162
- "apple": ["a"],
163
- QC_NOTE_KEY: [""],
164
354
  HOSTTYPE_SHORTHAND_KEY: ["human"],
165
355
  SAMPLETYPE_SHORTHAND_KEY: ["stool"],
166
- "banana": ["b"]
356
+ "body_site": ["gut"]
167
357
  })
358
+ study_config = {
359
+ DEFAULT_KEY: "not provided",
360
+ LEAVE_REQUIREDS_BLANK_KEY: True,
361
+ OVERWRITE_NON_NANS_KEY: False,
362
+ STUDY_SPECIFIC_METADATA_KEY: {
363
+ HOST_TYPE_SPECIFIC_METADATA_KEY: {
364
+ "human": {
365
+ METADATA_FIELDS_KEY: {},
366
+ SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
367
+ "stool": {
368
+ METADATA_FIELDS_KEY: {
369
+ "body_site": {
370
+ DEFAULT_KEY: "gut",
371
+ TYPE_KEY: "string"
372
+ }
373
+ }
374
+ }
375
+ }
376
+ }
377
+ }
378
+ }
379
+ }
168
380
 
169
- result = _reorder_df(input_df, INTERNAL_COL_KEYS)
381
+ result = find_standard_cols(
382
+ input_df, study_config, self.TEST_STDS_FP,
383
+ suppress_missing_name_err=True)
170
384
 
171
- expected_order = [SAMPLE_NAME_KEY, "apple", "banana", "zebra"] + INTERNAL_COL_KEYS
172
- self.assertEqual(expected_order, list(result.columns))
385
+ # Only body_site is a standard col in df (sample_name is missing but allowed)
386
+ expected = ["body_site"]
387
+ self.assertEqual(expected, sorted(result))
173
388
 
174
- # Tests for _catch_nan_required_fields
389
+ # Tests for find_nonstandard_cols
175
390
 
176
- def test__catch_nan_required_fields_no_nans(self):
177
- """Test returns unchanged df when no NaNs in required fields."""
391
+ def test_find_nonstandard_cols_returns_nonstandard_cols(self):
392
+ """Test returns columns in df that are not in the reserved columns list."""
178
393
  input_df = pandas.DataFrame({
179
- SAMPLE_NAME_KEY: ["sample1", "sample2"],
180
- HOSTTYPE_SHORTHAND_KEY: ["human", "control"],
181
- SAMPLETYPE_SHORTHAND_KEY: ["stool", "blank"]
394
+ SAMPLE_NAME_KEY: ["sample1"],
395
+ HOSTTYPE_SHORTHAND_KEY: ["human"],
396
+ SAMPLETYPE_SHORTHAND_KEY: ["stool"],
397
+ "body_site": ["gut"],
398
+ "host_common_name": ["human"],
399
+ "my_custom_column": ["custom_value"],
400
+ "another_nonstandard": ["value"]
182
401
  })
402
+ study_config = {
403
+ DEFAULT_KEY: "not provided",
404
+ LEAVE_REQUIREDS_BLANK_KEY: True,
405
+ OVERWRITE_NON_NANS_KEY: False,
406
+ STUDY_SPECIFIC_METADATA_KEY: {
407
+ HOST_TYPE_SPECIFIC_METADATA_KEY: {
408
+ "human": {
409
+ METADATA_FIELDS_KEY: {
410
+ "host_common_name": {
411
+ DEFAULT_KEY: "human",
412
+ TYPE_KEY: "string"
413
+ }
414
+ },
415
+ SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
416
+ "stool": {
417
+ METADATA_FIELDS_KEY: {
418
+ "body_site": {
419
+ DEFAULT_KEY: "gut",
420
+ TYPE_KEY: "string"
421
+ }
422
+ }
423
+ }
424
+ }
425
+ }
426
+ }
427
+ }
428
+ }
183
429
 
184
- result = _catch_nan_required_fields(input_df)
430
+ result = find_nonstandard_cols(input_df, study_config, self.TEST_STDS_FP)
185
431
 
186
- assert_frame_equal(input_df, result)
432
+ # Only my_custom_column and another_nonstandard are not in the reserved list
433
+ # sample_name, body_site, host_common_name, hosttype_shorthand,
434
+ # sampletype_shorthand are all reserved
435
+ expected = ["another_nonstandard", "my_custom_column"]
436
+ self.assertEqual(sorted(expected), sorted(result))
187
437
 
188
- def test__catch_nan_required_fields_nan_sample_name_raises(self):
189
- """Test raises ValueError when sample_name contains NaN."""
438
+ def test_find_nonstandard_cols_missing_required_col_raises(self):
439
+ """Test raises ValueError when a required column is missing."""
190
440
  input_df = pandas.DataFrame({
191
- SAMPLE_NAME_KEY: ["sample1", np.nan],
192
- HOSTTYPE_SHORTHAND_KEY: ["human", "control"],
193
- SAMPLETYPE_SHORTHAND_KEY: ["stool", "blank"]
441
+ SAMPLE_NAME_KEY: ["sample1"],
442
+ SAMPLETYPE_SHORTHAND_KEY: ["stool"]
443
+ # missing HOSTTYPE_SHORTHAND_KEY
194
444
  })
445
+ study_config = {}
195
446
 
196
- with self.assertRaisesRegex(ValueError, "Metadata contains NaN sample names"):
197
- _catch_nan_required_fields(input_df)
447
+ with self.assertRaisesRegex(ValueError, HOSTTYPE_SHORTHAND_KEY):
448
+ find_nonstandard_cols(input_df, study_config, self.TEST_STDS_FP)
198
449
 
199
- def test__catch_nan_required_fields_nan_shorthand_fields_become_empty(self):
200
- """Test that NaN hosttype_shorthand and sampletype_shorthand values are set to 'empty'."""
201
- input_df = pandas.DataFrame({
202
- SAMPLE_NAME_KEY: ["sample1", "sample2"],
203
- HOSTTYPE_SHORTHAND_KEY: ["human", np.nan],
204
- SAMPLETYPE_SHORTHAND_KEY: [np.nan, "blank"]
450
+ # Tests for write_metadata_results
451
+
452
+ def test_write_metadata_results_creates_all_files(self):
453
+ """Test creates metadata file and validation errors file, includes failed rows."""
454
+ metadata_df = pandas.DataFrame({
455
+ SAMPLE_NAME_KEY: ["sample1", "sample2", "sample3"],
456
+ "field_a": ["a1", "a2", "a3"],
457
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human", "human"],
458
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool", "stool"],
459
+ QC_NOTE_KEY: ["", "invalid host_type", ""]
460
+ })
461
+ validation_msgs_df = pandas.DataFrame({
462
+ "field": ["field_a"],
463
+ "error": ["some validation error"]
205
464
  })
206
465
 
207
- result = _catch_nan_required_fields(input_df)
466
+ with tempfile.TemporaryDirectory() as tmpdir:
467
+ write_metadata_results(
468
+ metadata_df, validation_msgs_df, tmpdir, "test_output",
469
+ sep="\t", remove_internals=False)
470
+
471
+ # Find the main metadata file
472
+ metadata_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
473
+ self.assertEqual(1, len(metadata_files))
474
+
475
+ # Verify metadata file contents - includes failed row when remove_internals=False
476
+ result_df = pandas.read_csv(
477
+ metadata_files[0], sep="\t", keep_default_na=False)
478
+ assert_frame_equal(metadata_df, result_df)
479
+
480
+ # Find the validation errors file (uses comma separator)
481
+ validation_files = glob.glob(
482
+ os.path.join(tmpdir, "*_test_output_validation_errors.csv"))
483
+ self.assertEqual(1, len(validation_files))
484
+
485
+ # Verify validation errors file contents
486
+ result_validation_df = pandas.read_csv(validation_files[0], sep=",")
487
+ assert_frame_equal(validation_msgs_df, result_validation_df)
488
+
489
+ # No fails file should be created when remove_internals=False
490
+ fails_files = glob.glob(
491
+ os.path.join(tmpdir, "*_test_output_fails.csv"))
492
+ self.assertEqual(0, len(fails_files))
208
493
 
209
- expected = pandas.DataFrame({
210
- SAMPLE_NAME_KEY: ["sample1", "sample2"],
211
- HOSTTYPE_SHORTHAND_KEY: ["human", "empty"],
212
- SAMPLETYPE_SHORTHAND_KEY: ["empty", "blank"]
494
+ def test_write_metadata_results_remove_internals_creates_fails_file(self):
495
+ """Test with remove_internals=True creates fails file and removes internal cols."""
496
+ metadata_df = pandas.DataFrame({
497
+ SAMPLE_NAME_KEY: ["sample1", "sample2", "sample3"],
498
+ "field_a": ["a1", "a2", "a3"],
499
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human", "human"],
500
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool", "stool"],
501
+ QC_NOTE_KEY: ["", "invalid host_type", ""]
213
502
  })
214
- assert_frame_equal(expected, result)
503
+ validation_msgs_df = pandas.DataFrame()
215
504
 
216
- # Tests for _fill_na_if_default
505
+ with tempfile.TemporaryDirectory() as tmpdir:
506
+ write_metadata_results(
507
+ metadata_df, validation_msgs_df, tmpdir, "test_output",
508
+ sep="\t", remove_internals=True)
217
509
 
218
- def test__fill_na_if_default_specific_overrides_settings(self):
219
- """Test that specific_dict default takes precedence over settings_dict."""
220
- input_df = pandas.DataFrame({
510
+ # Find the main metadata file
511
+ metadata_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
512
+ self.assertEqual(1, len(metadata_files))
513
+
514
+ # Verify metadata has internal cols removed and no failures
515
+ result_df = pandas.read_csv(metadata_files[0], sep="\t")
516
+ expected_df = pandas.DataFrame({
517
+ SAMPLE_NAME_KEY: ["sample1", "sample3"],
518
+ "field_a": ["a1", "a3"]
519
+ })
520
+ assert_frame_equal(expected_df, result_df)
521
+
522
+ # Find the fails file
523
+ fails_files = glob.glob(
524
+ os.path.join(tmpdir, "*_test_output_fails.csv"))
525
+ self.assertEqual(1, len(fails_files))
526
+
527
+ # Verify fails file contains the failed row
528
+ fails_df = pandas.read_csv(fails_files[0], sep=",")
529
+ expected_fails_df = pandas.DataFrame({
530
+ SAMPLE_NAME_KEY: ["sample2"],
531
+ "field_a": ["a2"],
532
+ HOSTTYPE_SHORTHAND_KEY: ["human"],
533
+ SAMPLETYPE_SHORTHAND_KEY: ["stool"],
534
+ QC_NOTE_KEY: ["invalid host_type"]
535
+ })
536
+ assert_frame_equal(expected_fails_df, fails_df)
537
+
538
+ # Validation errors file should be empty (touched)
539
+ validation_files = glob.glob(
540
+ os.path.join(tmpdir, "*_test_output_validation_errors.csv"))
541
+ self.assertEqual(1, len(validation_files))
542
+ self.assertEqual(0, os.path.getsize(validation_files[0]))
543
+
544
+ def test_write_metadata_results_suppress_empty_fails(self):
545
+ """Test with suppress_empty_fails=True does not create empty files."""
546
+ metadata_df = pandas.DataFrame({
547
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
548
+ "field_a": ["a1", "a2"],
549
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
550
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
551
+ QC_NOTE_KEY: ["", ""]
552
+ })
553
+ validation_msgs_df = pandas.DataFrame()
554
+
555
+ with tempfile.TemporaryDirectory() as tmpdir:
556
+ write_metadata_results(
557
+ metadata_df, validation_msgs_df, tmpdir, "test_output",
558
+ sep="\t", remove_internals=True, suppress_empty_fails=True)
559
+
560
+ # Main metadata file should exist
561
+ metadata_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
562
+ self.assertEqual(1, len(metadata_files))
563
+
564
+ # Fails file should NOT exist (no failures, suppressed)
565
+ fails_files = glob.glob(
566
+ os.path.join(tmpdir, "*_test_output_fails.csv"))
567
+ self.assertEqual(0, len(fails_files))
568
+
569
+ # Validation errors file should NOT exist (empty, suppressed)
570
+ validation_files = glob.glob(
571
+ os.path.join(tmpdir, "*_test_output_validation_errors.csv"))
572
+ self.assertEqual(0, len(validation_files))
573
+
574
+ def test_write_metadata_results_custom_internal_col_names(self):
575
+ """Test with custom internal_col_names parameter."""
576
+ metadata_df = pandas.DataFrame({
577
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
578
+ "field_a": ["a1", "a2"],
579
+ "custom_internal": ["x", "y"],
580
+ QC_NOTE_KEY: ["", ""]
581
+ })
582
+ validation_msgs_df = pandas.DataFrame()
583
+
584
+ with tempfile.TemporaryDirectory() as tmpdir:
585
+ write_metadata_results(
586
+ metadata_df, validation_msgs_df, tmpdir, "test_output",
587
+ sep="\t", remove_internals=True, suppress_empty_fails=True,
588
+ internal_col_names=["custom_internal", QC_NOTE_KEY])
589
+
590
+ # Find the main metadata file
591
+ metadata_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
592
+ self.assertEqual(1, len(metadata_files))
593
+
594
+ # Verify custom internal cols are removed
595
+ result_df = pandas.read_csv(metadata_files[0], sep="\t")
596
+ expected_df = pandas.DataFrame({
597
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
598
+ "field_a": ["a1", "a2"]
599
+ })
600
+ assert_frame_equal(expected_df, result_df)
601
+
602
+ # Tests for get_qc_failures
603
+
604
+ def test_get_qc_failures_no_failures(self):
605
+ """Test returns empty df when QC_NOTE_KEY is all empty strings."""
606
+ input_df = pandas.DataFrame({
607
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
608
+ QC_NOTE_KEY: ["", ""]
609
+ })
610
+
611
+ result = get_qc_failures(input_df)
612
+
613
+ self.assertTrue(result.empty)
614
+
615
+ def test_get_qc_failures_some_failures(self):
616
+ """Test returns only rows where QC_NOTE_KEY is not empty."""
617
+ input_df = pandas.DataFrame({
618
+ SAMPLE_NAME_KEY: ["sample1", "sample2", "sample3"],
619
+ QC_NOTE_KEY: ["", "invalid host_type", ""]
620
+ })
621
+
622
+ result = get_qc_failures(input_df)
623
+
624
+ expected = pandas.DataFrame({
625
+ SAMPLE_NAME_KEY: ["sample2"],
626
+ QC_NOTE_KEY: ["invalid host_type"]
627
+ }, index=[1])
628
+ assert_frame_equal(expected, result)
629
+
630
+ def test_get_qc_failures_all_failures(self):
631
+ """Test returns all rows when all have QC notes."""
632
+ input_df = pandas.DataFrame({
633
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
634
+ QC_NOTE_KEY: ["invalid host_type", "invalid sample_type"]
635
+ })
636
+
637
+ result = get_qc_failures(input_df)
638
+
639
+ assert_frame_equal(input_df, result)
640
+
641
+ # Tests for _reorder_df
642
+
643
+ def test__reorder_df_sample_name_first(self):
644
+ """Test that sample_name becomes the first column."""
645
+ input_df = pandas.DataFrame({
646
+ "zebra": ["z"],
647
+ SAMPLE_NAME_KEY: ["sample1"],
648
+ "apple": ["a"],
649
+ QC_NOTE_KEY: [""],
650
+ HOSTTYPE_SHORTHAND_KEY: ["human"],
651
+ SAMPLETYPE_SHORTHAND_KEY: ["stool"]
652
+ })
653
+
654
+ result = _reorder_df(input_df, INTERNAL_COL_KEYS)
655
+
656
+ self.assertEqual(SAMPLE_NAME_KEY, result.columns[0])
657
+
658
+ def test__reorder_df_alphabetical_order(self):
659
+ """Test that non-internal columns are sorted alphabetically after sample_name."""
660
+ input_df = pandas.DataFrame({
661
+ "zebra": ["z"],
662
+ SAMPLE_NAME_KEY: ["sample1"],
663
+ "apple": ["a"],
664
+ QC_NOTE_KEY: [""],
665
+ HOSTTYPE_SHORTHAND_KEY: ["human"],
666
+ SAMPLETYPE_SHORTHAND_KEY: ["stool"]
667
+ })
668
+
669
+ result = _reorder_df(input_df, INTERNAL_COL_KEYS)
670
+
671
+ expected_order = [SAMPLE_NAME_KEY, "apple", "zebra"] + INTERNAL_COL_KEYS
672
+ self.assertEqual(expected_order, list(result.columns))
673
+
674
+ def test__reorder_df_internals_at_end(self):
675
+ """Test that internal columns are moved to the end in the provided order."""
676
+ input_df = pandas.DataFrame({
677
+ "field1": ["value1"],
678
+ SAMPLE_NAME_KEY: ["sample1"],
679
+ QC_NOTE_KEY: [""],
680
+ HOSTTYPE_SHORTHAND_KEY: ["human"],
681
+ SAMPLETYPE_SHORTHAND_KEY: ["stool"]
682
+ })
683
+
684
+ result = _reorder_df(input_df, INTERNAL_COL_KEYS)
685
+
686
+ expected_order = [SAMPLE_NAME_KEY, "field1"] + INTERNAL_COL_KEYS
687
+ self.assertEqual(expected_order, list(result.columns))
688
+
689
+ def test__reorder_df_full_ordering(self):
690
+ """Test complete column ordering: sample_name, alphabetical, internals."""
691
+ input_df = pandas.DataFrame({
692
+ "zebra": ["z"],
693
+ SAMPLE_NAME_KEY: ["sample1"],
694
+ "apple": ["a"],
695
+ QC_NOTE_KEY: [""],
696
+ HOSTTYPE_SHORTHAND_KEY: ["human"],
697
+ SAMPLETYPE_SHORTHAND_KEY: ["stool"],
698
+ "banana": ["b"]
699
+ })
700
+
701
+ result = _reorder_df(input_df, INTERNAL_COL_KEYS)
702
+
703
+ expected_order = [SAMPLE_NAME_KEY, "apple", "banana", "zebra"] + INTERNAL_COL_KEYS
704
+ self.assertEqual(expected_order, list(result.columns))
705
+
706
+ # Tests for _catch_nan_required_fields
707
+
708
+ def test__catch_nan_required_fields_no_nans(self):
709
+ """Test returns unchanged df when no NaNs in required fields."""
710
+ input_df = pandas.DataFrame({
711
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
712
+ HOSTTYPE_SHORTHAND_KEY: ["human", "control"],
713
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "blank"]
714
+ })
715
+
716
+ result = _catch_nan_required_fields(input_df)
717
+
718
+ assert_frame_equal(input_df, result)
719
+
720
+ def test__catch_nan_required_fields_nan_sample_name_raises(self):
721
+ """Test raises ValueError when sample_name contains NaN."""
722
+ input_df = pandas.DataFrame({
723
+ SAMPLE_NAME_KEY: ["sample1", np.nan],
724
+ HOSTTYPE_SHORTHAND_KEY: ["human", "control"],
725
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "blank"]
726
+ })
727
+
728
+ with self.assertRaisesRegex(ValueError, "Metadata contains NaN sample names"):
729
+ _catch_nan_required_fields(input_df)
730
+
731
+ def test__catch_nan_required_fields_nan_shorthand_fields_become_empty(self):
732
+ """Test that NaN hosttype_shorthand and sampletype_shorthand values are set to 'empty'."""
733
+ input_df = pandas.DataFrame({
734
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
735
+ HOSTTYPE_SHORTHAND_KEY: ["human", np.nan],
736
+ SAMPLETYPE_SHORTHAND_KEY: [np.nan, "blank"]
737
+ })
738
+
739
+ result = _catch_nan_required_fields(input_df)
740
+
741
+ expected = pandas.DataFrame({
742
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
743
+ HOSTTYPE_SHORTHAND_KEY: ["human", "empty"],
744
+ SAMPLETYPE_SHORTHAND_KEY: ["empty", "blank"]
745
+ })
746
+ assert_frame_equal(expected, result)
747
+
748
+ # Tests for _fill_na_if_default
749
+
750
+ def test__fill_na_if_default_specific_overrides_settings(self):
751
+ """Test that specific_dict default takes precedence over settings_dict."""
752
+ input_df = pandas.DataFrame({
221
753
  "field1": ["value1", np.nan, "value3"],
222
754
  "field2": [np.nan, "value2", np.nan]
223
755
  })
@@ -745,6 +1277,8 @@ class TestMetadataExtender(TestCase):
745
1277
  LEAVE_REQUIREDS_BLANK_KEY: False,
746
1278
  DEFAULT_KEY: "not provided"
747
1279
  }
1280
+ # Config is pre-resolved: sample type's metadata_fields already includes
1281
+ # host fields merged in, plus sample_type and qiita_sample_type
748
1282
  host_type_config_dict = {
749
1283
  METADATA_FIELDS_KEY: {
750
1284
  "host_field": {
@@ -755,9 +1289,23 @@ class TestMetadataExtender(TestCase):
755
1289
  SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
756
1290
  "stool": {
757
1291
  METADATA_FIELDS_KEY: {
1292
+ "host_field": {
1293
+ DEFAULT_KEY: "host_default",
1294
+ TYPE_KEY: "string"
1295
+ },
758
1296
  "stool_field": {
759
1297
  DEFAULT_KEY: "stool_default",
760
1298
  TYPE_KEY: "string"
1299
+ },
1300
+ SAMPLE_TYPE_KEY: {
1301
+ ALLOWED_KEY: ["stool"],
1302
+ DEFAULT_KEY: "stool",
1303
+ TYPE_KEY: "string"
1304
+ },
1305
+ QIITA_SAMPLE_TYPE: {
1306
+ ALLOWED_KEY: ["stool"],
1307
+ DEFAULT_KEY: "stool",
1308
+ TYPE_KEY: "string"
761
1309
  }
762
1310
  }
763
1311
  }
@@ -996,17 +1544,44 @@ class TestMetadataExtender(TestCase):
996
1544
  LEAVE_REQUIREDS_BLANK_KEY: False,
997
1545
  DEFAULT_KEY: "not provided"
998
1546
  }
1547
+ # Config is pre-resolved: alias "feces" has its own metadata_fields
1548
+ # that is a copy of "stool"'s resolved fields with sample_type="stool"
999
1549
  host_type_config_dict = {
1000
1550
  METADATA_FIELDS_KEY: {},
1001
1551
  SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
1002
1552
  "feces": {
1003
- ALIAS_KEY: "stool"
1553
+ METADATA_FIELDS_KEY: {
1554
+ "stool_field": {
1555
+ DEFAULT_KEY: "stool_value",
1556
+ TYPE_KEY: "string"
1557
+ },
1558
+ SAMPLE_TYPE_KEY: {
1559
+ ALLOWED_KEY: ["stool"],
1560
+ DEFAULT_KEY: "stool",
1561
+ TYPE_KEY: "string"
1562
+ },
1563
+ QIITA_SAMPLE_TYPE: {
1564
+ ALLOWED_KEY: ["stool"],
1565
+ DEFAULT_KEY: "stool",
1566
+ TYPE_KEY: "string"
1567
+ }
1568
+ }
1004
1569
  },
1005
1570
  "stool": {
1006
1571
  METADATA_FIELDS_KEY: {
1007
1572
  "stool_field": {
1008
1573
  DEFAULT_KEY: "stool_value",
1009
1574
  TYPE_KEY: "string"
1575
+ },
1576
+ SAMPLE_TYPE_KEY: {
1577
+ ALLOWED_KEY: ["stool"],
1578
+ DEFAULT_KEY: "stool",
1579
+ TYPE_KEY: "string"
1580
+ },
1581
+ QIITA_SAMPLE_TYPE: {
1582
+ ALLOWED_KEY: ["stool"],
1583
+ DEFAULT_KEY: "stool",
1584
+ TYPE_KEY: "string"
1010
1585
  }
1011
1586
  }
1012
1587
  }
@@ -1035,6 +1610,8 @@ class TestMetadataExtender(TestCase):
1035
1610
  LEAVE_REQUIREDS_BLANK_KEY: False,
1036
1611
  DEFAULT_KEY: "global_default"
1037
1612
  }
1613
+ # Config is pre-resolved: sample type's metadata_fields includes
1614
+ # host fields merged in, plus sample_type and qiita_sample_type
1038
1615
  full_flat_config_dict = {
1039
1616
  HOST_TYPE_SPECIFIC_METADATA_KEY: {
1040
1617
  "human": {
@@ -1048,9 +1625,23 @@ class TestMetadataExtender(TestCase):
1048
1625
  SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
1049
1626
  "stool": {
1050
1627
  METADATA_FIELDS_KEY: {
1628
+ "host_field": {
1629
+ DEFAULT_KEY: "host_value",
1630
+ TYPE_KEY: "string"
1631
+ },
1051
1632
  "stool_field": {
1052
1633
  DEFAULT_KEY: "stool_value",
1053
1634
  TYPE_KEY: "string"
1635
+ },
1636
+ SAMPLE_TYPE_KEY: {
1637
+ ALLOWED_KEY: ["stool"],
1638
+ DEFAULT_KEY: "stool",
1639
+ TYPE_KEY: "string"
1640
+ },
1641
+ QIITA_SAMPLE_TYPE: {
1642
+ ALLOWED_KEY: ["stool"],
1643
+ DEFAULT_KEY: "stool",
1644
+ TYPE_KEY: "string"
1054
1645
  }
1055
1646
  }
1056
1647
  }
@@ -1160,6 +1751,8 @@ class TestMetadataExtender(TestCase):
1160
1751
  LEAVE_REQUIREDS_BLANK_KEY: False,
1161
1752
  DEFAULT_KEY: "global_default"
1162
1753
  }
1754
+ # Config is pre-resolved: sample type's metadata_fields includes
1755
+ # host fields merged in, plus sample_type and qiita_sample_type
1163
1756
  full_flat_config_dict = {
1164
1757
  HOST_TYPE_SPECIFIC_METADATA_KEY: {
1165
1758
  "human": {
@@ -1171,7 +1764,22 @@ class TestMetadataExtender(TestCase):
1171
1764
  },
1172
1765
  SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
1173
1766
  "stool": {
1174
- METADATA_FIELDS_KEY: {}
1767
+ METADATA_FIELDS_KEY: {
1768
+ "human_field": {
1769
+ DEFAULT_KEY: "human_value",
1770
+ TYPE_KEY: "string"
1771
+ },
1772
+ SAMPLE_TYPE_KEY: {
1773
+ ALLOWED_KEY: ["stool"],
1774
+ DEFAULT_KEY: "stool",
1775
+ TYPE_KEY: "string"
1776
+ },
1777
+ QIITA_SAMPLE_TYPE: {
1778
+ ALLOWED_KEY: ["stool"],
1779
+ DEFAULT_KEY: "stool",
1780
+ TYPE_KEY: "string"
1781
+ }
1782
+ }
1175
1783
  }
1176
1784
  }
1177
1785
  },
@@ -1209,6 +1817,8 @@ class TestMetadataExtender(TestCase):
1209
1817
  LEAVE_REQUIREDS_BLANK_KEY: False,
1210
1818
  DEFAULT_KEY: "global_default"
1211
1819
  }
1820
+ # Config is pre-resolved: sample type's metadata_fields includes
1821
+ # host fields merged in, plus sample_type and qiita_sample_type
1212
1822
  full_flat_config_dict = {
1213
1823
  HOST_TYPE_SPECIFIC_METADATA_KEY: {
1214
1824
  "human": {
@@ -1220,6 +1830,16 @@ class TestMetadataExtender(TestCase):
1220
1830
  "required_field": {
1221
1831
  REQUIRED_KEY: True,
1222
1832
  TYPE_KEY: "string"
1833
+ },
1834
+ SAMPLE_TYPE_KEY: {
1835
+ ALLOWED_KEY: ["stool"],
1836
+ DEFAULT_KEY: "stool",
1837
+ TYPE_KEY: "string"
1838
+ },
1839
+ QIITA_SAMPLE_TYPE: {
1840
+ ALLOWED_KEY: ["stool"],
1841
+ DEFAULT_KEY: "stool",
1842
+ TYPE_KEY: "string"
1223
1843
  }
1224
1844
  }
1225
1845
  }
@@ -1255,6 +1875,8 @@ class TestMetadataExtender(TestCase):
1255
1875
  LEAVE_REQUIREDS_BLANK_KEY: False,
1256
1876
  DEFAULT_KEY: "global_default"
1257
1877
  }
1878
+ # Config is pre-resolved: sample type's metadata_fields includes
1879
+ # host fields merged in, plus sample_type and qiita_sample_type
1258
1880
  full_flat_config_dict = {
1259
1881
  HOST_TYPE_SPECIFIC_METADATA_KEY: {
1260
1882
  "human": {
@@ -1266,6 +1888,16 @@ class TestMetadataExtender(TestCase):
1266
1888
  "required_field": {
1267
1889
  REQUIRED_KEY: True,
1268
1890
  TYPE_KEY: "string"
1891
+ },
1892
+ SAMPLE_TYPE_KEY: {
1893
+ ALLOWED_KEY: ["stool"],
1894
+ DEFAULT_KEY: "stool",
1895
+ TYPE_KEY: "string"
1896
+ },
1897
+ QIITA_SAMPLE_TYPE: {
1898
+ ALLOWED_KEY: ["stool"],
1899
+ DEFAULT_KEY: "stool",
1900
+ TYPE_KEY: "string"
1269
1901
  }
1270
1902
  }
1271
1903
  }
@@ -1298,6 +1930,8 @@ class TestMetadataExtender(TestCase):
1298
1930
  SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
1299
1931
  QC_NOTE_KEY: ["", ""]
1300
1932
  })
1933
+ # Config is pre-resolved: sample type's metadata_fields includes
1934
+ # host fields merged in, plus sample_type and qiita_sample_type
1301
1935
  full_flat_config_dict = {
1302
1936
  DEFAULT_KEY: "global_default",
1303
1937
  LEAVE_REQUIREDS_BLANK_KEY: False,
@@ -1313,9 +1947,23 @@ class TestMetadataExtender(TestCase):
1313
1947
  SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
1314
1948
  "stool": {
1315
1949
  METADATA_FIELDS_KEY: {
1950
+ "host_field": {
1951
+ DEFAULT_KEY: "host_value",
1952
+ TYPE_KEY: "string"
1953
+ },
1316
1954
  "stool_field": {
1317
1955
  DEFAULT_KEY: "stool_value",
1318
1956
  TYPE_KEY: "string"
1957
+ },
1958
+ SAMPLE_TYPE_KEY: {
1959
+ ALLOWED_KEY: ["stool"],
1960
+ DEFAULT_KEY: "stool",
1961
+ TYPE_KEY: "string"
1962
+ },
1963
+ QIITA_SAMPLE_TYPE: {
1964
+ ALLOWED_KEY: ["stool"],
1965
+ DEFAULT_KEY: "stool",
1966
+ TYPE_KEY: "string"
1319
1967
  }
1320
1968
  }
1321
1969
  }
@@ -1348,6 +1996,8 @@ class TestMetadataExtender(TestCase):
1348
1996
  SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool", "blood"],
1349
1997
  QC_NOTE_KEY: ["", "", ""]
1350
1998
  })
1999
+ # Config is pre-resolved: sample type's metadata_fields includes
2000
+ # host fields merged in, plus sample_type and qiita_sample_type
1351
2001
  full_flat_config_dict = {
1352
2002
  DEFAULT_KEY: "global_default",
1353
2003
  LEAVE_REQUIREDS_BLANK_KEY: False,
@@ -1362,10 +2012,40 @@ class TestMetadataExtender(TestCase):
1362
2012
  },
1363
2013
  SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
1364
2014
  "stool": {
1365
- METADATA_FIELDS_KEY: {}
2015
+ METADATA_FIELDS_KEY: {
2016
+ "human_field": {
2017
+ DEFAULT_KEY: "human_value",
2018
+ TYPE_KEY: "string"
2019
+ },
2020
+ SAMPLE_TYPE_KEY: {
2021
+ ALLOWED_KEY: ["stool"],
2022
+ DEFAULT_KEY: "stool",
2023
+ TYPE_KEY: "string"
2024
+ },
2025
+ QIITA_SAMPLE_TYPE: {
2026
+ ALLOWED_KEY: ["stool"],
2027
+ DEFAULT_KEY: "stool",
2028
+ TYPE_KEY: "string"
2029
+ }
2030
+ }
1366
2031
  },
1367
2032
  "blood": {
1368
- METADATA_FIELDS_KEY: {}
2033
+ METADATA_FIELDS_KEY: {
2034
+ "human_field": {
2035
+ DEFAULT_KEY: "human_value",
2036
+ TYPE_KEY: "string"
2037
+ },
2038
+ SAMPLE_TYPE_KEY: {
2039
+ ALLOWED_KEY: ["blood"],
2040
+ DEFAULT_KEY: "blood",
2041
+ TYPE_KEY: "string"
2042
+ },
2043
+ QIITA_SAMPLE_TYPE: {
2044
+ ALLOWED_KEY: ["blood"],
2045
+ DEFAULT_KEY: "blood",
2046
+ TYPE_KEY: "string"
2047
+ }
2048
+ }
1369
2049
  }
1370
2050
  }
1371
2051
  },
@@ -1378,12 +2058,27 @@ class TestMetadataExtender(TestCase):
1378
2058
  },
1379
2059
  SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
1380
2060
  "stool": {
1381
- METADATA_FIELDS_KEY: {}
1382
- }
1383
- }
1384
- }
1385
- }
1386
- }
2061
+ METADATA_FIELDS_KEY: {
2062
+ "mouse_field": {
2063
+ DEFAULT_KEY: "mouse_value",
2064
+ TYPE_KEY: "string"
2065
+ },
2066
+ SAMPLE_TYPE_KEY: {
2067
+ ALLOWED_KEY: ["stool"],
2068
+ DEFAULT_KEY: "stool",
2069
+ TYPE_KEY: "string"
2070
+ },
2071
+ QIITA_SAMPLE_TYPE: {
2072
+ ALLOWED_KEY: ["stool"],
2073
+ DEFAULT_KEY: "stool",
2074
+ TYPE_KEY: "string"
2075
+ }
2076
+ }
2077
+ }
2078
+ }
2079
+ }
2080
+ }
2081
+ }
1387
2082
 
1388
2083
  result_df, validation_msgs = _generate_metadata_for_host_types(
1389
2084
  input_df, full_flat_config_dict)
@@ -1478,6 +2173,8 @@ class TestMetadataExtender(TestCase):
1478
2173
  SAMPLETYPE_SHORTHAND_KEY: ["stool"],
1479
2174
  QC_NOTE_KEY: [""]
1480
2175
  })
2176
+ # Config is pre-resolved: sample type's metadata_fields includes
2177
+ # host fields merged in, plus sample_type and qiita_sample_type
1481
2178
  full_flat_config_dict = {
1482
2179
  DEFAULT_KEY: "global_default",
1483
2180
  LEAVE_REQUIREDS_BLANK_KEY: True, # This causes required fields to get LEAVE_BLANK_VAL
@@ -1491,6 +2188,16 @@ class TestMetadataExtender(TestCase):
1491
2188
  "required_field": {
1492
2189
  REQUIRED_KEY: True,
1493
2190
  TYPE_KEY: "string"
2191
+ },
2192
+ SAMPLE_TYPE_KEY: {
2193
+ ALLOWED_KEY: ["stool"],
2194
+ DEFAULT_KEY: "stool",
2195
+ TYPE_KEY: "string"
2196
+ },
2197
+ QIITA_SAMPLE_TYPE: {
2198
+ ALLOWED_KEY: ["stool"],
2199
+ DEFAULT_KEY: "stool",
2200
+ TYPE_KEY: "string"
1494
2201
  }
1495
2202
  }
1496
2203
  }
@@ -1790,6 +2497,8 @@ class TestMetadataExtender(TestCase):
1790
2497
  HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
1791
2498
  SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"]
1792
2499
  })
2500
+ # Config is pre-resolved: sample type's metadata_fields includes
2501
+ # host fields merged in, plus sample_type and qiita_sample_type
1793
2502
  full_flat_config_dict = {
1794
2503
  DEFAULT_KEY: "not provided",
1795
2504
  LEAVE_REQUIREDS_BLANK_KEY: False,
@@ -1805,9 +2514,23 @@ class TestMetadataExtender(TestCase):
1805
2514
  SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
1806
2515
  "stool": {
1807
2516
  METADATA_FIELDS_KEY: {
2517
+ "host_field": {
2518
+ DEFAULT_KEY: "host_value",
2519
+ TYPE_KEY: "string"
2520
+ },
1808
2521
  "stool_field": {
1809
2522
  DEFAULT_KEY: "stool_value",
1810
2523
  TYPE_KEY: "string"
2524
+ },
2525
+ SAMPLE_TYPE_KEY: {
2526
+ ALLOWED_KEY: ["stool"],
2527
+ DEFAULT_KEY: "stool",
2528
+ TYPE_KEY: "string"
2529
+ },
2530
+ QIITA_SAMPLE_TYPE: {
2531
+ ALLOWED_KEY: ["stool"],
2532
+ DEFAULT_KEY: "stool",
2533
+ TYPE_KEY: "string"
1811
2534
  }
1812
2535
  }
1813
2536
  }
@@ -1840,6 +2563,8 @@ class TestMetadataExtender(TestCase):
1840
2563
  SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
1841
2564
  "input_sex": ["F", "Male"]
1842
2565
  })
2566
+ # Config is pre-resolved: sample type's metadata_fields includes
2567
+ # host fields merged in, plus sample_type and qiita_sample_type
1843
2568
  full_flat_config_dict = {
1844
2569
  DEFAULT_KEY: "not provided",
1845
2570
  LEAVE_REQUIREDS_BLANK_KEY: False,
@@ -1857,7 +2582,18 @@ class TestMetadataExtender(TestCase):
1857
2582
  METADATA_FIELDS_KEY: {},
1858
2583
  SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
1859
2584
  "stool": {
1860
- METADATA_FIELDS_KEY: {}
2585
+ METADATA_FIELDS_KEY: {
2586
+ SAMPLE_TYPE_KEY: {
2587
+ ALLOWED_KEY: ["stool"],
2588
+ DEFAULT_KEY: "stool",
2589
+ TYPE_KEY: "string"
2590
+ },
2591
+ QIITA_SAMPLE_TYPE: {
2592
+ ALLOWED_KEY: ["stool"],
2593
+ DEFAULT_KEY: "stool",
2594
+ TYPE_KEY: "string"
2595
+ }
2596
+ }
1861
2597
  }
1862
2598
  }
1863
2599
  }
@@ -1886,6 +2622,8 @@ class TestMetadataExtender(TestCase):
1886
2622
  HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
1887
2623
  SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"]
1888
2624
  })
2625
+ # Config is pre-resolved: sample type's metadata_fields includes
2626
+ # host fields merged in, plus sample_type and qiita_sample_type
1889
2627
  full_flat_config_dict = {
1890
2628
  DEFAULT_KEY: "not provided",
1891
2629
  LEAVE_REQUIREDS_BLANK_KEY: False,
@@ -1903,7 +2641,18 @@ class TestMetadataExtender(TestCase):
1903
2641
  METADATA_FIELDS_KEY: {},
1904
2642
  SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
1905
2643
  "stool": {
1906
- METADATA_FIELDS_KEY: {}
2644
+ METADATA_FIELDS_KEY: {
2645
+ SAMPLE_TYPE_KEY: {
2646
+ ALLOWED_KEY: ["stool"],
2647
+ DEFAULT_KEY: "stool",
2648
+ TYPE_KEY: "string"
2649
+ },
2650
+ QIITA_SAMPLE_TYPE: {
2651
+ ALLOWED_KEY: ["stool"],
2652
+ DEFAULT_KEY: "stool",
2653
+ TYPE_KEY: "string"
2654
+ }
2655
+ }
1907
2656
  }
1908
2657
  }
1909
2658
  }
@@ -1963,6 +2712,8 @@ class TestMetadataExtender(TestCase):
1963
2712
  HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
1964
2713
  SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"]
1965
2714
  })
2715
+ # Config is pre-resolved: sample type's metadata_fields includes
2716
+ # host fields merged in, plus sample_type and qiita_sample_type
1966
2717
  full_flat_config_dict = {
1967
2718
  DEFAULT_KEY: "not provided",
1968
2719
  LEAVE_REQUIREDS_BLANK_KEY: False,
@@ -1972,7 +2723,18 @@ class TestMetadataExtender(TestCase):
1972
2723
  METADATA_FIELDS_KEY: {},
1973
2724
  SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
1974
2725
  "stool": {
1975
- METADATA_FIELDS_KEY: {}
2726
+ METADATA_FIELDS_KEY: {
2727
+ SAMPLE_TYPE_KEY: {
2728
+ ALLOWED_KEY: ["stool"],
2729
+ DEFAULT_KEY: "stool",
2730
+ TYPE_KEY: "string"
2731
+ },
2732
+ QIITA_SAMPLE_TYPE: {
2733
+ ALLOWED_KEY: ["stool"],
2734
+ DEFAULT_KEY: "stool",
2735
+ TYPE_KEY: "string"
2736
+ }
2737
+ }
1976
2738
  }
1977
2739
  }
1978
2740
  }
@@ -2002,6 +2764,8 @@ class TestMetadataExtender(TestCase):
2002
2764
  SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
2003
2765
  "source_field": ["hello", "world"]
2004
2766
  })
2767
+ # Config is pre-resolved: sample type's metadata_fields includes
2768
+ # host fields merged in, plus sample_type and qiita_sample_type
2005
2769
  full_flat_config_dict = {
2006
2770
  DEFAULT_KEY: "not provided",
2007
2771
  LEAVE_REQUIREDS_BLANK_KEY: False,
@@ -2019,7 +2783,18 @@ class TestMetadataExtender(TestCase):
2019
2783
  METADATA_FIELDS_KEY: {},
2020
2784
  SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
2021
2785
  "stool": {
2022
- METADATA_FIELDS_KEY: {}
2786
+ METADATA_FIELDS_KEY: {
2787
+ SAMPLE_TYPE_KEY: {
2788
+ ALLOWED_KEY: ["stool"],
2789
+ DEFAULT_KEY: "stool",
2790
+ TYPE_KEY: "string"
2791
+ },
2792
+ QIITA_SAMPLE_TYPE: {
2793
+ ALLOWED_KEY: ["stool"],
2794
+ DEFAULT_KEY: "stool",
2795
+ TYPE_KEY: "string"
2796
+ }
2797
+ }
2023
2798
  }
2024
2799
  }
2025
2800
  }
@@ -2049,62 +2824,613 @@ class TestMetadataExtender(TestCase):
2049
2824
  def test__populate_metadata_df_nan_sample_name_raises(self):
2050
2825
  """Test that NaN sample name raises ValueError."""
2051
2826
  input_df = pandas.DataFrame({
2052
- SAMPLE_NAME_KEY: ["sample1", np.nan],
2827
+ SAMPLE_NAME_KEY: ["sample1", np.nan],
2828
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
2829
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"]
2830
+ })
2831
+ full_flat_config_dict = {
2832
+ HOST_TYPE_SPECIFIC_METADATA_KEY: {}
2833
+ }
2834
+
2835
+ with self.assertRaisesRegex(ValueError, "Metadata contains NaN sample names"):
2836
+ _populate_metadata_df(input_df, full_flat_config_dict, None)
2837
+
2838
+ # Tests for extend_metadata_df
2839
+
2840
+ TEST_DIR = path.dirname(__file__)
2841
+ TEST_STDS_FP = path.join(TEST_DIR, "data/test_standards.yml")
2842
+
2843
+ def test_extend_metadata_df_basic(self):
2844
+ """Test basic metadata extension with study config."""
2845
+ input_df = pandas.DataFrame({
2846
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
2847
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
2848
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"]
2849
+ })
2850
+ study_config = {
2851
+ DEFAULT_KEY: "not provided",
2852
+ LEAVE_REQUIREDS_BLANK_KEY: True,
2853
+ OVERWRITE_NON_NANS_KEY: False,
2854
+ STUDY_SPECIFIC_METADATA_KEY: {
2855
+ HOST_TYPE_SPECIFIC_METADATA_KEY: {
2856
+ "human": {
2857
+ METADATA_FIELDS_KEY: {
2858
+ "custom_field": {
2859
+ DEFAULT_KEY: "custom_value",
2860
+ TYPE_KEY: "string"
2861
+ }
2862
+ },
2863
+ SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
2864
+ "stool": {
2865
+ METADATA_FIELDS_KEY: {}
2866
+ }
2867
+ }
2868
+ }
2869
+ }
2870
+ }
2871
+ }
2872
+
2873
+ result_df, validation_msgs_df = extend_metadata_df(
2874
+ input_df, study_config, None, None, self.TEST_STDS_FP)
2875
+
2876
+ expected_df = pandas.DataFrame({
2877
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
2878
+ # body_product from human stool in test_standards.yml
2879
+ "body_product": ["UBERON:feces", "UBERON:feces"],
2880
+ # body_site inherited from host_associated stool
2881
+ "body_site": ["gut", "gut"],
2882
+ # custom_field from study_specific_metadata
2883
+ "custom_field": ["custom_value", "custom_value"],
2884
+ # description overridden at human level
2885
+ "description": ["human sample", "human sample"],
2886
+ # host_common_name from human level
2887
+ "host_common_name": ["human", "human"],
2888
+ QIITA_SAMPLE_TYPE: ["stool", "stool"],
2889
+ SAMPLE_TYPE_KEY: ["stool", "stool"],
2890
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
2891
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
2892
+ QC_NOTE_KEY: ["", ""]
2893
+ })
2894
+ assert_frame_equal(expected_df, result_df)
2895
+ self.assertTrue(validation_msgs_df.empty)
2896
+
2897
+ def test_extend_metadata_df_with_pre_transformer(self):
2898
+ """Test metadata extension with pre-transformer."""
2899
+ input_df = pandas.DataFrame({
2900
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
2901
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
2902
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
2903
+ "input_sex": ["F", "Male"]
2904
+ })
2905
+ study_config = {
2906
+ DEFAULT_KEY: "not provided",
2907
+ LEAVE_REQUIREDS_BLANK_KEY: True,
2908
+ OVERWRITE_NON_NANS_KEY: False,
2909
+ METADATA_TRANSFORMERS_KEY: {
2910
+ PRE_TRANSFORMERS_KEY: {
2911
+ "sex": {
2912
+ SOURCES_KEY: ["input_sex"],
2913
+ FUNCTION_KEY: "transform_input_sex_to_std_sex"
2914
+ }
2915
+ }
2916
+ },
2917
+ STUDY_SPECIFIC_METADATA_KEY: {
2918
+ HOST_TYPE_SPECIFIC_METADATA_KEY: {
2919
+ "human": {
2920
+ METADATA_FIELDS_KEY: {},
2921
+ SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
2922
+ "stool": {
2923
+ METADATA_FIELDS_KEY: {}
2924
+ }
2925
+ }
2926
+ }
2927
+ }
2928
+ }
2929
+ }
2930
+
2931
+ result_df, validation_msgs_df = extend_metadata_df(
2932
+ input_df, study_config, None, None, self.TEST_STDS_FP)
2933
+
2934
+ expected_df = pandas.DataFrame({
2935
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
2936
+ # body_product from human stool in test_standards.yml
2937
+ "body_product": ["UBERON:feces", "UBERON:feces"],
2938
+ "body_site": ["gut", "gut"],
2939
+ # description overridden at human level
2940
+ "description": ["human sample", "human sample"],
2941
+ "host_common_name": ["human", "human"],
2942
+ "input_sex": ["F", "Male"],
2943
+ QIITA_SAMPLE_TYPE: ["stool", "stool"],
2944
+ SAMPLE_TYPE_KEY: ["stool", "stool"],
2945
+ "sex": ["female", "male"],
2946
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
2947
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
2948
+ QC_NOTE_KEY: ["", ""]
2949
+ })
2950
+ assert_frame_equal(expected_df, result_df)
2951
+
2952
+ def test_extend_metadata_df_with_custom_transformer(self):
2953
+ """Test metadata extension with custom transformer function."""
2954
+ input_df = pandas.DataFrame({
2955
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
2956
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
2957
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
2958
+ "source_field": ["hello", "world"]
2959
+ })
2960
+ study_config = {
2961
+ DEFAULT_KEY: "not provided",
2962
+ LEAVE_REQUIREDS_BLANK_KEY: True,
2963
+ OVERWRITE_NON_NANS_KEY: False,
2964
+ METADATA_TRANSFORMERS_KEY: {
2965
+ PRE_TRANSFORMERS_KEY: {
2966
+ "upper_field": {
2967
+ SOURCES_KEY: ["source_field"],
2968
+ FUNCTION_KEY: "custom_upper"
2969
+ }
2970
+ }
2971
+ },
2972
+ STUDY_SPECIFIC_METADATA_KEY: {
2973
+ HOST_TYPE_SPECIFIC_METADATA_KEY: {
2974
+ "human": {
2975
+ METADATA_FIELDS_KEY: {},
2976
+ SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
2977
+ "stool": {
2978
+ METADATA_FIELDS_KEY: {}
2979
+ }
2980
+ }
2981
+ }
2982
+ }
2983
+ }
2984
+ }
2985
+
2986
+ def custom_upper(row, source_fields):
2987
+ return row[source_fields[0]].upper()
2988
+
2989
+ transformer_funcs_dict = {"custom_upper": custom_upper}
2990
+
2991
+ result_df, validation_msgs_df = extend_metadata_df(
2992
+ input_df, study_config, transformer_funcs_dict, None, self.TEST_STDS_FP)
2993
+
2994
+ expected_df = pandas.DataFrame({
2995
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
2996
+ "body_product": ["UBERON:feces", "UBERON:feces"],
2997
+ "body_site": ["gut", "gut"],
2998
+ "description": ["human sample", "human sample"],
2999
+ "host_common_name": ["human", "human"],
3000
+ QIITA_SAMPLE_TYPE: ["stool", "stool"],
3001
+ SAMPLE_TYPE_KEY: ["stool", "stool"],
3002
+ "source_field": ["hello", "world"],
3003
+ "upper_field": ["HELLO", "WORLD"],
3004
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
3005
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
3006
+ QC_NOTE_KEY: ["", ""]
3007
+ })
3008
+ assert_frame_equal(expected_df, result_df)
3009
+
3010
+ def test_extend_metadata_df_missing_required_columns_raises(self):
3011
+ """Test that missing required columns raises ValueError."""
3012
+ input_df = pandas.DataFrame({
3013
+ SAMPLE_NAME_KEY: ["sample1", "sample2"]
3014
+ # Missing HOSTTYPE_SHORTHAND_KEY and SAMPLETYPE_SHORTHAND_KEY
3015
+ })
3016
+ study_config = {}
3017
+
3018
+ with self.assertRaisesRegex(ValueError, "metadata missing required columns"):
3019
+ extend_metadata_df(input_df, study_config, None, None, self.TEST_STDS_FP)
3020
+
3021
+ def test_extend_metadata_df_none_study_config(self):
3022
+ """Test metadata extension with None study config uses standards only."""
3023
+ input_df = pandas.DataFrame({
3024
+ SAMPLE_NAME_KEY: ["sample1"],
3025
+ HOSTTYPE_SHORTHAND_KEY: ["human"],
3026
+ SAMPLETYPE_SHORTHAND_KEY: ["stool"]
3027
+ })
3028
+
3029
+ result_df, validation_msgs_df = extend_metadata_df(
3030
+ input_df, None, None, None, self.TEST_STDS_FP)
3031
+
3032
+ expected_df = pandas.DataFrame({
3033
+ SAMPLE_NAME_KEY: ["sample1"],
3034
+ "body_product": ["UBERON:feces"],
3035
+ "body_site": ["gut"],
3036
+ "description": ["human sample"],
3037
+ "host_common_name": ["human"],
3038
+ QIITA_SAMPLE_TYPE: ["stool"],
3039
+ SAMPLE_TYPE_KEY: ["stool"],
3040
+ HOSTTYPE_SHORTHAND_KEY: ["human"],
3041
+ SAMPLETYPE_SHORTHAND_KEY: ["stool"],
3042
+ QC_NOTE_KEY: [""]
3043
+ })
3044
+ assert_frame_equal(expected_df, result_df)
3045
+
3046
+ def test_extend_metadata_df_unknown_host_type(self):
3047
+ """Test that unknown host type adds QC note."""
3048
+ input_df = pandas.DataFrame({
3049
+ SAMPLE_NAME_KEY: ["sample1"],
3050
+ HOSTTYPE_SHORTHAND_KEY: ["unknown_host"],
3051
+ SAMPLETYPE_SHORTHAND_KEY: ["stool"]
3052
+ })
3053
+ study_config = {
3054
+ DEFAULT_KEY: "not provided",
3055
+ LEAVE_REQUIREDS_BLANK_KEY: True,
3056
+ OVERWRITE_NON_NANS_KEY: False,
3057
+ STUDY_SPECIFIC_METADATA_KEY: {
3058
+ HOST_TYPE_SPECIFIC_METADATA_KEY: {
3059
+ "human": {
3060
+ METADATA_FIELDS_KEY: {},
3061
+ SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
3062
+ "stool": {
3063
+ METADATA_FIELDS_KEY: {}
3064
+ }
3065
+ }
3066
+ }
3067
+ }
3068
+ }
3069
+ }
3070
+
3071
+ result_df, validation_msgs_df = extend_metadata_df(
3072
+ input_df, study_config, None, None, self.TEST_STDS_FP)
3073
+
3074
+ expected_df = pandas.DataFrame({
3075
+ SAMPLE_NAME_KEY: ["sample1"],
3076
+ HOSTTYPE_SHORTHAND_KEY: ["unknown_host"],
3077
+ SAMPLETYPE_SHORTHAND_KEY: ["stool"],
3078
+ QC_NOTE_KEY: ["invalid host_type"]
3079
+ })
3080
+ assert_frame_equal(expected_df, result_df)
3081
+
3082
+ def test_extend_metadata_df_multiple_host_types(self):
3083
+ """Test metadata extension with multiple host types."""
3084
+ input_df = pandas.DataFrame({
3085
+ SAMPLE_NAME_KEY: ["sample1", "sample2", "sample3"],
3086
+ HOSTTYPE_SHORTHAND_KEY: ["human", "mouse", "human"],
3087
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool", "blood"]
3088
+ })
3089
+ study_config = {
3090
+ DEFAULT_KEY: "not provided",
3091
+ LEAVE_REQUIREDS_BLANK_KEY: True,
3092
+ OVERWRITE_NON_NANS_KEY: False,
3093
+ STUDY_SPECIFIC_METADATA_KEY: {
3094
+ HOST_TYPE_SPECIFIC_METADATA_KEY: {
3095
+ "human": {
3096
+ METADATA_FIELDS_KEY: {},
3097
+ SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
3098
+ "stool": {
3099
+ METADATA_FIELDS_KEY: {}
3100
+ },
3101
+ "blood": {
3102
+ METADATA_FIELDS_KEY: {}
3103
+ }
3104
+ }
3105
+ },
3106
+ "mouse": {
3107
+ METADATA_FIELDS_KEY: {},
3108
+ SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
3109
+ "stool": {
3110
+ METADATA_FIELDS_KEY: {}
3111
+ }
3112
+ }
3113
+ }
3114
+ }
3115
+ }
3116
+ }
3117
+
3118
+ result_df, validation_msgs_df = extend_metadata_df(
3119
+ input_df, study_config, None, None, self.TEST_STDS_FP)
3120
+
3121
+ # After processing multiple host types, rows may be reordered
3122
+ # Human samples are processed together, then mouse samples
3123
+ expected_df = pandas.DataFrame({
3124
+ SAMPLE_NAME_KEY: ["sample1", "sample3", "sample2"],
3125
+ # body_product: human stool/blood have it, mouse stool uses default
3126
+ "body_product": ["UBERON:feces", "UBERON:blood", "not provided"],
3127
+ "body_site": ["gut", "blood", "gut"],
3128
+ # description: human overrides to "human sample",
3129
+ # mouse inherits "host associated sample"
3130
+ "description": ["human sample", "human sample", "host associated sample"],
3131
+ "host_common_name": ["human", "human", "mouse"],
3132
+ QIITA_SAMPLE_TYPE: ["stool", "blood", "stool"],
3133
+ SAMPLE_TYPE_KEY: ["stool", "blood", "stool"],
3134
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human", "mouse"],
3135
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "blood", "stool"],
3136
+ QC_NOTE_KEY: ["", "", ""]
3137
+ })
3138
+ assert_frame_equal(expected_df, result_df)
3139
+
3140
+ def test_extend_metadata_df_with_software_config(self):
3141
+ """Test metadata extension with custom software config overrides defaults."""
3142
+ input_df = pandas.DataFrame({
3143
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
3144
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
3145
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"]
3146
+ })
3147
+ # Software config with custom default value
3148
+ software_config = {
3149
+ DEFAULT_KEY: "custom_software_default",
3150
+ LEAVE_REQUIREDS_BLANK_KEY: True,
3151
+ OVERWRITE_NON_NANS_KEY: False
3152
+ }
3153
+ # Study config that doesn't override DEFAULT_KEY
3154
+ study_config = {
3155
+ STUDY_SPECIFIC_METADATA_KEY: {
3156
+ HOST_TYPE_SPECIFIC_METADATA_KEY: {
3157
+ "human": {
3158
+ METADATA_FIELDS_KEY: {
3159
+ "study_field": {
3160
+ DEFAULT_KEY: "study_value",
3161
+ TYPE_KEY: "string"
3162
+ }
3163
+ },
3164
+ SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
3165
+ "stool": {
3166
+ METADATA_FIELDS_KEY: {}
3167
+ }
3168
+ }
3169
+ }
3170
+ }
3171
+ }
3172
+ }
3173
+
3174
+ result_df, validation_msgs_df = extend_metadata_df(
3175
+ input_df, study_config, None, software_config, self.TEST_STDS_FP)
3176
+
3177
+ expected_df = pandas.DataFrame({
3178
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
3179
+ "body_product": ["UBERON:feces", "UBERON:feces"],
3180
+ "body_site": ["gut", "gut"],
3181
+ "description": ["human sample", "human sample"],
3182
+ "host_common_name": ["human", "human"],
3183
+ QIITA_SAMPLE_TYPE: ["stool", "stool"],
3184
+ SAMPLE_TYPE_KEY: ["stool", "stool"],
3185
+ "study_field": ["study_value", "study_value"],
3186
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
3187
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
3188
+ QC_NOTE_KEY: ["", ""]
3189
+ })
3190
+ assert_frame_equal(expected_df, result_df)
3191
+
3192
+ # Tests for _get_study_specific_config
3193
+
3194
+ def test__get_study_specific_config_with_valid_file(self):
3195
+ """Test loading study-specific config from a valid YAML file."""
3196
+ config_fp = path.join(self.TEST_DIR, "data/test_config.yml")
3197
+
3198
+ result = _get_study_specific_config(config_fp)
3199
+
3200
+ expected = {
3201
+ HOST_TYPE_SPECIFIC_METADATA_KEY: {
3202
+ "base": {
3203
+ METADATA_FIELDS_KEY: {
3204
+ "sample_name": {
3205
+ TYPE_KEY: "string",
3206
+ "unique": True
3207
+ },
3208
+ "sample_type": {
3209
+ "empty": False,
3210
+ "is_phi": False
3211
+ }
3212
+ }
3213
+ }
3214
+ }
3215
+ }
3216
+ self.assertDictEqual(expected, result)
3217
+
3218
+ def test__get_study_specific_config_with_none(self):
3219
+ """Test that None file path returns None."""
3220
+ result = _get_study_specific_config(None)
3221
+
3222
+ self.assertIsNone(result)
3223
+
3224
+ def test__get_study_specific_config_with_empty_string(self):
3225
+ """Test that empty string file path returns None."""
3226
+ result = _get_study_specific_config("")
3227
+
3228
+ self.assertIsNone(result)
3229
+
3230
+ def test__get_study_specific_config_nonexistent_file_raises(self):
3231
+ """Test that nonexistent file raises FileNotFoundError."""
3232
+ with self.assertRaises(FileNotFoundError):
3233
+ _get_study_specific_config("/nonexistent/path/config.yml")
3234
+
3235
+ def test__get_study_specific_config_invalid_yaml_raises(self):
3236
+ """Test that invalid YAML file raises an error."""
3237
+ invalid_fp = path.join(self.TEST_DIR, "data/invalid.yml")
3238
+
3239
+ with self.assertRaises(Exception):
3240
+ _get_study_specific_config(invalid_fp)
3241
+
3242
+ # Tests for _output_metadata_df_to_files
3243
+
3244
+ def test__output_metadata_df_to_files_basic(self):
3245
+ """Test basic output of metadata DataFrame to file."""
3246
+ input_df = pandas.DataFrame({
3247
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
3248
+ "field_a": ["a1", "a2"],
3249
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
3250
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
3251
+ QC_NOTE_KEY: ["", ""]
3252
+ })
3253
+
3254
+ with tempfile.TemporaryDirectory() as tmpdir:
3255
+ _output_metadata_df_to_files(
3256
+ input_df, tmpdir, "test_output", INTERNAL_COL_KEYS,
3257
+ sep="\t", remove_internals_and_fails=False)
3258
+
3259
+ # Find the output file (has timestamp prefix)
3260
+ output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
3261
+ self.assertEqual(1, len(output_files))
3262
+
3263
+ # Read and verify contents (keep_default_na=False preserves empty strings)
3264
+ result_df = pandas.read_csv(output_files[0], sep="\t", keep_default_na=False)
3265
+ expected_df = input_df
3266
+ assert_frame_equal(expected_df, result_df)
3267
+
3268
+ def test__output_metadata_df_to_files_remove_internals_and_fails(self):
3269
+ """Test output with internal columns and failures removed."""
3270
+ input_df = pandas.DataFrame({
3271
+ SAMPLE_NAME_KEY: ["sample1", "sample2", "sample3"],
3272
+ "field_a": ["a1", "a2", "a3"],
3273
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human", "human"],
3274
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool", "stool"],
3275
+ QC_NOTE_KEY: ["", "invalid host_type", ""]
3276
+ })
3277
+
3278
+ with tempfile.TemporaryDirectory() as tmpdir:
3279
+ _output_metadata_df_to_files(
3280
+ input_df, tmpdir, "test_output", INTERNAL_COL_KEYS,
3281
+ sep="\t", remove_internals_and_fails=True)
3282
+
3283
+ # Find the main output file
3284
+ output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
3285
+ self.assertEqual(1, len(output_files))
3286
+
3287
+ # Verify main output has internal cols removed and no failures
3288
+ result_df = pandas.read_csv(output_files[0], sep="\t")
3289
+ expected_df = pandas.DataFrame({
3290
+ SAMPLE_NAME_KEY: ["sample1", "sample3"],
3291
+ "field_a": ["a1", "a3"]
3292
+ })
3293
+ assert_frame_equal(expected_df, result_df)
3294
+
3295
+ # Find the fails file
3296
+ fails_files = glob.glob(os.path.join(tmpdir, "*_test_output_fails.csv"))
3297
+ self.assertEqual(1, len(fails_files))
3298
+
3299
+ # Verify fails file contains the failed row
3300
+ fails_df = pandas.read_csv(fails_files[0], sep=",")
3301
+ expected_fails_df = pandas.DataFrame({
3302
+ SAMPLE_NAME_KEY: ["sample2"],
3303
+ "field_a": ["a2"],
3304
+ HOSTTYPE_SHORTHAND_KEY: ["human"],
3305
+ SAMPLETYPE_SHORTHAND_KEY: ["stool"],
3306
+ QC_NOTE_KEY: ["invalid host_type"]
3307
+ })
3308
+ assert_frame_equal(expected_fails_df, fails_df)
3309
+
3310
+ def test__output_metadata_df_to_files_no_failures_creates_empty_file(self):
3311
+ """Test that empty fails file is created when there are no failures."""
3312
+ input_df = pandas.DataFrame({
3313
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
3314
+ "field_a": ["a1", "a2"],
3315
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
3316
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
3317
+ QC_NOTE_KEY: ["", ""]
3318
+ })
3319
+
3320
+ with tempfile.TemporaryDirectory() as tmpdir:
3321
+ _output_metadata_df_to_files(
3322
+ input_df, tmpdir, "test_output", INTERNAL_COL_KEYS,
3323
+ sep="\t", remove_internals_and_fails=True,
3324
+ suppress_empty_fails=False)
3325
+
3326
+ # Find the fails file
3327
+ fails_files = glob.glob(os.path.join(tmpdir, "*_test_output_fails.csv"))
3328
+ self.assertEqual(1, len(fails_files))
3329
+
3330
+ # Verify fails file is empty (zero bytes)
3331
+ self.assertEqual(0, os.path.getsize(fails_files[0]))
3332
+
3333
+ def test__output_metadata_df_to_files_suppress_empty_fails(self):
3334
+ """Test that empty fails file is not created when suppress_empty_fails=True."""
3335
+ input_df = pandas.DataFrame({
3336
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
3337
+ "field_a": ["a1", "a2"],
3338
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
3339
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
3340
+ QC_NOTE_KEY: ["", ""]
3341
+ })
3342
+
3343
+ with tempfile.TemporaryDirectory() as tmpdir:
3344
+ _output_metadata_df_to_files(
3345
+ input_df, tmpdir, "test_output", INTERNAL_COL_KEYS,
3346
+ sep="\t", remove_internals_and_fails=True,
3347
+ suppress_empty_fails=True)
3348
+
3349
+ # Find the fails file - should not exist
3350
+ fails_files = glob.glob(os.path.join(tmpdir, "*_test_output_fails.csv"))
3351
+ self.assertEqual(0, len(fails_files))
3352
+
3353
+ # Main output file should still exist
3354
+ output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
3355
+ self.assertEqual(1, len(output_files))
3356
+
3357
+ def test__output_metadata_df_to_files_csv_separator(self):
3358
+ """Test output with comma separator creates .csv file."""
3359
+ input_df = pandas.DataFrame({
3360
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
3361
+ "field_a": ["a1", "a2"],
3362
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
3363
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
3364
+ QC_NOTE_KEY: ["", ""]
3365
+ })
3366
+
3367
+ with tempfile.TemporaryDirectory() as tmpdir:
3368
+ _output_metadata_df_to_files(
3369
+ input_df, tmpdir, "test_output", INTERNAL_COL_KEYS,
3370
+ sep=",", remove_internals_and_fails=False)
3371
+
3372
+ # Find the output file with .csv extension
3373
+ output_files = glob.glob(os.path.join(tmpdir, "*_test_output.csv"))
3374
+ self.assertEqual(1, len(output_files))
3375
+
3376
+ # Read and verify contents (keep_default_na=False preserves empty strings)
3377
+ result_df = pandas.read_csv(output_files[0], sep=",", keep_default_na=False)
3378
+ expected_df = input_df
3379
+ assert_frame_equal(expected_df, result_df)
3380
+
3381
+ def test__output_metadata_df_to_files_all_failures(self):
3382
+ """Test output when all rows are failures."""
3383
+ input_df = pandas.DataFrame({
3384
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
3385
+ "field_a": ["a1", "a2"],
2053
3386
  HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
2054
- SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"]
3387
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
3388
+ QC_NOTE_KEY: ["invalid host_type", "invalid sample_type"]
2055
3389
  })
2056
- full_flat_config_dict = {
2057
- HOST_TYPE_SPECIFIC_METADATA_KEY: {}
2058
- }
2059
3390
 
2060
- with self.assertRaisesRegex(ValueError, "Metadata contains NaN sample names"):
2061
- _populate_metadata_df(input_df, full_flat_config_dict, None)
3391
+ with tempfile.TemporaryDirectory() as tmpdir:
3392
+ _output_metadata_df_to_files(
3393
+ input_df, tmpdir, "test_output", INTERNAL_COL_KEYS,
3394
+ sep="\t", remove_internals_and_fails=True)
2062
3395
 
2063
- # Tests for extend_metadata_df
3396
+ # Main output file should have only headers (empty data)
3397
+ output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
3398
+ self.assertEqual(1, len(output_files))
3399
+ result_df = pandas.read_csv(output_files[0], sep="\t")
3400
+ self.assertTrue(result_df.empty)
3401
+ self.assertEqual([SAMPLE_NAME_KEY, "field_a"], list(result_df.columns))
2064
3402
 
2065
- TEST_DIR = path.dirname(__file__)
2066
- TEST_STDS_FP = path.join(TEST_DIR, "data/test_standards.yml")
3403
+ # Fails file should have both rows
3404
+ fails_files = glob.glob(os.path.join(tmpdir, "*_test_output_fails.csv"))
3405
+ self.assertEqual(1, len(fails_files))
3406
+ fails_df = pandas.read_csv(fails_files[0], sep=",")
3407
+ self.assertEqual(2, len(fails_df))
2067
3408
 
2068
- def test_extend_metadata_df_basic(self):
2069
- """Test basic metadata extension with study config."""
3409
+ # Tests for get_extended_metadata_from_df_and_yaml
3410
+
3411
+ TEST_STUDY_CONFIG_FP = path.join(TEST_DIR, "data/test_study_config.yml")
3412
+
3413
+ def test_get_extended_metadata_from_df_and_yaml_with_config(self):
3414
+ """Test extending metadata with a study-specific YAML config file."""
2070
3415
  input_df = pandas.DataFrame({
2071
3416
  SAMPLE_NAME_KEY: ["sample1", "sample2"],
2072
3417
  HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
2073
3418
  SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"]
2074
3419
  })
2075
- study_config = {
2076
- DEFAULT_KEY: "not provided",
2077
- LEAVE_REQUIREDS_BLANK_KEY: True,
2078
- OVERWRITE_NON_NANS_KEY: False,
2079
- STUDY_SPECIFIC_METADATA_KEY: {
2080
- HOST_TYPE_SPECIFIC_METADATA_KEY: {
2081
- "human": {
2082
- METADATA_FIELDS_KEY: {
2083
- "custom_field": {
2084
- DEFAULT_KEY: "custom_value",
2085
- TYPE_KEY: "string"
2086
- }
2087
- },
2088
- SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
2089
- "stool": {
2090
- METADATA_FIELDS_KEY: {}
2091
- }
2092
- }
2093
- }
2094
- }
2095
- }
2096
- }
2097
3420
 
2098
- result_df, validation_msgs_df = extend_metadata_df(
2099
- input_df, study_config, None, None, self.TEST_STDS_FP)
3421
+ result_df, validation_msgs_df = get_extended_metadata_from_df_and_yaml(
3422
+ input_df, self.TEST_STUDY_CONFIG_FP, self.TEST_STDS_FP)
2100
3423
 
2101
3424
  expected_df = pandas.DataFrame({
2102
3425
  SAMPLE_NAME_KEY: ["sample1", "sample2"],
3426
+ "body_product": ["UBERON:feces", "UBERON:feces"],
2103
3427
  "body_site": ["gut", "gut"],
2104
- "custom_field": ["custom_value", "custom_value"],
3428
+ "description": ["human sample", "human sample"],
2105
3429
  "host_common_name": ["human", "human"],
2106
3430
  QIITA_SAMPLE_TYPE: ["stool", "stool"],
2107
3431
  SAMPLE_TYPE_KEY: ["stool", "stool"],
3432
+ "study_custom_field": ["custom_value", "custom_value"],
3433
+ "study_stool_field": ["stool_custom", "stool_custom"],
2108
3434
  HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
2109
3435
  SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
2110
3436
  QC_NOTE_KEY: ["", ""]
@@ -2112,153 +3438,68 @@ class TestMetadataExtender(TestCase):
2112
3438
  assert_frame_equal(expected_df, result_df)
2113
3439
  self.assertTrue(validation_msgs_df.empty)
2114
3440
 
2115
- def test_extend_metadata_df_with_pre_transformer(self):
2116
- """Test metadata extension with pre-transformer."""
3441
+ def test_get_extended_metadata_from_df_and_yaml_none_config(self):
3442
+ """Test extending metadata with None for study_specific_config_fp."""
2117
3443
  input_df = pandas.DataFrame({
2118
3444
  SAMPLE_NAME_KEY: ["sample1", "sample2"],
2119
3445
  HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
2120
- SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
2121
- "input_sex": ["F", "Male"]
3446
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"]
2122
3447
  })
2123
- study_config = {
2124
- DEFAULT_KEY: "not provided",
2125
- LEAVE_REQUIREDS_BLANK_KEY: True,
2126
- OVERWRITE_NON_NANS_KEY: False,
2127
- METADATA_TRANSFORMERS_KEY: {
2128
- PRE_TRANSFORMERS_KEY: {
2129
- "sex": {
2130
- SOURCES_KEY: ["input_sex"],
2131
- FUNCTION_KEY: "transform_input_sex_to_std_sex"
2132
- }
2133
- }
2134
- },
2135
- STUDY_SPECIFIC_METADATA_KEY: {
2136
- HOST_TYPE_SPECIFIC_METADATA_KEY: {
2137
- "human": {
2138
- METADATA_FIELDS_KEY: {},
2139
- SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
2140
- "stool": {
2141
- METADATA_FIELDS_KEY: {}
2142
- }
2143
- }
2144
- }
2145
- }
2146
- }
2147
- }
2148
3448
 
2149
- result_df, validation_msgs_df = extend_metadata_df(
2150
- input_df, study_config, None, None, self.TEST_STDS_FP)
3449
+ result_df, validation_msgs_df = get_extended_metadata_from_df_and_yaml(
3450
+ input_df, None, self.TEST_STDS_FP)
2151
3451
 
2152
3452
  expected_df = pandas.DataFrame({
2153
3453
  SAMPLE_NAME_KEY: ["sample1", "sample2"],
3454
+ "body_product": ["UBERON:feces", "UBERON:feces"],
2154
3455
  "body_site": ["gut", "gut"],
3456
+ "description": ["human sample", "human sample"],
2155
3457
  "host_common_name": ["human", "human"],
2156
- "input_sex": ["F", "Male"],
2157
3458
  QIITA_SAMPLE_TYPE: ["stool", "stool"],
2158
3459
  SAMPLE_TYPE_KEY: ["stool", "stool"],
2159
- "sex": ["female", "male"],
2160
3460
  HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
2161
3461
  SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
2162
3462
  QC_NOTE_KEY: ["", ""]
2163
3463
  })
2164
3464
  assert_frame_equal(expected_df, result_df)
3465
+ self.assertTrue(validation_msgs_df.empty)
2165
3466
 
2166
- def test_extend_metadata_df_with_custom_transformer(self):
2167
- """Test metadata extension with custom transformer function."""
3467
+ def test_get_extended_metadata_from_df_and_yaml_invalid_host_type(self):
3468
+ """Test that invalid host types are flagged with QC note."""
2168
3469
  input_df = pandas.DataFrame({
2169
3470
  SAMPLE_NAME_KEY: ["sample1", "sample2"],
2170
- HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
2171
- SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
2172
- "source_field": ["hello", "world"]
3471
+ HOSTTYPE_SHORTHAND_KEY: ["unknown_host", "human"],
3472
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"]
2173
3473
  })
2174
- study_config = {
2175
- DEFAULT_KEY: "not provided",
2176
- LEAVE_REQUIREDS_BLANK_KEY: True,
2177
- OVERWRITE_NON_NANS_KEY: False,
2178
- METADATA_TRANSFORMERS_KEY: {
2179
- PRE_TRANSFORMERS_KEY: {
2180
- "upper_field": {
2181
- SOURCES_KEY: ["source_field"],
2182
- FUNCTION_KEY: "custom_upper"
2183
- }
2184
- }
2185
- },
2186
- STUDY_SPECIFIC_METADATA_KEY: {
2187
- HOST_TYPE_SPECIFIC_METADATA_KEY: {
2188
- "human": {
2189
- METADATA_FIELDS_KEY: {},
2190
- SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
2191
- "stool": {
2192
- METADATA_FIELDS_KEY: {}
2193
- }
2194
- }
2195
- }
2196
- }
2197
- }
2198
- }
2199
-
2200
- def custom_upper(row, source_fields):
2201
- return row[source_fields[0]].upper()
2202
-
2203
- transformer_funcs_dict = {"custom_upper": custom_upper}
2204
3474
 
2205
- result_df, validation_msgs_df = extend_metadata_df(
2206
- input_df, study_config, transformer_funcs_dict, None, self.TEST_STDS_FP)
3475
+ result_df, validation_msgs_df = get_extended_metadata_from_df_and_yaml(
3476
+ input_df, self.TEST_STUDY_CONFIG_FP, self.TEST_STDS_FP)
2207
3477
 
2208
3478
  expected_df = pandas.DataFrame({
2209
3479
  SAMPLE_NAME_KEY: ["sample1", "sample2"],
2210
- "body_site": ["gut", "gut"],
2211
- "host_common_name": ["human", "human"],
2212
- QIITA_SAMPLE_TYPE: ["stool", "stool"],
2213
- SAMPLE_TYPE_KEY: ["stool", "stool"],
2214
- "source_field": ["hello", "world"],
2215
- "upper_field": ["HELLO", "WORLD"],
2216
- HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
3480
+ "body_product": ["not provided", "UBERON:feces"],
3481
+ "body_site": ["not provided", "gut"],
3482
+ "description": ["not provided", "human sample"],
3483
+ "host_common_name": ["not provided", "human"],
3484
+ QIITA_SAMPLE_TYPE: ["not provided", "stool"],
3485
+ SAMPLE_TYPE_KEY: ["not provided", "stool"],
3486
+ "study_custom_field": ["not provided", "custom_value"],
3487
+ "study_stool_field": ["not provided", "stool_custom"],
3488
+ HOSTTYPE_SHORTHAND_KEY: ["unknown_host", "human"],
2217
3489
  SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
2218
- QC_NOTE_KEY: ["", ""]
3490
+ QC_NOTE_KEY: ["invalid host_type", ""]
2219
3491
  })
2220
3492
  assert_frame_equal(expected_df, result_df)
3493
+ self.assertTrue(validation_msgs_df.empty)
2221
3494
 
2222
- def test_extend_metadata_df_missing_required_columns_raises(self):
2223
- """Test that missing required columns raises ValueError."""
2224
- input_df = pandas.DataFrame({
2225
- SAMPLE_NAME_KEY: ["sample1", "sample2"]
2226
- # Missing HOSTTYPE_SHORTHAND_KEY and SAMPLETYPE_SHORTHAND_KEY
2227
- })
2228
- study_config = {}
2229
-
2230
- with self.assertRaisesRegex(ValueError, "metadata missing required columns"):
2231
- extend_metadata_df(input_df, study_config, None, None, self.TEST_STDS_FP)
2232
-
2233
- def test_extend_metadata_df_none_study_config(self):
2234
- """Test metadata extension with None study config uses standards only."""
2235
- input_df = pandas.DataFrame({
2236
- SAMPLE_NAME_KEY: ["sample1"],
2237
- HOSTTYPE_SHORTHAND_KEY: ["human"],
2238
- SAMPLETYPE_SHORTHAND_KEY: ["stool"]
2239
- })
2240
-
2241
- result_df, validation_msgs_df = extend_metadata_df(
2242
- input_df, None, None, None, self.TEST_STDS_FP)
2243
-
2244
- expected_df = pandas.DataFrame({
2245
- SAMPLE_NAME_KEY: ["sample1"],
2246
- "body_site": ["gut"],
2247
- "host_common_name": ["human"],
2248
- QIITA_SAMPLE_TYPE: ["stool"],
2249
- SAMPLE_TYPE_KEY: ["stool"],
2250
- HOSTTYPE_SHORTHAND_KEY: ["human"],
2251
- SAMPLETYPE_SHORTHAND_KEY: ["stool"],
2252
- QC_NOTE_KEY: [""]
2253
- })
2254
- assert_frame_equal(expected_df, result_df)
3495
+ # Tests for write_extended_metadata_from_df
2255
3496
 
2256
- def test_extend_metadata_df_unknown_host_type(self):
2257
- """Test that unknown host type adds QC note."""
3497
+ def test_write_extended_metadata_from_df_basic(self):
3498
+ """Test basic writing of extended metadata to files."""
2258
3499
  input_df = pandas.DataFrame({
2259
- SAMPLE_NAME_KEY: ["sample1"],
2260
- HOSTTYPE_SHORTHAND_KEY: ["unknown_host"],
2261
- SAMPLETYPE_SHORTHAND_KEY: ["stool"]
3500
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
3501
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
3502
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"]
2262
3503
  })
2263
3504
  study_config = {
2264
3505
  DEFAULT_KEY: "not provided",
@@ -2267,7 +3508,12 @@ class TestMetadataExtender(TestCase):
2267
3508
  STUDY_SPECIFIC_METADATA_KEY: {
2268
3509
  HOST_TYPE_SPECIFIC_METADATA_KEY: {
2269
3510
  "human": {
2270
- METADATA_FIELDS_KEY: {},
3511
+ METADATA_FIELDS_KEY: {
3512
+ "custom_field": {
3513
+ DEFAULT_KEY: "custom_value",
3514
+ TYPE_KEY: "string"
3515
+ }
3516
+ },
2271
3517
  SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
2272
3518
  "stool": {
2273
3519
  METADATA_FIELDS_KEY: {}
@@ -2278,23 +3524,60 @@ class TestMetadataExtender(TestCase):
2278
3524
  }
2279
3525
  }
2280
3526
 
2281
- result_df, validation_msgs_df = extend_metadata_df(
2282
- input_df, study_config, None, None, self.TEST_STDS_FP)
3527
+ with tempfile.TemporaryDirectory() as tmpdir:
3528
+ result_df = write_extended_metadata_from_df(
3529
+ input_df, study_config, tmpdir, "test_output",
3530
+ stds_fp=self.TEST_STDS_FP)
2283
3531
 
2284
- expected_df = pandas.DataFrame({
2285
- SAMPLE_NAME_KEY: ["sample1"],
2286
- HOSTTYPE_SHORTHAND_KEY: ["unknown_host"],
2287
- SAMPLETYPE_SHORTHAND_KEY: ["stool"],
2288
- QC_NOTE_KEY: ["invalid host_type"]
2289
- })
2290
- assert_frame_equal(expected_df, result_df)
3532
+ # Verify returned DataFrame
3533
+ expected_df = pandas.DataFrame({
3534
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
3535
+ "body_product": ["UBERON:feces", "UBERON:feces"],
3536
+ "body_site": ["gut", "gut"],
3537
+ "custom_field": ["custom_value", "custom_value"],
3538
+ "description": ["human sample", "human sample"],
3539
+ "host_common_name": ["human", "human"],
3540
+ QIITA_SAMPLE_TYPE: ["stool", "stool"],
3541
+ SAMPLE_TYPE_KEY: ["stool", "stool"],
3542
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
3543
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
3544
+ QC_NOTE_KEY: ["", ""]
3545
+ })
3546
+ assert_frame_equal(expected_df, result_df)
2291
3547
 
2292
- def test_extend_metadata_df_multiple_host_types(self):
2293
- """Test metadata extension with multiple host types."""
3548
+ # Verify main output file was created (internal cols removed by default)
3549
+ output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
3550
+ self.assertEqual(1, len(output_files))
3551
+ output_df = pandas.read_csv(output_files[0], sep="\t")
3552
+ expected_output_df = pandas.DataFrame({
3553
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
3554
+ "body_product": ["UBERON:feces", "UBERON:feces"],
3555
+ "body_site": ["gut", "gut"],
3556
+ "custom_field": ["custom_value", "custom_value"],
3557
+ "description": ["human sample", "human sample"],
3558
+ "host_common_name": ["human", "human"],
3559
+ QIITA_SAMPLE_TYPE: ["stool", "stool"],
3560
+ SAMPLE_TYPE_KEY: ["stool", "stool"]
3561
+ })
3562
+ assert_frame_equal(expected_output_df, output_df)
3563
+
3564
+ # Verify empty fails file was created
3565
+ fails_files = glob.glob(os.path.join(tmpdir, "*_test_output_fails.csv"))
3566
+ self.assertEqual(1, len(fails_files))
3567
+ self.assertEqual(0, os.path.getsize(fails_files[0]))
3568
+
3569
+ # Verify validation errors file was created (empty)
3570
+ validation_files = glob.glob(
3571
+ os.path.join(tmpdir, "*_test_output_validation_errors.csv"))
3572
+ self.assertEqual(1, len(validation_files))
3573
+ self.assertEqual(0, os.path.getsize(validation_files[0]))
3574
+
3575
+ def test_write_extended_metadata_from_df_with_qc_failures(self):
3576
+ """Test writing extended metadata when some rows have QC failures."""
2294
3577
  input_df = pandas.DataFrame({
2295
3578
  SAMPLE_NAME_KEY: ["sample1", "sample2", "sample3"],
2296
- HOSTTYPE_SHORTHAND_KEY: ["human", "mouse", "human"],
2297
- SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool", "blood"]
3579
+ HOSTTYPE_SHORTHAND_KEY: ["human", "unknown_host", "human"],
3580
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool", "stool"]
2298
3581
  })
2299
3582
  study_config = {
2300
3583
  DEFAULT_KEY: "not provided",
@@ -2303,17 +3586,6 @@ class TestMetadataExtender(TestCase):
2303
3586
  STUDY_SPECIFIC_METADATA_KEY: {
2304
3587
  HOST_TYPE_SPECIFIC_METADATA_KEY: {
2305
3588
  "human": {
2306
- METADATA_FIELDS_KEY: {},
2307
- SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
2308
- "stool": {
2309
- METADATA_FIELDS_KEY: {}
2310
- },
2311
- "blood": {
2312
- METADATA_FIELDS_KEY: {}
2313
- }
2314
- }
2315
- },
2316
- "mouse": {
2317
3589
  METADATA_FIELDS_KEY: {},
2318
3590
  SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
2319
3591
  "stool": {
@@ -2325,45 +3597,79 @@ class TestMetadataExtender(TestCase):
2325
3597
  }
2326
3598
  }
2327
3599
 
2328
- result_df, validation_msgs_df = extend_metadata_df(
2329
- input_df, study_config, None, None, self.TEST_STDS_FP)
3600
+ with tempfile.TemporaryDirectory() as tmpdir:
3601
+ result_df = write_extended_metadata_from_df(
3602
+ input_df, study_config, tmpdir, "test_output",
3603
+ stds_fp=self.TEST_STDS_FP)
3604
+
3605
+ # Verify returned DataFrame includes all rows (including failures)
3606
+ # Note: rows are reordered by host type processing (valid hosts first)
3607
+ expected_result_df = pandas.DataFrame({
3608
+ SAMPLE_NAME_KEY: ["sample1", "sample3", "sample2"],
3609
+ "body_product": ["UBERON:feces", "UBERON:feces", "not provided"],
3610
+ "body_site": ["gut", "gut", "not provided"],
3611
+ "description": ["human sample", "human sample", "not provided"],
3612
+ "host_common_name": ["human", "human", "not provided"],
3613
+ QIITA_SAMPLE_TYPE: ["stool", "stool", "not provided"],
3614
+ SAMPLE_TYPE_KEY: ["stool", "stool", "not provided"],
3615
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human", "unknown_host"],
3616
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool", "stool"],
3617
+ QC_NOTE_KEY: ["", "", "invalid host_type"]
3618
+ })
3619
+ assert_frame_equal(expected_result_df, result_df)
2330
3620
 
2331
- # After processing multiple host types, rows may be reordered
2332
- # Human samples are processed together, then mouse samples
2333
- expected_df = pandas.DataFrame({
2334
- SAMPLE_NAME_KEY: ["sample1", "sample3", "sample2"],
2335
- "body_site": ["gut", "blood", "gut"],
2336
- "host_common_name": ["human", "human", "mouse"],
2337
- QIITA_SAMPLE_TYPE: ["stool", "blood", "stool"],
2338
- SAMPLE_TYPE_KEY: ["stool", "blood", "stool"],
2339
- HOSTTYPE_SHORTHAND_KEY: ["human", "human", "mouse"],
2340
- SAMPLETYPE_SHORTHAND_KEY: ["stool", "blood", "stool"],
2341
- QC_NOTE_KEY: ["", "", ""]
2342
- })
2343
- assert_frame_equal(expected_df, result_df)
3621
+ # Verify main output file excludes failure rows
3622
+ output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
3623
+ self.assertEqual(1, len(output_files))
3624
+ output_df = pandas.read_csv(output_files[0], sep="\t")
3625
+ expected_output_df = pandas.DataFrame({
3626
+ SAMPLE_NAME_KEY: ["sample1", "sample3"],
3627
+ "body_product": ["UBERON:feces", "UBERON:feces"],
3628
+ "body_site": ["gut", "gut"],
3629
+ "description": ["human sample", "human sample"],
3630
+ "host_common_name": ["human", "human"],
3631
+ QIITA_SAMPLE_TYPE: ["stool", "stool"],
3632
+ SAMPLE_TYPE_KEY: ["stool", "stool"]
3633
+ })
3634
+ assert_frame_equal(expected_output_df, output_df)
3635
+
3636
+ # Verify fails file contains the failed row
3637
+ fails_files = glob.glob(os.path.join(tmpdir, "*_test_output_fails.csv"))
3638
+ self.assertEqual(1, len(fails_files))
3639
+ fails_df = pandas.read_csv(fails_files[0], sep=",")
3640
+ expected_fails_df = pandas.DataFrame({
3641
+ SAMPLE_NAME_KEY: ["sample2"],
3642
+ "body_product": ["not provided"],
3643
+ "body_site": ["not provided"],
3644
+ "description": ["not provided"],
3645
+ "host_common_name": ["not provided"],
3646
+ QIITA_SAMPLE_TYPE: ["not provided"],
3647
+ SAMPLE_TYPE_KEY: ["not provided"],
3648
+ HOSTTYPE_SHORTHAND_KEY: ["unknown_host"],
3649
+ SAMPLETYPE_SHORTHAND_KEY: ["stool"],
3650
+ QC_NOTE_KEY: ["invalid host_type"]
3651
+ })
3652
+ assert_frame_equal(expected_fails_df, fails_df)
2344
3653
 
2345
- def test_extend_metadata_df_with_software_config(self):
2346
- """Test metadata extension with custom software config overrides defaults."""
3654
+ def test_write_extended_metadata_from_df_with_validation_errors(self):
3655
+ """Test writing extended metadata when validation errors occur."""
2347
3656
  input_df = pandas.DataFrame({
2348
3657
  SAMPLE_NAME_KEY: ["sample1", "sample2"],
2349
3658
  HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
2350
- SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"]
3659
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
3660
+ "restricted_field": ["invalid_value", "allowed_value"]
2351
3661
  })
2352
- # Software config with custom default value
2353
- software_config = {
2354
- DEFAULT_KEY: "custom_software_default",
2355
- LEAVE_REQUIREDS_BLANK_KEY: True,
2356
- OVERWRITE_NON_NANS_KEY: False
2357
- }
2358
- # Study config that doesn't override DEFAULT_KEY
2359
3662
  study_config = {
3663
+ DEFAULT_KEY: "not provided",
3664
+ LEAVE_REQUIREDS_BLANK_KEY: True,
3665
+ OVERWRITE_NON_NANS_KEY: False,
2360
3666
  STUDY_SPECIFIC_METADATA_KEY: {
2361
3667
  HOST_TYPE_SPECIFIC_METADATA_KEY: {
2362
3668
  "human": {
2363
3669
  METADATA_FIELDS_KEY: {
2364
- "study_field": {
2365
- DEFAULT_KEY: "study_value",
2366
- TYPE_KEY: "string"
3670
+ "restricted_field": {
3671
+ TYPE_KEY: "string",
3672
+ ALLOWED_KEY: ["allowed_value"]
2367
3673
  }
2368
3674
  },
2369
3675
  SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
@@ -2376,235 +3682,466 @@ class TestMetadataExtender(TestCase):
2376
3682
  }
2377
3683
  }
2378
3684
 
2379
- result_df, validation_msgs_df = extend_metadata_df(
2380
- input_df, study_config, None, software_config, self.TEST_STDS_FP)
3685
+ with tempfile.TemporaryDirectory() as tmpdir:
3686
+ result_df = write_extended_metadata_from_df(
3687
+ input_df, study_config, tmpdir, "test_output",
3688
+ stds_fp=self.TEST_STDS_FP)
3689
+
3690
+ # Verify returned DataFrame
3691
+ expected_result_df = pandas.DataFrame({
3692
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
3693
+ "body_product": ["UBERON:feces", "UBERON:feces"],
3694
+ "body_site": ["gut", "gut"],
3695
+ "description": ["human sample", "human sample"],
3696
+ "host_common_name": ["human", "human"],
3697
+ QIITA_SAMPLE_TYPE: ["stool", "stool"],
3698
+ "restricted_field": ["invalid_value", "allowed_value"],
3699
+ SAMPLE_TYPE_KEY: ["stool", "stool"],
3700
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
3701
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
3702
+ QC_NOTE_KEY: ["", ""]
3703
+ })
3704
+ assert_frame_equal(expected_result_df, result_df)
3705
+
3706
+ # Verify validation errors file contains the error
3707
+ validation_files = glob.glob(
3708
+ os.path.join(tmpdir, "*_test_output_validation_errors.csv"))
3709
+ self.assertEqual(1, len(validation_files))
3710
+ validation_df = pandas.read_csv(validation_files[0], sep=",")
3711
+ expected_validation_df = pandas.DataFrame({
3712
+ "sample_name": ["sample1"],
3713
+ "field_name": ["restricted_field"],
3714
+ "error_message": ["['unallowed value invalid_value']"]
3715
+ })
3716
+ assert_frame_equal(expected_validation_df, validation_df)
2381
3717
 
2382
- expected_df = pandas.DataFrame({
2383
- SAMPLE_NAME_KEY: ["sample1", "sample2"],
2384
- "body_site": ["gut", "gut"],
2385
- "host_common_name": ["human", "human"],
2386
- QIITA_SAMPLE_TYPE: ["stool", "stool"],
2387
- SAMPLE_TYPE_KEY: ["stool", "stool"],
2388
- "study_field": ["study_value", "study_value"],
2389
- HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
2390
- SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
2391
- QC_NOTE_KEY: ["", ""]
3718
+ def test_write_extended_metadata_from_df_remove_internals_false(self):
3719
+ """Test writing extended metadata with remove_internals=False."""
3720
+ input_df = pandas.DataFrame({
3721
+ SAMPLE_NAME_KEY: ["sample1"],
3722
+ HOSTTYPE_SHORTHAND_KEY: ["human"],
3723
+ SAMPLETYPE_SHORTHAND_KEY: ["stool"]
2392
3724
  })
2393
- assert_frame_equal(expected_df, result_df)
2394
-
2395
- # Tests for _get_study_specific_config
2396
-
2397
- def test__get_study_specific_config_with_valid_file(self):
2398
- """Test loading study-specific config from a valid YAML file."""
2399
- config_fp = path.join(self.TEST_DIR, "data/test_config.yml")
2400
-
2401
- result = _get_study_specific_config(config_fp)
2402
-
2403
- expected = {
2404
- HOST_TYPE_SPECIFIC_METADATA_KEY: {
2405
- "base": {
2406
- METADATA_FIELDS_KEY: {
2407
- "sample_name": {
2408
- TYPE_KEY: "string",
2409
- "unique": True
2410
- },
2411
- "sample_type": {
2412
- "empty": False,
2413
- "is_phi": False
3725
+ study_config = {
3726
+ DEFAULT_KEY: "not provided",
3727
+ LEAVE_REQUIREDS_BLANK_KEY: True,
3728
+ OVERWRITE_NON_NANS_KEY: False,
3729
+ STUDY_SPECIFIC_METADATA_KEY: {
3730
+ HOST_TYPE_SPECIFIC_METADATA_KEY: {
3731
+ "human": {
3732
+ METADATA_FIELDS_KEY: {},
3733
+ SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
3734
+ "stool": {
3735
+ METADATA_FIELDS_KEY: {}
3736
+ }
2414
3737
  }
2415
3738
  }
2416
3739
  }
2417
3740
  }
2418
3741
  }
2419
- self.assertDictEqual(expected, result)
2420
-
2421
- def test__get_study_specific_config_with_none(self):
2422
- """Test that None file path returns None."""
2423
- result = _get_study_specific_config(None)
2424
-
2425
- self.assertIsNone(result)
2426
-
2427
- def test__get_study_specific_config_with_empty_string(self):
2428
- """Test that empty string file path returns None."""
2429
- result = _get_study_specific_config("")
2430
-
2431
- self.assertIsNone(result)
2432
3742
 
2433
- def test__get_study_specific_config_nonexistent_file_raises(self):
2434
- """Test that nonexistent file raises FileNotFoundError."""
2435
- with self.assertRaises(FileNotFoundError):
2436
- _get_study_specific_config("/nonexistent/path/config.yml")
3743
+ with tempfile.TemporaryDirectory() as tmpdir:
3744
+ write_extended_metadata_from_df(
3745
+ input_df, study_config, tmpdir, "test_output",
3746
+ remove_internals=False, stds_fp=self.TEST_STDS_FP)
2437
3747
 
2438
- def test__get_study_specific_config_invalid_yaml_raises(self):
2439
- """Test that invalid YAML file raises an error."""
2440
- invalid_fp = path.join(self.TEST_DIR, "data/invalid.yml")
3748
+ # Verify main output file includes internal columns
3749
+ output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
3750
+ self.assertEqual(1, len(output_files))
3751
+ output_df = pandas.read_csv(output_files[0], sep="\t", keep_default_na=False)
3752
+ expected_output_df = pandas.DataFrame({
3753
+ SAMPLE_NAME_KEY: ["sample1"],
3754
+ "body_product": ["UBERON:feces"],
3755
+ "body_site": ["gut"],
3756
+ "description": ["human sample"],
3757
+ "host_common_name": ["human"],
3758
+ QIITA_SAMPLE_TYPE: ["stool"],
3759
+ SAMPLE_TYPE_KEY: ["stool"],
3760
+ HOSTTYPE_SHORTHAND_KEY: ["human"],
3761
+ SAMPLETYPE_SHORTHAND_KEY: ["stool"],
3762
+ QC_NOTE_KEY: [""]
3763
+ })
3764
+ assert_frame_equal(expected_output_df, output_df)
2441
3765
 
2442
- with self.assertRaises(Exception):
2443
- _get_study_specific_config(invalid_fp)
3766
+ # Verify no fails file was created (since remove_internals=False)
3767
+ fails_files = glob.glob(os.path.join(tmpdir, "*_test_output_fails.csv"))
3768
+ self.assertEqual(0, len(fails_files))
2444
3769
 
2445
- # Tests for _output_metadata_df_to_files
3770
+ # Tests for write_extended_metadata
2446
3771
 
2447
- def test__output_metadata_df_to_files_basic(self):
2448
- """Test basic output of metadata DataFrame to file."""
2449
- input_df = pandas.DataFrame({
2450
- SAMPLE_NAME_KEY: ["sample1", "sample2"],
2451
- "field_a": ["a1", "a2"],
2452
- HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
2453
- SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
2454
- QC_NOTE_KEY: ["", ""]
2455
- })
3772
+ TEST_METADATA_CSV_FP = path.join(TEST_DIR, "data/test_metadata.csv")
3773
+ TEST_METADATA_TXT_FP = path.join(TEST_DIR, "data/test_metadata.txt")
3774
+ TEST_METADATA_WITH_ERRORS_FP = path.join(
3775
+ TEST_DIR, "data/test_metadata_with_errors.csv")
3776
+ TEST_STUDY_CONFIG_WITH_VALIDATION_FP = path.join(
3777
+ TEST_DIR, "data/test_study_config_with_validation.yml")
2456
3778
 
3779
+ def test_write_extended_metadata_csv_input(self):
3780
+ """Test writing extended metadata from a CSV input file."""
2457
3781
  with tempfile.TemporaryDirectory() as tmpdir:
2458
- _output_metadata_df_to_files(
2459
- input_df, tmpdir, "test_output", INTERNAL_COL_KEYS,
2460
- sep="\t", remove_internals_and_fails=False)
3782
+ result_df = write_extended_metadata(
3783
+ self.TEST_METADATA_CSV_FP, self.TEST_STUDY_CONFIG_FP,
3784
+ tmpdir, "test_output", stds_fp=self.TEST_STDS_FP)
3785
+
3786
+ # Verify returned DataFrame
3787
+ expected_result_df = pandas.DataFrame({
3788
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
3789
+ "body_product": ["UBERON:feces", "UBERON:feces"],
3790
+ "body_site": ["gut", "gut"],
3791
+ "description": ["human sample", "human sample"],
3792
+ "host_common_name": ["human", "human"],
3793
+ QIITA_SAMPLE_TYPE: ["stool", "stool"],
3794
+ SAMPLE_TYPE_KEY: ["stool", "stool"],
3795
+ "study_custom_field": ["custom_value", "custom_value"],
3796
+ "study_stool_field": ["stool_custom", "stool_custom"],
3797
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
3798
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
3799
+ QC_NOTE_KEY: ["", ""]
3800
+ })
3801
+ assert_frame_equal(expected_result_df, result_df)
2461
3802
 
2462
- # Find the output file (has timestamp prefix)
3803
+ # Verify main output file was created (internal cols removed by default)
2463
3804
  output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
2464
3805
  self.assertEqual(1, len(output_files))
3806
+ output_df = pandas.read_csv(output_files[0], sep="\t")
3807
+ expected_output_df = pandas.DataFrame({
3808
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
3809
+ "body_product": ["UBERON:feces", "UBERON:feces"],
3810
+ "body_site": ["gut", "gut"],
3811
+ "description": ["human sample", "human sample"],
3812
+ "host_common_name": ["human", "human"],
3813
+ QIITA_SAMPLE_TYPE: ["stool", "stool"],
3814
+ SAMPLE_TYPE_KEY: ["stool", "stool"],
3815
+ "study_custom_field": ["custom_value", "custom_value"],
3816
+ "study_stool_field": ["stool_custom", "stool_custom"]
3817
+ })
3818
+ assert_frame_equal(expected_output_df, output_df)
2465
3819
 
2466
- # Read and verify contents (keep_default_na=False preserves empty strings)
2467
- result_df = pandas.read_csv(output_files[0], sep="\t", keep_default_na=False)
2468
- expected_df = input_df
2469
- assert_frame_equal(expected_df, result_df)
3820
+ # Verify empty fails file was created
3821
+ fails_files = glob.glob(os.path.join(tmpdir, "*_test_output_fails.csv"))
3822
+ self.assertEqual(1, len(fails_files))
3823
+ self.assertEqual(0, os.path.getsize(fails_files[0]))
2470
3824
 
2471
- def test__output_metadata_df_to_files_remove_internals_and_fails(self):
2472
- """Test output with internal columns and failures removed."""
2473
- input_df = pandas.DataFrame({
2474
- SAMPLE_NAME_KEY: ["sample1", "sample2", "sample3"],
2475
- "field_a": ["a1", "a2", "a3"],
2476
- HOSTTYPE_SHORTHAND_KEY: ["human", "human", "human"],
2477
- SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool", "stool"],
2478
- QC_NOTE_KEY: ["", "invalid host_type", ""]
2479
- })
3825
+ # Verify empty validation errors file was created
3826
+ validation_files = glob.glob(
3827
+ os.path.join(tmpdir, "*_test_output_validation_errors.csv"))
3828
+ self.assertEqual(1, len(validation_files))
3829
+ self.assertEqual(0, os.path.getsize(validation_files[0]))
2480
3830
 
3831
+ def test_write_extended_metadata_txt_input(self):
3832
+ """Test writing extended metadata from a tab-delimited TXT input file."""
2481
3833
  with tempfile.TemporaryDirectory() as tmpdir:
2482
- _output_metadata_df_to_files(
2483
- input_df, tmpdir, "test_output", INTERNAL_COL_KEYS,
2484
- sep="\t", remove_internals_and_fails=True)
3834
+ result_df = write_extended_metadata(
3835
+ self.TEST_METADATA_TXT_FP, self.TEST_STUDY_CONFIG_FP,
3836
+ tmpdir, "test_output", stds_fp=self.TEST_STDS_FP)
3837
+
3838
+ # Verify returned DataFrame
3839
+ expected_result_df = pandas.DataFrame({
3840
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
3841
+ "body_product": ["UBERON:feces", "UBERON:feces"],
3842
+ "body_site": ["gut", "gut"],
3843
+ "description": ["human sample", "human sample"],
3844
+ "host_common_name": ["human", "human"],
3845
+ QIITA_SAMPLE_TYPE: ["stool", "stool"],
3846
+ SAMPLE_TYPE_KEY: ["stool", "stool"],
3847
+ "study_custom_field": ["custom_value", "custom_value"],
3848
+ "study_stool_field": ["stool_custom", "stool_custom"],
3849
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
3850
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
3851
+ QC_NOTE_KEY: ["", ""]
3852
+ })
3853
+ assert_frame_equal(expected_result_df, result_df)
2485
3854
 
2486
- # Find the main output file
3855
+ # Verify main output file was created
2487
3856
  output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
2488
3857
  self.assertEqual(1, len(output_files))
3858
+ output_df = pandas.read_csv(output_files[0], sep="\t")
3859
+ expected_output_df = pandas.DataFrame({
3860
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
3861
+ "body_product": ["UBERON:feces", "UBERON:feces"],
3862
+ "body_site": ["gut", "gut"],
3863
+ "description": ["human sample", "human sample"],
3864
+ "host_common_name": ["human", "human"],
3865
+ QIITA_SAMPLE_TYPE: ["stool", "stool"],
3866
+ SAMPLE_TYPE_KEY: ["stool", "stool"],
3867
+ "study_custom_field": ["custom_value", "custom_value"],
3868
+ "study_stool_field": ["stool_custom", "stool_custom"]
3869
+ })
3870
+ assert_frame_equal(expected_output_df, output_df)
2489
3871
 
2490
- # Verify main output has internal cols removed and no failures
2491
- result_df = pandas.read_csv(output_files[0], sep="\t")
2492
- expected_df = pandas.DataFrame({
2493
- SAMPLE_NAME_KEY: ["sample1", "sample3"],
2494
- "field_a": ["a1", "a3"]
3872
+ def test_write_extended_metadata_with_validation_errors(self):
3873
+ """Test writing extended metadata when validation errors occur."""
3874
+ with tempfile.TemporaryDirectory() as tmpdir:
3875
+ result_df = write_extended_metadata(
3876
+ self.TEST_METADATA_WITH_ERRORS_FP,
3877
+ self.TEST_STUDY_CONFIG_WITH_VALIDATION_FP,
3878
+ tmpdir, "test_output", stds_fp=self.TEST_STDS_FP)
3879
+
3880
+ # Verify returned DataFrame
3881
+ expected_result_df = pandas.DataFrame({
3882
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
3883
+ "body_product": ["UBERON:feces", "UBERON:feces"],
3884
+ "body_site": ["gut", "gut"],
3885
+ "description": ["human sample", "human sample"],
3886
+ "host_common_name": ["human", "human"],
3887
+ QIITA_SAMPLE_TYPE: ["stool", "stool"],
3888
+ "restricted_field": ["invalid_value", "allowed_value"],
3889
+ SAMPLE_TYPE_KEY: ["stool", "stool"],
3890
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
3891
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
3892
+ QC_NOTE_KEY: ["", ""]
2495
3893
  })
2496
- assert_frame_equal(expected_df, result_df)
3894
+ assert_frame_equal(expected_result_df, result_df)
2497
3895
 
2498
- # Find the fails file
2499
- fails_files = glob.glob(os.path.join(tmpdir, "*_test_output_fails.csv"))
2500
- self.assertEqual(1, len(fails_files))
3896
+ # Verify main output file was created
3897
+ output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
3898
+ self.assertEqual(1, len(output_files))
3899
+ output_df = pandas.read_csv(output_files[0], sep="\t")
3900
+ expected_output_df = pandas.DataFrame({
3901
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
3902
+ "body_product": ["UBERON:feces", "UBERON:feces"],
3903
+ "body_site": ["gut", "gut"],
3904
+ "description": ["human sample", "human sample"],
3905
+ "host_common_name": ["human", "human"],
3906
+ QIITA_SAMPLE_TYPE: ["stool", "stool"],
3907
+ "restricted_field": ["invalid_value", "allowed_value"],
3908
+ SAMPLE_TYPE_KEY: ["stool", "stool"]
3909
+ })
3910
+ assert_frame_equal(expected_output_df, output_df)
3911
+
3912
+ # Verify validation errors file contains the error
3913
+ validation_files = glob.glob(
3914
+ os.path.join(tmpdir, "*_test_output_validation_errors.csv"))
3915
+ self.assertEqual(1, len(validation_files))
3916
+ validation_df = pandas.read_csv(validation_files[0], sep=",")
3917
+ expected_validation_df = pandas.DataFrame({
3918
+ "sample_name": ["sample1"],
3919
+ "field_name": ["restricted_field"],
3920
+ "error_message": ["['unallowed value invalid_value']"]
3921
+ })
3922
+ assert_frame_equal(expected_validation_df, validation_df)
2501
3923
 
2502
- # Verify fails file contains the failed row
2503
- fails_df = pandas.read_csv(fails_files[0], sep=",")
2504
- expected_fails_df = pandas.DataFrame({
2505
- SAMPLE_NAME_KEY: ["sample2"],
2506
- "field_a": ["a2"],
2507
- HOSTTYPE_SHORTHAND_KEY: ["human"],
2508
- SAMPLETYPE_SHORTHAND_KEY: ["stool"],
2509
- QC_NOTE_KEY: ["invalid host_type"]
3924
+ def test_write_extended_metadata_unrecognized_extension_raises(self):
3925
+ """Test that unrecognized file extension raises ValueError."""
3926
+ with tempfile.TemporaryDirectory() as tmpdir:
3927
+ fake_fp = path.join(tmpdir, "test.json")
3928
+ # Create a dummy file so the path exists
3929
+ with open(fake_fp, "w") as f:
3930
+ f.write("{}")
3931
+
3932
+ with self.assertRaisesRegex(
3933
+ ValueError, "Unrecognized input file extension"):
3934
+ write_extended_metadata(
3935
+ fake_fp, self.TEST_STUDY_CONFIG_FP,
3936
+ tmpdir, "test_output", stds_fp=self.TEST_STDS_FP)
3937
+
3938
+ def test_write_extended_metadata_csv_separator_output(self):
3939
+ """Test writing extended metadata with CSV separator for output."""
3940
+ with tempfile.TemporaryDirectory() as tmpdir:
3941
+ result_df = write_extended_metadata(
3942
+ self.TEST_METADATA_CSV_FP, self.TEST_STUDY_CONFIG_FP,
3943
+ tmpdir, "test_output", sep=",", stds_fp=self.TEST_STDS_FP)
3944
+
3945
+ # Verify returned DataFrame
3946
+ expected_result_df = pandas.DataFrame({
3947
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
3948
+ "body_product": ["UBERON:feces", "UBERON:feces"],
3949
+ "body_site": ["gut", "gut"],
3950
+ "description": ["human sample", "human sample"],
3951
+ "host_common_name": ["human", "human"],
3952
+ QIITA_SAMPLE_TYPE: ["stool", "stool"],
3953
+ SAMPLE_TYPE_KEY: ["stool", "stool"],
3954
+ "study_custom_field": ["custom_value", "custom_value"],
3955
+ "study_stool_field": ["stool_custom", "stool_custom"],
3956
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
3957
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
3958
+ QC_NOTE_KEY: ["", ""]
2510
3959
  })
2511
- assert_frame_equal(expected_fails_df, fails_df)
3960
+ assert_frame_equal(expected_result_df, result_df)
2512
3961
 
2513
- def test__output_metadata_df_to_files_no_failures_creates_empty_file(self):
2514
- """Test that empty fails file is created when there are no failures."""
2515
- input_df = pandas.DataFrame({
2516
- SAMPLE_NAME_KEY: ["sample1", "sample2"],
2517
- "field_a": ["a1", "a2"],
2518
- HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
2519
- SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
2520
- QC_NOTE_KEY: ["", ""]
2521
- })
3962
+ # Verify output file has .csv extension
3963
+ output_files = glob.glob(os.path.join(tmpdir, "*_test_output.csv"))
3964
+ self.assertEqual(1, len(output_files))
3965
+ output_df = pandas.read_csv(output_files[0], sep=",")
3966
+ expected_output_df = pandas.DataFrame({
3967
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
3968
+ "body_product": ["UBERON:feces", "UBERON:feces"],
3969
+ "body_site": ["gut", "gut"],
3970
+ "description": ["human sample", "human sample"],
3971
+ "host_common_name": ["human", "human"],
3972
+ QIITA_SAMPLE_TYPE: ["stool", "stool"],
3973
+ SAMPLE_TYPE_KEY: ["stool", "stool"],
3974
+ "study_custom_field": ["custom_value", "custom_value"],
3975
+ "study_stool_field": ["stool_custom", "stool_custom"]
3976
+ })
3977
+ assert_frame_equal(expected_output_df, output_df)
2522
3978
 
3979
+ def test_write_extended_metadata_remove_internals_false(self):
3980
+ """Test writing extended metadata with remove_internals=False."""
2523
3981
  with tempfile.TemporaryDirectory() as tmpdir:
2524
- _output_metadata_df_to_files(
2525
- input_df, tmpdir, "test_output", INTERNAL_COL_KEYS,
2526
- sep="\t", remove_internals_and_fails=True,
2527
- suppress_empty_fails=False)
2528
-
2529
- # Find the fails file
2530
- fails_files = glob.glob(os.path.join(tmpdir, "*_test_output_fails.csv"))
2531
- self.assertEqual(1, len(fails_files))
3982
+ result_df = write_extended_metadata(
3983
+ self.TEST_METADATA_CSV_FP, self.TEST_STUDY_CONFIG_FP,
3984
+ tmpdir, "test_output", remove_internals=False,
3985
+ stds_fp=self.TEST_STDS_FP)
3986
+
3987
+ # Verify returned DataFrame
3988
+ expected_result_df = pandas.DataFrame({
3989
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
3990
+ "body_product": ["UBERON:feces", "UBERON:feces"],
3991
+ "body_site": ["gut", "gut"],
3992
+ "description": ["human sample", "human sample"],
3993
+ "host_common_name": ["human", "human"],
3994
+ QIITA_SAMPLE_TYPE: ["stool", "stool"],
3995
+ SAMPLE_TYPE_KEY: ["stool", "stool"],
3996
+ "study_custom_field": ["custom_value", "custom_value"],
3997
+ "study_stool_field": ["stool_custom", "stool_custom"],
3998
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
3999
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
4000
+ QC_NOTE_KEY: ["", ""]
4001
+ })
4002
+ assert_frame_equal(expected_result_df, result_df)
2532
4003
 
2533
- # Verify fails file is empty (zero bytes)
2534
- self.assertEqual(0, os.path.getsize(fails_files[0]))
4004
+ # Verify main output file includes internal columns
4005
+ output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
4006
+ self.assertEqual(1, len(output_files))
4007
+ output_df = pandas.read_csv(output_files[0], sep="\t", keep_default_na=False)
4008
+ expected_output_df = pandas.DataFrame({
4009
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
4010
+ "body_product": ["UBERON:feces", "UBERON:feces"],
4011
+ "body_site": ["gut", "gut"],
4012
+ "description": ["human sample", "human sample"],
4013
+ "host_common_name": ["human", "human"],
4014
+ QIITA_SAMPLE_TYPE: ["stool", "stool"],
4015
+ SAMPLE_TYPE_KEY: ["stool", "stool"],
4016
+ "study_custom_field": ["custom_value", "custom_value"],
4017
+ "study_stool_field": ["stool_custom", "stool_custom"],
4018
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
4019
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
4020
+ QC_NOTE_KEY: ["", ""]
4021
+ })
4022
+ assert_frame_equal(expected_output_df, output_df)
2535
4023
 
2536
- def test__output_metadata_df_to_files_suppress_empty_fails(self):
2537
- """Test that empty fails file is not created when suppress_empty_fails=True."""
2538
- input_df = pandas.DataFrame({
2539
- SAMPLE_NAME_KEY: ["sample1", "sample2"],
2540
- "field_a": ["a1", "a2"],
2541
- HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
2542
- SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
2543
- QC_NOTE_KEY: ["", ""]
2544
- })
4024
+ # Verify no fails file was created (since remove_internals=False)
4025
+ fails_files = glob.glob(os.path.join(tmpdir, "*_test_output_fails.csv"))
4026
+ self.assertEqual(0, len(fails_files))
2545
4027
 
4028
+ def test_write_extended_metadata_suppress_empty_fails(self):
4029
+ """Test writing extended metadata with suppress_empty_fails=True."""
2546
4030
  with tempfile.TemporaryDirectory() as tmpdir:
2547
- _output_metadata_df_to_files(
2548
- input_df, tmpdir, "test_output", INTERNAL_COL_KEYS,
2549
- sep="\t", remove_internals_and_fails=True,
2550
- suppress_empty_fails=True)
4031
+ result_df = write_extended_metadata(
4032
+ self.TEST_METADATA_CSV_FP, self.TEST_STUDY_CONFIG_FP,
4033
+ tmpdir, "test_output", suppress_empty_fails=True,
4034
+ stds_fp=self.TEST_STDS_FP)
4035
+
4036
+ # Verify returned DataFrame
4037
+ expected_result_df = pandas.DataFrame({
4038
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
4039
+ "body_product": ["UBERON:feces", "UBERON:feces"],
4040
+ "body_site": ["gut", "gut"],
4041
+ "description": ["human sample", "human sample"],
4042
+ "host_common_name": ["human", "human"],
4043
+ QIITA_SAMPLE_TYPE: ["stool", "stool"],
4044
+ SAMPLE_TYPE_KEY: ["stool", "stool"],
4045
+ "study_custom_field": ["custom_value", "custom_value"],
4046
+ "study_stool_field": ["stool_custom", "stool_custom"],
4047
+ HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
4048
+ SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
4049
+ QC_NOTE_KEY: ["", ""]
4050
+ })
4051
+ assert_frame_equal(expected_result_df, result_df)
2551
4052
 
2552
- # Find the fails file - should not exist
4053
+ # Verify main output file was created
4054
+ output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
4055
+ self.assertEqual(1, len(output_files))
4056
+ output_df = pandas.read_csv(output_files[0], sep="\t")
4057
+ expected_output_df = pandas.DataFrame({
4058
+ SAMPLE_NAME_KEY: ["sample1", "sample2"],
4059
+ "body_product": ["UBERON:feces", "UBERON:feces"],
4060
+ "body_site": ["gut", "gut"],
4061
+ "description": ["human sample", "human sample"],
4062
+ "host_common_name": ["human", "human"],
4063
+ QIITA_SAMPLE_TYPE: ["stool", "stool"],
4064
+ SAMPLE_TYPE_KEY: ["stool", "stool"],
4065
+ "study_custom_field": ["custom_value", "custom_value"],
4066
+ "study_stool_field": ["stool_custom", "stool_custom"]
4067
+ })
4068
+ assert_frame_equal(expected_output_df, output_df)
4069
+
4070
+ # Verify no empty fails file was created (since suppress_empty_fails=True)
2553
4071
  fails_files = glob.glob(os.path.join(tmpdir, "*_test_output_fails.csv"))
2554
4072
  self.assertEqual(0, len(fails_files))
2555
4073
 
2556
- # Main output file should still exist
2557
- output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
2558
- self.assertEqual(1, len(output_files))
4074
+ # Verify no empty validation errors file was created
4075
+ validation_files = glob.glob(
4076
+ os.path.join(tmpdir, "*_test_output_validation_errors.csv"))
4077
+ self.assertEqual(0, len(validation_files))
2559
4078
 
2560
- def test__output_metadata_df_to_files_csv_separator(self):
2561
- """Test output with comma separator creates .csv file."""
2562
- input_df = pandas.DataFrame({
2563
- SAMPLE_NAME_KEY: ["sample1", "sample2"],
2564
- "field_a": ["a1", "a2"],
2565
- HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
2566
- SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
2567
- QC_NOTE_KEY: ["", ""]
2568
- })
4079
+ # Integration tests
2569
4080
 
2570
- with tempfile.TemporaryDirectory() as tmpdir:
2571
- _output_metadata_df_to_files(
2572
- input_df, tmpdir, "test_output", INTERNAL_COL_KEYS,
2573
- sep=",", remove_internals_and_fails=False)
4081
+ TEST_PROJECT1_METADATA_FP = path.join(TEST_DIR, "data/test_project1_input_metadata.csv")
4082
+ TEST_PROJECT1_CONFIG_FP = path.join(TEST_DIR, "data/test_project1_config.yml")
4083
+ TEST_PROJECT1_EXPECTED_OUTPUT_FP = path.join(
4084
+ TEST_DIR, "data/test_project1_output_metadata.txt")
4085
+ TEST_PROJECT1_EXPECTED_FAILS_FP = path.join(
4086
+ TEST_DIR, "data/test_project1_output_fails.csv")
4087
+ def test_write_extended_metadata_from_df_project1_integration(self):
4088
+ """Integration test using project1 test data files."""
2574
4089
 
2575
- # Find the output file with .csv extension
2576
- output_files = glob.glob(os.path.join(tmpdir, "*_test_output.csv"))
2577
- self.assertEqual(1, len(output_files))
4090
+ def write_mismatched_debug_files(expected_content, actual_content, file_name):
4091
+ """Write debug files to Desktop for unmatched content."""
4092
+ debug_dir = path.join(path.expanduser("~"), "Desktop")
4093
+ with open(path.join(debug_dir, f"UNMATCHED_1_{file_name}"), 'w') as debug_expected_file:
4094
+ debug_expected_file.write(expected_content)
4095
+ with open(path.join(debug_dir, f"UNMATCHED_2_{file_name}"), 'w') as debug_actual_file:
4096
+ debug_actual_file.write(actual_content)
2578
4097
 
2579
- # Read and verify contents (keep_default_na=False preserves empty strings)
2580
- result_df = pandas.read_csv(output_files[0], sep=",", keep_default_na=False)
2581
- expected_df = input_df
2582
- assert_frame_equal(expected_df, result_df)
2583
4098
 
2584
- def test__output_metadata_df_to_files_all_failures(self):
2585
- """Test output when all rows are failures."""
2586
- input_df = pandas.DataFrame({
2587
- SAMPLE_NAME_KEY: ["sample1", "sample2"],
2588
- "field_a": ["a1", "a2"],
2589
- HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
2590
- SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
2591
- QC_NOTE_KEY: ["invalid host_type", "invalid sample_type"]
2592
- })
4099
+ # Load input metadata CSV
4100
+ input_df = pandas.read_csv(self.TEST_PROJECT1_METADATA_FP, dtype=str)
4101
+ # for the columns "plating_notes" and "notes", fill NaN with empty string
4102
+ input_df["plating_notes"] = input_df["plating_notes"].fillna("")
4103
+ input_df["notes"] = input_df["notes"].fillna("")
4104
+
4105
+ # Load study config
4106
+ study_config = _get_study_specific_config(self.TEST_PROJECT1_CONFIG_FP)
2593
4107
 
2594
4108
  with tempfile.TemporaryDirectory() as tmpdir:
2595
- _output_metadata_df_to_files(
2596
- input_df, tmpdir, "test_output", INTERNAL_COL_KEYS,
2597
- sep="\t", remove_internals_and_fails=True)
4109
+ write_extended_metadata_from_df(
4110
+ input_df, study_config, tmpdir, "test_output",
4111
+ remove_internals=True)
2598
4112
 
2599
- # Main output file should have only headers (empty data)
4113
+ # Compare main output file directly to expected file
2600
4114
  output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
2601
4115
  self.assertEqual(1, len(output_files))
2602
- result_df = pandas.read_csv(output_files[0], sep="\t")
2603
- self.assertTrue(result_df.empty)
2604
- self.assertEqual([SAMPLE_NAME_KEY, "field_a"], list(result_df.columns))
2605
-
2606
- # Fails file should have both rows
4116
+ with open(output_files[0], 'r') as actual_file:
4117
+ actual_content = actual_file.read()
4118
+ with open(self.TEST_PROJECT1_EXPECTED_OUTPUT_FP, 'r') as expected_file:
4119
+ expected_content = expected_file.read()
4120
+ try:
4121
+ self.assertEqual(expected_content, actual_content)
4122
+ except AssertionError:
4123
+ write_mismatched_debug_files(
4124
+ expected_content, actual_content,
4125
+ "project1_output.txt")
4126
+ raise
4127
+
4128
+ # Compare fails file directly to expected file
2607
4129
  fails_files = glob.glob(os.path.join(tmpdir, "*_test_output_fails.csv"))
2608
4130
  self.assertEqual(1, len(fails_files))
2609
- fails_df = pandas.read_csv(fails_files[0], sep=",")
2610
- self.assertEqual(2, len(fails_df))
4131
+ with open(fails_files[0], 'r') as actual_file:
4132
+ actual_fails_content = actual_file.read()
4133
+ with open(self.TEST_PROJECT1_EXPECTED_FAILS_FP, 'r') as expected_file:
4134
+ expected_fails_content = expected_file.read()
4135
+ try:
4136
+ self.assertEqual(expected_fails_content, actual_fails_content)
4137
+ except AssertionError:
4138
+ write_mismatched_debug_files(
4139
+ expected_fails_content, actual_fails_content,
4140
+ "project1_fails.csv")
4141
+ raise
4142
+
4143
+ # Verify validation errors file is empty
4144
+ validation_files = glob.glob(
4145
+ os.path.join(tmpdir, "*_test_output_validation_errors.csv"))
4146
+ self.assertEqual(1, len(validation_files))
4147
+ self.assertEqual(0, os.path.getsize(validation_files[0]))