metameq 2026.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- metameq/__init__.py +42 -0
- metameq/_version.py +21 -0
- metameq/config/__init__.py +0 -0
- metameq/config/config.yml +3 -0
- metameq/config/standards.yml +1648 -0
- metameq/src/__init__.py +0 -0
- metameq/src/__main__.py +34 -0
- metameq/src/metadata_configurator.py +512 -0
- metameq/src/metadata_extender.py +1168 -0
- metameq/src/metadata_merger.py +362 -0
- metameq/src/metadata_transformers.py +335 -0
- metameq/src/metadata_validator.py +387 -0
- metameq/src/util.py +299 -0
- metameq/tests/__init__.py +0 -0
- metameq/tests/data/invalid.yml +1 -0
- metameq/tests/data/test_config.yml +9 -0
- metameq/tests/test_metadata_configurator.py +2334 -0
- metameq/tests/test_metadata_extender.py +2610 -0
- metameq/tests/test_metadata_merger.py +657 -0
- metameq/tests/test_metadata_transformers.py +277 -0
- metameq/tests/test_metadata_validator.py +1191 -0
- metameq/tests/test_util.py +436 -0
- metameq-2026.1.1.dist-info/METADATA +21 -0
- metameq-2026.1.1.dist-info/RECORD +27 -0
- metameq-2026.1.1.dist-info/WHEEL +5 -0
- metameq-2026.1.1.dist-info/entry_points.txt +2 -0
- metameq-2026.1.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,2610 @@
|
|
|
1
|
+
import glob
|
|
2
|
+
import numpy as np
|
|
3
|
+
import os
|
|
4
|
+
import os.path as path
|
|
5
|
+
import pandas
|
|
6
|
+
import tempfile
|
|
7
|
+
from pandas.testing import assert_frame_equal
|
|
8
|
+
from unittest import TestCase
|
|
9
|
+
from metameq.src.util import \
|
|
10
|
+
SAMPLE_NAME_KEY, HOSTTYPE_SHORTHAND_KEY, SAMPLETYPE_SHORTHAND_KEY, \
|
|
11
|
+
QC_NOTE_KEY, DEFAULT_KEY, REQUIRED_RAW_METADATA_FIELDS, REQUIRED_KEY, \
|
|
12
|
+
METADATA_FIELDS_KEY, ALIAS_KEY, BASE_TYPE_KEY, ALLOWED_KEY, TYPE_KEY, \
|
|
13
|
+
SAMPLE_TYPE_KEY, QIITA_SAMPLE_TYPE, SAMPLE_TYPE_SPECIFIC_METADATA_KEY, \
|
|
14
|
+
OVERWRITE_NON_NANS_KEY, LEAVE_REQUIREDS_BLANK_KEY, LEAVE_BLANK_VAL, \
|
|
15
|
+
HOST_TYPE_SPECIFIC_METADATA_KEY, METADATA_TRANSFORMERS_KEY, \
|
|
16
|
+
SOURCES_KEY, FUNCTION_KEY, PRE_TRANSFORMERS_KEY, POST_TRANSFORMERS_KEY, \
|
|
17
|
+
STUDY_SPECIFIC_METADATA_KEY
|
|
18
|
+
from metameq.src.metadata_extender import \
|
|
19
|
+
id_missing_cols, get_qc_failures, _reorder_df, \
|
|
20
|
+
_catch_nan_required_fields, _fill_na_if_default, \
|
|
21
|
+
_update_metadata_from_metadata_fields_dict, _update_metadata_from_dict, \
|
|
22
|
+
_construct_sample_type_metadata_fields_dict, \
|
|
23
|
+
_generate_metadata_for_a_sample_type_in_a_host_type, \
|
|
24
|
+
_generate_metadata_for_a_host_type, _generate_metadata_for_host_types, \
|
|
25
|
+
_transform_metadata, _populate_metadata_df, extend_metadata_df, \
|
|
26
|
+
_get_study_specific_config, _output_metadata_df_to_files, \
|
|
27
|
+
INTERNAL_COL_KEYS, REQ_PLACEHOLDER
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class TestMetadataExtender(TestCase):
|
|
31
|
+
"""Test suite for metadata_extender module."""
|
|
32
|
+
|
|
33
|
+
# Tests for id_missing_cols
|
|
34
|
+
|
|
35
|
+
def test_id_missing_cols_all_present(self):
|
|
36
|
+
"""Test returns empty list when all required columns exist."""
|
|
37
|
+
input_df = pandas.DataFrame({
|
|
38
|
+
SAMPLE_NAME_KEY: ["sample1"],
|
|
39
|
+
HOSTTYPE_SHORTHAND_KEY: ["human"],
|
|
40
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool"]
|
|
41
|
+
})
|
|
42
|
+
|
|
43
|
+
result = id_missing_cols(input_df)
|
|
44
|
+
|
|
45
|
+
expected = []
|
|
46
|
+
self.assertEqual(expected, result)
|
|
47
|
+
|
|
48
|
+
def test_id_missing_cols_some_missing(self):
|
|
49
|
+
"""Test returns sorted list of missing required columns."""
|
|
50
|
+
input_df = pandas.DataFrame({
|
|
51
|
+
SAMPLE_NAME_KEY: ["sample1"]
|
|
52
|
+
})
|
|
53
|
+
|
|
54
|
+
result = id_missing_cols(input_df)
|
|
55
|
+
|
|
56
|
+
expected = sorted([HOSTTYPE_SHORTHAND_KEY, SAMPLETYPE_SHORTHAND_KEY])
|
|
57
|
+
self.assertEqual(expected, result)
|
|
58
|
+
|
|
59
|
+
def test_id_missing_cols_all_missing(self):
|
|
60
|
+
"""Test returns all required columns when df has none of them."""
|
|
61
|
+
input_df = pandas.DataFrame({
|
|
62
|
+
"other_col": ["value1"]
|
|
63
|
+
})
|
|
64
|
+
|
|
65
|
+
result = id_missing_cols(input_df)
|
|
66
|
+
|
|
67
|
+
expected = sorted(REQUIRED_RAW_METADATA_FIELDS)
|
|
68
|
+
self.assertEqual(expected, result)
|
|
69
|
+
|
|
70
|
+
# Tests for get_qc_failures
|
|
71
|
+
|
|
72
|
+
def test_get_qc_failures_no_failures(self):
|
|
73
|
+
"""Test returns empty df when QC_NOTE_KEY is all empty strings."""
|
|
74
|
+
input_df = pandas.DataFrame({
|
|
75
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
76
|
+
QC_NOTE_KEY: ["", ""]
|
|
77
|
+
})
|
|
78
|
+
|
|
79
|
+
result = get_qc_failures(input_df)
|
|
80
|
+
|
|
81
|
+
self.assertTrue(result.empty)
|
|
82
|
+
|
|
83
|
+
def test_get_qc_failures_some_failures(self):
|
|
84
|
+
"""Test returns only rows where QC_NOTE_KEY is not empty."""
|
|
85
|
+
input_df = pandas.DataFrame({
|
|
86
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2", "sample3"],
|
|
87
|
+
QC_NOTE_KEY: ["", "invalid host_type", ""]
|
|
88
|
+
})
|
|
89
|
+
|
|
90
|
+
result = get_qc_failures(input_df)
|
|
91
|
+
|
|
92
|
+
expected = pandas.DataFrame({
|
|
93
|
+
SAMPLE_NAME_KEY: ["sample2"],
|
|
94
|
+
QC_NOTE_KEY: ["invalid host_type"]
|
|
95
|
+
}, index=[1])
|
|
96
|
+
assert_frame_equal(expected, result)
|
|
97
|
+
|
|
98
|
+
def test_get_qc_failures_all_failures(self):
|
|
99
|
+
"""Test returns all rows when all have QC notes."""
|
|
100
|
+
input_df = pandas.DataFrame({
|
|
101
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
102
|
+
QC_NOTE_KEY: ["invalid host_type", "invalid sample_type"]
|
|
103
|
+
})
|
|
104
|
+
|
|
105
|
+
result = get_qc_failures(input_df)
|
|
106
|
+
|
|
107
|
+
assert_frame_equal(input_df, result)
|
|
108
|
+
|
|
109
|
+
# Tests for _reorder_df
|
|
110
|
+
|
|
111
|
+
def test__reorder_df_sample_name_first(self):
|
|
112
|
+
"""Test that sample_name becomes the first column."""
|
|
113
|
+
input_df = pandas.DataFrame({
|
|
114
|
+
"zebra": ["z"],
|
|
115
|
+
SAMPLE_NAME_KEY: ["sample1"],
|
|
116
|
+
"apple": ["a"],
|
|
117
|
+
QC_NOTE_KEY: [""],
|
|
118
|
+
HOSTTYPE_SHORTHAND_KEY: ["human"],
|
|
119
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool"]
|
|
120
|
+
})
|
|
121
|
+
|
|
122
|
+
result = _reorder_df(input_df, INTERNAL_COL_KEYS)
|
|
123
|
+
|
|
124
|
+
self.assertEqual(SAMPLE_NAME_KEY, result.columns[0])
|
|
125
|
+
|
|
126
|
+
def test__reorder_df_alphabetical_order(self):
|
|
127
|
+
"""Test that non-internal columns are sorted alphabetically after sample_name."""
|
|
128
|
+
input_df = pandas.DataFrame({
|
|
129
|
+
"zebra": ["z"],
|
|
130
|
+
SAMPLE_NAME_KEY: ["sample1"],
|
|
131
|
+
"apple": ["a"],
|
|
132
|
+
QC_NOTE_KEY: [""],
|
|
133
|
+
HOSTTYPE_SHORTHAND_KEY: ["human"],
|
|
134
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool"]
|
|
135
|
+
})
|
|
136
|
+
|
|
137
|
+
result = _reorder_df(input_df, INTERNAL_COL_KEYS)
|
|
138
|
+
|
|
139
|
+
expected_order = [SAMPLE_NAME_KEY, "apple", "zebra"] + INTERNAL_COL_KEYS
|
|
140
|
+
self.assertEqual(expected_order, list(result.columns))
|
|
141
|
+
|
|
142
|
+
def test__reorder_df_internals_at_end(self):
|
|
143
|
+
"""Test that internal columns are moved to the end in the provided order."""
|
|
144
|
+
input_df = pandas.DataFrame({
|
|
145
|
+
"field1": ["value1"],
|
|
146
|
+
SAMPLE_NAME_KEY: ["sample1"],
|
|
147
|
+
QC_NOTE_KEY: [""],
|
|
148
|
+
HOSTTYPE_SHORTHAND_KEY: ["human"],
|
|
149
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool"]
|
|
150
|
+
})
|
|
151
|
+
|
|
152
|
+
result = _reorder_df(input_df, INTERNAL_COL_KEYS)
|
|
153
|
+
|
|
154
|
+
expected_order = [SAMPLE_NAME_KEY, "field1"] + INTERNAL_COL_KEYS
|
|
155
|
+
self.assertEqual(expected_order, list(result.columns))
|
|
156
|
+
|
|
157
|
+
def test__reorder_df_full_ordering(self):
|
|
158
|
+
"""Test complete column ordering: sample_name, alphabetical, internals."""
|
|
159
|
+
input_df = pandas.DataFrame({
|
|
160
|
+
"zebra": ["z"],
|
|
161
|
+
SAMPLE_NAME_KEY: ["sample1"],
|
|
162
|
+
"apple": ["a"],
|
|
163
|
+
QC_NOTE_KEY: [""],
|
|
164
|
+
HOSTTYPE_SHORTHAND_KEY: ["human"],
|
|
165
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool"],
|
|
166
|
+
"banana": ["b"]
|
|
167
|
+
})
|
|
168
|
+
|
|
169
|
+
result = _reorder_df(input_df, INTERNAL_COL_KEYS)
|
|
170
|
+
|
|
171
|
+
expected_order = [SAMPLE_NAME_KEY, "apple", "banana", "zebra"] + INTERNAL_COL_KEYS
|
|
172
|
+
self.assertEqual(expected_order, list(result.columns))
|
|
173
|
+
|
|
174
|
+
# Tests for _catch_nan_required_fields
|
|
175
|
+
|
|
176
|
+
def test__catch_nan_required_fields_no_nans(self):
|
|
177
|
+
"""Test returns unchanged df when no NaNs in required fields."""
|
|
178
|
+
input_df = pandas.DataFrame({
|
|
179
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
180
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "control"],
|
|
181
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "blank"]
|
|
182
|
+
})
|
|
183
|
+
|
|
184
|
+
result = _catch_nan_required_fields(input_df)
|
|
185
|
+
|
|
186
|
+
assert_frame_equal(input_df, result)
|
|
187
|
+
|
|
188
|
+
def test__catch_nan_required_fields_nan_sample_name_raises(self):
|
|
189
|
+
"""Test raises ValueError when sample_name contains NaN."""
|
|
190
|
+
input_df = pandas.DataFrame({
|
|
191
|
+
SAMPLE_NAME_KEY: ["sample1", np.nan],
|
|
192
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "control"],
|
|
193
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "blank"]
|
|
194
|
+
})
|
|
195
|
+
|
|
196
|
+
with self.assertRaisesRegex(ValueError, "Metadata contains NaN sample names"):
|
|
197
|
+
_catch_nan_required_fields(input_df)
|
|
198
|
+
|
|
199
|
+
def test__catch_nan_required_fields_nan_shorthand_fields_become_empty(self):
|
|
200
|
+
"""Test that NaN hosttype_shorthand and sampletype_shorthand values are set to 'empty'."""
|
|
201
|
+
input_df = pandas.DataFrame({
|
|
202
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
203
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", np.nan],
|
|
204
|
+
SAMPLETYPE_SHORTHAND_KEY: [np.nan, "blank"]
|
|
205
|
+
})
|
|
206
|
+
|
|
207
|
+
result = _catch_nan_required_fields(input_df)
|
|
208
|
+
|
|
209
|
+
expected = pandas.DataFrame({
|
|
210
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
211
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "empty"],
|
|
212
|
+
SAMPLETYPE_SHORTHAND_KEY: ["empty", "blank"]
|
|
213
|
+
})
|
|
214
|
+
assert_frame_equal(expected, result)
|
|
215
|
+
|
|
216
|
+
# Tests for _fill_na_if_default
|
|
217
|
+
|
|
218
|
+
def test__fill_na_if_default_specific_overrides_settings(self):
|
|
219
|
+
"""Test that specific_dict default takes precedence over settings_dict."""
|
|
220
|
+
input_df = pandas.DataFrame({
|
|
221
|
+
"field1": ["value1", np.nan, "value3"],
|
|
222
|
+
"field2": [np.nan, "value2", np.nan]
|
|
223
|
+
})
|
|
224
|
+
specific_dict = {DEFAULT_KEY: "filled"}
|
|
225
|
+
settings_dict = {DEFAULT_KEY: "unused"}
|
|
226
|
+
|
|
227
|
+
result = _fill_na_if_default(input_df, specific_dict, settings_dict)
|
|
228
|
+
|
|
229
|
+
expected = pandas.DataFrame({
|
|
230
|
+
"field1": ["value1", "filled", "value3"],
|
|
231
|
+
"field2": ["filled", "value2", "filled"]
|
|
232
|
+
})
|
|
233
|
+
assert_frame_equal(expected, result)
|
|
234
|
+
|
|
235
|
+
def test__fill_na_if_default_uses_settings_when_specific_missing(self):
|
|
236
|
+
"""Test that settings_dict default is used when specific_dict has no default."""
|
|
237
|
+
input_df = pandas.DataFrame({
|
|
238
|
+
"field1": [np.nan]
|
|
239
|
+
})
|
|
240
|
+
specific_dict = {}
|
|
241
|
+
settings_dict = {DEFAULT_KEY: "settings_default"}
|
|
242
|
+
|
|
243
|
+
result = _fill_na_if_default(input_df, specific_dict, settings_dict)
|
|
244
|
+
|
|
245
|
+
expected = pandas.DataFrame({
|
|
246
|
+
"field1": ["settings_default"]
|
|
247
|
+
})
|
|
248
|
+
assert_frame_equal(expected, result)
|
|
249
|
+
|
|
250
|
+
# Tests for _update_metadata_from_metadata_fields_dict
|
|
251
|
+
|
|
252
|
+
def test__update_metadata_from_metadata_fields_dict_adds_new_column_with_default(self):
|
|
253
|
+
"""Test that a new column is added with the default value when field has default."""
|
|
254
|
+
input_df = pandas.DataFrame({
|
|
255
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"]
|
|
256
|
+
})
|
|
257
|
+
metadata_fields_dict = {
|
|
258
|
+
"new_field": {
|
|
259
|
+
DEFAULT_KEY: "default_value"
|
|
260
|
+
}
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
result = _update_metadata_from_metadata_fields_dict(
|
|
264
|
+
input_df, metadata_fields_dict, overwrite_non_nans=False)
|
|
265
|
+
|
|
266
|
+
expected = pandas.DataFrame({
|
|
267
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
268
|
+
"new_field": ["default_value", "default_value"]
|
|
269
|
+
})
|
|
270
|
+
assert_frame_equal(expected, result)
|
|
271
|
+
|
|
272
|
+
def test__update_metadata_from_metadata_fields_dict_fills_nans_with_default(self):
|
|
273
|
+
"""Test that NaN values in existing column are filled with default."""
|
|
274
|
+
input_df = pandas.DataFrame({
|
|
275
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
276
|
+
"existing_field": ["value1", np.nan]
|
|
277
|
+
})
|
|
278
|
+
metadata_fields_dict = {
|
|
279
|
+
"existing_field": {
|
|
280
|
+
DEFAULT_KEY: "default_value"
|
|
281
|
+
}
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
result = _update_metadata_from_metadata_fields_dict(
|
|
285
|
+
input_df, metadata_fields_dict, overwrite_non_nans=False)
|
|
286
|
+
|
|
287
|
+
expected = pandas.DataFrame({
|
|
288
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
289
|
+
"existing_field": ["value1", "default_value"]
|
|
290
|
+
})
|
|
291
|
+
assert_frame_equal(expected, result)
|
|
292
|
+
|
|
293
|
+
def test__update_metadata_from_metadata_fields_dict_overwrite_non_nans_false(self):
|
|
294
|
+
"""Test that existing non-NaN values are preserved when overwrite_non_nans is False."""
|
|
295
|
+
input_df = pandas.DataFrame({
|
|
296
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
297
|
+
"existing_field": ["original", np.nan]
|
|
298
|
+
})
|
|
299
|
+
metadata_fields_dict = {
|
|
300
|
+
"existing_field": {
|
|
301
|
+
DEFAULT_KEY: "default_value"
|
|
302
|
+
}
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
result = _update_metadata_from_metadata_fields_dict(
|
|
306
|
+
input_df, metadata_fields_dict, overwrite_non_nans=False)
|
|
307
|
+
|
|
308
|
+
expected = pandas.DataFrame({
|
|
309
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
310
|
+
"existing_field": ["original", "default_value"]
|
|
311
|
+
})
|
|
312
|
+
assert_frame_equal(expected, result)
|
|
313
|
+
|
|
314
|
+
def test__update_metadata_from_metadata_fields_dict_overwrite_non_nans_true(self):
|
|
315
|
+
"""Test that existing values are overwritten when overwrite_non_nans is True."""
|
|
316
|
+
input_df = pandas.DataFrame({
|
|
317
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
318
|
+
"existing_field": ["original", "also_original"]
|
|
319
|
+
})
|
|
320
|
+
metadata_fields_dict = {
|
|
321
|
+
"existing_field": {
|
|
322
|
+
DEFAULT_KEY: "default_value"
|
|
323
|
+
}
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
result = _update_metadata_from_metadata_fields_dict(
|
|
327
|
+
input_df, metadata_fields_dict, overwrite_non_nans=True)
|
|
328
|
+
|
|
329
|
+
expected = pandas.DataFrame({
|
|
330
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
331
|
+
"existing_field": ["default_value", "default_value"]
|
|
332
|
+
})
|
|
333
|
+
assert_frame_equal(expected, result)
|
|
334
|
+
|
|
335
|
+
def test__update_metadata_from_metadata_fields_dict_adds_required_placeholder(self):
|
|
336
|
+
"""Test that required field without default gets placeholder when column doesn't exist."""
|
|
337
|
+
input_df = pandas.DataFrame({
|
|
338
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"]
|
|
339
|
+
})
|
|
340
|
+
metadata_fields_dict = {
|
|
341
|
+
"required_field": {
|
|
342
|
+
REQUIRED_KEY: True
|
|
343
|
+
}
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
result = _update_metadata_from_metadata_fields_dict(
|
|
347
|
+
input_df, metadata_fields_dict, overwrite_non_nans=False)
|
|
348
|
+
|
|
349
|
+
expected = pandas.DataFrame({
|
|
350
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
351
|
+
"required_field": [REQ_PLACEHOLDER, REQ_PLACEHOLDER]
|
|
352
|
+
})
|
|
353
|
+
assert_frame_equal(expected, result)
|
|
354
|
+
|
|
355
|
+
def test__update_metadata_from_metadata_fields_dict_preserves_existing_required(self):
|
|
356
|
+
"""Test that existing values in required, no-default field are preserved (no placeholder)."""
|
|
357
|
+
input_df = pandas.DataFrame({
|
|
358
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
359
|
+
"required_field": ["existing1", "existing2"]
|
|
360
|
+
})
|
|
361
|
+
metadata_fields_dict = {
|
|
362
|
+
"required_field": {
|
|
363
|
+
REQUIRED_KEY: True
|
|
364
|
+
}
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
result = _update_metadata_from_metadata_fields_dict(
|
|
368
|
+
input_df, metadata_fields_dict, overwrite_non_nans=False)
|
|
369
|
+
|
|
370
|
+
expected = pandas.DataFrame({
|
|
371
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
372
|
+
"required_field": ["existing1", "existing2"]
|
|
373
|
+
})
|
|
374
|
+
assert_frame_equal(expected, result)
|
|
375
|
+
|
|
376
|
+
def test__update_metadata_from_metadata_fields_dict_required_false_no_placeholder(self):
|
|
377
|
+
"""Test that field with required=False and no default doesn't get added."""
|
|
378
|
+
input_df = pandas.DataFrame({
|
|
379
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"]
|
|
380
|
+
})
|
|
381
|
+
metadata_fields_dict = {
|
|
382
|
+
"optional_field": {
|
|
383
|
+
REQUIRED_KEY: False
|
|
384
|
+
}
|
|
385
|
+
}
|
|
386
|
+
|
|
387
|
+
result = _update_metadata_from_metadata_fields_dict(
|
|
388
|
+
input_df, metadata_fields_dict, overwrite_non_nans=False)
|
|
389
|
+
|
|
390
|
+
expected = pandas.DataFrame({
|
|
391
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"]
|
|
392
|
+
})
|
|
393
|
+
assert_frame_equal(expected, result)
|
|
394
|
+
|
|
395
|
+
def test__update_metadata_from_metadata_fields_dict_default_takes_precedence(self):
|
|
396
|
+
"""Test that default value is used even when field is also marked required."""
|
|
397
|
+
input_df = pandas.DataFrame({
|
|
398
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"]
|
|
399
|
+
})
|
|
400
|
+
metadata_fields_dict = {
|
|
401
|
+
"field_with_both": {
|
|
402
|
+
DEFAULT_KEY: "the_default",
|
|
403
|
+
REQUIRED_KEY: True
|
|
404
|
+
}
|
|
405
|
+
}
|
|
406
|
+
|
|
407
|
+
result = _update_metadata_from_metadata_fields_dict(
|
|
408
|
+
input_df, metadata_fields_dict, overwrite_non_nans=False)
|
|
409
|
+
|
|
410
|
+
expected = pandas.DataFrame({
|
|
411
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
412
|
+
"field_with_both": ["the_default", "the_default"]
|
|
413
|
+
})
|
|
414
|
+
assert_frame_equal(expected, result)
|
|
415
|
+
|
|
416
|
+
def test__update_metadata_from_metadata_fields_dict_multiple_fields(self):
|
|
417
|
+
"""Test updating multiple fields at once."""
|
|
418
|
+
input_df = pandas.DataFrame({
|
|
419
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
420
|
+
"existing": ["val1", np.nan]
|
|
421
|
+
})
|
|
422
|
+
metadata_fields_dict = {
|
|
423
|
+
"existing": {
|
|
424
|
+
DEFAULT_KEY: "filled"
|
|
425
|
+
},
|
|
426
|
+
"new_default": {
|
|
427
|
+
DEFAULT_KEY: "new_val"
|
|
428
|
+
},
|
|
429
|
+
"new_required": {
|
|
430
|
+
REQUIRED_KEY: True
|
|
431
|
+
}
|
|
432
|
+
}
|
|
433
|
+
|
|
434
|
+
result = _update_metadata_from_metadata_fields_dict(
|
|
435
|
+
input_df, metadata_fields_dict, overwrite_non_nans=False)
|
|
436
|
+
|
|
437
|
+
expected = pandas.DataFrame({
|
|
438
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
439
|
+
"existing": ["val1", "filled"],
|
|
440
|
+
"new_default": ["new_val", "new_val"],
|
|
441
|
+
"new_required": [REQ_PLACEHOLDER, REQ_PLACEHOLDER]
|
|
442
|
+
})
|
|
443
|
+
assert_frame_equal(expected, result)
|
|
444
|
+
|
|
445
|
+
# Tests for _update_metadata_from_dict
|
|
446
|
+
|
|
447
|
+
def test__update_metadata_from_dict_extracts_metadata_fields(self):
|
|
448
|
+
"""Test that METADATA_FIELDS_KEY is extracted when dict_is_metadata_fields=False."""
|
|
449
|
+
input_df = pandas.DataFrame({
|
|
450
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"]
|
|
451
|
+
})
|
|
452
|
+
config_section_dict = {
|
|
453
|
+
METADATA_FIELDS_KEY: {
|
|
454
|
+
"new_field": {
|
|
455
|
+
DEFAULT_KEY: "default_value"
|
|
456
|
+
}
|
|
457
|
+
},
|
|
458
|
+
"other_key": "ignored"
|
|
459
|
+
}
|
|
460
|
+
|
|
461
|
+
result = _update_metadata_from_dict(
|
|
462
|
+
input_df, config_section_dict,
|
|
463
|
+
dict_is_metadata_fields=False, overwrite_non_nans=False)
|
|
464
|
+
|
|
465
|
+
expected = pandas.DataFrame({
|
|
466
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
467
|
+
"new_field": ["default_value", "default_value"]
|
|
468
|
+
})
|
|
469
|
+
assert_frame_equal(expected, result)
|
|
470
|
+
|
|
471
|
+
def test__update_metadata_from_dict_uses_dict_directly(self):
|
|
472
|
+
"""Test that dict is used directly when dict_is_metadata_fields=True."""
|
|
473
|
+
input_df = pandas.DataFrame({
|
|
474
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"]
|
|
475
|
+
})
|
|
476
|
+
config_section_dict = {
|
|
477
|
+
"new_field": {
|
|
478
|
+
DEFAULT_KEY: "default_value"
|
|
479
|
+
}
|
|
480
|
+
}
|
|
481
|
+
|
|
482
|
+
result = _update_metadata_from_dict(
|
|
483
|
+
input_df, config_section_dict,
|
|
484
|
+
dict_is_metadata_fields=True, overwrite_non_nans=False)
|
|
485
|
+
|
|
486
|
+
expected = pandas.DataFrame({
|
|
487
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
488
|
+
"new_field": ["default_value", "default_value"]
|
|
489
|
+
})
|
|
490
|
+
assert_frame_equal(expected, result)
|
|
491
|
+
|
|
492
|
+
def test__update_metadata_from_dict_passes_overwrite_non_nans(self):
|
|
493
|
+
"""Test that overwrite_non_nans parameter is passed through correctly."""
|
|
494
|
+
input_df = pandas.DataFrame({
|
|
495
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
496
|
+
"existing_field": ["original", "also_original"]
|
|
497
|
+
})
|
|
498
|
+
config_section_dict = {
|
|
499
|
+
"existing_field": {
|
|
500
|
+
DEFAULT_KEY: "new_value"
|
|
501
|
+
}
|
|
502
|
+
}
|
|
503
|
+
|
|
504
|
+
result = _update_metadata_from_dict(
|
|
505
|
+
input_df, config_section_dict,
|
|
506
|
+
dict_is_metadata_fields=True, overwrite_non_nans=True)
|
|
507
|
+
|
|
508
|
+
expected = pandas.DataFrame({
|
|
509
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
510
|
+
"existing_field": ["new_value", "new_value"]
|
|
511
|
+
})
|
|
512
|
+
assert_frame_equal(expected, result)
|
|
513
|
+
|
|
514
|
+
# Tests for _construct_sample_type_metadata_fields_dict
|
|
515
|
+
|
|
516
|
+
def test__construct_sample_type_metadata_fields_dict_simple(self):
|
|
517
|
+
"""Test combining host and sample type fields for a simple sample type."""
|
|
518
|
+
host_sample_types_config_dict = {
|
|
519
|
+
"stool": {
|
|
520
|
+
METADATA_FIELDS_KEY: {
|
|
521
|
+
"sample_field": {
|
|
522
|
+
DEFAULT_KEY: "sample_default"
|
|
523
|
+
}
|
|
524
|
+
}
|
|
525
|
+
}
|
|
526
|
+
}
|
|
527
|
+
host_metadata_fields_dict = {
|
|
528
|
+
"host_field": {
|
|
529
|
+
DEFAULT_KEY: "host_default"
|
|
530
|
+
}
|
|
531
|
+
}
|
|
532
|
+
|
|
533
|
+
result = _construct_sample_type_metadata_fields_dict(
|
|
534
|
+
"stool", host_sample_types_config_dict, host_metadata_fields_dict)
|
|
535
|
+
|
|
536
|
+
expected = {
|
|
537
|
+
"host_field": {
|
|
538
|
+
DEFAULT_KEY: "host_default"
|
|
539
|
+
},
|
|
540
|
+
"sample_field": {
|
|
541
|
+
DEFAULT_KEY: "sample_default"
|
|
542
|
+
},
|
|
543
|
+
SAMPLE_TYPE_KEY: {
|
|
544
|
+
ALLOWED_KEY: ["stool"],
|
|
545
|
+
DEFAULT_KEY: "stool",
|
|
546
|
+
TYPE_KEY: "string"
|
|
547
|
+
},
|
|
548
|
+
QIITA_SAMPLE_TYPE: {
|
|
549
|
+
ALLOWED_KEY: ["stool"],
|
|
550
|
+
DEFAULT_KEY: "stool",
|
|
551
|
+
TYPE_KEY: "string"
|
|
552
|
+
}
|
|
553
|
+
}
|
|
554
|
+
self.assertDictEqual(expected, result)
|
|
555
|
+
|
|
556
|
+
def test__construct_sample_type_metadata_fields_dict_with_alias(self):
|
|
557
|
+
"""Test that alias resolves to target sample type."""
|
|
558
|
+
host_sample_types_config_dict = {
|
|
559
|
+
"feces": {
|
|
560
|
+
ALIAS_KEY: "stool"
|
|
561
|
+
},
|
|
562
|
+
"stool": {
|
|
563
|
+
METADATA_FIELDS_KEY: {
|
|
564
|
+
"stool_field": {
|
|
565
|
+
DEFAULT_KEY: "stool_value"
|
|
566
|
+
}
|
|
567
|
+
}
|
|
568
|
+
}
|
|
569
|
+
}
|
|
570
|
+
host_metadata_fields_dict = {}
|
|
571
|
+
|
|
572
|
+
result = _construct_sample_type_metadata_fields_dict(
|
|
573
|
+
"feces", host_sample_types_config_dict, host_metadata_fields_dict)
|
|
574
|
+
|
|
575
|
+
expected = {
|
|
576
|
+
"stool_field": {
|
|
577
|
+
DEFAULT_KEY: "stool_value"
|
|
578
|
+
},
|
|
579
|
+
SAMPLE_TYPE_KEY: {
|
|
580
|
+
ALLOWED_KEY: ["stool"],
|
|
581
|
+
DEFAULT_KEY: "stool",
|
|
582
|
+
TYPE_KEY: "string"
|
|
583
|
+
},
|
|
584
|
+
QIITA_SAMPLE_TYPE: {
|
|
585
|
+
ALLOWED_KEY: ["stool"],
|
|
586
|
+
DEFAULT_KEY: "stool",
|
|
587
|
+
TYPE_KEY: "string"
|
|
588
|
+
}
|
|
589
|
+
}
|
|
590
|
+
self.assertDictEqual(expected, result)
|
|
591
|
+
|
|
592
|
+
def test__construct_sample_type_metadata_fields_dict_chained_alias_raises(self):
|
|
593
|
+
"""Test that chained aliases raise ValueError."""
|
|
594
|
+
host_sample_types_config_dict = {
|
|
595
|
+
"feces": {
|
|
596
|
+
ALIAS_KEY: "stool"
|
|
597
|
+
},
|
|
598
|
+
"stool": {
|
|
599
|
+
ALIAS_KEY: "poop"
|
|
600
|
+
},
|
|
601
|
+
"poop": {
|
|
602
|
+
METADATA_FIELDS_KEY: {}
|
|
603
|
+
}
|
|
604
|
+
}
|
|
605
|
+
host_metadata_fields_dict = {}
|
|
606
|
+
|
|
607
|
+
with self.assertRaisesRegex(ValueError, "May not chain aliases"):
|
|
608
|
+
_construct_sample_type_metadata_fields_dict(
|
|
609
|
+
"feces", host_sample_types_config_dict, host_metadata_fields_dict)
|
|
610
|
+
|
|
611
|
+
def test__construct_sample_type_metadata_fields_dict_with_base_type(self):
|
|
612
|
+
"""Test that base type fields are inherited and overlaid."""
|
|
613
|
+
host_sample_types_config_dict = {
|
|
614
|
+
"base_sample": {
|
|
615
|
+
METADATA_FIELDS_KEY: {
|
|
616
|
+
"base_field": {
|
|
617
|
+
DEFAULT_KEY: "base_value"
|
|
618
|
+
}
|
|
619
|
+
}
|
|
620
|
+
},
|
|
621
|
+
"derived_sample": {
|
|
622
|
+
BASE_TYPE_KEY: "base_sample",
|
|
623
|
+
METADATA_FIELDS_KEY: {
|
|
624
|
+
"derived_field": {
|
|
625
|
+
DEFAULT_KEY: "derived_value"
|
|
626
|
+
}
|
|
627
|
+
}
|
|
628
|
+
}
|
|
629
|
+
}
|
|
630
|
+
host_metadata_fields_dict = {}
|
|
631
|
+
|
|
632
|
+
result = _construct_sample_type_metadata_fields_dict(
|
|
633
|
+
"derived_sample", host_sample_types_config_dict, host_metadata_fields_dict)
|
|
634
|
+
|
|
635
|
+
expected = {
|
|
636
|
+
"base_field": {
|
|
637
|
+
DEFAULT_KEY: "base_value"
|
|
638
|
+
},
|
|
639
|
+
"derived_field": {
|
|
640
|
+
DEFAULT_KEY: "derived_value"
|
|
641
|
+
},
|
|
642
|
+
SAMPLE_TYPE_KEY: {
|
|
643
|
+
ALLOWED_KEY: ["derived_sample"],
|
|
644
|
+
DEFAULT_KEY: "derived_sample",
|
|
645
|
+
TYPE_KEY: "string"
|
|
646
|
+
},
|
|
647
|
+
QIITA_SAMPLE_TYPE: {
|
|
648
|
+
ALLOWED_KEY: ["derived_sample"],
|
|
649
|
+
DEFAULT_KEY: "derived_sample",
|
|
650
|
+
TYPE_KEY: "string"
|
|
651
|
+
}
|
|
652
|
+
}
|
|
653
|
+
self.assertDictEqual(expected, result)
|
|
654
|
+
|
|
655
|
+
def test__construct_sample_type_metadata_fields_dict_base_type_invalid_raises(self):
|
|
656
|
+
"""Test that base type with non-metadata-fields keys raises ValueError."""
|
|
657
|
+
host_sample_types_config_dict = {
|
|
658
|
+
"base_sample": {
|
|
659
|
+
METADATA_FIELDS_KEY: {
|
|
660
|
+
"base_field": {DEFAULT_KEY: "value"}
|
|
661
|
+
},
|
|
662
|
+
"extra_key": "not_allowed"
|
|
663
|
+
},
|
|
664
|
+
"derived_sample": {
|
|
665
|
+
BASE_TYPE_KEY: "base_sample",
|
|
666
|
+
METADATA_FIELDS_KEY: {}
|
|
667
|
+
}
|
|
668
|
+
}
|
|
669
|
+
host_metadata_fields_dict = {}
|
|
670
|
+
|
|
671
|
+
with self.assertRaisesRegex(ValueError, "must only have metadata fields"):
|
|
672
|
+
_construct_sample_type_metadata_fields_dict(
|
|
673
|
+
"derived_sample", host_sample_types_config_dict, host_metadata_fields_dict)
|
|
674
|
+
|
|
675
|
+
def test__construct_sample_type_metadata_fields_dict_sets_sample_type(self):
|
|
676
|
+
"""Test that sample_type field is set with correct allowed/default values."""
|
|
677
|
+
host_sample_types_config_dict = {
|
|
678
|
+
"blood": {
|
|
679
|
+
METADATA_FIELDS_KEY: {}
|
|
680
|
+
}
|
|
681
|
+
}
|
|
682
|
+
host_metadata_fields_dict = {}
|
|
683
|
+
|
|
684
|
+
result = _construct_sample_type_metadata_fields_dict(
|
|
685
|
+
"blood", host_sample_types_config_dict, host_metadata_fields_dict)
|
|
686
|
+
|
|
687
|
+
expected = {
|
|
688
|
+
SAMPLE_TYPE_KEY: {
|
|
689
|
+
ALLOWED_KEY: ["blood"],
|
|
690
|
+
DEFAULT_KEY: "blood",
|
|
691
|
+
TYPE_KEY: "string"
|
|
692
|
+
},
|
|
693
|
+
QIITA_SAMPLE_TYPE: {
|
|
694
|
+
ALLOWED_KEY: ["blood"],
|
|
695
|
+
DEFAULT_KEY: "blood",
|
|
696
|
+
TYPE_KEY: "string"
|
|
697
|
+
}
|
|
698
|
+
}
|
|
699
|
+
self.assertDictEqual(expected, result)
|
|
700
|
+
|
|
701
|
+
def test__construct_sample_type_metadata_fields_dict_preserves_existing_qiita_sample_type(self):
|
|
702
|
+
"""Test that existing qiita_sample_type is not overwritten."""
|
|
703
|
+
host_sample_types_config_dict = {
|
|
704
|
+
"stool": {
|
|
705
|
+
METADATA_FIELDS_KEY: {
|
|
706
|
+
QIITA_SAMPLE_TYPE: {
|
|
707
|
+
ALLOWED_KEY: ["custom_type"],
|
|
708
|
+
DEFAULT_KEY: "custom_type",
|
|
709
|
+
TYPE_KEY: "string"
|
|
710
|
+
}
|
|
711
|
+
}
|
|
712
|
+
}
|
|
713
|
+
}
|
|
714
|
+
host_metadata_fields_dict = {}
|
|
715
|
+
|
|
716
|
+
result = _construct_sample_type_metadata_fields_dict(
|
|
717
|
+
"stool", host_sample_types_config_dict, host_metadata_fields_dict)
|
|
718
|
+
|
|
719
|
+
expected = {
|
|
720
|
+
SAMPLE_TYPE_KEY: {
|
|
721
|
+
ALLOWED_KEY: ["stool"],
|
|
722
|
+
DEFAULT_KEY: "stool",
|
|
723
|
+
TYPE_KEY: "string"
|
|
724
|
+
},
|
|
725
|
+
QIITA_SAMPLE_TYPE: {
|
|
726
|
+
ALLOWED_KEY: ["custom_type"],
|
|
727
|
+
DEFAULT_KEY: "custom_type",
|
|
728
|
+
TYPE_KEY: "string"
|
|
729
|
+
}
|
|
730
|
+
}
|
|
731
|
+
self.assertDictEqual(expected, result)
|
|
732
|
+
|
|
733
|
+
# Tests for _generate_metadata_for_a_sample_type_in_a_host_type
|
|
734
|
+
|
|
735
|
+
def test__generate_metadata_for_a_sample_type_in_a_host_type_basic(self):
|
|
736
|
+
"""Test basic metadata generation for a known sample type."""
|
|
737
|
+
input_df = pandas.DataFrame({
|
|
738
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
739
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
|
|
740
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
|
|
741
|
+
QC_NOTE_KEY: ["", ""]
|
|
742
|
+
})
|
|
743
|
+
global_plus_host_settings_dict = {
|
|
744
|
+
OVERWRITE_NON_NANS_KEY: False,
|
|
745
|
+
LEAVE_REQUIREDS_BLANK_KEY: False,
|
|
746
|
+
DEFAULT_KEY: "not provided"
|
|
747
|
+
}
|
|
748
|
+
host_type_config_dict = {
|
|
749
|
+
METADATA_FIELDS_KEY: {
|
|
750
|
+
"host_field": {
|
|
751
|
+
DEFAULT_KEY: "host_default",
|
|
752
|
+
TYPE_KEY: "string"
|
|
753
|
+
}
|
|
754
|
+
},
|
|
755
|
+
SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
|
|
756
|
+
"stool": {
|
|
757
|
+
METADATA_FIELDS_KEY: {
|
|
758
|
+
"stool_field": {
|
|
759
|
+
DEFAULT_KEY: "stool_default",
|
|
760
|
+
TYPE_KEY: "string"
|
|
761
|
+
}
|
|
762
|
+
}
|
|
763
|
+
}
|
|
764
|
+
}
|
|
765
|
+
}
|
|
766
|
+
|
|
767
|
+
result_df, validation_msgs = _generate_metadata_for_a_sample_type_in_a_host_type(
|
|
768
|
+
input_df, "stool", global_plus_host_settings_dict, host_type_config_dict)
|
|
769
|
+
|
|
770
|
+
expected_df = pandas.DataFrame({
|
|
771
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
772
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
|
|
773
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
|
|
774
|
+
QC_NOTE_KEY: ["", ""],
|
|
775
|
+
"host_field": ["host_default", "host_default"],
|
|
776
|
+
"stool_field": ["stool_default", "stool_default"],
|
|
777
|
+
SAMPLE_TYPE_KEY: ["stool", "stool"],
|
|
778
|
+
QIITA_SAMPLE_TYPE: ["stool", "stool"]
|
|
779
|
+
})
|
|
780
|
+
assert_frame_equal(expected_df, result_df)
|
|
781
|
+
self.assertEqual([], validation_msgs)
|
|
782
|
+
|
|
783
|
+
def test__generate_metadata_for_a_sample_type_in_a_host_type_unknown_sample_type(self):
|
|
784
|
+
"""Test that unknown sample type adds QC note."""
|
|
785
|
+
input_df = pandas.DataFrame({
|
|
786
|
+
SAMPLE_NAME_KEY: ["sample1"],
|
|
787
|
+
HOSTTYPE_SHORTHAND_KEY: ["human"],
|
|
788
|
+
SAMPLETYPE_SHORTHAND_KEY: ["unknown_type"],
|
|
789
|
+
QC_NOTE_KEY: [""]
|
|
790
|
+
})
|
|
791
|
+
global_plus_host_settings_dict = {
|
|
792
|
+
OVERWRITE_NON_NANS_KEY: False,
|
|
793
|
+
LEAVE_REQUIREDS_BLANK_KEY: False,
|
|
794
|
+
DEFAULT_KEY: "not provided"
|
|
795
|
+
}
|
|
796
|
+
host_type_config_dict = {
|
|
797
|
+
METADATA_FIELDS_KEY: {},
|
|
798
|
+
SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
|
|
799
|
+
"stool": {
|
|
800
|
+
METADATA_FIELDS_KEY: {}
|
|
801
|
+
}
|
|
802
|
+
}
|
|
803
|
+
}
|
|
804
|
+
|
|
805
|
+
result_df, validation_msgs = _generate_metadata_for_a_sample_type_in_a_host_type(
|
|
806
|
+
input_df, "unknown_type", global_plus_host_settings_dict, host_type_config_dict)
|
|
807
|
+
|
|
808
|
+
expected_df = pandas.DataFrame({
|
|
809
|
+
SAMPLE_NAME_KEY: ["sample1"],
|
|
810
|
+
HOSTTYPE_SHORTHAND_KEY: ["human"],
|
|
811
|
+
SAMPLETYPE_SHORTHAND_KEY: ["unknown_type"],
|
|
812
|
+
QC_NOTE_KEY: ["invalid sample_type"]
|
|
813
|
+
})
|
|
814
|
+
assert_frame_equal(expected_df, result_df)
|
|
815
|
+
self.assertEqual([], validation_msgs)
|
|
816
|
+
|
|
817
|
+
def test__generate_metadata_for_a_sample_type_in_a_host_type_filters_by_sample_type(self):
|
|
818
|
+
"""Test that only rows matching the sample type are processed."""
|
|
819
|
+
input_df = pandas.DataFrame({
|
|
820
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2", "sample3"],
|
|
821
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "human", "human"],
|
|
822
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "blood", "stool"],
|
|
823
|
+
QC_NOTE_KEY: ["", "", ""]
|
|
824
|
+
})
|
|
825
|
+
global_plus_host_settings_dict = {
|
|
826
|
+
OVERWRITE_NON_NANS_KEY: False,
|
|
827
|
+
LEAVE_REQUIREDS_BLANK_KEY: False,
|
|
828
|
+
DEFAULT_KEY: "not provided"
|
|
829
|
+
}
|
|
830
|
+
host_type_config_dict = {
|
|
831
|
+
METADATA_FIELDS_KEY: {},
|
|
832
|
+
SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
|
|
833
|
+
"stool": {
|
|
834
|
+
METADATA_FIELDS_KEY: {
|
|
835
|
+
"stool_field": {
|
|
836
|
+
DEFAULT_KEY: "stool_value",
|
|
837
|
+
TYPE_KEY: "string"
|
|
838
|
+
}
|
|
839
|
+
}
|
|
840
|
+
},
|
|
841
|
+
"blood": {
|
|
842
|
+
METADATA_FIELDS_KEY: {}
|
|
843
|
+
}
|
|
844
|
+
}
|
|
845
|
+
}
|
|
846
|
+
|
|
847
|
+
result_df, validation_msgs = _generate_metadata_for_a_sample_type_in_a_host_type(
|
|
848
|
+
input_df, "stool", global_plus_host_settings_dict, host_type_config_dict)
|
|
849
|
+
|
|
850
|
+
# Should only have the two stool samples
|
|
851
|
+
self.assertEqual(2, len(result_df))
|
|
852
|
+
self.assertEqual(["sample1", "sample3"], result_df[SAMPLE_NAME_KEY].tolist())
|
|
853
|
+
self.assertEqual(["stool_value", "stool_value"], result_df["stool_field"].tolist())
|
|
854
|
+
|
|
855
|
+
def test__generate_metadata_for_a_sample_type_in_a_host_type_leave_requireds_blank_true(self):
|
|
856
|
+
"""Test that required fields get LEAVE_BLANK_VAL when leave_requireds_blank is True."""
|
|
857
|
+
input_df = pandas.DataFrame({
|
|
858
|
+
SAMPLE_NAME_KEY: ["sample1"],
|
|
859
|
+
HOSTTYPE_SHORTHAND_KEY: ["human"],
|
|
860
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool"],
|
|
861
|
+
QC_NOTE_KEY: [""]
|
|
862
|
+
})
|
|
863
|
+
global_plus_host_settings_dict = {
|
|
864
|
+
OVERWRITE_NON_NANS_KEY: False,
|
|
865
|
+
LEAVE_REQUIREDS_BLANK_KEY: True,
|
|
866
|
+
DEFAULT_KEY: "not provided"
|
|
867
|
+
}
|
|
868
|
+
host_type_config_dict = {
|
|
869
|
+
METADATA_FIELDS_KEY: {},
|
|
870
|
+
SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
|
|
871
|
+
"stool": {
|
|
872
|
+
METADATA_FIELDS_KEY: {
|
|
873
|
+
"required_field": {
|
|
874
|
+
REQUIRED_KEY: True,
|
|
875
|
+
TYPE_KEY: "string"
|
|
876
|
+
}
|
|
877
|
+
}
|
|
878
|
+
}
|
|
879
|
+
}
|
|
880
|
+
}
|
|
881
|
+
|
|
882
|
+
result_df, validation_msgs = _generate_metadata_for_a_sample_type_in_a_host_type(
|
|
883
|
+
input_df, "stool", global_plus_host_settings_dict, host_type_config_dict)
|
|
884
|
+
|
|
885
|
+
self.assertEqual(LEAVE_BLANK_VAL, result_df["required_field"].iloc[0])
|
|
886
|
+
|
|
887
|
+
def test__generate_metadata_for_a_sample_type_in_a_host_type_leave_requireds_blank_false(self):
|
|
888
|
+
"""Test that required fields get default when leave_requireds_blank is False."""
|
|
889
|
+
input_df = pandas.DataFrame({
|
|
890
|
+
SAMPLE_NAME_KEY: ["sample1"],
|
|
891
|
+
HOSTTYPE_SHORTHAND_KEY: ["human"],
|
|
892
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool"],
|
|
893
|
+
QC_NOTE_KEY: [""]
|
|
894
|
+
})
|
|
895
|
+
global_plus_host_settings_dict = {
|
|
896
|
+
OVERWRITE_NON_NANS_KEY: False,
|
|
897
|
+
LEAVE_REQUIREDS_BLANK_KEY: False,
|
|
898
|
+
DEFAULT_KEY: "global_default"
|
|
899
|
+
}
|
|
900
|
+
host_type_config_dict = {
|
|
901
|
+
METADATA_FIELDS_KEY: {},
|
|
902
|
+
SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
|
|
903
|
+
"stool": {
|
|
904
|
+
METADATA_FIELDS_KEY: {
|
|
905
|
+
"required_field": {
|
|
906
|
+
REQUIRED_KEY: True,
|
|
907
|
+
TYPE_KEY: "string"
|
|
908
|
+
}
|
|
909
|
+
}
|
|
910
|
+
}
|
|
911
|
+
}
|
|
912
|
+
}
|
|
913
|
+
|
|
914
|
+
result_df, validation_msgs = _generate_metadata_for_a_sample_type_in_a_host_type(
|
|
915
|
+
input_df, "stool", global_plus_host_settings_dict, host_type_config_dict)
|
|
916
|
+
|
|
917
|
+
# When leave_requireds_blank is False, NaN values get filled with global default
|
|
918
|
+
self.assertEqual("global_default", result_df["required_field"].iloc[0])
|
|
919
|
+
|
|
920
|
+
def test__generate_metadata_for_a_sample_type_in_a_host_type_overwrite_non_nans_true(self):
|
|
921
|
+
"""Test that existing values are overwritten when overwrite_non_nans is True."""
|
|
922
|
+
input_df = pandas.DataFrame({
|
|
923
|
+
SAMPLE_NAME_KEY: ["sample1"],
|
|
924
|
+
HOSTTYPE_SHORTHAND_KEY: ["human"],
|
|
925
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool"],
|
|
926
|
+
QC_NOTE_KEY: [""],
|
|
927
|
+
"existing_field": ["original_value"]
|
|
928
|
+
})
|
|
929
|
+
global_plus_host_settings_dict = {
|
|
930
|
+
OVERWRITE_NON_NANS_KEY: True,
|
|
931
|
+
LEAVE_REQUIREDS_BLANK_KEY: False,
|
|
932
|
+
DEFAULT_KEY: "not provided"
|
|
933
|
+
}
|
|
934
|
+
host_type_config_dict = {
|
|
935
|
+
METADATA_FIELDS_KEY: {},
|
|
936
|
+
SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
|
|
937
|
+
"stool": {
|
|
938
|
+
METADATA_FIELDS_KEY: {
|
|
939
|
+
"existing_field": {
|
|
940
|
+
DEFAULT_KEY: "new_value",
|
|
941
|
+
TYPE_KEY: "string"
|
|
942
|
+
}
|
|
943
|
+
}
|
|
944
|
+
}
|
|
945
|
+
}
|
|
946
|
+
}
|
|
947
|
+
|
|
948
|
+
result_df, validation_msgs = _generate_metadata_for_a_sample_type_in_a_host_type(
|
|
949
|
+
input_df, "stool", global_plus_host_settings_dict, host_type_config_dict)
|
|
950
|
+
|
|
951
|
+
self.assertEqual("new_value", result_df["existing_field"].iloc[0])
|
|
952
|
+
|
|
953
|
+
def test__generate_metadata_for_a_sample_type_in_a_host_type_overwrite_non_nans_false(self):
|
|
954
|
+
"""Test that existing values are preserved when overwrite_non_nans is False."""
|
|
955
|
+
input_df = pandas.DataFrame({
|
|
956
|
+
SAMPLE_NAME_KEY: ["sample1"],
|
|
957
|
+
HOSTTYPE_SHORTHAND_KEY: ["human"],
|
|
958
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool"],
|
|
959
|
+
QC_NOTE_KEY: [""],
|
|
960
|
+
"existing_field": ["original_value"]
|
|
961
|
+
})
|
|
962
|
+
global_plus_host_settings_dict = {
|
|
963
|
+
OVERWRITE_NON_NANS_KEY: False,
|
|
964
|
+
LEAVE_REQUIREDS_BLANK_KEY: False,
|
|
965
|
+
DEFAULT_KEY: "not provided"
|
|
966
|
+
}
|
|
967
|
+
host_type_config_dict = {
|
|
968
|
+
METADATA_FIELDS_KEY: {},
|
|
969
|
+
SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
|
|
970
|
+
"stool": {
|
|
971
|
+
METADATA_FIELDS_KEY: {
|
|
972
|
+
"existing_field": {
|
|
973
|
+
DEFAULT_KEY: "new_value",
|
|
974
|
+
TYPE_KEY: "string"
|
|
975
|
+
}
|
|
976
|
+
}
|
|
977
|
+
}
|
|
978
|
+
}
|
|
979
|
+
}
|
|
980
|
+
|
|
981
|
+
result_df, validation_msgs = _generate_metadata_for_a_sample_type_in_a_host_type(
|
|
982
|
+
input_df, "stool", global_plus_host_settings_dict, host_type_config_dict)
|
|
983
|
+
|
|
984
|
+
self.assertEqual("original_value", result_df["existing_field"].iloc[0])
|
|
985
|
+
|
|
986
|
+
def test__generate_metadata_for_a_sample_type_in_a_host_type_with_alias(self):
|
|
987
|
+
"""Test that sample type aliases are resolved correctly."""
|
|
988
|
+
input_df = pandas.DataFrame({
|
|
989
|
+
SAMPLE_NAME_KEY: ["sample1"],
|
|
990
|
+
HOSTTYPE_SHORTHAND_KEY: ["human"],
|
|
991
|
+
SAMPLETYPE_SHORTHAND_KEY: ["feces"],
|
|
992
|
+
QC_NOTE_KEY: [""]
|
|
993
|
+
})
|
|
994
|
+
global_plus_host_settings_dict = {
|
|
995
|
+
OVERWRITE_NON_NANS_KEY: False,
|
|
996
|
+
LEAVE_REQUIREDS_BLANK_KEY: False,
|
|
997
|
+
DEFAULT_KEY: "not provided"
|
|
998
|
+
}
|
|
999
|
+
host_type_config_dict = {
|
|
1000
|
+
METADATA_FIELDS_KEY: {},
|
|
1001
|
+
SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
|
|
1002
|
+
"feces": {
|
|
1003
|
+
ALIAS_KEY: "stool"
|
|
1004
|
+
},
|
|
1005
|
+
"stool": {
|
|
1006
|
+
METADATA_FIELDS_KEY: {
|
|
1007
|
+
"stool_field": {
|
|
1008
|
+
DEFAULT_KEY: "stool_value",
|
|
1009
|
+
TYPE_KEY: "string"
|
|
1010
|
+
}
|
|
1011
|
+
}
|
|
1012
|
+
}
|
|
1013
|
+
}
|
|
1014
|
+
}
|
|
1015
|
+
|
|
1016
|
+
result_df, validation_msgs = _generate_metadata_for_a_sample_type_in_a_host_type(
|
|
1017
|
+
input_df, "feces", global_plus_host_settings_dict, host_type_config_dict)
|
|
1018
|
+
|
|
1019
|
+
self.assertEqual("stool_value", result_df["stool_field"].iloc[0])
|
|
1020
|
+
# sample_type should be set to the resolved type "stool"
|
|
1021
|
+
self.assertEqual("stool", result_df[SAMPLE_TYPE_KEY].iloc[0])
|
|
1022
|
+
|
|
1023
|
+
# Tests for _generate_metadata_for_a_host_type
|
|
1024
|
+
|
|
1025
|
+
def test__generate_metadata_for_a_host_type_basic(self):
|
|
1026
|
+
"""Test basic metadata generation for a known host type."""
|
|
1027
|
+
input_df = pandas.DataFrame({
|
|
1028
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
1029
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
|
|
1030
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
|
|
1031
|
+
QC_NOTE_KEY: ["", ""]
|
|
1032
|
+
})
|
|
1033
|
+
settings_dict = {
|
|
1034
|
+
OVERWRITE_NON_NANS_KEY: False,
|
|
1035
|
+
LEAVE_REQUIREDS_BLANK_KEY: False,
|
|
1036
|
+
DEFAULT_KEY: "global_default"
|
|
1037
|
+
}
|
|
1038
|
+
full_flat_config_dict = {
|
|
1039
|
+
HOST_TYPE_SPECIFIC_METADATA_KEY: {
|
|
1040
|
+
"human": {
|
|
1041
|
+
DEFAULT_KEY: "human_default",
|
|
1042
|
+
METADATA_FIELDS_KEY: {
|
|
1043
|
+
"host_field": {
|
|
1044
|
+
DEFAULT_KEY: "host_value",
|
|
1045
|
+
TYPE_KEY: "string"
|
|
1046
|
+
}
|
|
1047
|
+
},
|
|
1048
|
+
SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
|
|
1049
|
+
"stool": {
|
|
1050
|
+
METADATA_FIELDS_KEY: {
|
|
1051
|
+
"stool_field": {
|
|
1052
|
+
DEFAULT_KEY: "stool_value",
|
|
1053
|
+
TYPE_KEY: "string"
|
|
1054
|
+
}
|
|
1055
|
+
}
|
|
1056
|
+
}
|
|
1057
|
+
}
|
|
1058
|
+
}
|
|
1059
|
+
}
|
|
1060
|
+
}
|
|
1061
|
+
|
|
1062
|
+
result_df, validation_msgs = _generate_metadata_for_a_host_type(
|
|
1063
|
+
input_df, "human", settings_dict, full_flat_config_dict)
|
|
1064
|
+
|
|
1065
|
+
expected_df = pandas.DataFrame({
|
|
1066
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
1067
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
|
|
1068
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
|
|
1069
|
+
QC_NOTE_KEY: ["", ""],
|
|
1070
|
+
"host_field": ["host_value", "host_value"],
|
|
1071
|
+
"stool_field": ["stool_value", "stool_value"],
|
|
1072
|
+
SAMPLE_TYPE_KEY: ["stool", "stool"],
|
|
1073
|
+
QIITA_SAMPLE_TYPE: ["stool", "stool"]
|
|
1074
|
+
})
|
|
1075
|
+
assert_frame_equal(expected_df, result_df)
|
|
1076
|
+
self.assertEqual([], validation_msgs)
|
|
1077
|
+
|
|
1078
|
+
def test__generate_metadata_for_a_host_type_unknown_host_type(self):
|
|
1079
|
+
"""Test that unknown host type adds QC note."""
|
|
1080
|
+
input_df = pandas.DataFrame({
|
|
1081
|
+
SAMPLE_NAME_KEY: ["sample1"],
|
|
1082
|
+
HOSTTYPE_SHORTHAND_KEY: ["unknown_host"],
|
|
1083
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool"],
|
|
1084
|
+
QC_NOTE_KEY: [""]
|
|
1085
|
+
})
|
|
1086
|
+
settings_dict = {
|
|
1087
|
+
OVERWRITE_NON_NANS_KEY: False,
|
|
1088
|
+
LEAVE_REQUIREDS_BLANK_KEY: False,
|
|
1089
|
+
DEFAULT_KEY: "global_default"
|
|
1090
|
+
}
|
|
1091
|
+
full_flat_config_dict = {
|
|
1092
|
+
HOST_TYPE_SPECIFIC_METADATA_KEY: {
|
|
1093
|
+
"human": {
|
|
1094
|
+
METADATA_FIELDS_KEY: {},
|
|
1095
|
+
SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {}
|
|
1096
|
+
}
|
|
1097
|
+
}
|
|
1098
|
+
}
|
|
1099
|
+
|
|
1100
|
+
result_df, validation_msgs = _generate_metadata_for_a_host_type(
|
|
1101
|
+
input_df, "unknown_host", settings_dict, full_flat_config_dict)
|
|
1102
|
+
|
|
1103
|
+
expected_df = pandas.DataFrame({
|
|
1104
|
+
SAMPLE_NAME_KEY: ["sample1"],
|
|
1105
|
+
HOSTTYPE_SHORTHAND_KEY: ["unknown_host"],
|
|
1106
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool"],
|
|
1107
|
+
QC_NOTE_KEY: ["invalid host_type"]
|
|
1108
|
+
})
|
|
1109
|
+
assert_frame_equal(expected_df, result_df)
|
|
1110
|
+
self.assertEqual([], validation_msgs)
|
|
1111
|
+
|
|
1112
|
+
def test__generate_metadata_for_a_host_type_unknown_sample_type(self):
|
|
1113
|
+
"""Test that unknown sample type within known host type adds QC note."""
|
|
1114
|
+
input_df = pandas.DataFrame({
|
|
1115
|
+
SAMPLE_NAME_KEY: ["sample1"],
|
|
1116
|
+
HOSTTYPE_SHORTHAND_KEY: ["human"],
|
|
1117
|
+
SAMPLETYPE_SHORTHAND_KEY: ["unknown_sample"],
|
|
1118
|
+
QC_NOTE_KEY: [""]
|
|
1119
|
+
})
|
|
1120
|
+
settings_dict = {
|
|
1121
|
+
OVERWRITE_NON_NANS_KEY: False,
|
|
1122
|
+
LEAVE_REQUIREDS_BLANK_KEY: False,
|
|
1123
|
+
DEFAULT_KEY: "global_default"
|
|
1124
|
+
}
|
|
1125
|
+
full_flat_config_dict = {
|
|
1126
|
+
HOST_TYPE_SPECIFIC_METADATA_KEY: {
|
|
1127
|
+
"human": {
|
|
1128
|
+
METADATA_FIELDS_KEY: {},
|
|
1129
|
+
SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
|
|
1130
|
+
"stool": {
|
|
1131
|
+
METADATA_FIELDS_KEY: {}
|
|
1132
|
+
}
|
|
1133
|
+
}
|
|
1134
|
+
}
|
|
1135
|
+
}
|
|
1136
|
+
}
|
|
1137
|
+
|
|
1138
|
+
result_df, validation_msgs = _generate_metadata_for_a_host_type(
|
|
1139
|
+
input_df, "human", settings_dict, full_flat_config_dict)
|
|
1140
|
+
|
|
1141
|
+
expected_df = pandas.DataFrame({
|
|
1142
|
+
SAMPLE_NAME_KEY: ["sample1"],
|
|
1143
|
+
HOSTTYPE_SHORTHAND_KEY: ["human"],
|
|
1144
|
+
SAMPLETYPE_SHORTHAND_KEY: ["unknown_sample"],
|
|
1145
|
+
QC_NOTE_KEY: ["invalid sample_type"]
|
|
1146
|
+
})
|
|
1147
|
+
assert_frame_equal(expected_df, result_df)
|
|
1148
|
+
self.assertEqual([], validation_msgs)
|
|
1149
|
+
|
|
1150
|
+
def test__generate_metadata_for_a_host_type_filters_by_host_type(self):
|
|
1151
|
+
"""Test that only rows matching the host type are processed."""
|
|
1152
|
+
input_df = pandas.DataFrame({
|
|
1153
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2", "sample3"],
|
|
1154
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "mouse", "human"],
|
|
1155
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool", "stool"],
|
|
1156
|
+
QC_NOTE_KEY: ["", "", ""]
|
|
1157
|
+
})
|
|
1158
|
+
settings_dict = {
|
|
1159
|
+
OVERWRITE_NON_NANS_KEY: False,
|
|
1160
|
+
LEAVE_REQUIREDS_BLANK_KEY: False,
|
|
1161
|
+
DEFAULT_KEY: "global_default"
|
|
1162
|
+
}
|
|
1163
|
+
full_flat_config_dict = {
|
|
1164
|
+
HOST_TYPE_SPECIFIC_METADATA_KEY: {
|
|
1165
|
+
"human": {
|
|
1166
|
+
METADATA_FIELDS_KEY: {
|
|
1167
|
+
"human_field": {
|
|
1168
|
+
DEFAULT_KEY: "human_value",
|
|
1169
|
+
TYPE_KEY: "string"
|
|
1170
|
+
}
|
|
1171
|
+
},
|
|
1172
|
+
SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
|
|
1173
|
+
"stool": {
|
|
1174
|
+
METADATA_FIELDS_KEY: {}
|
|
1175
|
+
}
|
|
1176
|
+
}
|
|
1177
|
+
},
|
|
1178
|
+
"mouse": {
|
|
1179
|
+
METADATA_FIELDS_KEY: {},
|
|
1180
|
+
SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {}
|
|
1181
|
+
}
|
|
1182
|
+
}
|
|
1183
|
+
}
|
|
1184
|
+
|
|
1185
|
+
result_df, validation_msgs = _generate_metadata_for_a_host_type(
|
|
1186
|
+
input_df, "human", settings_dict, full_flat_config_dict)
|
|
1187
|
+
|
|
1188
|
+
expected_df = pandas.DataFrame({
|
|
1189
|
+
SAMPLE_NAME_KEY: ["sample1", "sample3"],
|
|
1190
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
|
|
1191
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
|
|
1192
|
+
QC_NOTE_KEY: ["", ""],
|
|
1193
|
+
"human_field": ["human_value", "human_value"],
|
|
1194
|
+
SAMPLE_TYPE_KEY: ["stool", "stool"],
|
|
1195
|
+
QIITA_SAMPLE_TYPE: ["stool", "stool"]
|
|
1196
|
+
})
|
|
1197
|
+
assert_frame_equal(expected_df, result_df)
|
|
1198
|
+
|
|
1199
|
+
def test__generate_metadata_for_a_host_type_uses_host_default(self):
|
|
1200
|
+
"""Test that host-type-specific default overrides global default."""
|
|
1201
|
+
input_df = pandas.DataFrame({
|
|
1202
|
+
SAMPLE_NAME_KEY: ["sample1"],
|
|
1203
|
+
HOSTTYPE_SHORTHAND_KEY: ["human"],
|
|
1204
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool"],
|
|
1205
|
+
QC_NOTE_KEY: [""]
|
|
1206
|
+
})
|
|
1207
|
+
settings_dict = {
|
|
1208
|
+
OVERWRITE_NON_NANS_KEY: False,
|
|
1209
|
+
LEAVE_REQUIREDS_BLANK_KEY: False,
|
|
1210
|
+
DEFAULT_KEY: "global_default"
|
|
1211
|
+
}
|
|
1212
|
+
full_flat_config_dict = {
|
|
1213
|
+
HOST_TYPE_SPECIFIC_METADATA_KEY: {
|
|
1214
|
+
"human": {
|
|
1215
|
+
DEFAULT_KEY: "human_specific_default",
|
|
1216
|
+
METADATA_FIELDS_KEY: {},
|
|
1217
|
+
SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
|
|
1218
|
+
"stool": {
|
|
1219
|
+
METADATA_FIELDS_KEY: {
|
|
1220
|
+
"required_field": {
|
|
1221
|
+
REQUIRED_KEY: True,
|
|
1222
|
+
TYPE_KEY: "string"
|
|
1223
|
+
}
|
|
1224
|
+
}
|
|
1225
|
+
}
|
|
1226
|
+
}
|
|
1227
|
+
}
|
|
1228
|
+
}
|
|
1229
|
+
}
|
|
1230
|
+
|
|
1231
|
+
result_df, validation_msgs = _generate_metadata_for_a_host_type(
|
|
1232
|
+
input_df, "human", settings_dict, full_flat_config_dict)
|
|
1233
|
+
|
|
1234
|
+
expected_df = pandas.DataFrame({
|
|
1235
|
+
SAMPLE_NAME_KEY: ["sample1"],
|
|
1236
|
+
HOSTTYPE_SHORTHAND_KEY: ["human"],
|
|
1237
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool"],
|
|
1238
|
+
QC_NOTE_KEY: [""],
|
|
1239
|
+
"required_field": ["human_specific_default"],
|
|
1240
|
+
SAMPLE_TYPE_KEY: ["stool"],
|
|
1241
|
+
QIITA_SAMPLE_TYPE: ["stool"]
|
|
1242
|
+
})
|
|
1243
|
+
assert_frame_equal(expected_df, result_df)
|
|
1244
|
+
|
|
1245
|
+
def test__generate_metadata_for_a_host_type_uses_global_default_when_no_host_default(self):
|
|
1246
|
+
"""Test that global default is used when host type has no specific default."""
|
|
1247
|
+
input_df = pandas.DataFrame({
|
|
1248
|
+
SAMPLE_NAME_KEY: ["sample1"],
|
|
1249
|
+
HOSTTYPE_SHORTHAND_KEY: ["human"],
|
|
1250
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool"],
|
|
1251
|
+
QC_NOTE_KEY: [""]
|
|
1252
|
+
})
|
|
1253
|
+
settings_dict = {
|
|
1254
|
+
OVERWRITE_NON_NANS_KEY: False,
|
|
1255
|
+
LEAVE_REQUIREDS_BLANK_KEY: False,
|
|
1256
|
+
DEFAULT_KEY: "global_default"
|
|
1257
|
+
}
|
|
1258
|
+
full_flat_config_dict = {
|
|
1259
|
+
HOST_TYPE_SPECIFIC_METADATA_KEY: {
|
|
1260
|
+
"human": {
|
|
1261
|
+
# No DEFAULT_KEY here
|
|
1262
|
+
METADATA_FIELDS_KEY: {},
|
|
1263
|
+
SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
|
|
1264
|
+
"stool": {
|
|
1265
|
+
METADATA_FIELDS_KEY: {
|
|
1266
|
+
"required_field": {
|
|
1267
|
+
REQUIRED_KEY: True,
|
|
1268
|
+
TYPE_KEY: "string"
|
|
1269
|
+
}
|
|
1270
|
+
}
|
|
1271
|
+
}
|
|
1272
|
+
}
|
|
1273
|
+
}
|
|
1274
|
+
}
|
|
1275
|
+
}
|
|
1276
|
+
|
|
1277
|
+
result_df, validation_msgs = _generate_metadata_for_a_host_type(
|
|
1278
|
+
input_df, "human", settings_dict, full_flat_config_dict)
|
|
1279
|
+
|
|
1280
|
+
expected_df = pandas.DataFrame({
|
|
1281
|
+
SAMPLE_NAME_KEY: ["sample1"],
|
|
1282
|
+
HOSTTYPE_SHORTHAND_KEY: ["human"],
|
|
1283
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool"],
|
|
1284
|
+
QC_NOTE_KEY: [""],
|
|
1285
|
+
"required_field": ["global_default"],
|
|
1286
|
+
SAMPLE_TYPE_KEY: ["stool"],
|
|
1287
|
+
QIITA_SAMPLE_TYPE: ["stool"]
|
|
1288
|
+
})
|
|
1289
|
+
assert_frame_equal(expected_df, result_df)
|
|
1290
|
+
|
|
1291
|
+
# Tests for _generate_metadata_for_host_types
|
|
1292
|
+
|
|
1293
|
+
def test__generate_metadata_for_host_types_single_host_type(self):
|
|
1294
|
+
"""Test metadata generation for a single host type."""
|
|
1295
|
+
input_df = pandas.DataFrame({
|
|
1296
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
1297
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
|
|
1298
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
|
|
1299
|
+
QC_NOTE_KEY: ["", ""]
|
|
1300
|
+
})
|
|
1301
|
+
full_flat_config_dict = {
|
|
1302
|
+
DEFAULT_KEY: "global_default",
|
|
1303
|
+
LEAVE_REQUIREDS_BLANK_KEY: False,
|
|
1304
|
+
OVERWRITE_NON_NANS_KEY: False,
|
|
1305
|
+
HOST_TYPE_SPECIFIC_METADATA_KEY: {
|
|
1306
|
+
"human": {
|
|
1307
|
+
METADATA_FIELDS_KEY: {
|
|
1308
|
+
"host_field": {
|
|
1309
|
+
DEFAULT_KEY: "host_value",
|
|
1310
|
+
TYPE_KEY: "string"
|
|
1311
|
+
}
|
|
1312
|
+
},
|
|
1313
|
+
SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
|
|
1314
|
+
"stool": {
|
|
1315
|
+
METADATA_FIELDS_KEY: {
|
|
1316
|
+
"stool_field": {
|
|
1317
|
+
DEFAULT_KEY: "stool_value",
|
|
1318
|
+
TYPE_KEY: "string"
|
|
1319
|
+
}
|
|
1320
|
+
}
|
|
1321
|
+
}
|
|
1322
|
+
}
|
|
1323
|
+
}
|
|
1324
|
+
}
|
|
1325
|
+
}
|
|
1326
|
+
|
|
1327
|
+
result_df, validation_msgs = _generate_metadata_for_host_types(
|
|
1328
|
+
input_df, full_flat_config_dict)
|
|
1329
|
+
|
|
1330
|
+
expected_df = pandas.DataFrame({
|
|
1331
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
1332
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
|
|
1333
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
|
|
1334
|
+
QC_NOTE_KEY: ["", ""],
|
|
1335
|
+
"host_field": ["host_value", "host_value"],
|
|
1336
|
+
"stool_field": ["stool_value", "stool_value"],
|
|
1337
|
+
SAMPLE_TYPE_KEY: ["stool", "stool"],
|
|
1338
|
+
QIITA_SAMPLE_TYPE: ["stool", "stool"]
|
|
1339
|
+
})
|
|
1340
|
+
assert_frame_equal(expected_df, result_df)
|
|
1341
|
+
self.assertEqual([], validation_msgs)
|
|
1342
|
+
|
|
1343
|
+
def test__generate_metadata_for_host_types_multiple_host_types(self):
|
|
1344
|
+
"""Test metadata generation for multiple host types with NA filling."""
|
|
1345
|
+
input_df = pandas.DataFrame({
|
|
1346
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2", "sample3"],
|
|
1347
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "mouse", "human"],
|
|
1348
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool", "blood"],
|
|
1349
|
+
QC_NOTE_KEY: ["", "", ""]
|
|
1350
|
+
})
|
|
1351
|
+
full_flat_config_dict = {
|
|
1352
|
+
DEFAULT_KEY: "global_default",
|
|
1353
|
+
LEAVE_REQUIREDS_BLANK_KEY: False,
|
|
1354
|
+
OVERWRITE_NON_NANS_KEY: False,
|
|
1355
|
+
HOST_TYPE_SPECIFIC_METADATA_KEY: {
|
|
1356
|
+
"human": {
|
|
1357
|
+
METADATA_FIELDS_KEY: {
|
|
1358
|
+
"human_field": {
|
|
1359
|
+
DEFAULT_KEY: "human_value",
|
|
1360
|
+
TYPE_KEY: "string"
|
|
1361
|
+
}
|
|
1362
|
+
},
|
|
1363
|
+
SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
|
|
1364
|
+
"stool": {
|
|
1365
|
+
METADATA_FIELDS_KEY: {}
|
|
1366
|
+
},
|
|
1367
|
+
"blood": {
|
|
1368
|
+
METADATA_FIELDS_KEY: {}
|
|
1369
|
+
}
|
|
1370
|
+
}
|
|
1371
|
+
},
|
|
1372
|
+
"mouse": {
|
|
1373
|
+
METADATA_FIELDS_KEY: {
|
|
1374
|
+
"mouse_field": {
|
|
1375
|
+
DEFAULT_KEY: "mouse_value",
|
|
1376
|
+
TYPE_KEY: "string"
|
|
1377
|
+
}
|
|
1378
|
+
},
|
|
1379
|
+
SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
|
|
1380
|
+
"stool": {
|
|
1381
|
+
METADATA_FIELDS_KEY: {}
|
|
1382
|
+
}
|
|
1383
|
+
}
|
|
1384
|
+
}
|
|
1385
|
+
}
|
|
1386
|
+
}
|
|
1387
|
+
|
|
1388
|
+
result_df, validation_msgs = _generate_metadata_for_host_types(
|
|
1389
|
+
input_df, full_flat_config_dict)
|
|
1390
|
+
|
|
1391
|
+
# After concat, columns from different host types will have NaNs filled with global_default
|
|
1392
|
+
expected_df = pandas.DataFrame({
|
|
1393
|
+
SAMPLE_NAME_KEY: ["sample1", "sample3", "sample2"],
|
|
1394
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "human", "mouse"],
|
|
1395
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "blood", "stool"],
|
|
1396
|
+
QC_NOTE_KEY: ["", "", ""],
|
|
1397
|
+
"human_field": ["human_value", "human_value", "global_default"],
|
|
1398
|
+
SAMPLE_TYPE_KEY: ["stool", "blood", "stool"],
|
|
1399
|
+
QIITA_SAMPLE_TYPE: ["stool", "blood", "stool"],
|
|
1400
|
+
"mouse_field": ["global_default", "global_default", "mouse_value"]
|
|
1401
|
+
})
|
|
1402
|
+
assert_frame_equal(expected_df, result_df)
|
|
1403
|
+
self.assertEqual([], validation_msgs)
|
|
1404
|
+
|
|
1405
|
+
def test__generate_metadata_for_host_types_unknown_host_type(self):
|
|
1406
|
+
"""Test that unknown host type adds QC note."""
|
|
1407
|
+
input_df = pandas.DataFrame({
|
|
1408
|
+
SAMPLE_NAME_KEY: ["sample1"],
|
|
1409
|
+
HOSTTYPE_SHORTHAND_KEY: ["unknown_host"],
|
|
1410
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool"],
|
|
1411
|
+
QC_NOTE_KEY: [""]
|
|
1412
|
+
})
|
|
1413
|
+
full_flat_config_dict = {
|
|
1414
|
+
DEFAULT_KEY: "global_default",
|
|
1415
|
+
LEAVE_REQUIREDS_BLANK_KEY: False,
|
|
1416
|
+
OVERWRITE_NON_NANS_KEY: False,
|
|
1417
|
+
HOST_TYPE_SPECIFIC_METADATA_KEY: {
|
|
1418
|
+
"human": {
|
|
1419
|
+
METADATA_FIELDS_KEY: {},
|
|
1420
|
+
SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {}
|
|
1421
|
+
}
|
|
1422
|
+
}
|
|
1423
|
+
}
|
|
1424
|
+
|
|
1425
|
+
result_df, validation_msgs = _generate_metadata_for_host_types(
|
|
1426
|
+
input_df, full_flat_config_dict)
|
|
1427
|
+
|
|
1428
|
+
expected_df = pandas.DataFrame({
|
|
1429
|
+
SAMPLE_NAME_KEY: ["sample1"],
|
|
1430
|
+
HOSTTYPE_SHORTHAND_KEY: ["unknown_host"],
|
|
1431
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool"],
|
|
1432
|
+
QC_NOTE_KEY: ["invalid host_type"]
|
|
1433
|
+
})
|
|
1434
|
+
assert_frame_equal(expected_df, result_df)
|
|
1435
|
+
self.assertEqual([], validation_msgs)
|
|
1436
|
+
|
|
1437
|
+
def test__generate_metadata_for_host_types_unknown_sample_type(self):
|
|
1438
|
+
"""Test that unknown sample type within known host type adds QC note."""
|
|
1439
|
+
input_df = pandas.DataFrame({
|
|
1440
|
+
SAMPLE_NAME_KEY: ["sample1"],
|
|
1441
|
+
HOSTTYPE_SHORTHAND_KEY: ["human"],
|
|
1442
|
+
SAMPLETYPE_SHORTHAND_KEY: ["unknown_sample"],
|
|
1443
|
+
QC_NOTE_KEY: [""]
|
|
1444
|
+
})
|
|
1445
|
+
full_flat_config_dict = {
|
|
1446
|
+
DEFAULT_KEY: "global_default",
|
|
1447
|
+
LEAVE_REQUIREDS_BLANK_KEY: False,
|
|
1448
|
+
OVERWRITE_NON_NANS_KEY: False,
|
|
1449
|
+
HOST_TYPE_SPECIFIC_METADATA_KEY: {
|
|
1450
|
+
"human": {
|
|
1451
|
+
METADATA_FIELDS_KEY: {},
|
|
1452
|
+
SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
|
|
1453
|
+
"stool": {
|
|
1454
|
+
METADATA_FIELDS_KEY: {}
|
|
1455
|
+
}
|
|
1456
|
+
}
|
|
1457
|
+
}
|
|
1458
|
+
}
|
|
1459
|
+
}
|
|
1460
|
+
|
|
1461
|
+
result_df, validation_msgs = _generate_metadata_for_host_types(
|
|
1462
|
+
input_df, full_flat_config_dict)
|
|
1463
|
+
|
|
1464
|
+
expected_df = pandas.DataFrame({
|
|
1465
|
+
SAMPLE_NAME_KEY: ["sample1"],
|
|
1466
|
+
HOSTTYPE_SHORTHAND_KEY: ["human"],
|
|
1467
|
+
SAMPLETYPE_SHORTHAND_KEY: ["unknown_sample"],
|
|
1468
|
+
QC_NOTE_KEY: ["invalid sample_type"]
|
|
1469
|
+
})
|
|
1470
|
+
assert_frame_equal(expected_df, result_df)
|
|
1471
|
+
self.assertEqual([], validation_msgs)
|
|
1472
|
+
|
|
1473
|
+
def test__generate_metadata_for_host_types_replaces_leave_blank_val(self):
|
|
1474
|
+
"""Test that LEAVE_BLANK_VAL is replaced with empty string."""
|
|
1475
|
+
input_df = pandas.DataFrame({
|
|
1476
|
+
SAMPLE_NAME_KEY: ["sample1"],
|
|
1477
|
+
HOSTTYPE_SHORTHAND_KEY: ["human"],
|
|
1478
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool"],
|
|
1479
|
+
QC_NOTE_KEY: [""]
|
|
1480
|
+
})
|
|
1481
|
+
full_flat_config_dict = {
|
|
1482
|
+
DEFAULT_KEY: "global_default",
|
|
1483
|
+
LEAVE_REQUIREDS_BLANK_KEY: True, # This causes required fields to get LEAVE_BLANK_VAL
|
|
1484
|
+
OVERWRITE_NON_NANS_KEY: False,
|
|
1485
|
+
HOST_TYPE_SPECIFIC_METADATA_KEY: {
|
|
1486
|
+
"human": {
|
|
1487
|
+
METADATA_FIELDS_KEY: {},
|
|
1488
|
+
SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
|
|
1489
|
+
"stool": {
|
|
1490
|
+
METADATA_FIELDS_KEY: {
|
|
1491
|
+
"required_field": {
|
|
1492
|
+
REQUIRED_KEY: True,
|
|
1493
|
+
TYPE_KEY: "string"
|
|
1494
|
+
}
|
|
1495
|
+
}
|
|
1496
|
+
}
|
|
1497
|
+
}
|
|
1498
|
+
}
|
|
1499
|
+
}
|
|
1500
|
+
}
|
|
1501
|
+
|
|
1502
|
+
result_df, validation_msgs = _generate_metadata_for_host_types(
|
|
1503
|
+
input_df, full_flat_config_dict)
|
|
1504
|
+
|
|
1505
|
+
expected_df = pandas.DataFrame({
|
|
1506
|
+
SAMPLE_NAME_KEY: ["sample1"],
|
|
1507
|
+
HOSTTYPE_SHORTHAND_KEY: ["human"],
|
|
1508
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool"],
|
|
1509
|
+
QC_NOTE_KEY: [""],
|
|
1510
|
+
"required_field": [""], # LEAVE_BLANK_VAL replaced with empty string
|
|
1511
|
+
SAMPLE_TYPE_KEY: ["stool"],
|
|
1512
|
+
QIITA_SAMPLE_TYPE: ["stool"]
|
|
1513
|
+
})
|
|
1514
|
+
assert_frame_equal(expected_df, result_df)
|
|
1515
|
+
|
|
1516
|
+
# Tests for _transform_metadata
|
|
1517
|
+
|
|
1518
|
+
def test__transform_metadata_no_transformers(self):
|
|
1519
|
+
"""Test that df is returned unchanged when no transformers are configured."""
|
|
1520
|
+
input_df = pandas.DataFrame({
|
|
1521
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
1522
|
+
"field1": ["value1", "value2"]
|
|
1523
|
+
})
|
|
1524
|
+
full_flat_config_dict = {}
|
|
1525
|
+
|
|
1526
|
+
result_df = _transform_metadata(
|
|
1527
|
+
input_df, full_flat_config_dict, "pre", None)
|
|
1528
|
+
|
|
1529
|
+
expected_df = input_df
|
|
1530
|
+
|
|
1531
|
+
assert_frame_equal(expected_df, result_df)
|
|
1532
|
+
|
|
1533
|
+
def test__transform_metadata_no_stage_transformers(self):
|
|
1534
|
+
"""Test that df is returned unchanged when stage has no transformers."""
|
|
1535
|
+
input_df = pandas.DataFrame({
|
|
1536
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
1537
|
+
"field1": ["value1", "value2"]
|
|
1538
|
+
})
|
|
1539
|
+
full_flat_config_dict = {
|
|
1540
|
+
METADATA_TRANSFORMERS_KEY: {
|
|
1541
|
+
"post": {
|
|
1542
|
+
"target_field": {
|
|
1543
|
+
SOURCES_KEY: ["field1"],
|
|
1544
|
+
FUNCTION_KEY: "pass_through"
|
|
1545
|
+
}
|
|
1546
|
+
}
|
|
1547
|
+
}
|
|
1548
|
+
}
|
|
1549
|
+
|
|
1550
|
+
result_df = _transform_metadata(
|
|
1551
|
+
input_df, full_flat_config_dict, "pre", None)
|
|
1552
|
+
|
|
1553
|
+
expected_df = input_df
|
|
1554
|
+
|
|
1555
|
+
assert_frame_equal(expected_df, result_df)
|
|
1556
|
+
|
|
1557
|
+
def test__transform_metadata_builtin_pass_through(self):
|
|
1558
|
+
"""Test using built-in pass_through transformer."""
|
|
1559
|
+
input_df = pandas.DataFrame({
|
|
1560
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
1561
|
+
"source_field": ["value1", "value2"]
|
|
1562
|
+
})
|
|
1563
|
+
full_flat_config_dict = {
|
|
1564
|
+
METADATA_TRANSFORMERS_KEY: {
|
|
1565
|
+
"pre": {
|
|
1566
|
+
"target_field": {
|
|
1567
|
+
SOURCES_KEY: ["source_field"],
|
|
1568
|
+
FUNCTION_KEY: "pass_through"
|
|
1569
|
+
}
|
|
1570
|
+
}
|
|
1571
|
+
}
|
|
1572
|
+
}
|
|
1573
|
+
|
|
1574
|
+
result_df = _transform_metadata(
|
|
1575
|
+
input_df, full_flat_config_dict, "pre", None)
|
|
1576
|
+
|
|
1577
|
+
expected_df = pandas.DataFrame({
|
|
1578
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
1579
|
+
"source_field": ["value1", "value2"],
|
|
1580
|
+
"target_field": ["value1", "value2"]
|
|
1581
|
+
})
|
|
1582
|
+
assert_frame_equal(expected_df, result_df)
|
|
1583
|
+
|
|
1584
|
+
def test__transform_metadata_builtin_sex_transformer(self):
|
|
1585
|
+
"""Test using built-in transform_input_sex_to_std_sex transformer."""
|
|
1586
|
+
input_df = pandas.DataFrame({
|
|
1587
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2", "sample3"],
|
|
1588
|
+
"input_sex": ["F", "Male", "female"]
|
|
1589
|
+
})
|
|
1590
|
+
full_flat_config_dict = {
|
|
1591
|
+
METADATA_TRANSFORMERS_KEY: {
|
|
1592
|
+
"pre": {
|
|
1593
|
+
"sex": {
|
|
1594
|
+
SOURCES_KEY: ["input_sex"],
|
|
1595
|
+
FUNCTION_KEY: "transform_input_sex_to_std_sex"
|
|
1596
|
+
}
|
|
1597
|
+
}
|
|
1598
|
+
}
|
|
1599
|
+
}
|
|
1600
|
+
|
|
1601
|
+
result_df = _transform_metadata(
|
|
1602
|
+
input_df, full_flat_config_dict, "pre", None)
|
|
1603
|
+
|
|
1604
|
+
expected_df = pandas.DataFrame({
|
|
1605
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2", "sample3"],
|
|
1606
|
+
"input_sex": ["F", "Male", "female"],
|
|
1607
|
+
"sex": ["female", "male", "female"]
|
|
1608
|
+
})
|
|
1609
|
+
assert_frame_equal(expected_df, result_df)
|
|
1610
|
+
|
|
1611
|
+
def test__transform_metadata_builtin_age_to_life_stage(self):
|
|
1612
|
+
"""Test using built-in transform_age_to_life_stage transformer."""
|
|
1613
|
+
input_df = pandas.DataFrame({
|
|
1614
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2", "sample3"],
|
|
1615
|
+
"age_years": [10, 17, 45]
|
|
1616
|
+
})
|
|
1617
|
+
full_flat_config_dict = {
|
|
1618
|
+
METADATA_TRANSFORMERS_KEY: {
|
|
1619
|
+
"pre": {
|
|
1620
|
+
"life_stage": {
|
|
1621
|
+
SOURCES_KEY: ["age_years"],
|
|
1622
|
+
FUNCTION_KEY: "transform_age_to_life_stage"
|
|
1623
|
+
}
|
|
1624
|
+
}
|
|
1625
|
+
}
|
|
1626
|
+
}
|
|
1627
|
+
|
|
1628
|
+
result_df = _transform_metadata(
|
|
1629
|
+
input_df, full_flat_config_dict, "pre", None)
|
|
1630
|
+
|
|
1631
|
+
expected_df = pandas.DataFrame({
|
|
1632
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2", "sample3"],
|
|
1633
|
+
"age_years": [10, 17, 45],
|
|
1634
|
+
"life_stage": ["child", "adult", "adult"]
|
|
1635
|
+
})
|
|
1636
|
+
assert_frame_equal(expected_df, result_df)
|
|
1637
|
+
|
|
1638
|
+
def test__transform_metadata_custom_transformer(self):
|
|
1639
|
+
"""Test using a custom transformer function passed in transformer_funcs_dict."""
|
|
1640
|
+
input_df = pandas.DataFrame({
|
|
1641
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
1642
|
+
"source_field": ["hello", "world"]
|
|
1643
|
+
})
|
|
1644
|
+
full_flat_config_dict = {
|
|
1645
|
+
METADATA_TRANSFORMERS_KEY: {
|
|
1646
|
+
"pre": {
|
|
1647
|
+
"target_field": {
|
|
1648
|
+
SOURCES_KEY: ["source_field"],
|
|
1649
|
+
FUNCTION_KEY: "custom_upper"
|
|
1650
|
+
}
|
|
1651
|
+
}
|
|
1652
|
+
}
|
|
1653
|
+
}
|
|
1654
|
+
|
|
1655
|
+
def custom_upper(row, source_fields):
|
|
1656
|
+
return row[source_fields[0]].upper()
|
|
1657
|
+
|
|
1658
|
+
transformer_funcs_dict = {
|
|
1659
|
+
"custom_upper": custom_upper
|
|
1660
|
+
}
|
|
1661
|
+
|
|
1662
|
+
result_df = _transform_metadata(
|
|
1663
|
+
input_df, full_flat_config_dict, "pre", transformer_funcs_dict)
|
|
1664
|
+
|
|
1665
|
+
expected_df = pandas.DataFrame({
|
|
1666
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
1667
|
+
"source_field": ["hello", "world"],
|
|
1668
|
+
"target_field": ["HELLO", "WORLD"]
|
|
1669
|
+
})
|
|
1670
|
+
assert_frame_equal(expected_df, result_df)
|
|
1671
|
+
|
|
1672
|
+
def test__transform_metadata_unknown_transformer_raises(self):
|
|
1673
|
+
"""Test that unknown transformer function raises ValueError."""
|
|
1674
|
+
input_df = pandas.DataFrame({
|
|
1675
|
+
SAMPLE_NAME_KEY: ["sample1"],
|
|
1676
|
+
"source_field": ["value1"]
|
|
1677
|
+
})
|
|
1678
|
+
full_flat_config_dict = {
|
|
1679
|
+
METADATA_TRANSFORMERS_KEY: {
|
|
1680
|
+
"pre": {
|
|
1681
|
+
"target_field": {
|
|
1682
|
+
SOURCES_KEY: ["source_field"],
|
|
1683
|
+
FUNCTION_KEY: "nonexistent_function"
|
|
1684
|
+
}
|
|
1685
|
+
}
|
|
1686
|
+
}
|
|
1687
|
+
}
|
|
1688
|
+
|
|
1689
|
+
with self.assertRaisesRegex(ValueError, "Unable to find transformer 'nonexistent_function'"):
|
|
1690
|
+
_transform_metadata(input_df, full_flat_config_dict, "pre", None)
|
|
1691
|
+
|
|
1692
|
+
def test__transform_metadata_overwrite_non_nans_false(self):
|
|
1693
|
+
"""Test that existing values are preserved when overwrite_non_nans is False."""
|
|
1694
|
+
input_df = pandas.DataFrame({
|
|
1695
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
1696
|
+
"source_field": ["value1", "value2"],
|
|
1697
|
+
"target_field": ["existing", np.nan]
|
|
1698
|
+
})
|
|
1699
|
+
full_flat_config_dict = {
|
|
1700
|
+
OVERWRITE_NON_NANS_KEY: False,
|
|
1701
|
+
METADATA_TRANSFORMERS_KEY: {
|
|
1702
|
+
"pre": {
|
|
1703
|
+
"target_field": {
|
|
1704
|
+
SOURCES_KEY: ["source_field"],
|
|
1705
|
+
FUNCTION_KEY: "pass_through"
|
|
1706
|
+
}
|
|
1707
|
+
}
|
|
1708
|
+
}
|
|
1709
|
+
}
|
|
1710
|
+
|
|
1711
|
+
result_df = _transform_metadata(
|
|
1712
|
+
input_df, full_flat_config_dict, "pre", None)
|
|
1713
|
+
|
|
1714
|
+
expected_df = pandas.DataFrame({
|
|
1715
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
1716
|
+
"source_field": ["value1", "value2"],
|
|
1717
|
+
"target_field": ["existing", "value2"]
|
|
1718
|
+
})
|
|
1719
|
+
assert_frame_equal(expected_df, result_df)
|
|
1720
|
+
|
|
1721
|
+
def test__transform_metadata_overwrite_non_nans_true(self):
|
|
1722
|
+
"""Test that existing values are overwritten when overwrite_non_nans is True."""
|
|
1723
|
+
input_df = pandas.DataFrame({
|
|
1724
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
1725
|
+
"source_field": ["value1", "value2"],
|
|
1726
|
+
"target_field": ["existing", "also_existing"]
|
|
1727
|
+
})
|
|
1728
|
+
full_flat_config_dict = {
|
|
1729
|
+
OVERWRITE_NON_NANS_KEY: True,
|
|
1730
|
+
METADATA_TRANSFORMERS_KEY: {
|
|
1731
|
+
"pre": {
|
|
1732
|
+
"target_field": {
|
|
1733
|
+
SOURCES_KEY: ["source_field"],
|
|
1734
|
+
FUNCTION_KEY: "pass_through"
|
|
1735
|
+
}
|
|
1736
|
+
}
|
|
1737
|
+
}
|
|
1738
|
+
}
|
|
1739
|
+
|
|
1740
|
+
result_df = _transform_metadata(
|
|
1741
|
+
input_df, full_flat_config_dict, "pre", None)
|
|
1742
|
+
|
|
1743
|
+
expected_df = pandas.DataFrame({
|
|
1744
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
1745
|
+
"source_field": ["value1", "value2"],
|
|
1746
|
+
"target_field": ["value1", "value2"]
|
|
1747
|
+
})
|
|
1748
|
+
assert_frame_equal(expected_df, result_df)
|
|
1749
|
+
|
|
1750
|
+
def test__transform_metadata_multiple_transformers(self):
|
|
1751
|
+
"""Test applying multiple transformers in a single stage."""
|
|
1752
|
+
input_df = pandas.DataFrame({
|
|
1753
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
1754
|
+
"field_a": ["a1", "a2"],
|
|
1755
|
+
"field_b": ["b1", "b2"]
|
|
1756
|
+
})
|
|
1757
|
+
full_flat_config_dict = {
|
|
1758
|
+
METADATA_TRANSFORMERS_KEY: {
|
|
1759
|
+
"pre": {
|
|
1760
|
+
"target_a": {
|
|
1761
|
+
SOURCES_KEY: ["field_a"],
|
|
1762
|
+
FUNCTION_KEY: "pass_through"
|
|
1763
|
+
},
|
|
1764
|
+
"target_b": {
|
|
1765
|
+
SOURCES_KEY: ["field_b"],
|
|
1766
|
+
FUNCTION_KEY: "pass_through"
|
|
1767
|
+
}
|
|
1768
|
+
}
|
|
1769
|
+
}
|
|
1770
|
+
}
|
|
1771
|
+
|
|
1772
|
+
result_df = _transform_metadata(
|
|
1773
|
+
input_df, full_flat_config_dict, "pre", None)
|
|
1774
|
+
|
|
1775
|
+
expected_df = pandas.DataFrame({
|
|
1776
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
1777
|
+
"field_a": ["a1", "a2"],
|
|
1778
|
+
"field_b": ["b1", "b2"],
|
|
1779
|
+
"target_a": ["a1", "a2"],
|
|
1780
|
+
"target_b": ["b1", "b2"]
|
|
1781
|
+
})
|
|
1782
|
+
assert_frame_equal(expected_df, result_df)
|
|
1783
|
+
|
|
1784
|
+
# Tests for _populate_metadata_df
|
|
1785
|
+
|
|
1786
|
+
def test__populate_metadata_df_basic(self):
|
|
1787
|
+
"""Test basic metadata population with a simple config."""
|
|
1788
|
+
input_df = pandas.DataFrame({
|
|
1789
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
1790
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
|
|
1791
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"]
|
|
1792
|
+
})
|
|
1793
|
+
full_flat_config_dict = {
|
|
1794
|
+
DEFAULT_KEY: "not provided",
|
|
1795
|
+
LEAVE_REQUIREDS_BLANK_KEY: False,
|
|
1796
|
+
OVERWRITE_NON_NANS_KEY: False,
|
|
1797
|
+
HOST_TYPE_SPECIFIC_METADATA_KEY: {
|
|
1798
|
+
"human": {
|
|
1799
|
+
METADATA_FIELDS_KEY: {
|
|
1800
|
+
"host_field": {
|
|
1801
|
+
DEFAULT_KEY: "host_value",
|
|
1802
|
+
TYPE_KEY: "string"
|
|
1803
|
+
}
|
|
1804
|
+
},
|
|
1805
|
+
SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
|
|
1806
|
+
"stool": {
|
|
1807
|
+
METADATA_FIELDS_KEY: {
|
|
1808
|
+
"stool_field": {
|
|
1809
|
+
DEFAULT_KEY: "stool_value",
|
|
1810
|
+
TYPE_KEY: "string"
|
|
1811
|
+
}
|
|
1812
|
+
}
|
|
1813
|
+
}
|
|
1814
|
+
}
|
|
1815
|
+
}
|
|
1816
|
+
}
|
|
1817
|
+
}
|
|
1818
|
+
|
|
1819
|
+
result_df, validation_msgs_df = _populate_metadata_df(
|
|
1820
|
+
input_df, full_flat_config_dict, None)
|
|
1821
|
+
|
|
1822
|
+
expected_df = pandas.DataFrame({
|
|
1823
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
1824
|
+
"host_field": ["host_value", "host_value"],
|
|
1825
|
+
QIITA_SAMPLE_TYPE: ["stool", "stool"],
|
|
1826
|
+
SAMPLE_TYPE_KEY: ["stool", "stool"],
|
|
1827
|
+
"stool_field": ["stool_value", "stool_value"],
|
|
1828
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
|
|
1829
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
|
|
1830
|
+
QC_NOTE_KEY: ["", ""]
|
|
1831
|
+
})
|
|
1832
|
+
assert_frame_equal(expected_df, result_df)
|
|
1833
|
+
self.assertTrue(validation_msgs_df.empty)
|
|
1834
|
+
|
|
1835
|
+
def test__populate_metadata_df_with_pre_transformer(self):
|
|
1836
|
+
"""Test metadata population with pre-transformer."""
|
|
1837
|
+
input_df = pandas.DataFrame({
|
|
1838
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
1839
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
|
|
1840
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
|
|
1841
|
+
"input_sex": ["F", "Male"]
|
|
1842
|
+
})
|
|
1843
|
+
full_flat_config_dict = {
|
|
1844
|
+
DEFAULT_KEY: "not provided",
|
|
1845
|
+
LEAVE_REQUIREDS_BLANK_KEY: False,
|
|
1846
|
+
OVERWRITE_NON_NANS_KEY: False,
|
|
1847
|
+
METADATA_TRANSFORMERS_KEY: {
|
|
1848
|
+
PRE_TRANSFORMERS_KEY: {
|
|
1849
|
+
"sex": {
|
|
1850
|
+
SOURCES_KEY: ["input_sex"],
|
|
1851
|
+
FUNCTION_KEY: "transform_input_sex_to_std_sex"
|
|
1852
|
+
}
|
|
1853
|
+
}
|
|
1854
|
+
},
|
|
1855
|
+
HOST_TYPE_SPECIFIC_METADATA_KEY: {
|
|
1856
|
+
"human": {
|
|
1857
|
+
METADATA_FIELDS_KEY: {},
|
|
1858
|
+
SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
|
|
1859
|
+
"stool": {
|
|
1860
|
+
METADATA_FIELDS_KEY: {}
|
|
1861
|
+
}
|
|
1862
|
+
}
|
|
1863
|
+
}
|
|
1864
|
+
}
|
|
1865
|
+
}
|
|
1866
|
+
|
|
1867
|
+
result_df, validation_msgs_df = _populate_metadata_df(
|
|
1868
|
+
input_df, full_flat_config_dict, None)
|
|
1869
|
+
|
|
1870
|
+
expected_df = pandas.DataFrame({
|
|
1871
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
1872
|
+
"input_sex": ["F", "Male"],
|
|
1873
|
+
QIITA_SAMPLE_TYPE: ["stool", "stool"],
|
|
1874
|
+
SAMPLE_TYPE_KEY: ["stool", "stool"],
|
|
1875
|
+
"sex": ["female", "male"],
|
|
1876
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
|
|
1877
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
|
|
1878
|
+
QC_NOTE_KEY: ["", ""]
|
|
1879
|
+
})
|
|
1880
|
+
assert_frame_equal(expected_df, result_df)
|
|
1881
|
+
|
|
1882
|
+
def test__populate_metadata_df_with_post_transformer(self):
|
|
1883
|
+
"""Test metadata population with post-transformer."""
|
|
1884
|
+
input_df = pandas.DataFrame({
|
|
1885
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
1886
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
|
|
1887
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"]
|
|
1888
|
+
})
|
|
1889
|
+
full_flat_config_dict = {
|
|
1890
|
+
DEFAULT_KEY: "not provided",
|
|
1891
|
+
LEAVE_REQUIREDS_BLANK_KEY: False,
|
|
1892
|
+
OVERWRITE_NON_NANS_KEY: False,
|
|
1893
|
+
METADATA_TRANSFORMERS_KEY: {
|
|
1894
|
+
POST_TRANSFORMERS_KEY: {
|
|
1895
|
+
"copied_sample_type": {
|
|
1896
|
+
SOURCES_KEY: [SAMPLE_TYPE_KEY],
|
|
1897
|
+
FUNCTION_KEY: "pass_through"
|
|
1898
|
+
}
|
|
1899
|
+
}
|
|
1900
|
+
},
|
|
1901
|
+
HOST_TYPE_SPECIFIC_METADATA_KEY: {
|
|
1902
|
+
"human": {
|
|
1903
|
+
METADATA_FIELDS_KEY: {},
|
|
1904
|
+
SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
|
|
1905
|
+
"stool": {
|
|
1906
|
+
METADATA_FIELDS_KEY: {}
|
|
1907
|
+
}
|
|
1908
|
+
}
|
|
1909
|
+
}
|
|
1910
|
+
}
|
|
1911
|
+
}
|
|
1912
|
+
|
|
1913
|
+
result_df, validation_msgs_df = _populate_metadata_df(
|
|
1914
|
+
input_df, full_flat_config_dict, None)
|
|
1915
|
+
|
|
1916
|
+
expected_df = pandas.DataFrame({
|
|
1917
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
1918
|
+
"copied_sample_type": ["stool", "stool"],
|
|
1919
|
+
QIITA_SAMPLE_TYPE: ["stool", "stool"],
|
|
1920
|
+
SAMPLE_TYPE_KEY: ["stool", "stool"],
|
|
1921
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
|
|
1922
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
|
|
1923
|
+
QC_NOTE_KEY: ["", ""]
|
|
1924
|
+
})
|
|
1925
|
+
assert_frame_equal(expected_df, result_df)
|
|
1926
|
+
|
|
1927
|
+
def test__populate_metadata_df_unknown_host_type(self):
|
|
1928
|
+
"""Test that unknown host type adds QC note."""
|
|
1929
|
+
input_df = pandas.DataFrame({
|
|
1930
|
+
SAMPLE_NAME_KEY: ["sample1"],
|
|
1931
|
+
HOSTTYPE_SHORTHAND_KEY: ["unknown_host"],
|
|
1932
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool"]
|
|
1933
|
+
})
|
|
1934
|
+
full_flat_config_dict = {
|
|
1935
|
+
DEFAULT_KEY: "not provided",
|
|
1936
|
+
LEAVE_REQUIREDS_BLANK_KEY: False,
|
|
1937
|
+
OVERWRITE_NON_NANS_KEY: False,
|
|
1938
|
+
HOST_TYPE_SPECIFIC_METADATA_KEY: {
|
|
1939
|
+
"human": {
|
|
1940
|
+
METADATA_FIELDS_KEY: {},
|
|
1941
|
+
SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {}
|
|
1942
|
+
}
|
|
1943
|
+
}
|
|
1944
|
+
}
|
|
1945
|
+
|
|
1946
|
+
result_df, validation_msgs_df = _populate_metadata_df(
|
|
1947
|
+
input_df, full_flat_config_dict, None)
|
|
1948
|
+
|
|
1949
|
+
expected_df = pandas.DataFrame({
|
|
1950
|
+
SAMPLE_NAME_KEY: ["sample1"],
|
|
1951
|
+
HOSTTYPE_SHORTHAND_KEY: ["unknown_host"],
|
|
1952
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool"],
|
|
1953
|
+
QC_NOTE_KEY: ["invalid host_type"]
|
|
1954
|
+
})
|
|
1955
|
+
assert_frame_equal(expected_df, result_df)
|
|
1956
|
+
|
|
1957
|
+
def test__populate_metadata_df_columns_reordered(self):
|
|
1958
|
+
"""Test that columns are reordered correctly."""
|
|
1959
|
+
input_df = pandas.DataFrame({
|
|
1960
|
+
"zebra_field": ["z1", "z2"],
|
|
1961
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
1962
|
+
"apple_field": ["a1", "a2"],
|
|
1963
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
|
|
1964
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"]
|
|
1965
|
+
})
|
|
1966
|
+
full_flat_config_dict = {
|
|
1967
|
+
DEFAULT_KEY: "not provided",
|
|
1968
|
+
LEAVE_REQUIREDS_BLANK_KEY: False,
|
|
1969
|
+
OVERWRITE_NON_NANS_KEY: False,
|
|
1970
|
+
HOST_TYPE_SPECIFIC_METADATA_KEY: {
|
|
1971
|
+
"human": {
|
|
1972
|
+
METADATA_FIELDS_KEY: {},
|
|
1973
|
+
SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
|
|
1974
|
+
"stool": {
|
|
1975
|
+
METADATA_FIELDS_KEY: {}
|
|
1976
|
+
}
|
|
1977
|
+
}
|
|
1978
|
+
}
|
|
1979
|
+
}
|
|
1980
|
+
}
|
|
1981
|
+
|
|
1982
|
+
result_df, validation_msgs_df = _populate_metadata_df(
|
|
1983
|
+
input_df, full_flat_config_dict, None)
|
|
1984
|
+
|
|
1985
|
+
expected_df = pandas.DataFrame({
|
|
1986
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
1987
|
+
"apple_field": ["a1", "a2"],
|
|
1988
|
+
QIITA_SAMPLE_TYPE: ["stool", "stool"],
|
|
1989
|
+
SAMPLE_TYPE_KEY: ["stool", "stool"],
|
|
1990
|
+
"zebra_field": ["z1", "z2"],
|
|
1991
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
|
|
1992
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
|
|
1993
|
+
QC_NOTE_KEY: ["", ""]
|
|
1994
|
+
})
|
|
1995
|
+
assert_frame_equal(expected_df, result_df)
|
|
1996
|
+
|
|
1997
|
+
def test__populate_metadata_df_with_custom_transformer(self):
|
|
1998
|
+
"""Test metadata population with custom transformer function."""
|
|
1999
|
+
input_df = pandas.DataFrame({
|
|
2000
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
2001
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
|
|
2002
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
|
|
2003
|
+
"source_field": ["hello", "world"]
|
|
2004
|
+
})
|
|
2005
|
+
full_flat_config_dict = {
|
|
2006
|
+
DEFAULT_KEY: "not provided",
|
|
2007
|
+
LEAVE_REQUIREDS_BLANK_KEY: False,
|
|
2008
|
+
OVERWRITE_NON_NANS_KEY: False,
|
|
2009
|
+
METADATA_TRANSFORMERS_KEY: {
|
|
2010
|
+
PRE_TRANSFORMERS_KEY: {
|
|
2011
|
+
"upper_field": {
|
|
2012
|
+
SOURCES_KEY: ["source_field"],
|
|
2013
|
+
FUNCTION_KEY: "custom_upper"
|
|
2014
|
+
}
|
|
2015
|
+
}
|
|
2016
|
+
},
|
|
2017
|
+
HOST_TYPE_SPECIFIC_METADATA_KEY: {
|
|
2018
|
+
"human": {
|
|
2019
|
+
METADATA_FIELDS_KEY: {},
|
|
2020
|
+
SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
|
|
2021
|
+
"stool": {
|
|
2022
|
+
METADATA_FIELDS_KEY: {}
|
|
2023
|
+
}
|
|
2024
|
+
}
|
|
2025
|
+
}
|
|
2026
|
+
}
|
|
2027
|
+
}
|
|
2028
|
+
|
|
2029
|
+
def custom_upper(row, source_fields):
|
|
2030
|
+
return row[source_fields[0]].upper()
|
|
2031
|
+
|
|
2032
|
+
transformer_funcs_dict = {"custom_upper": custom_upper}
|
|
2033
|
+
|
|
2034
|
+
result_df, validation_msgs_df = _populate_metadata_df(
|
|
2035
|
+
input_df, full_flat_config_dict, transformer_funcs_dict)
|
|
2036
|
+
|
|
2037
|
+
expected_df = pandas.DataFrame({
|
|
2038
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
2039
|
+
QIITA_SAMPLE_TYPE: ["stool", "stool"],
|
|
2040
|
+
SAMPLE_TYPE_KEY: ["stool", "stool"],
|
|
2041
|
+
"source_field": ["hello", "world"],
|
|
2042
|
+
"upper_field": ["HELLO", "WORLD"],
|
|
2043
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
|
|
2044
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
|
|
2045
|
+
QC_NOTE_KEY: ["", ""]
|
|
2046
|
+
})
|
|
2047
|
+
assert_frame_equal(expected_df, result_df)
|
|
2048
|
+
|
|
2049
|
+
def test__populate_metadata_df_nan_sample_name_raises(self):
|
|
2050
|
+
"""Test that NaN sample name raises ValueError."""
|
|
2051
|
+
input_df = pandas.DataFrame({
|
|
2052
|
+
SAMPLE_NAME_KEY: ["sample1", np.nan],
|
|
2053
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
|
|
2054
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"]
|
|
2055
|
+
})
|
|
2056
|
+
full_flat_config_dict = {
|
|
2057
|
+
HOST_TYPE_SPECIFIC_METADATA_KEY: {}
|
|
2058
|
+
}
|
|
2059
|
+
|
|
2060
|
+
with self.assertRaisesRegex(ValueError, "Metadata contains NaN sample names"):
|
|
2061
|
+
_populate_metadata_df(input_df, full_flat_config_dict, None)
|
|
2062
|
+
|
|
2063
|
+
# Tests for extend_metadata_df
|
|
2064
|
+
|
|
2065
|
+
TEST_DIR = path.dirname(__file__)
|
|
2066
|
+
TEST_STDS_FP = path.join(TEST_DIR, "data/test_standards.yml")
|
|
2067
|
+
|
|
2068
|
+
def test_extend_metadata_df_basic(self):
|
|
2069
|
+
"""Test basic metadata extension with study config."""
|
|
2070
|
+
input_df = pandas.DataFrame({
|
|
2071
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
2072
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
|
|
2073
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"]
|
|
2074
|
+
})
|
|
2075
|
+
study_config = {
|
|
2076
|
+
DEFAULT_KEY: "not provided",
|
|
2077
|
+
LEAVE_REQUIREDS_BLANK_KEY: True,
|
|
2078
|
+
OVERWRITE_NON_NANS_KEY: False,
|
|
2079
|
+
STUDY_SPECIFIC_METADATA_KEY: {
|
|
2080
|
+
HOST_TYPE_SPECIFIC_METADATA_KEY: {
|
|
2081
|
+
"human": {
|
|
2082
|
+
METADATA_FIELDS_KEY: {
|
|
2083
|
+
"custom_field": {
|
|
2084
|
+
DEFAULT_KEY: "custom_value",
|
|
2085
|
+
TYPE_KEY: "string"
|
|
2086
|
+
}
|
|
2087
|
+
},
|
|
2088
|
+
SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
|
|
2089
|
+
"stool": {
|
|
2090
|
+
METADATA_FIELDS_KEY: {}
|
|
2091
|
+
}
|
|
2092
|
+
}
|
|
2093
|
+
}
|
|
2094
|
+
}
|
|
2095
|
+
}
|
|
2096
|
+
}
|
|
2097
|
+
|
|
2098
|
+
result_df, validation_msgs_df = extend_metadata_df(
|
|
2099
|
+
input_df, study_config, None, None, self.TEST_STDS_FP)
|
|
2100
|
+
|
|
2101
|
+
expected_df = pandas.DataFrame({
|
|
2102
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
2103
|
+
"body_site": ["gut", "gut"],
|
|
2104
|
+
"custom_field": ["custom_value", "custom_value"],
|
|
2105
|
+
"host_common_name": ["human", "human"],
|
|
2106
|
+
QIITA_SAMPLE_TYPE: ["stool", "stool"],
|
|
2107
|
+
SAMPLE_TYPE_KEY: ["stool", "stool"],
|
|
2108
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
|
|
2109
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
|
|
2110
|
+
QC_NOTE_KEY: ["", ""]
|
|
2111
|
+
})
|
|
2112
|
+
assert_frame_equal(expected_df, result_df)
|
|
2113
|
+
self.assertTrue(validation_msgs_df.empty)
|
|
2114
|
+
|
|
2115
|
+
def test_extend_metadata_df_with_pre_transformer(self):
|
|
2116
|
+
"""Test metadata extension with pre-transformer."""
|
|
2117
|
+
input_df = pandas.DataFrame({
|
|
2118
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
2119
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
|
|
2120
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
|
|
2121
|
+
"input_sex": ["F", "Male"]
|
|
2122
|
+
})
|
|
2123
|
+
study_config = {
|
|
2124
|
+
DEFAULT_KEY: "not provided",
|
|
2125
|
+
LEAVE_REQUIREDS_BLANK_KEY: True,
|
|
2126
|
+
OVERWRITE_NON_NANS_KEY: False,
|
|
2127
|
+
METADATA_TRANSFORMERS_KEY: {
|
|
2128
|
+
PRE_TRANSFORMERS_KEY: {
|
|
2129
|
+
"sex": {
|
|
2130
|
+
SOURCES_KEY: ["input_sex"],
|
|
2131
|
+
FUNCTION_KEY: "transform_input_sex_to_std_sex"
|
|
2132
|
+
}
|
|
2133
|
+
}
|
|
2134
|
+
},
|
|
2135
|
+
STUDY_SPECIFIC_METADATA_KEY: {
|
|
2136
|
+
HOST_TYPE_SPECIFIC_METADATA_KEY: {
|
|
2137
|
+
"human": {
|
|
2138
|
+
METADATA_FIELDS_KEY: {},
|
|
2139
|
+
SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
|
|
2140
|
+
"stool": {
|
|
2141
|
+
METADATA_FIELDS_KEY: {}
|
|
2142
|
+
}
|
|
2143
|
+
}
|
|
2144
|
+
}
|
|
2145
|
+
}
|
|
2146
|
+
}
|
|
2147
|
+
}
|
|
2148
|
+
|
|
2149
|
+
result_df, validation_msgs_df = extend_metadata_df(
|
|
2150
|
+
input_df, study_config, None, None, self.TEST_STDS_FP)
|
|
2151
|
+
|
|
2152
|
+
expected_df = pandas.DataFrame({
|
|
2153
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
2154
|
+
"body_site": ["gut", "gut"],
|
|
2155
|
+
"host_common_name": ["human", "human"],
|
|
2156
|
+
"input_sex": ["F", "Male"],
|
|
2157
|
+
QIITA_SAMPLE_TYPE: ["stool", "stool"],
|
|
2158
|
+
SAMPLE_TYPE_KEY: ["stool", "stool"],
|
|
2159
|
+
"sex": ["female", "male"],
|
|
2160
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
|
|
2161
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
|
|
2162
|
+
QC_NOTE_KEY: ["", ""]
|
|
2163
|
+
})
|
|
2164
|
+
assert_frame_equal(expected_df, result_df)
|
|
2165
|
+
|
|
2166
|
+
def test_extend_metadata_df_with_custom_transformer(self):
|
|
2167
|
+
"""Test metadata extension with custom transformer function."""
|
|
2168
|
+
input_df = pandas.DataFrame({
|
|
2169
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
2170
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
|
|
2171
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
|
|
2172
|
+
"source_field": ["hello", "world"]
|
|
2173
|
+
})
|
|
2174
|
+
study_config = {
|
|
2175
|
+
DEFAULT_KEY: "not provided",
|
|
2176
|
+
LEAVE_REQUIREDS_BLANK_KEY: True,
|
|
2177
|
+
OVERWRITE_NON_NANS_KEY: False,
|
|
2178
|
+
METADATA_TRANSFORMERS_KEY: {
|
|
2179
|
+
PRE_TRANSFORMERS_KEY: {
|
|
2180
|
+
"upper_field": {
|
|
2181
|
+
SOURCES_KEY: ["source_field"],
|
|
2182
|
+
FUNCTION_KEY: "custom_upper"
|
|
2183
|
+
}
|
|
2184
|
+
}
|
|
2185
|
+
},
|
|
2186
|
+
STUDY_SPECIFIC_METADATA_KEY: {
|
|
2187
|
+
HOST_TYPE_SPECIFIC_METADATA_KEY: {
|
|
2188
|
+
"human": {
|
|
2189
|
+
METADATA_FIELDS_KEY: {},
|
|
2190
|
+
SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
|
|
2191
|
+
"stool": {
|
|
2192
|
+
METADATA_FIELDS_KEY: {}
|
|
2193
|
+
}
|
|
2194
|
+
}
|
|
2195
|
+
}
|
|
2196
|
+
}
|
|
2197
|
+
}
|
|
2198
|
+
}
|
|
2199
|
+
|
|
2200
|
+
def custom_upper(row, source_fields):
|
|
2201
|
+
return row[source_fields[0]].upper()
|
|
2202
|
+
|
|
2203
|
+
transformer_funcs_dict = {"custom_upper": custom_upper}
|
|
2204
|
+
|
|
2205
|
+
result_df, validation_msgs_df = extend_metadata_df(
|
|
2206
|
+
input_df, study_config, transformer_funcs_dict, None, self.TEST_STDS_FP)
|
|
2207
|
+
|
|
2208
|
+
expected_df = pandas.DataFrame({
|
|
2209
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
2210
|
+
"body_site": ["gut", "gut"],
|
|
2211
|
+
"host_common_name": ["human", "human"],
|
|
2212
|
+
QIITA_SAMPLE_TYPE: ["stool", "stool"],
|
|
2213
|
+
SAMPLE_TYPE_KEY: ["stool", "stool"],
|
|
2214
|
+
"source_field": ["hello", "world"],
|
|
2215
|
+
"upper_field": ["HELLO", "WORLD"],
|
|
2216
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
|
|
2217
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
|
|
2218
|
+
QC_NOTE_KEY: ["", ""]
|
|
2219
|
+
})
|
|
2220
|
+
assert_frame_equal(expected_df, result_df)
|
|
2221
|
+
|
|
2222
|
+
def test_extend_metadata_df_missing_required_columns_raises(self):
|
|
2223
|
+
"""Test that missing required columns raises ValueError."""
|
|
2224
|
+
input_df = pandas.DataFrame({
|
|
2225
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"]
|
|
2226
|
+
# Missing HOSTTYPE_SHORTHAND_KEY and SAMPLETYPE_SHORTHAND_KEY
|
|
2227
|
+
})
|
|
2228
|
+
study_config = {}
|
|
2229
|
+
|
|
2230
|
+
with self.assertRaisesRegex(ValueError, "metadata missing required columns"):
|
|
2231
|
+
extend_metadata_df(input_df, study_config, None, None, self.TEST_STDS_FP)
|
|
2232
|
+
|
|
2233
|
+
def test_extend_metadata_df_none_study_config(self):
|
|
2234
|
+
"""Test metadata extension with None study config uses standards only."""
|
|
2235
|
+
input_df = pandas.DataFrame({
|
|
2236
|
+
SAMPLE_NAME_KEY: ["sample1"],
|
|
2237
|
+
HOSTTYPE_SHORTHAND_KEY: ["human"],
|
|
2238
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool"]
|
|
2239
|
+
})
|
|
2240
|
+
|
|
2241
|
+
result_df, validation_msgs_df = extend_metadata_df(
|
|
2242
|
+
input_df, None, None, None, self.TEST_STDS_FP)
|
|
2243
|
+
|
|
2244
|
+
expected_df = pandas.DataFrame({
|
|
2245
|
+
SAMPLE_NAME_KEY: ["sample1"],
|
|
2246
|
+
"body_site": ["gut"],
|
|
2247
|
+
"host_common_name": ["human"],
|
|
2248
|
+
QIITA_SAMPLE_TYPE: ["stool"],
|
|
2249
|
+
SAMPLE_TYPE_KEY: ["stool"],
|
|
2250
|
+
HOSTTYPE_SHORTHAND_KEY: ["human"],
|
|
2251
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool"],
|
|
2252
|
+
QC_NOTE_KEY: [""]
|
|
2253
|
+
})
|
|
2254
|
+
assert_frame_equal(expected_df, result_df)
|
|
2255
|
+
|
|
2256
|
+
def test_extend_metadata_df_unknown_host_type(self):
|
|
2257
|
+
"""Test that unknown host type adds QC note."""
|
|
2258
|
+
input_df = pandas.DataFrame({
|
|
2259
|
+
SAMPLE_NAME_KEY: ["sample1"],
|
|
2260
|
+
HOSTTYPE_SHORTHAND_KEY: ["unknown_host"],
|
|
2261
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool"]
|
|
2262
|
+
})
|
|
2263
|
+
study_config = {
|
|
2264
|
+
DEFAULT_KEY: "not provided",
|
|
2265
|
+
LEAVE_REQUIREDS_BLANK_KEY: True,
|
|
2266
|
+
OVERWRITE_NON_NANS_KEY: False,
|
|
2267
|
+
STUDY_SPECIFIC_METADATA_KEY: {
|
|
2268
|
+
HOST_TYPE_SPECIFIC_METADATA_KEY: {
|
|
2269
|
+
"human": {
|
|
2270
|
+
METADATA_FIELDS_KEY: {},
|
|
2271
|
+
SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
|
|
2272
|
+
"stool": {
|
|
2273
|
+
METADATA_FIELDS_KEY: {}
|
|
2274
|
+
}
|
|
2275
|
+
}
|
|
2276
|
+
}
|
|
2277
|
+
}
|
|
2278
|
+
}
|
|
2279
|
+
}
|
|
2280
|
+
|
|
2281
|
+
result_df, validation_msgs_df = extend_metadata_df(
|
|
2282
|
+
input_df, study_config, None, None, self.TEST_STDS_FP)
|
|
2283
|
+
|
|
2284
|
+
expected_df = pandas.DataFrame({
|
|
2285
|
+
SAMPLE_NAME_KEY: ["sample1"],
|
|
2286
|
+
HOSTTYPE_SHORTHAND_KEY: ["unknown_host"],
|
|
2287
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool"],
|
|
2288
|
+
QC_NOTE_KEY: ["invalid host_type"]
|
|
2289
|
+
})
|
|
2290
|
+
assert_frame_equal(expected_df, result_df)
|
|
2291
|
+
|
|
2292
|
+
def test_extend_metadata_df_multiple_host_types(self):
|
|
2293
|
+
"""Test metadata extension with multiple host types."""
|
|
2294
|
+
input_df = pandas.DataFrame({
|
|
2295
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2", "sample3"],
|
|
2296
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "mouse", "human"],
|
|
2297
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool", "blood"]
|
|
2298
|
+
})
|
|
2299
|
+
study_config = {
|
|
2300
|
+
DEFAULT_KEY: "not provided",
|
|
2301
|
+
LEAVE_REQUIREDS_BLANK_KEY: True,
|
|
2302
|
+
OVERWRITE_NON_NANS_KEY: False,
|
|
2303
|
+
STUDY_SPECIFIC_METADATA_KEY: {
|
|
2304
|
+
HOST_TYPE_SPECIFIC_METADATA_KEY: {
|
|
2305
|
+
"human": {
|
|
2306
|
+
METADATA_FIELDS_KEY: {},
|
|
2307
|
+
SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
|
|
2308
|
+
"stool": {
|
|
2309
|
+
METADATA_FIELDS_KEY: {}
|
|
2310
|
+
},
|
|
2311
|
+
"blood": {
|
|
2312
|
+
METADATA_FIELDS_KEY: {}
|
|
2313
|
+
}
|
|
2314
|
+
}
|
|
2315
|
+
},
|
|
2316
|
+
"mouse": {
|
|
2317
|
+
METADATA_FIELDS_KEY: {},
|
|
2318
|
+
SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
|
|
2319
|
+
"stool": {
|
|
2320
|
+
METADATA_FIELDS_KEY: {}
|
|
2321
|
+
}
|
|
2322
|
+
}
|
|
2323
|
+
}
|
|
2324
|
+
}
|
|
2325
|
+
}
|
|
2326
|
+
}
|
|
2327
|
+
|
|
2328
|
+
result_df, validation_msgs_df = extend_metadata_df(
|
|
2329
|
+
input_df, study_config, None, None, self.TEST_STDS_FP)
|
|
2330
|
+
|
|
2331
|
+
# After processing multiple host types, rows may be reordered
|
|
2332
|
+
# Human samples are processed together, then mouse samples
|
|
2333
|
+
expected_df = pandas.DataFrame({
|
|
2334
|
+
SAMPLE_NAME_KEY: ["sample1", "sample3", "sample2"],
|
|
2335
|
+
"body_site": ["gut", "blood", "gut"],
|
|
2336
|
+
"host_common_name": ["human", "human", "mouse"],
|
|
2337
|
+
QIITA_SAMPLE_TYPE: ["stool", "blood", "stool"],
|
|
2338
|
+
SAMPLE_TYPE_KEY: ["stool", "blood", "stool"],
|
|
2339
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "human", "mouse"],
|
|
2340
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "blood", "stool"],
|
|
2341
|
+
QC_NOTE_KEY: ["", "", ""]
|
|
2342
|
+
})
|
|
2343
|
+
assert_frame_equal(expected_df, result_df)
|
|
2344
|
+
|
|
2345
|
+
def test_extend_metadata_df_with_software_config(self):
|
|
2346
|
+
"""Test metadata extension with custom software config overrides defaults."""
|
|
2347
|
+
input_df = pandas.DataFrame({
|
|
2348
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
2349
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
|
|
2350
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"]
|
|
2351
|
+
})
|
|
2352
|
+
# Software config with custom default value
|
|
2353
|
+
software_config = {
|
|
2354
|
+
DEFAULT_KEY: "custom_software_default",
|
|
2355
|
+
LEAVE_REQUIREDS_BLANK_KEY: True,
|
|
2356
|
+
OVERWRITE_NON_NANS_KEY: False
|
|
2357
|
+
}
|
|
2358
|
+
# Study config that doesn't override DEFAULT_KEY
|
|
2359
|
+
study_config = {
|
|
2360
|
+
STUDY_SPECIFIC_METADATA_KEY: {
|
|
2361
|
+
HOST_TYPE_SPECIFIC_METADATA_KEY: {
|
|
2362
|
+
"human": {
|
|
2363
|
+
METADATA_FIELDS_KEY: {
|
|
2364
|
+
"study_field": {
|
|
2365
|
+
DEFAULT_KEY: "study_value",
|
|
2366
|
+
TYPE_KEY: "string"
|
|
2367
|
+
}
|
|
2368
|
+
},
|
|
2369
|
+
SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
|
|
2370
|
+
"stool": {
|
|
2371
|
+
METADATA_FIELDS_KEY: {}
|
|
2372
|
+
}
|
|
2373
|
+
}
|
|
2374
|
+
}
|
|
2375
|
+
}
|
|
2376
|
+
}
|
|
2377
|
+
}
|
|
2378
|
+
|
|
2379
|
+
result_df, validation_msgs_df = extend_metadata_df(
|
|
2380
|
+
input_df, study_config, None, software_config, self.TEST_STDS_FP)
|
|
2381
|
+
|
|
2382
|
+
expected_df = pandas.DataFrame({
|
|
2383
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
2384
|
+
"body_site": ["gut", "gut"],
|
|
2385
|
+
"host_common_name": ["human", "human"],
|
|
2386
|
+
QIITA_SAMPLE_TYPE: ["stool", "stool"],
|
|
2387
|
+
SAMPLE_TYPE_KEY: ["stool", "stool"],
|
|
2388
|
+
"study_field": ["study_value", "study_value"],
|
|
2389
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
|
|
2390
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
|
|
2391
|
+
QC_NOTE_KEY: ["", ""]
|
|
2392
|
+
})
|
|
2393
|
+
assert_frame_equal(expected_df, result_df)
|
|
2394
|
+
|
|
2395
|
+
# Tests for _get_study_specific_config
|
|
2396
|
+
|
|
2397
|
+
def test__get_study_specific_config_with_valid_file(self):
|
|
2398
|
+
"""Test loading study-specific config from a valid YAML file."""
|
|
2399
|
+
config_fp = path.join(self.TEST_DIR, "data/test_config.yml")
|
|
2400
|
+
|
|
2401
|
+
result = _get_study_specific_config(config_fp)
|
|
2402
|
+
|
|
2403
|
+
expected = {
|
|
2404
|
+
HOST_TYPE_SPECIFIC_METADATA_KEY: {
|
|
2405
|
+
"base": {
|
|
2406
|
+
METADATA_FIELDS_KEY: {
|
|
2407
|
+
"sample_name": {
|
|
2408
|
+
TYPE_KEY: "string",
|
|
2409
|
+
"unique": True
|
|
2410
|
+
},
|
|
2411
|
+
"sample_type": {
|
|
2412
|
+
"empty": False,
|
|
2413
|
+
"is_phi": False
|
|
2414
|
+
}
|
|
2415
|
+
}
|
|
2416
|
+
}
|
|
2417
|
+
}
|
|
2418
|
+
}
|
|
2419
|
+
self.assertDictEqual(expected, result)
|
|
2420
|
+
|
|
2421
|
+
def test__get_study_specific_config_with_none(self):
|
|
2422
|
+
"""Test that None file path returns None."""
|
|
2423
|
+
result = _get_study_specific_config(None)
|
|
2424
|
+
|
|
2425
|
+
self.assertIsNone(result)
|
|
2426
|
+
|
|
2427
|
+
def test__get_study_specific_config_with_empty_string(self):
|
|
2428
|
+
"""Test that empty string file path returns None."""
|
|
2429
|
+
result = _get_study_specific_config("")
|
|
2430
|
+
|
|
2431
|
+
self.assertIsNone(result)
|
|
2432
|
+
|
|
2433
|
+
def test__get_study_specific_config_nonexistent_file_raises(self):
|
|
2434
|
+
"""Test that nonexistent file raises FileNotFoundError."""
|
|
2435
|
+
with self.assertRaises(FileNotFoundError):
|
|
2436
|
+
_get_study_specific_config("/nonexistent/path/config.yml")
|
|
2437
|
+
|
|
2438
|
+
def test__get_study_specific_config_invalid_yaml_raises(self):
|
|
2439
|
+
"""Test that invalid YAML file raises an error."""
|
|
2440
|
+
invalid_fp = path.join(self.TEST_DIR, "data/invalid.yml")
|
|
2441
|
+
|
|
2442
|
+
with self.assertRaises(Exception):
|
|
2443
|
+
_get_study_specific_config(invalid_fp)
|
|
2444
|
+
|
|
2445
|
+
# Tests for _output_metadata_df_to_files
|
|
2446
|
+
|
|
2447
|
+
def test__output_metadata_df_to_files_basic(self):
|
|
2448
|
+
"""Test basic output of metadata DataFrame to file."""
|
|
2449
|
+
input_df = pandas.DataFrame({
|
|
2450
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
2451
|
+
"field_a": ["a1", "a2"],
|
|
2452
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
|
|
2453
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
|
|
2454
|
+
QC_NOTE_KEY: ["", ""]
|
|
2455
|
+
})
|
|
2456
|
+
|
|
2457
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
2458
|
+
_output_metadata_df_to_files(
|
|
2459
|
+
input_df, tmpdir, "test_output", INTERNAL_COL_KEYS,
|
|
2460
|
+
sep="\t", remove_internals_and_fails=False)
|
|
2461
|
+
|
|
2462
|
+
# Find the output file (has timestamp prefix)
|
|
2463
|
+
output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
|
|
2464
|
+
self.assertEqual(1, len(output_files))
|
|
2465
|
+
|
|
2466
|
+
# Read and verify contents (keep_default_na=False preserves empty strings)
|
|
2467
|
+
result_df = pandas.read_csv(output_files[0], sep="\t", keep_default_na=False)
|
|
2468
|
+
expected_df = input_df
|
|
2469
|
+
assert_frame_equal(expected_df, result_df)
|
|
2470
|
+
|
|
2471
|
+
def test__output_metadata_df_to_files_remove_internals_and_fails(self):
|
|
2472
|
+
"""Test output with internal columns and failures removed."""
|
|
2473
|
+
input_df = pandas.DataFrame({
|
|
2474
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2", "sample3"],
|
|
2475
|
+
"field_a": ["a1", "a2", "a3"],
|
|
2476
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "human", "human"],
|
|
2477
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool", "stool"],
|
|
2478
|
+
QC_NOTE_KEY: ["", "invalid host_type", ""]
|
|
2479
|
+
})
|
|
2480
|
+
|
|
2481
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
2482
|
+
_output_metadata_df_to_files(
|
|
2483
|
+
input_df, tmpdir, "test_output", INTERNAL_COL_KEYS,
|
|
2484
|
+
sep="\t", remove_internals_and_fails=True)
|
|
2485
|
+
|
|
2486
|
+
# Find the main output file
|
|
2487
|
+
output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
|
|
2488
|
+
self.assertEqual(1, len(output_files))
|
|
2489
|
+
|
|
2490
|
+
# Verify main output has internal cols removed and no failures
|
|
2491
|
+
result_df = pandas.read_csv(output_files[0], sep="\t")
|
|
2492
|
+
expected_df = pandas.DataFrame({
|
|
2493
|
+
SAMPLE_NAME_KEY: ["sample1", "sample3"],
|
|
2494
|
+
"field_a": ["a1", "a3"]
|
|
2495
|
+
})
|
|
2496
|
+
assert_frame_equal(expected_df, result_df)
|
|
2497
|
+
|
|
2498
|
+
# Find the fails file
|
|
2499
|
+
fails_files = glob.glob(os.path.join(tmpdir, "*_test_output_fails.csv"))
|
|
2500
|
+
self.assertEqual(1, len(fails_files))
|
|
2501
|
+
|
|
2502
|
+
# Verify fails file contains the failed row
|
|
2503
|
+
fails_df = pandas.read_csv(fails_files[0], sep=",")
|
|
2504
|
+
expected_fails_df = pandas.DataFrame({
|
|
2505
|
+
SAMPLE_NAME_KEY: ["sample2"],
|
|
2506
|
+
"field_a": ["a2"],
|
|
2507
|
+
HOSTTYPE_SHORTHAND_KEY: ["human"],
|
|
2508
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool"],
|
|
2509
|
+
QC_NOTE_KEY: ["invalid host_type"]
|
|
2510
|
+
})
|
|
2511
|
+
assert_frame_equal(expected_fails_df, fails_df)
|
|
2512
|
+
|
|
2513
|
+
def test__output_metadata_df_to_files_no_failures_creates_empty_file(self):
|
|
2514
|
+
"""Test that empty fails file is created when there are no failures."""
|
|
2515
|
+
input_df = pandas.DataFrame({
|
|
2516
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
2517
|
+
"field_a": ["a1", "a2"],
|
|
2518
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
|
|
2519
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
|
|
2520
|
+
QC_NOTE_KEY: ["", ""]
|
|
2521
|
+
})
|
|
2522
|
+
|
|
2523
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
2524
|
+
_output_metadata_df_to_files(
|
|
2525
|
+
input_df, tmpdir, "test_output", INTERNAL_COL_KEYS,
|
|
2526
|
+
sep="\t", remove_internals_and_fails=True,
|
|
2527
|
+
suppress_empty_fails=False)
|
|
2528
|
+
|
|
2529
|
+
# Find the fails file
|
|
2530
|
+
fails_files = glob.glob(os.path.join(tmpdir, "*_test_output_fails.csv"))
|
|
2531
|
+
self.assertEqual(1, len(fails_files))
|
|
2532
|
+
|
|
2533
|
+
# Verify fails file is empty (zero bytes)
|
|
2534
|
+
self.assertEqual(0, os.path.getsize(fails_files[0]))
|
|
2535
|
+
|
|
2536
|
+
def test__output_metadata_df_to_files_suppress_empty_fails(self):
|
|
2537
|
+
"""Test that empty fails file is not created when suppress_empty_fails=True."""
|
|
2538
|
+
input_df = pandas.DataFrame({
|
|
2539
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
2540
|
+
"field_a": ["a1", "a2"],
|
|
2541
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
|
|
2542
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
|
|
2543
|
+
QC_NOTE_KEY: ["", ""]
|
|
2544
|
+
})
|
|
2545
|
+
|
|
2546
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
2547
|
+
_output_metadata_df_to_files(
|
|
2548
|
+
input_df, tmpdir, "test_output", INTERNAL_COL_KEYS,
|
|
2549
|
+
sep="\t", remove_internals_and_fails=True,
|
|
2550
|
+
suppress_empty_fails=True)
|
|
2551
|
+
|
|
2552
|
+
# Find the fails file - should not exist
|
|
2553
|
+
fails_files = glob.glob(os.path.join(tmpdir, "*_test_output_fails.csv"))
|
|
2554
|
+
self.assertEqual(0, len(fails_files))
|
|
2555
|
+
|
|
2556
|
+
# Main output file should still exist
|
|
2557
|
+
output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
|
|
2558
|
+
self.assertEqual(1, len(output_files))
|
|
2559
|
+
|
|
2560
|
+
def test__output_metadata_df_to_files_csv_separator(self):
|
|
2561
|
+
"""Test output with comma separator creates .csv file."""
|
|
2562
|
+
input_df = pandas.DataFrame({
|
|
2563
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
2564
|
+
"field_a": ["a1", "a2"],
|
|
2565
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
|
|
2566
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
|
|
2567
|
+
QC_NOTE_KEY: ["", ""]
|
|
2568
|
+
})
|
|
2569
|
+
|
|
2570
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
2571
|
+
_output_metadata_df_to_files(
|
|
2572
|
+
input_df, tmpdir, "test_output", INTERNAL_COL_KEYS,
|
|
2573
|
+
sep=",", remove_internals_and_fails=False)
|
|
2574
|
+
|
|
2575
|
+
# Find the output file with .csv extension
|
|
2576
|
+
output_files = glob.glob(os.path.join(tmpdir, "*_test_output.csv"))
|
|
2577
|
+
self.assertEqual(1, len(output_files))
|
|
2578
|
+
|
|
2579
|
+
# Read and verify contents (keep_default_na=False preserves empty strings)
|
|
2580
|
+
result_df = pandas.read_csv(output_files[0], sep=",", keep_default_na=False)
|
|
2581
|
+
expected_df = input_df
|
|
2582
|
+
assert_frame_equal(expected_df, result_df)
|
|
2583
|
+
|
|
2584
|
+
def test__output_metadata_df_to_files_all_failures(self):
|
|
2585
|
+
"""Test output when all rows are failures."""
|
|
2586
|
+
input_df = pandas.DataFrame({
|
|
2587
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
2588
|
+
"field_a": ["a1", "a2"],
|
|
2589
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
|
|
2590
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
|
|
2591
|
+
QC_NOTE_KEY: ["invalid host_type", "invalid sample_type"]
|
|
2592
|
+
})
|
|
2593
|
+
|
|
2594
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
2595
|
+
_output_metadata_df_to_files(
|
|
2596
|
+
input_df, tmpdir, "test_output", INTERNAL_COL_KEYS,
|
|
2597
|
+
sep="\t", remove_internals_and_fails=True)
|
|
2598
|
+
|
|
2599
|
+
# Main output file should have only headers (empty data)
|
|
2600
|
+
output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
|
|
2601
|
+
self.assertEqual(1, len(output_files))
|
|
2602
|
+
result_df = pandas.read_csv(output_files[0], sep="\t")
|
|
2603
|
+
self.assertTrue(result_df.empty)
|
|
2604
|
+
self.assertEqual([SAMPLE_NAME_KEY, "field_a"], list(result_df.columns))
|
|
2605
|
+
|
|
2606
|
+
# Fails file should have both rows
|
|
2607
|
+
fails_files = glob.glob(os.path.join(tmpdir, "*_test_output_fails.csv"))
|
|
2608
|
+
self.assertEqual(1, len(fails_files))
|
|
2609
|
+
fails_df = pandas.read_csv(fails_files[0], sep=",")
|
|
2610
|
+
self.assertEqual(2, len(fails_df))
|