metameq 2026.1.1__py3-none-any.whl → 2026.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- metameq/_version.py +3 -3
- metameq/src/metadata_configurator.py +146 -1
- metameq/src/metadata_extender.py +44 -38
- metameq/tests/test_metadata_configurator.py +2741 -208
- metameq/tests/test_metadata_extender.py +2034 -497
- metameq/tests/test_metadata_merger.py +1 -1
- metameq/tests/test_util.py +1 -1
- {metameq-2026.1.1.dist-info → metameq-2026.1.2.dist-info}/METADATA +1 -1
- {metameq-2026.1.1.dist-info → metameq-2026.1.2.dist-info}/RECORD +12 -12
- {metameq-2026.1.1.dist-info → metameq-2026.1.2.dist-info}/WHEEL +0 -0
- {metameq-2026.1.1.dist-info → metameq-2026.1.2.dist-info}/entry_points.txt +0 -0
- {metameq-2026.1.1.dist-info → metameq-2026.1.2.dist-info}/top_level.txt +0 -0
|
@@ -16,10 +16,12 @@ from metameq.src.util import \
|
|
|
16
16
|
SOURCES_KEY, FUNCTION_KEY, PRE_TRANSFORMERS_KEY, POST_TRANSFORMERS_KEY, \
|
|
17
17
|
STUDY_SPECIFIC_METADATA_KEY
|
|
18
18
|
from metameq.src.metadata_extender import \
|
|
19
|
-
id_missing_cols, get_qc_failures,
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
19
|
+
id_missing_cols, get_qc_failures, get_reserved_cols, find_standard_cols, \
|
|
20
|
+
find_nonstandard_cols, write_metadata_results, \
|
|
21
|
+
get_extended_metadata_from_df_and_yaml, write_extended_metadata_from_df, \
|
|
22
|
+
write_extended_metadata, _reorder_df, _catch_nan_required_fields, \
|
|
23
|
+
_fill_na_if_default, _update_metadata_from_metadata_fields_dict, \
|
|
24
|
+
_update_metadata_from_dict, _construct_sample_type_metadata_fields_dict, \
|
|
23
25
|
_generate_metadata_for_a_sample_type_in_a_host_type, \
|
|
24
26
|
_generate_metadata_for_a_host_type, _generate_metadata_for_host_types, \
|
|
25
27
|
_transform_metadata, _populate_metadata_df, extend_metadata_df, \
|
|
@@ -67,157 +69,687 @@ class TestMetadataExtender(TestCase):
|
|
|
67
69
|
expected = sorted(REQUIRED_RAW_METADATA_FIELDS)
|
|
68
70
|
self.assertEqual(expected, result)
|
|
69
71
|
|
|
70
|
-
# Tests for
|
|
72
|
+
# Tests for get_reserved_cols
|
|
71
73
|
|
|
72
|
-
def
|
|
73
|
-
"""Test returns
|
|
74
|
+
def test_get_reserved_cols_single_host_sample_type(self):
|
|
75
|
+
"""Test returns sorted list of reserved column names for a single host/sample type."""
|
|
74
76
|
input_df = pandas.DataFrame({
|
|
75
|
-
SAMPLE_NAME_KEY: ["sample1"
|
|
76
|
-
|
|
77
|
+
SAMPLE_NAME_KEY: ["sample1"],
|
|
78
|
+
HOSTTYPE_SHORTHAND_KEY: ["human"],
|
|
79
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool"]
|
|
77
80
|
})
|
|
81
|
+
study_config = {
|
|
82
|
+
DEFAULT_KEY: "not provided",
|
|
83
|
+
LEAVE_REQUIREDS_BLANK_KEY: True,
|
|
84
|
+
OVERWRITE_NON_NANS_KEY: False,
|
|
85
|
+
STUDY_SPECIFIC_METADATA_KEY: {
|
|
86
|
+
HOST_TYPE_SPECIFIC_METADATA_KEY: {
|
|
87
|
+
"human": {
|
|
88
|
+
METADATA_FIELDS_KEY: {
|
|
89
|
+
"host_common_name": {
|
|
90
|
+
DEFAULT_KEY: "human",
|
|
91
|
+
TYPE_KEY: "string"
|
|
92
|
+
}
|
|
93
|
+
},
|
|
94
|
+
SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
|
|
95
|
+
"stool": {
|
|
96
|
+
METADATA_FIELDS_KEY: {
|
|
97
|
+
"body_site": {
|
|
98
|
+
DEFAULT_KEY: "gut",
|
|
99
|
+
TYPE_KEY: "string"
|
|
100
|
+
},
|
|
101
|
+
"stool_consistency": {
|
|
102
|
+
DEFAULT_KEY: "normal",
|
|
103
|
+
TYPE_KEY: "string"
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
}
|
|
78
112
|
|
|
79
|
-
result =
|
|
80
|
-
|
|
81
|
-
|
|
113
|
+
result = get_reserved_cols(input_df, study_config, self.TEST_STDS_FP)
|
|
114
|
+
|
|
115
|
+
# Expected columns are union of study_config fields and test_standards.yml fields
|
|
116
|
+
# From standards: sample_name, sample_type (base), description (human overrides host_associated),
|
|
117
|
+
# body_site (host_associated stool), body_product (human stool), host_common_name (human)
|
|
118
|
+
expected = [
|
|
119
|
+
"body_product", # from human stool in test_standards.yml
|
|
120
|
+
"body_site",
|
|
121
|
+
"description", # from human in test_standards.yml (overrides host_associated)
|
|
122
|
+
"host_common_name",
|
|
123
|
+
HOSTTYPE_SHORTHAND_KEY,
|
|
124
|
+
QC_NOTE_KEY,
|
|
125
|
+
QIITA_SAMPLE_TYPE,
|
|
126
|
+
SAMPLE_NAME_KEY,
|
|
127
|
+
SAMPLE_TYPE_KEY,
|
|
128
|
+
SAMPLETYPE_SHORTHAND_KEY,
|
|
129
|
+
"stool_consistency"
|
|
130
|
+
]
|
|
131
|
+
self.assertEqual(expected, result)
|
|
82
132
|
|
|
83
|
-
def
|
|
84
|
-
"""Test
|
|
133
|
+
def test_get_reserved_cols_missing_hosttype_shorthand_raises(self):
|
|
134
|
+
"""Test raises ValueError when hosttype_shorthand column is missing."""
|
|
85
135
|
input_df = pandas.DataFrame({
|
|
86
|
-
SAMPLE_NAME_KEY: ["sample1"
|
|
87
|
-
|
|
136
|
+
SAMPLE_NAME_KEY: ["sample1"],
|
|
137
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool"]
|
|
88
138
|
})
|
|
139
|
+
study_config = {}
|
|
89
140
|
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
expected = pandas.DataFrame({
|
|
93
|
-
SAMPLE_NAME_KEY: ["sample2"],
|
|
94
|
-
QC_NOTE_KEY: ["invalid host_type"]
|
|
95
|
-
}, index=[1])
|
|
96
|
-
assert_frame_equal(expected, result)
|
|
141
|
+
with self.assertRaisesRegex(ValueError, HOSTTYPE_SHORTHAND_KEY):
|
|
142
|
+
get_reserved_cols(input_df, study_config)
|
|
97
143
|
|
|
98
|
-
def
|
|
99
|
-
"""Test
|
|
144
|
+
def test_get_reserved_cols_missing_sampletype_shorthand_raises(self):
|
|
145
|
+
"""Test raises ValueError when sampletype_shorthand column is missing."""
|
|
100
146
|
input_df = pandas.DataFrame({
|
|
101
|
-
SAMPLE_NAME_KEY: ["sample1"
|
|
102
|
-
|
|
147
|
+
SAMPLE_NAME_KEY: ["sample1"],
|
|
148
|
+
HOSTTYPE_SHORTHAND_KEY: ["human"]
|
|
103
149
|
})
|
|
150
|
+
study_config = {}
|
|
104
151
|
|
|
105
|
-
|
|
152
|
+
with self.assertRaisesRegex(ValueError, SAMPLETYPE_SHORTHAND_KEY):
|
|
153
|
+
get_reserved_cols(input_df, study_config)
|
|
106
154
|
|
|
107
|
-
|
|
155
|
+
def test_get_reserved_cols_multiple_host_sample_types(self):
|
|
156
|
+
"""Test returns deduped union of reserved columns for multiple host/sample type combinations."""
|
|
157
|
+
input_df = pandas.DataFrame({
|
|
158
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2", "sample3"],
|
|
159
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "human", "mouse"],
|
|
160
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "blood", "stool"]
|
|
161
|
+
})
|
|
162
|
+
# Both human and mouse define host_common_name and body_site - should appear only once each
|
|
163
|
+
study_config = {
|
|
164
|
+
DEFAULT_KEY: "not provided",
|
|
165
|
+
LEAVE_REQUIREDS_BLANK_KEY: True,
|
|
166
|
+
OVERWRITE_NON_NANS_KEY: False,
|
|
167
|
+
STUDY_SPECIFIC_METADATA_KEY: {
|
|
168
|
+
HOST_TYPE_SPECIFIC_METADATA_KEY: {
|
|
169
|
+
"human": {
|
|
170
|
+
METADATA_FIELDS_KEY: {
|
|
171
|
+
"host_common_name": {
|
|
172
|
+
DEFAULT_KEY: "human",
|
|
173
|
+
TYPE_KEY: "string"
|
|
174
|
+
},
|
|
175
|
+
"human_field": {
|
|
176
|
+
DEFAULT_KEY: "human_value",
|
|
177
|
+
TYPE_KEY: "string"
|
|
178
|
+
}
|
|
179
|
+
},
|
|
180
|
+
SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
|
|
181
|
+
"stool": {
|
|
182
|
+
METADATA_FIELDS_KEY: {
|
|
183
|
+
"body_site": {
|
|
184
|
+
DEFAULT_KEY: "gut",
|
|
185
|
+
TYPE_KEY: "string"
|
|
186
|
+
},
|
|
187
|
+
"stool_consistency": {
|
|
188
|
+
DEFAULT_KEY: "normal",
|
|
189
|
+
TYPE_KEY: "string"
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
},
|
|
193
|
+
"blood": {
|
|
194
|
+
METADATA_FIELDS_KEY: {
|
|
195
|
+
"body_site": {
|
|
196
|
+
DEFAULT_KEY: "blood",
|
|
197
|
+
TYPE_KEY: "string"
|
|
198
|
+
},
|
|
199
|
+
"blood_type": {
|
|
200
|
+
DEFAULT_KEY: "unknown",
|
|
201
|
+
TYPE_KEY: "string"
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
},
|
|
207
|
+
"mouse": {
|
|
208
|
+
METADATA_FIELDS_KEY: {
|
|
209
|
+
"host_common_name": {
|
|
210
|
+
DEFAULT_KEY: "mouse",
|
|
211
|
+
TYPE_KEY: "string"
|
|
212
|
+
},
|
|
213
|
+
"mouse_field": {
|
|
214
|
+
DEFAULT_KEY: "mouse_value",
|
|
215
|
+
TYPE_KEY: "string"
|
|
216
|
+
}
|
|
217
|
+
},
|
|
218
|
+
SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
|
|
219
|
+
"stool": {
|
|
220
|
+
METADATA_FIELDS_KEY: {
|
|
221
|
+
"body_site": {
|
|
222
|
+
DEFAULT_KEY: "gut",
|
|
223
|
+
TYPE_KEY: "string"
|
|
224
|
+
},
|
|
225
|
+
"mouse_stool_field": {
|
|
226
|
+
DEFAULT_KEY: "mouse_stool_value",
|
|
227
|
+
TYPE_KEY: "string"
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
}
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
}
|
|
235
|
+
}
|
|
108
236
|
|
|
109
|
-
|
|
237
|
+
result = get_reserved_cols(input_df, study_config, self.TEST_STDS_FP)
|
|
238
|
+
|
|
239
|
+
# Expected columns are union of study_config fields and test_standards.yml fields
|
|
240
|
+
# From standards for human/stool: sample_name, sample_type (base), description (human),
|
|
241
|
+
# body_site (host_associated stool), body_product (human stool), host_common_name (human)
|
|
242
|
+
# From standards for human/blood: body_site (human blood), body_product (human blood),
|
|
243
|
+
# description (human), host_common_name (human)
|
|
244
|
+
# From standards for mouse/stool: sample_name, sample_type (base), description (host_associated),
|
|
245
|
+
# body_site (host_associated stool), host_common_name (mouse)
|
|
246
|
+
# TODO: cage_id from mouse stool in test_standards.yml SHOULD be included here
|
|
247
|
+
# but is currently excluded because it has required: false and no default.
|
|
248
|
+
# The function under test needs to be changed to include fields even when
|
|
249
|
+
# they have required: false and no default.
|
|
250
|
+
expected = [
|
|
251
|
+
"blood_type",
|
|
252
|
+
"body_product", # from human stool and human blood in test_standards.yml
|
|
253
|
+
"body_site",
|
|
254
|
+
"description", # from human (overrides host_associated) and host_associated (mouse inherits)
|
|
255
|
+
"host_common_name",
|
|
256
|
+
HOSTTYPE_SHORTHAND_KEY,
|
|
257
|
+
"human_field",
|
|
258
|
+
"mouse_field",
|
|
259
|
+
"mouse_stool_field",
|
|
260
|
+
QC_NOTE_KEY,
|
|
261
|
+
QIITA_SAMPLE_TYPE,
|
|
262
|
+
SAMPLE_NAME_KEY,
|
|
263
|
+
SAMPLE_TYPE_KEY,
|
|
264
|
+
SAMPLETYPE_SHORTHAND_KEY,
|
|
265
|
+
"stool_consistency"
|
|
266
|
+
]
|
|
267
|
+
self.assertEqual(expected, result)
|
|
110
268
|
|
|
111
|
-
|
|
112
|
-
|
|
269
|
+
# Tests for find_standard_cols
|
|
270
|
+
|
|
271
|
+
def test_find_standard_cols_returns_standard_cols_in_df(self):
|
|
272
|
+
"""Test returns standard columns that exist in the input DataFrame, excluding internals."""
|
|
113
273
|
input_df = pandas.DataFrame({
|
|
114
|
-
"zebra": ["z"],
|
|
115
274
|
SAMPLE_NAME_KEY: ["sample1"],
|
|
116
|
-
"apple": ["a"],
|
|
117
|
-
QC_NOTE_KEY: [""],
|
|
118
275
|
HOSTTYPE_SHORTHAND_KEY: ["human"],
|
|
119
|
-
SAMPLETYPE_SHORTHAND_KEY: ["stool"]
|
|
276
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool"],
|
|
277
|
+
"body_site": ["gut"],
|
|
278
|
+
"host_common_name": ["human"],
|
|
279
|
+
"my_custom_column": ["custom_value"]
|
|
120
280
|
})
|
|
281
|
+
study_config = {
|
|
282
|
+
DEFAULT_KEY: "not provided",
|
|
283
|
+
LEAVE_REQUIREDS_BLANK_KEY: True,
|
|
284
|
+
OVERWRITE_NON_NANS_KEY: False,
|
|
285
|
+
STUDY_SPECIFIC_METADATA_KEY: {
|
|
286
|
+
HOST_TYPE_SPECIFIC_METADATA_KEY: {
|
|
287
|
+
"human": {
|
|
288
|
+
METADATA_FIELDS_KEY: {
|
|
289
|
+
"host_common_name": {
|
|
290
|
+
DEFAULT_KEY: "human",
|
|
291
|
+
TYPE_KEY: "string"
|
|
292
|
+
}
|
|
293
|
+
},
|
|
294
|
+
SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
|
|
295
|
+
"stool": {
|
|
296
|
+
METADATA_FIELDS_KEY: {
|
|
297
|
+
"body_site": {
|
|
298
|
+
DEFAULT_KEY: "gut",
|
|
299
|
+
TYPE_KEY: "string"
|
|
300
|
+
}
|
|
301
|
+
}
|
|
302
|
+
}
|
|
303
|
+
}
|
|
304
|
+
}
|
|
305
|
+
}
|
|
306
|
+
}
|
|
307
|
+
}
|
|
121
308
|
|
|
122
|
-
result =
|
|
309
|
+
result = find_standard_cols(input_df, study_config, self.TEST_STDS_FP)
|
|
123
310
|
|
|
124
|
-
|
|
311
|
+
# Returns intersection of reserved cols (minus internals) with df columns.
|
|
312
|
+
# body_site, host_common_name, sample_name are standard and in df
|
|
313
|
+
# hosttype_shorthand, sampletype_shorthand are internal (excluded)
|
|
314
|
+
# my_custom_column is nonstandard (excluded)
|
|
315
|
+
expected = ["body_site", "host_common_name", SAMPLE_NAME_KEY]
|
|
316
|
+
self.assertEqual(sorted(expected), sorted(result))
|
|
125
317
|
|
|
126
|
-
def
|
|
127
|
-
"""Test
|
|
318
|
+
def test_find_standard_cols_missing_hosttype_shorthand_raises(self):
|
|
319
|
+
"""Test raises ValueError when hosttype_shorthand column is missing."""
|
|
128
320
|
input_df = pandas.DataFrame({
|
|
129
|
-
"zebra": ["z"],
|
|
130
321
|
SAMPLE_NAME_KEY: ["sample1"],
|
|
131
|
-
"apple": ["a"],
|
|
132
|
-
QC_NOTE_KEY: [""],
|
|
133
|
-
HOSTTYPE_SHORTHAND_KEY: ["human"],
|
|
134
322
|
SAMPLETYPE_SHORTHAND_KEY: ["stool"]
|
|
135
323
|
})
|
|
324
|
+
study_config = {}
|
|
136
325
|
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
expected_order = [SAMPLE_NAME_KEY, "apple", "zebra"] + INTERNAL_COL_KEYS
|
|
140
|
-
self.assertEqual(expected_order, list(result.columns))
|
|
326
|
+
with self.assertRaisesRegex(ValueError, HOSTTYPE_SHORTHAND_KEY):
|
|
327
|
+
find_standard_cols(input_df, study_config, self.TEST_STDS_FP)
|
|
141
328
|
|
|
142
|
-
def
|
|
143
|
-
"""Test
|
|
329
|
+
def test_find_standard_cols_missing_sampletype_shorthand_raises(self):
|
|
330
|
+
"""Test raises ValueError when sampletype_shorthand column is missing."""
|
|
144
331
|
input_df = pandas.DataFrame({
|
|
145
|
-
"field1": ["value1"],
|
|
146
332
|
SAMPLE_NAME_KEY: ["sample1"],
|
|
147
|
-
|
|
333
|
+
HOSTTYPE_SHORTHAND_KEY: ["human"]
|
|
334
|
+
})
|
|
335
|
+
study_config = {}
|
|
336
|
+
|
|
337
|
+
with self.assertRaisesRegex(ValueError, SAMPLETYPE_SHORTHAND_KEY):
|
|
338
|
+
find_standard_cols(input_df, study_config, self.TEST_STDS_FP)
|
|
339
|
+
|
|
340
|
+
def test_find_standard_cols_missing_sample_name_raises(self):
|
|
341
|
+
"""Test raises ValueError when sample_name column is missing."""
|
|
342
|
+
input_df = pandas.DataFrame({
|
|
148
343
|
HOSTTYPE_SHORTHAND_KEY: ["human"],
|
|
149
344
|
SAMPLETYPE_SHORTHAND_KEY: ["stool"]
|
|
150
345
|
})
|
|
346
|
+
study_config = {}
|
|
151
347
|
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
expected_order = [SAMPLE_NAME_KEY, "field1"] + INTERNAL_COL_KEYS
|
|
155
|
-
self.assertEqual(expected_order, list(result.columns))
|
|
348
|
+
with self.assertRaisesRegex(ValueError, SAMPLE_NAME_KEY):
|
|
349
|
+
find_standard_cols(input_df, study_config, self.TEST_STDS_FP)
|
|
156
350
|
|
|
157
|
-
def
|
|
158
|
-
"""Test
|
|
351
|
+
def test_find_standard_cols_suppress_missing_name_err(self):
|
|
352
|
+
"""Test that suppress_missing_name_err=True allows missing sample_name."""
|
|
159
353
|
input_df = pandas.DataFrame({
|
|
160
|
-
"zebra": ["z"],
|
|
161
|
-
SAMPLE_NAME_KEY: ["sample1"],
|
|
162
|
-
"apple": ["a"],
|
|
163
|
-
QC_NOTE_KEY: [""],
|
|
164
354
|
HOSTTYPE_SHORTHAND_KEY: ["human"],
|
|
165
355
|
SAMPLETYPE_SHORTHAND_KEY: ["stool"],
|
|
166
|
-
"
|
|
356
|
+
"body_site": ["gut"]
|
|
167
357
|
})
|
|
358
|
+
study_config = {
|
|
359
|
+
DEFAULT_KEY: "not provided",
|
|
360
|
+
LEAVE_REQUIREDS_BLANK_KEY: True,
|
|
361
|
+
OVERWRITE_NON_NANS_KEY: False,
|
|
362
|
+
STUDY_SPECIFIC_METADATA_KEY: {
|
|
363
|
+
HOST_TYPE_SPECIFIC_METADATA_KEY: {
|
|
364
|
+
"human": {
|
|
365
|
+
METADATA_FIELDS_KEY: {},
|
|
366
|
+
SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
|
|
367
|
+
"stool": {
|
|
368
|
+
METADATA_FIELDS_KEY: {
|
|
369
|
+
"body_site": {
|
|
370
|
+
DEFAULT_KEY: "gut",
|
|
371
|
+
TYPE_KEY: "string"
|
|
372
|
+
}
|
|
373
|
+
}
|
|
374
|
+
}
|
|
375
|
+
}
|
|
376
|
+
}
|
|
377
|
+
}
|
|
378
|
+
}
|
|
379
|
+
}
|
|
168
380
|
|
|
169
|
-
result =
|
|
381
|
+
result = find_standard_cols(
|
|
382
|
+
input_df, study_config, self.TEST_STDS_FP,
|
|
383
|
+
suppress_missing_name_err=True)
|
|
170
384
|
|
|
171
|
-
|
|
172
|
-
|
|
385
|
+
# Only body_site is a standard col in df (sample_name is missing but allowed)
|
|
386
|
+
expected = ["body_site"]
|
|
387
|
+
self.assertEqual(expected, sorted(result))
|
|
173
388
|
|
|
174
|
-
# Tests for
|
|
389
|
+
# Tests for find_nonstandard_cols
|
|
175
390
|
|
|
176
|
-
def
|
|
177
|
-
"""Test returns
|
|
391
|
+
def test_find_nonstandard_cols_returns_nonstandard_cols(self):
|
|
392
|
+
"""Test returns columns in df that are not in the reserved columns list."""
|
|
178
393
|
input_df = pandas.DataFrame({
|
|
179
|
-
SAMPLE_NAME_KEY: ["sample1"
|
|
180
|
-
HOSTTYPE_SHORTHAND_KEY: ["human"
|
|
181
|
-
SAMPLETYPE_SHORTHAND_KEY: ["stool",
|
|
394
|
+
SAMPLE_NAME_KEY: ["sample1"],
|
|
395
|
+
HOSTTYPE_SHORTHAND_KEY: ["human"],
|
|
396
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool"],
|
|
397
|
+
"body_site": ["gut"],
|
|
398
|
+
"host_common_name": ["human"],
|
|
399
|
+
"my_custom_column": ["custom_value"],
|
|
400
|
+
"another_nonstandard": ["value"]
|
|
182
401
|
})
|
|
402
|
+
study_config = {
|
|
403
|
+
DEFAULT_KEY: "not provided",
|
|
404
|
+
LEAVE_REQUIREDS_BLANK_KEY: True,
|
|
405
|
+
OVERWRITE_NON_NANS_KEY: False,
|
|
406
|
+
STUDY_SPECIFIC_METADATA_KEY: {
|
|
407
|
+
HOST_TYPE_SPECIFIC_METADATA_KEY: {
|
|
408
|
+
"human": {
|
|
409
|
+
METADATA_FIELDS_KEY: {
|
|
410
|
+
"host_common_name": {
|
|
411
|
+
DEFAULT_KEY: "human",
|
|
412
|
+
TYPE_KEY: "string"
|
|
413
|
+
}
|
|
414
|
+
},
|
|
415
|
+
SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
|
|
416
|
+
"stool": {
|
|
417
|
+
METADATA_FIELDS_KEY: {
|
|
418
|
+
"body_site": {
|
|
419
|
+
DEFAULT_KEY: "gut",
|
|
420
|
+
TYPE_KEY: "string"
|
|
421
|
+
}
|
|
422
|
+
}
|
|
423
|
+
}
|
|
424
|
+
}
|
|
425
|
+
}
|
|
426
|
+
}
|
|
427
|
+
}
|
|
428
|
+
}
|
|
183
429
|
|
|
184
|
-
result =
|
|
430
|
+
result = find_nonstandard_cols(input_df, study_config, self.TEST_STDS_FP)
|
|
185
431
|
|
|
186
|
-
|
|
432
|
+
# Only my_custom_column and another_nonstandard are not in the reserved list
|
|
433
|
+
# sample_name, body_site, host_common_name, hosttype_shorthand,
|
|
434
|
+
# sampletype_shorthand are all reserved
|
|
435
|
+
expected = ["another_nonstandard", "my_custom_column"]
|
|
436
|
+
self.assertEqual(sorted(expected), sorted(result))
|
|
187
437
|
|
|
188
|
-
def
|
|
189
|
-
"""Test raises ValueError when
|
|
438
|
+
def test_find_nonstandard_cols_missing_required_col_raises(self):
|
|
439
|
+
"""Test raises ValueError when a required column is missing."""
|
|
190
440
|
input_df = pandas.DataFrame({
|
|
191
|
-
SAMPLE_NAME_KEY: ["sample1"
|
|
192
|
-
|
|
193
|
-
|
|
441
|
+
SAMPLE_NAME_KEY: ["sample1"],
|
|
442
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool"]
|
|
443
|
+
# missing HOSTTYPE_SHORTHAND_KEY
|
|
194
444
|
})
|
|
445
|
+
study_config = {}
|
|
195
446
|
|
|
196
|
-
with self.assertRaisesRegex(ValueError,
|
|
197
|
-
|
|
447
|
+
with self.assertRaisesRegex(ValueError, HOSTTYPE_SHORTHAND_KEY):
|
|
448
|
+
find_nonstandard_cols(input_df, study_config, self.TEST_STDS_FP)
|
|
198
449
|
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
450
|
+
# Tests for write_metadata_results
|
|
451
|
+
|
|
452
|
+
def test_write_metadata_results_creates_all_files(self):
|
|
453
|
+
"""Test creates metadata file and validation errors file, includes failed rows."""
|
|
454
|
+
metadata_df = pandas.DataFrame({
|
|
455
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2", "sample3"],
|
|
456
|
+
"field_a": ["a1", "a2", "a3"],
|
|
457
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "human", "human"],
|
|
458
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool", "stool"],
|
|
459
|
+
QC_NOTE_KEY: ["", "invalid host_type", ""]
|
|
460
|
+
})
|
|
461
|
+
validation_msgs_df = pandas.DataFrame({
|
|
462
|
+
"field": ["field_a"],
|
|
463
|
+
"error": ["some validation error"]
|
|
205
464
|
})
|
|
206
465
|
|
|
207
|
-
|
|
466
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
467
|
+
write_metadata_results(
|
|
468
|
+
metadata_df, validation_msgs_df, tmpdir, "test_output",
|
|
469
|
+
sep="\t", remove_internals=False)
|
|
470
|
+
|
|
471
|
+
# Find the main metadata file
|
|
472
|
+
metadata_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
|
|
473
|
+
self.assertEqual(1, len(metadata_files))
|
|
474
|
+
|
|
475
|
+
# Verify metadata file contents - includes failed row when remove_internals=False
|
|
476
|
+
result_df = pandas.read_csv(
|
|
477
|
+
metadata_files[0], sep="\t", keep_default_na=False)
|
|
478
|
+
assert_frame_equal(metadata_df, result_df)
|
|
479
|
+
|
|
480
|
+
# Find the validation errors file (uses comma separator)
|
|
481
|
+
validation_files = glob.glob(
|
|
482
|
+
os.path.join(tmpdir, "*_test_output_validation_errors.csv"))
|
|
483
|
+
self.assertEqual(1, len(validation_files))
|
|
484
|
+
|
|
485
|
+
# Verify validation errors file contents
|
|
486
|
+
result_validation_df = pandas.read_csv(validation_files[0], sep=",")
|
|
487
|
+
assert_frame_equal(validation_msgs_df, result_validation_df)
|
|
488
|
+
|
|
489
|
+
# No fails file should be created when remove_internals=False
|
|
490
|
+
fails_files = glob.glob(
|
|
491
|
+
os.path.join(tmpdir, "*_test_output_fails.csv"))
|
|
492
|
+
self.assertEqual(0, len(fails_files))
|
|
208
493
|
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
494
|
+
def test_write_metadata_results_remove_internals_creates_fails_file(self):
|
|
495
|
+
"""Test with remove_internals=True creates fails file and removes internal cols."""
|
|
496
|
+
metadata_df = pandas.DataFrame({
|
|
497
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2", "sample3"],
|
|
498
|
+
"field_a": ["a1", "a2", "a3"],
|
|
499
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "human", "human"],
|
|
500
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool", "stool"],
|
|
501
|
+
QC_NOTE_KEY: ["", "invalid host_type", ""]
|
|
213
502
|
})
|
|
214
|
-
|
|
503
|
+
validation_msgs_df = pandas.DataFrame()
|
|
215
504
|
|
|
216
|
-
|
|
505
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
506
|
+
write_metadata_results(
|
|
507
|
+
metadata_df, validation_msgs_df, tmpdir, "test_output",
|
|
508
|
+
sep="\t", remove_internals=True)
|
|
217
509
|
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
510
|
+
# Find the main metadata file
|
|
511
|
+
metadata_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
|
|
512
|
+
self.assertEqual(1, len(metadata_files))
|
|
513
|
+
|
|
514
|
+
# Verify metadata has internal cols removed and no failures
|
|
515
|
+
result_df = pandas.read_csv(metadata_files[0], sep="\t")
|
|
516
|
+
expected_df = pandas.DataFrame({
|
|
517
|
+
SAMPLE_NAME_KEY: ["sample1", "sample3"],
|
|
518
|
+
"field_a": ["a1", "a3"]
|
|
519
|
+
})
|
|
520
|
+
assert_frame_equal(expected_df, result_df)
|
|
521
|
+
|
|
522
|
+
# Find the fails file
|
|
523
|
+
fails_files = glob.glob(
|
|
524
|
+
os.path.join(tmpdir, "*_test_output_fails.csv"))
|
|
525
|
+
self.assertEqual(1, len(fails_files))
|
|
526
|
+
|
|
527
|
+
# Verify fails file contains the failed row
|
|
528
|
+
fails_df = pandas.read_csv(fails_files[0], sep=",")
|
|
529
|
+
expected_fails_df = pandas.DataFrame({
|
|
530
|
+
SAMPLE_NAME_KEY: ["sample2"],
|
|
531
|
+
"field_a": ["a2"],
|
|
532
|
+
HOSTTYPE_SHORTHAND_KEY: ["human"],
|
|
533
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool"],
|
|
534
|
+
QC_NOTE_KEY: ["invalid host_type"]
|
|
535
|
+
})
|
|
536
|
+
assert_frame_equal(expected_fails_df, fails_df)
|
|
537
|
+
|
|
538
|
+
# Validation errors file should be empty (touched)
|
|
539
|
+
validation_files = glob.glob(
|
|
540
|
+
os.path.join(tmpdir, "*_test_output_validation_errors.csv"))
|
|
541
|
+
self.assertEqual(1, len(validation_files))
|
|
542
|
+
self.assertEqual(0, os.path.getsize(validation_files[0]))
|
|
543
|
+
|
|
544
|
+
def test_write_metadata_results_suppress_empty_fails(self):
|
|
545
|
+
"""Test with suppress_empty_fails=True does not create empty files."""
|
|
546
|
+
metadata_df = pandas.DataFrame({
|
|
547
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
548
|
+
"field_a": ["a1", "a2"],
|
|
549
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
|
|
550
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
|
|
551
|
+
QC_NOTE_KEY: ["", ""]
|
|
552
|
+
})
|
|
553
|
+
validation_msgs_df = pandas.DataFrame()
|
|
554
|
+
|
|
555
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
556
|
+
write_metadata_results(
|
|
557
|
+
metadata_df, validation_msgs_df, tmpdir, "test_output",
|
|
558
|
+
sep="\t", remove_internals=True, suppress_empty_fails=True)
|
|
559
|
+
|
|
560
|
+
# Main metadata file should exist
|
|
561
|
+
metadata_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
|
|
562
|
+
self.assertEqual(1, len(metadata_files))
|
|
563
|
+
|
|
564
|
+
# Fails file should NOT exist (no failures, suppressed)
|
|
565
|
+
fails_files = glob.glob(
|
|
566
|
+
os.path.join(tmpdir, "*_test_output_fails.csv"))
|
|
567
|
+
self.assertEqual(0, len(fails_files))
|
|
568
|
+
|
|
569
|
+
# Validation errors file should NOT exist (empty, suppressed)
|
|
570
|
+
validation_files = glob.glob(
|
|
571
|
+
os.path.join(tmpdir, "*_test_output_validation_errors.csv"))
|
|
572
|
+
self.assertEqual(0, len(validation_files))
|
|
573
|
+
|
|
574
|
+
def test_write_metadata_results_custom_internal_col_names(self):
|
|
575
|
+
"""Test with custom internal_col_names parameter."""
|
|
576
|
+
metadata_df = pandas.DataFrame({
|
|
577
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
578
|
+
"field_a": ["a1", "a2"],
|
|
579
|
+
"custom_internal": ["x", "y"],
|
|
580
|
+
QC_NOTE_KEY: ["", ""]
|
|
581
|
+
})
|
|
582
|
+
validation_msgs_df = pandas.DataFrame()
|
|
583
|
+
|
|
584
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
585
|
+
write_metadata_results(
|
|
586
|
+
metadata_df, validation_msgs_df, tmpdir, "test_output",
|
|
587
|
+
sep="\t", remove_internals=True, suppress_empty_fails=True,
|
|
588
|
+
internal_col_names=["custom_internal", QC_NOTE_KEY])
|
|
589
|
+
|
|
590
|
+
# Find the main metadata file
|
|
591
|
+
metadata_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
|
|
592
|
+
self.assertEqual(1, len(metadata_files))
|
|
593
|
+
|
|
594
|
+
# Verify custom internal cols are removed
|
|
595
|
+
result_df = pandas.read_csv(metadata_files[0], sep="\t")
|
|
596
|
+
expected_df = pandas.DataFrame({
|
|
597
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
598
|
+
"field_a": ["a1", "a2"]
|
|
599
|
+
})
|
|
600
|
+
assert_frame_equal(expected_df, result_df)
|
|
601
|
+
|
|
602
|
+
# Tests for get_qc_failures
|
|
603
|
+
|
|
604
|
+
def test_get_qc_failures_no_failures(self):
|
|
605
|
+
"""Test returns empty df when QC_NOTE_KEY is all empty strings."""
|
|
606
|
+
input_df = pandas.DataFrame({
|
|
607
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
608
|
+
QC_NOTE_KEY: ["", ""]
|
|
609
|
+
})
|
|
610
|
+
|
|
611
|
+
result = get_qc_failures(input_df)
|
|
612
|
+
|
|
613
|
+
self.assertTrue(result.empty)
|
|
614
|
+
|
|
615
|
+
def test_get_qc_failures_some_failures(self):
|
|
616
|
+
"""Test returns only rows where QC_NOTE_KEY is not empty."""
|
|
617
|
+
input_df = pandas.DataFrame({
|
|
618
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2", "sample3"],
|
|
619
|
+
QC_NOTE_KEY: ["", "invalid host_type", ""]
|
|
620
|
+
})
|
|
621
|
+
|
|
622
|
+
result = get_qc_failures(input_df)
|
|
623
|
+
|
|
624
|
+
expected = pandas.DataFrame({
|
|
625
|
+
SAMPLE_NAME_KEY: ["sample2"],
|
|
626
|
+
QC_NOTE_KEY: ["invalid host_type"]
|
|
627
|
+
}, index=[1])
|
|
628
|
+
assert_frame_equal(expected, result)
|
|
629
|
+
|
|
630
|
+
def test_get_qc_failures_all_failures(self):
|
|
631
|
+
"""Test returns all rows when all have QC notes."""
|
|
632
|
+
input_df = pandas.DataFrame({
|
|
633
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
634
|
+
QC_NOTE_KEY: ["invalid host_type", "invalid sample_type"]
|
|
635
|
+
})
|
|
636
|
+
|
|
637
|
+
result = get_qc_failures(input_df)
|
|
638
|
+
|
|
639
|
+
assert_frame_equal(input_df, result)
|
|
640
|
+
|
|
641
|
+
# Tests for _reorder_df
|
|
642
|
+
|
|
643
|
+
def test__reorder_df_sample_name_first(self):
|
|
644
|
+
"""Test that sample_name becomes the first column."""
|
|
645
|
+
input_df = pandas.DataFrame({
|
|
646
|
+
"zebra": ["z"],
|
|
647
|
+
SAMPLE_NAME_KEY: ["sample1"],
|
|
648
|
+
"apple": ["a"],
|
|
649
|
+
QC_NOTE_KEY: [""],
|
|
650
|
+
HOSTTYPE_SHORTHAND_KEY: ["human"],
|
|
651
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool"]
|
|
652
|
+
})
|
|
653
|
+
|
|
654
|
+
result = _reorder_df(input_df, INTERNAL_COL_KEYS)
|
|
655
|
+
|
|
656
|
+
self.assertEqual(SAMPLE_NAME_KEY, result.columns[0])
|
|
657
|
+
|
|
658
|
+
def test__reorder_df_alphabetical_order(self):
|
|
659
|
+
"""Test that non-internal columns are sorted alphabetically after sample_name."""
|
|
660
|
+
input_df = pandas.DataFrame({
|
|
661
|
+
"zebra": ["z"],
|
|
662
|
+
SAMPLE_NAME_KEY: ["sample1"],
|
|
663
|
+
"apple": ["a"],
|
|
664
|
+
QC_NOTE_KEY: [""],
|
|
665
|
+
HOSTTYPE_SHORTHAND_KEY: ["human"],
|
|
666
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool"]
|
|
667
|
+
})
|
|
668
|
+
|
|
669
|
+
result = _reorder_df(input_df, INTERNAL_COL_KEYS)
|
|
670
|
+
|
|
671
|
+
expected_order = [SAMPLE_NAME_KEY, "apple", "zebra"] + INTERNAL_COL_KEYS
|
|
672
|
+
self.assertEqual(expected_order, list(result.columns))
|
|
673
|
+
|
|
674
|
+
def test__reorder_df_internals_at_end(self):
|
|
675
|
+
"""Test that internal columns are moved to the end in the provided order."""
|
|
676
|
+
input_df = pandas.DataFrame({
|
|
677
|
+
"field1": ["value1"],
|
|
678
|
+
SAMPLE_NAME_KEY: ["sample1"],
|
|
679
|
+
QC_NOTE_KEY: [""],
|
|
680
|
+
HOSTTYPE_SHORTHAND_KEY: ["human"],
|
|
681
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool"]
|
|
682
|
+
})
|
|
683
|
+
|
|
684
|
+
result = _reorder_df(input_df, INTERNAL_COL_KEYS)
|
|
685
|
+
|
|
686
|
+
expected_order = [SAMPLE_NAME_KEY, "field1"] + INTERNAL_COL_KEYS
|
|
687
|
+
self.assertEqual(expected_order, list(result.columns))
|
|
688
|
+
|
|
689
|
+
def test__reorder_df_full_ordering(self):
|
|
690
|
+
"""Test complete column ordering: sample_name, alphabetical, internals."""
|
|
691
|
+
input_df = pandas.DataFrame({
|
|
692
|
+
"zebra": ["z"],
|
|
693
|
+
SAMPLE_NAME_KEY: ["sample1"],
|
|
694
|
+
"apple": ["a"],
|
|
695
|
+
QC_NOTE_KEY: [""],
|
|
696
|
+
HOSTTYPE_SHORTHAND_KEY: ["human"],
|
|
697
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool"],
|
|
698
|
+
"banana": ["b"]
|
|
699
|
+
})
|
|
700
|
+
|
|
701
|
+
result = _reorder_df(input_df, INTERNAL_COL_KEYS)
|
|
702
|
+
|
|
703
|
+
expected_order = [SAMPLE_NAME_KEY, "apple", "banana", "zebra"] + INTERNAL_COL_KEYS
|
|
704
|
+
self.assertEqual(expected_order, list(result.columns))
|
|
705
|
+
|
|
706
|
+
# Tests for _catch_nan_required_fields
|
|
707
|
+
|
|
708
|
+
def test__catch_nan_required_fields_no_nans(self):
|
|
709
|
+
"""Test returns unchanged df when no NaNs in required fields."""
|
|
710
|
+
input_df = pandas.DataFrame({
|
|
711
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
712
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "control"],
|
|
713
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "blank"]
|
|
714
|
+
})
|
|
715
|
+
|
|
716
|
+
result = _catch_nan_required_fields(input_df)
|
|
717
|
+
|
|
718
|
+
assert_frame_equal(input_df, result)
|
|
719
|
+
|
|
720
|
+
def test__catch_nan_required_fields_nan_sample_name_raises(self):
|
|
721
|
+
"""Test raises ValueError when sample_name contains NaN."""
|
|
722
|
+
input_df = pandas.DataFrame({
|
|
723
|
+
SAMPLE_NAME_KEY: ["sample1", np.nan],
|
|
724
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "control"],
|
|
725
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "blank"]
|
|
726
|
+
})
|
|
727
|
+
|
|
728
|
+
with self.assertRaisesRegex(ValueError, "Metadata contains NaN sample names"):
|
|
729
|
+
_catch_nan_required_fields(input_df)
|
|
730
|
+
|
|
731
|
+
def test__catch_nan_required_fields_nan_shorthand_fields_become_empty(self):
|
|
732
|
+
"""Test that NaN hosttype_shorthand and sampletype_shorthand values are set to 'empty'."""
|
|
733
|
+
input_df = pandas.DataFrame({
|
|
734
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
735
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", np.nan],
|
|
736
|
+
SAMPLETYPE_SHORTHAND_KEY: [np.nan, "blank"]
|
|
737
|
+
})
|
|
738
|
+
|
|
739
|
+
result = _catch_nan_required_fields(input_df)
|
|
740
|
+
|
|
741
|
+
expected = pandas.DataFrame({
|
|
742
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
743
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "empty"],
|
|
744
|
+
SAMPLETYPE_SHORTHAND_KEY: ["empty", "blank"]
|
|
745
|
+
})
|
|
746
|
+
assert_frame_equal(expected, result)
|
|
747
|
+
|
|
748
|
+
# Tests for _fill_na_if_default
|
|
749
|
+
|
|
750
|
+
def test__fill_na_if_default_specific_overrides_settings(self):
|
|
751
|
+
"""Test that specific_dict default takes precedence over settings_dict."""
|
|
752
|
+
input_df = pandas.DataFrame({
|
|
221
753
|
"field1": ["value1", np.nan, "value3"],
|
|
222
754
|
"field2": [np.nan, "value2", np.nan]
|
|
223
755
|
})
|
|
@@ -745,6 +1277,8 @@ class TestMetadataExtender(TestCase):
|
|
|
745
1277
|
LEAVE_REQUIREDS_BLANK_KEY: False,
|
|
746
1278
|
DEFAULT_KEY: "not provided"
|
|
747
1279
|
}
|
|
1280
|
+
# Config is pre-resolved: sample type's metadata_fields already includes
|
|
1281
|
+
# host fields merged in, plus sample_type and qiita_sample_type
|
|
748
1282
|
host_type_config_dict = {
|
|
749
1283
|
METADATA_FIELDS_KEY: {
|
|
750
1284
|
"host_field": {
|
|
@@ -755,9 +1289,23 @@ class TestMetadataExtender(TestCase):
|
|
|
755
1289
|
SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
|
|
756
1290
|
"stool": {
|
|
757
1291
|
METADATA_FIELDS_KEY: {
|
|
1292
|
+
"host_field": {
|
|
1293
|
+
DEFAULT_KEY: "host_default",
|
|
1294
|
+
TYPE_KEY: "string"
|
|
1295
|
+
},
|
|
758
1296
|
"stool_field": {
|
|
759
1297
|
DEFAULT_KEY: "stool_default",
|
|
760
1298
|
TYPE_KEY: "string"
|
|
1299
|
+
},
|
|
1300
|
+
SAMPLE_TYPE_KEY: {
|
|
1301
|
+
ALLOWED_KEY: ["stool"],
|
|
1302
|
+
DEFAULT_KEY: "stool",
|
|
1303
|
+
TYPE_KEY: "string"
|
|
1304
|
+
},
|
|
1305
|
+
QIITA_SAMPLE_TYPE: {
|
|
1306
|
+
ALLOWED_KEY: ["stool"],
|
|
1307
|
+
DEFAULT_KEY: "stool",
|
|
1308
|
+
TYPE_KEY: "string"
|
|
761
1309
|
}
|
|
762
1310
|
}
|
|
763
1311
|
}
|
|
@@ -996,17 +1544,44 @@ class TestMetadataExtender(TestCase):
|
|
|
996
1544
|
LEAVE_REQUIREDS_BLANK_KEY: False,
|
|
997
1545
|
DEFAULT_KEY: "not provided"
|
|
998
1546
|
}
|
|
1547
|
+
# Config is pre-resolved: alias "feces" has its own metadata_fields
|
|
1548
|
+
# that is a copy of "stool"'s resolved fields with sample_type="stool"
|
|
999
1549
|
host_type_config_dict = {
|
|
1000
1550
|
METADATA_FIELDS_KEY: {},
|
|
1001
1551
|
SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
|
|
1002
1552
|
"feces": {
|
|
1003
|
-
|
|
1553
|
+
METADATA_FIELDS_KEY: {
|
|
1554
|
+
"stool_field": {
|
|
1555
|
+
DEFAULT_KEY: "stool_value",
|
|
1556
|
+
TYPE_KEY: "string"
|
|
1557
|
+
},
|
|
1558
|
+
SAMPLE_TYPE_KEY: {
|
|
1559
|
+
ALLOWED_KEY: ["stool"],
|
|
1560
|
+
DEFAULT_KEY: "stool",
|
|
1561
|
+
TYPE_KEY: "string"
|
|
1562
|
+
},
|
|
1563
|
+
QIITA_SAMPLE_TYPE: {
|
|
1564
|
+
ALLOWED_KEY: ["stool"],
|
|
1565
|
+
DEFAULT_KEY: "stool",
|
|
1566
|
+
TYPE_KEY: "string"
|
|
1567
|
+
}
|
|
1568
|
+
}
|
|
1004
1569
|
},
|
|
1005
1570
|
"stool": {
|
|
1006
1571
|
METADATA_FIELDS_KEY: {
|
|
1007
1572
|
"stool_field": {
|
|
1008
1573
|
DEFAULT_KEY: "stool_value",
|
|
1009
1574
|
TYPE_KEY: "string"
|
|
1575
|
+
},
|
|
1576
|
+
SAMPLE_TYPE_KEY: {
|
|
1577
|
+
ALLOWED_KEY: ["stool"],
|
|
1578
|
+
DEFAULT_KEY: "stool",
|
|
1579
|
+
TYPE_KEY: "string"
|
|
1580
|
+
},
|
|
1581
|
+
QIITA_SAMPLE_TYPE: {
|
|
1582
|
+
ALLOWED_KEY: ["stool"],
|
|
1583
|
+
DEFAULT_KEY: "stool",
|
|
1584
|
+
TYPE_KEY: "string"
|
|
1010
1585
|
}
|
|
1011
1586
|
}
|
|
1012
1587
|
}
|
|
@@ -1035,6 +1610,8 @@ class TestMetadataExtender(TestCase):
|
|
|
1035
1610
|
LEAVE_REQUIREDS_BLANK_KEY: False,
|
|
1036
1611
|
DEFAULT_KEY: "global_default"
|
|
1037
1612
|
}
|
|
1613
|
+
# Config is pre-resolved: sample type's metadata_fields includes
|
|
1614
|
+
# host fields merged in, plus sample_type and qiita_sample_type
|
|
1038
1615
|
full_flat_config_dict = {
|
|
1039
1616
|
HOST_TYPE_SPECIFIC_METADATA_KEY: {
|
|
1040
1617
|
"human": {
|
|
@@ -1048,9 +1625,23 @@ class TestMetadataExtender(TestCase):
|
|
|
1048
1625
|
SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
|
|
1049
1626
|
"stool": {
|
|
1050
1627
|
METADATA_FIELDS_KEY: {
|
|
1628
|
+
"host_field": {
|
|
1629
|
+
DEFAULT_KEY: "host_value",
|
|
1630
|
+
TYPE_KEY: "string"
|
|
1631
|
+
},
|
|
1051
1632
|
"stool_field": {
|
|
1052
1633
|
DEFAULT_KEY: "stool_value",
|
|
1053
1634
|
TYPE_KEY: "string"
|
|
1635
|
+
},
|
|
1636
|
+
SAMPLE_TYPE_KEY: {
|
|
1637
|
+
ALLOWED_KEY: ["stool"],
|
|
1638
|
+
DEFAULT_KEY: "stool",
|
|
1639
|
+
TYPE_KEY: "string"
|
|
1640
|
+
},
|
|
1641
|
+
QIITA_SAMPLE_TYPE: {
|
|
1642
|
+
ALLOWED_KEY: ["stool"],
|
|
1643
|
+
DEFAULT_KEY: "stool",
|
|
1644
|
+
TYPE_KEY: "string"
|
|
1054
1645
|
}
|
|
1055
1646
|
}
|
|
1056
1647
|
}
|
|
@@ -1160,6 +1751,8 @@ class TestMetadataExtender(TestCase):
|
|
|
1160
1751
|
LEAVE_REQUIREDS_BLANK_KEY: False,
|
|
1161
1752
|
DEFAULT_KEY: "global_default"
|
|
1162
1753
|
}
|
|
1754
|
+
# Config is pre-resolved: sample type's metadata_fields includes
|
|
1755
|
+
# host fields merged in, plus sample_type and qiita_sample_type
|
|
1163
1756
|
full_flat_config_dict = {
|
|
1164
1757
|
HOST_TYPE_SPECIFIC_METADATA_KEY: {
|
|
1165
1758
|
"human": {
|
|
@@ -1171,7 +1764,22 @@ class TestMetadataExtender(TestCase):
|
|
|
1171
1764
|
},
|
|
1172
1765
|
SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
|
|
1173
1766
|
"stool": {
|
|
1174
|
-
METADATA_FIELDS_KEY: {
|
|
1767
|
+
METADATA_FIELDS_KEY: {
|
|
1768
|
+
"human_field": {
|
|
1769
|
+
DEFAULT_KEY: "human_value",
|
|
1770
|
+
TYPE_KEY: "string"
|
|
1771
|
+
},
|
|
1772
|
+
SAMPLE_TYPE_KEY: {
|
|
1773
|
+
ALLOWED_KEY: ["stool"],
|
|
1774
|
+
DEFAULT_KEY: "stool",
|
|
1775
|
+
TYPE_KEY: "string"
|
|
1776
|
+
},
|
|
1777
|
+
QIITA_SAMPLE_TYPE: {
|
|
1778
|
+
ALLOWED_KEY: ["stool"],
|
|
1779
|
+
DEFAULT_KEY: "stool",
|
|
1780
|
+
TYPE_KEY: "string"
|
|
1781
|
+
}
|
|
1782
|
+
}
|
|
1175
1783
|
}
|
|
1176
1784
|
}
|
|
1177
1785
|
},
|
|
@@ -1209,6 +1817,8 @@ class TestMetadataExtender(TestCase):
|
|
|
1209
1817
|
LEAVE_REQUIREDS_BLANK_KEY: False,
|
|
1210
1818
|
DEFAULT_KEY: "global_default"
|
|
1211
1819
|
}
|
|
1820
|
+
# Config is pre-resolved: sample type's metadata_fields includes
|
|
1821
|
+
# host fields merged in, plus sample_type and qiita_sample_type
|
|
1212
1822
|
full_flat_config_dict = {
|
|
1213
1823
|
HOST_TYPE_SPECIFIC_METADATA_KEY: {
|
|
1214
1824
|
"human": {
|
|
@@ -1220,6 +1830,16 @@ class TestMetadataExtender(TestCase):
|
|
|
1220
1830
|
"required_field": {
|
|
1221
1831
|
REQUIRED_KEY: True,
|
|
1222
1832
|
TYPE_KEY: "string"
|
|
1833
|
+
},
|
|
1834
|
+
SAMPLE_TYPE_KEY: {
|
|
1835
|
+
ALLOWED_KEY: ["stool"],
|
|
1836
|
+
DEFAULT_KEY: "stool",
|
|
1837
|
+
TYPE_KEY: "string"
|
|
1838
|
+
},
|
|
1839
|
+
QIITA_SAMPLE_TYPE: {
|
|
1840
|
+
ALLOWED_KEY: ["stool"],
|
|
1841
|
+
DEFAULT_KEY: "stool",
|
|
1842
|
+
TYPE_KEY: "string"
|
|
1223
1843
|
}
|
|
1224
1844
|
}
|
|
1225
1845
|
}
|
|
@@ -1255,6 +1875,8 @@ class TestMetadataExtender(TestCase):
|
|
|
1255
1875
|
LEAVE_REQUIREDS_BLANK_KEY: False,
|
|
1256
1876
|
DEFAULT_KEY: "global_default"
|
|
1257
1877
|
}
|
|
1878
|
+
# Config is pre-resolved: sample type's metadata_fields includes
|
|
1879
|
+
# host fields merged in, plus sample_type and qiita_sample_type
|
|
1258
1880
|
full_flat_config_dict = {
|
|
1259
1881
|
HOST_TYPE_SPECIFIC_METADATA_KEY: {
|
|
1260
1882
|
"human": {
|
|
@@ -1266,6 +1888,16 @@ class TestMetadataExtender(TestCase):
|
|
|
1266
1888
|
"required_field": {
|
|
1267
1889
|
REQUIRED_KEY: True,
|
|
1268
1890
|
TYPE_KEY: "string"
|
|
1891
|
+
},
|
|
1892
|
+
SAMPLE_TYPE_KEY: {
|
|
1893
|
+
ALLOWED_KEY: ["stool"],
|
|
1894
|
+
DEFAULT_KEY: "stool",
|
|
1895
|
+
TYPE_KEY: "string"
|
|
1896
|
+
},
|
|
1897
|
+
QIITA_SAMPLE_TYPE: {
|
|
1898
|
+
ALLOWED_KEY: ["stool"],
|
|
1899
|
+
DEFAULT_KEY: "stool",
|
|
1900
|
+
TYPE_KEY: "string"
|
|
1269
1901
|
}
|
|
1270
1902
|
}
|
|
1271
1903
|
}
|
|
@@ -1298,6 +1930,8 @@ class TestMetadataExtender(TestCase):
|
|
|
1298
1930
|
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
|
|
1299
1931
|
QC_NOTE_KEY: ["", ""]
|
|
1300
1932
|
})
|
|
1933
|
+
# Config is pre-resolved: sample type's metadata_fields includes
|
|
1934
|
+
# host fields merged in, plus sample_type and qiita_sample_type
|
|
1301
1935
|
full_flat_config_dict = {
|
|
1302
1936
|
DEFAULT_KEY: "global_default",
|
|
1303
1937
|
LEAVE_REQUIREDS_BLANK_KEY: False,
|
|
@@ -1313,9 +1947,23 @@ class TestMetadataExtender(TestCase):
|
|
|
1313
1947
|
SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
|
|
1314
1948
|
"stool": {
|
|
1315
1949
|
METADATA_FIELDS_KEY: {
|
|
1950
|
+
"host_field": {
|
|
1951
|
+
DEFAULT_KEY: "host_value",
|
|
1952
|
+
TYPE_KEY: "string"
|
|
1953
|
+
},
|
|
1316
1954
|
"stool_field": {
|
|
1317
1955
|
DEFAULT_KEY: "stool_value",
|
|
1318
1956
|
TYPE_KEY: "string"
|
|
1957
|
+
},
|
|
1958
|
+
SAMPLE_TYPE_KEY: {
|
|
1959
|
+
ALLOWED_KEY: ["stool"],
|
|
1960
|
+
DEFAULT_KEY: "stool",
|
|
1961
|
+
TYPE_KEY: "string"
|
|
1962
|
+
},
|
|
1963
|
+
QIITA_SAMPLE_TYPE: {
|
|
1964
|
+
ALLOWED_KEY: ["stool"],
|
|
1965
|
+
DEFAULT_KEY: "stool",
|
|
1966
|
+
TYPE_KEY: "string"
|
|
1319
1967
|
}
|
|
1320
1968
|
}
|
|
1321
1969
|
}
|
|
@@ -1348,6 +1996,8 @@ class TestMetadataExtender(TestCase):
|
|
|
1348
1996
|
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool", "blood"],
|
|
1349
1997
|
QC_NOTE_KEY: ["", "", ""]
|
|
1350
1998
|
})
|
|
1999
|
+
# Config is pre-resolved: sample type's metadata_fields includes
|
|
2000
|
+
# host fields merged in, plus sample_type and qiita_sample_type
|
|
1351
2001
|
full_flat_config_dict = {
|
|
1352
2002
|
DEFAULT_KEY: "global_default",
|
|
1353
2003
|
LEAVE_REQUIREDS_BLANK_KEY: False,
|
|
@@ -1362,10 +2012,40 @@ class TestMetadataExtender(TestCase):
|
|
|
1362
2012
|
},
|
|
1363
2013
|
SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
|
|
1364
2014
|
"stool": {
|
|
1365
|
-
METADATA_FIELDS_KEY: {
|
|
2015
|
+
METADATA_FIELDS_KEY: {
|
|
2016
|
+
"human_field": {
|
|
2017
|
+
DEFAULT_KEY: "human_value",
|
|
2018
|
+
TYPE_KEY: "string"
|
|
2019
|
+
},
|
|
2020
|
+
SAMPLE_TYPE_KEY: {
|
|
2021
|
+
ALLOWED_KEY: ["stool"],
|
|
2022
|
+
DEFAULT_KEY: "stool",
|
|
2023
|
+
TYPE_KEY: "string"
|
|
2024
|
+
},
|
|
2025
|
+
QIITA_SAMPLE_TYPE: {
|
|
2026
|
+
ALLOWED_KEY: ["stool"],
|
|
2027
|
+
DEFAULT_KEY: "stool",
|
|
2028
|
+
TYPE_KEY: "string"
|
|
2029
|
+
}
|
|
2030
|
+
}
|
|
1366
2031
|
},
|
|
1367
2032
|
"blood": {
|
|
1368
|
-
METADATA_FIELDS_KEY: {
|
|
2033
|
+
METADATA_FIELDS_KEY: {
|
|
2034
|
+
"human_field": {
|
|
2035
|
+
DEFAULT_KEY: "human_value",
|
|
2036
|
+
TYPE_KEY: "string"
|
|
2037
|
+
},
|
|
2038
|
+
SAMPLE_TYPE_KEY: {
|
|
2039
|
+
ALLOWED_KEY: ["blood"],
|
|
2040
|
+
DEFAULT_KEY: "blood",
|
|
2041
|
+
TYPE_KEY: "string"
|
|
2042
|
+
},
|
|
2043
|
+
QIITA_SAMPLE_TYPE: {
|
|
2044
|
+
ALLOWED_KEY: ["blood"],
|
|
2045
|
+
DEFAULT_KEY: "blood",
|
|
2046
|
+
TYPE_KEY: "string"
|
|
2047
|
+
}
|
|
2048
|
+
}
|
|
1369
2049
|
}
|
|
1370
2050
|
}
|
|
1371
2051
|
},
|
|
@@ -1378,12 +2058,27 @@ class TestMetadataExtender(TestCase):
|
|
|
1378
2058
|
},
|
|
1379
2059
|
SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
|
|
1380
2060
|
"stool": {
|
|
1381
|
-
METADATA_FIELDS_KEY: {
|
|
1382
|
-
|
|
1383
|
-
|
|
1384
|
-
|
|
1385
|
-
|
|
1386
|
-
|
|
2061
|
+
METADATA_FIELDS_KEY: {
|
|
2062
|
+
"mouse_field": {
|
|
2063
|
+
DEFAULT_KEY: "mouse_value",
|
|
2064
|
+
TYPE_KEY: "string"
|
|
2065
|
+
},
|
|
2066
|
+
SAMPLE_TYPE_KEY: {
|
|
2067
|
+
ALLOWED_KEY: ["stool"],
|
|
2068
|
+
DEFAULT_KEY: "stool",
|
|
2069
|
+
TYPE_KEY: "string"
|
|
2070
|
+
},
|
|
2071
|
+
QIITA_SAMPLE_TYPE: {
|
|
2072
|
+
ALLOWED_KEY: ["stool"],
|
|
2073
|
+
DEFAULT_KEY: "stool",
|
|
2074
|
+
TYPE_KEY: "string"
|
|
2075
|
+
}
|
|
2076
|
+
}
|
|
2077
|
+
}
|
|
2078
|
+
}
|
|
2079
|
+
}
|
|
2080
|
+
}
|
|
2081
|
+
}
|
|
1387
2082
|
|
|
1388
2083
|
result_df, validation_msgs = _generate_metadata_for_host_types(
|
|
1389
2084
|
input_df, full_flat_config_dict)
|
|
@@ -1478,6 +2173,8 @@ class TestMetadataExtender(TestCase):
|
|
|
1478
2173
|
SAMPLETYPE_SHORTHAND_KEY: ["stool"],
|
|
1479
2174
|
QC_NOTE_KEY: [""]
|
|
1480
2175
|
})
|
|
2176
|
+
# Config is pre-resolved: sample type's metadata_fields includes
|
|
2177
|
+
# host fields merged in, plus sample_type and qiita_sample_type
|
|
1481
2178
|
full_flat_config_dict = {
|
|
1482
2179
|
DEFAULT_KEY: "global_default",
|
|
1483
2180
|
LEAVE_REQUIREDS_BLANK_KEY: True, # This causes required fields to get LEAVE_BLANK_VAL
|
|
@@ -1491,6 +2188,16 @@ class TestMetadataExtender(TestCase):
|
|
|
1491
2188
|
"required_field": {
|
|
1492
2189
|
REQUIRED_KEY: True,
|
|
1493
2190
|
TYPE_KEY: "string"
|
|
2191
|
+
},
|
|
2192
|
+
SAMPLE_TYPE_KEY: {
|
|
2193
|
+
ALLOWED_KEY: ["stool"],
|
|
2194
|
+
DEFAULT_KEY: "stool",
|
|
2195
|
+
TYPE_KEY: "string"
|
|
2196
|
+
},
|
|
2197
|
+
QIITA_SAMPLE_TYPE: {
|
|
2198
|
+
ALLOWED_KEY: ["stool"],
|
|
2199
|
+
DEFAULT_KEY: "stool",
|
|
2200
|
+
TYPE_KEY: "string"
|
|
1494
2201
|
}
|
|
1495
2202
|
}
|
|
1496
2203
|
}
|
|
@@ -1790,6 +2497,8 @@ class TestMetadataExtender(TestCase):
|
|
|
1790
2497
|
HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
|
|
1791
2498
|
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"]
|
|
1792
2499
|
})
|
|
2500
|
+
# Config is pre-resolved: sample type's metadata_fields includes
|
|
2501
|
+
# host fields merged in, plus sample_type and qiita_sample_type
|
|
1793
2502
|
full_flat_config_dict = {
|
|
1794
2503
|
DEFAULT_KEY: "not provided",
|
|
1795
2504
|
LEAVE_REQUIREDS_BLANK_KEY: False,
|
|
@@ -1805,9 +2514,23 @@ class TestMetadataExtender(TestCase):
|
|
|
1805
2514
|
SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
|
|
1806
2515
|
"stool": {
|
|
1807
2516
|
METADATA_FIELDS_KEY: {
|
|
2517
|
+
"host_field": {
|
|
2518
|
+
DEFAULT_KEY: "host_value",
|
|
2519
|
+
TYPE_KEY: "string"
|
|
2520
|
+
},
|
|
1808
2521
|
"stool_field": {
|
|
1809
2522
|
DEFAULT_KEY: "stool_value",
|
|
1810
2523
|
TYPE_KEY: "string"
|
|
2524
|
+
},
|
|
2525
|
+
SAMPLE_TYPE_KEY: {
|
|
2526
|
+
ALLOWED_KEY: ["stool"],
|
|
2527
|
+
DEFAULT_KEY: "stool",
|
|
2528
|
+
TYPE_KEY: "string"
|
|
2529
|
+
},
|
|
2530
|
+
QIITA_SAMPLE_TYPE: {
|
|
2531
|
+
ALLOWED_KEY: ["stool"],
|
|
2532
|
+
DEFAULT_KEY: "stool",
|
|
2533
|
+
TYPE_KEY: "string"
|
|
1811
2534
|
}
|
|
1812
2535
|
}
|
|
1813
2536
|
}
|
|
@@ -1840,6 +2563,8 @@ class TestMetadataExtender(TestCase):
|
|
|
1840
2563
|
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
|
|
1841
2564
|
"input_sex": ["F", "Male"]
|
|
1842
2565
|
})
|
|
2566
|
+
# Config is pre-resolved: sample type's metadata_fields includes
|
|
2567
|
+
# host fields merged in, plus sample_type and qiita_sample_type
|
|
1843
2568
|
full_flat_config_dict = {
|
|
1844
2569
|
DEFAULT_KEY: "not provided",
|
|
1845
2570
|
LEAVE_REQUIREDS_BLANK_KEY: False,
|
|
@@ -1857,7 +2582,18 @@ class TestMetadataExtender(TestCase):
|
|
|
1857
2582
|
METADATA_FIELDS_KEY: {},
|
|
1858
2583
|
SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
|
|
1859
2584
|
"stool": {
|
|
1860
|
-
METADATA_FIELDS_KEY: {
|
|
2585
|
+
METADATA_FIELDS_KEY: {
|
|
2586
|
+
SAMPLE_TYPE_KEY: {
|
|
2587
|
+
ALLOWED_KEY: ["stool"],
|
|
2588
|
+
DEFAULT_KEY: "stool",
|
|
2589
|
+
TYPE_KEY: "string"
|
|
2590
|
+
},
|
|
2591
|
+
QIITA_SAMPLE_TYPE: {
|
|
2592
|
+
ALLOWED_KEY: ["stool"],
|
|
2593
|
+
DEFAULT_KEY: "stool",
|
|
2594
|
+
TYPE_KEY: "string"
|
|
2595
|
+
}
|
|
2596
|
+
}
|
|
1861
2597
|
}
|
|
1862
2598
|
}
|
|
1863
2599
|
}
|
|
@@ -1886,6 +2622,8 @@ class TestMetadataExtender(TestCase):
|
|
|
1886
2622
|
HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
|
|
1887
2623
|
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"]
|
|
1888
2624
|
})
|
|
2625
|
+
# Config is pre-resolved: sample type's metadata_fields includes
|
|
2626
|
+
# host fields merged in, plus sample_type and qiita_sample_type
|
|
1889
2627
|
full_flat_config_dict = {
|
|
1890
2628
|
DEFAULT_KEY: "not provided",
|
|
1891
2629
|
LEAVE_REQUIREDS_BLANK_KEY: False,
|
|
@@ -1903,7 +2641,18 @@ class TestMetadataExtender(TestCase):
|
|
|
1903
2641
|
METADATA_FIELDS_KEY: {},
|
|
1904
2642
|
SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
|
|
1905
2643
|
"stool": {
|
|
1906
|
-
METADATA_FIELDS_KEY: {
|
|
2644
|
+
METADATA_FIELDS_KEY: {
|
|
2645
|
+
SAMPLE_TYPE_KEY: {
|
|
2646
|
+
ALLOWED_KEY: ["stool"],
|
|
2647
|
+
DEFAULT_KEY: "stool",
|
|
2648
|
+
TYPE_KEY: "string"
|
|
2649
|
+
},
|
|
2650
|
+
QIITA_SAMPLE_TYPE: {
|
|
2651
|
+
ALLOWED_KEY: ["stool"],
|
|
2652
|
+
DEFAULT_KEY: "stool",
|
|
2653
|
+
TYPE_KEY: "string"
|
|
2654
|
+
}
|
|
2655
|
+
}
|
|
1907
2656
|
}
|
|
1908
2657
|
}
|
|
1909
2658
|
}
|
|
@@ -1963,6 +2712,8 @@ class TestMetadataExtender(TestCase):
|
|
|
1963
2712
|
HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
|
|
1964
2713
|
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"]
|
|
1965
2714
|
})
|
|
2715
|
+
# Config is pre-resolved: sample type's metadata_fields includes
|
|
2716
|
+
# host fields merged in, plus sample_type and qiita_sample_type
|
|
1966
2717
|
full_flat_config_dict = {
|
|
1967
2718
|
DEFAULT_KEY: "not provided",
|
|
1968
2719
|
LEAVE_REQUIREDS_BLANK_KEY: False,
|
|
@@ -1972,7 +2723,18 @@ class TestMetadataExtender(TestCase):
|
|
|
1972
2723
|
METADATA_FIELDS_KEY: {},
|
|
1973
2724
|
SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
|
|
1974
2725
|
"stool": {
|
|
1975
|
-
METADATA_FIELDS_KEY: {
|
|
2726
|
+
METADATA_FIELDS_KEY: {
|
|
2727
|
+
SAMPLE_TYPE_KEY: {
|
|
2728
|
+
ALLOWED_KEY: ["stool"],
|
|
2729
|
+
DEFAULT_KEY: "stool",
|
|
2730
|
+
TYPE_KEY: "string"
|
|
2731
|
+
},
|
|
2732
|
+
QIITA_SAMPLE_TYPE: {
|
|
2733
|
+
ALLOWED_KEY: ["stool"],
|
|
2734
|
+
DEFAULT_KEY: "stool",
|
|
2735
|
+
TYPE_KEY: "string"
|
|
2736
|
+
}
|
|
2737
|
+
}
|
|
1976
2738
|
}
|
|
1977
2739
|
}
|
|
1978
2740
|
}
|
|
@@ -2002,6 +2764,8 @@ class TestMetadataExtender(TestCase):
|
|
|
2002
2764
|
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
|
|
2003
2765
|
"source_field": ["hello", "world"]
|
|
2004
2766
|
})
|
|
2767
|
+
# Config is pre-resolved: sample type's metadata_fields includes
|
|
2768
|
+
# host fields merged in, plus sample_type and qiita_sample_type
|
|
2005
2769
|
full_flat_config_dict = {
|
|
2006
2770
|
DEFAULT_KEY: "not provided",
|
|
2007
2771
|
LEAVE_REQUIREDS_BLANK_KEY: False,
|
|
@@ -2019,7 +2783,18 @@ class TestMetadataExtender(TestCase):
|
|
|
2019
2783
|
METADATA_FIELDS_KEY: {},
|
|
2020
2784
|
SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
|
|
2021
2785
|
"stool": {
|
|
2022
|
-
METADATA_FIELDS_KEY: {
|
|
2786
|
+
METADATA_FIELDS_KEY: {
|
|
2787
|
+
SAMPLE_TYPE_KEY: {
|
|
2788
|
+
ALLOWED_KEY: ["stool"],
|
|
2789
|
+
DEFAULT_KEY: "stool",
|
|
2790
|
+
TYPE_KEY: "string"
|
|
2791
|
+
},
|
|
2792
|
+
QIITA_SAMPLE_TYPE: {
|
|
2793
|
+
ALLOWED_KEY: ["stool"],
|
|
2794
|
+
DEFAULT_KEY: "stool",
|
|
2795
|
+
TYPE_KEY: "string"
|
|
2796
|
+
}
|
|
2797
|
+
}
|
|
2023
2798
|
}
|
|
2024
2799
|
}
|
|
2025
2800
|
}
|
|
@@ -2049,62 +2824,613 @@ class TestMetadataExtender(TestCase):
|
|
|
2049
2824
|
def test__populate_metadata_df_nan_sample_name_raises(self):
|
|
2050
2825
|
"""Test that NaN sample name raises ValueError."""
|
|
2051
2826
|
input_df = pandas.DataFrame({
|
|
2052
|
-
SAMPLE_NAME_KEY: ["sample1", np.nan],
|
|
2827
|
+
SAMPLE_NAME_KEY: ["sample1", np.nan],
|
|
2828
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
|
|
2829
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"]
|
|
2830
|
+
})
|
|
2831
|
+
full_flat_config_dict = {
|
|
2832
|
+
HOST_TYPE_SPECIFIC_METADATA_KEY: {}
|
|
2833
|
+
}
|
|
2834
|
+
|
|
2835
|
+
with self.assertRaisesRegex(ValueError, "Metadata contains NaN sample names"):
|
|
2836
|
+
_populate_metadata_df(input_df, full_flat_config_dict, None)
|
|
2837
|
+
|
|
2838
|
+
# Tests for extend_metadata_df
|
|
2839
|
+
|
|
2840
|
+
TEST_DIR = path.dirname(__file__)
|
|
2841
|
+
TEST_STDS_FP = path.join(TEST_DIR, "data/test_standards.yml")
|
|
2842
|
+
|
|
2843
|
+
def test_extend_metadata_df_basic(self):
|
|
2844
|
+
"""Test basic metadata extension with study config."""
|
|
2845
|
+
input_df = pandas.DataFrame({
|
|
2846
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
2847
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
|
|
2848
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"]
|
|
2849
|
+
})
|
|
2850
|
+
study_config = {
|
|
2851
|
+
DEFAULT_KEY: "not provided",
|
|
2852
|
+
LEAVE_REQUIREDS_BLANK_KEY: True,
|
|
2853
|
+
OVERWRITE_NON_NANS_KEY: False,
|
|
2854
|
+
STUDY_SPECIFIC_METADATA_KEY: {
|
|
2855
|
+
HOST_TYPE_SPECIFIC_METADATA_KEY: {
|
|
2856
|
+
"human": {
|
|
2857
|
+
METADATA_FIELDS_KEY: {
|
|
2858
|
+
"custom_field": {
|
|
2859
|
+
DEFAULT_KEY: "custom_value",
|
|
2860
|
+
TYPE_KEY: "string"
|
|
2861
|
+
}
|
|
2862
|
+
},
|
|
2863
|
+
SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
|
|
2864
|
+
"stool": {
|
|
2865
|
+
METADATA_FIELDS_KEY: {}
|
|
2866
|
+
}
|
|
2867
|
+
}
|
|
2868
|
+
}
|
|
2869
|
+
}
|
|
2870
|
+
}
|
|
2871
|
+
}
|
|
2872
|
+
|
|
2873
|
+
result_df, validation_msgs_df = extend_metadata_df(
|
|
2874
|
+
input_df, study_config, None, None, self.TEST_STDS_FP)
|
|
2875
|
+
|
|
2876
|
+
expected_df = pandas.DataFrame({
|
|
2877
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
2878
|
+
# body_product from human stool in test_standards.yml
|
|
2879
|
+
"body_product": ["UBERON:feces", "UBERON:feces"],
|
|
2880
|
+
# body_site inherited from host_associated stool
|
|
2881
|
+
"body_site": ["gut", "gut"],
|
|
2882
|
+
# custom_field from study_specific_metadata
|
|
2883
|
+
"custom_field": ["custom_value", "custom_value"],
|
|
2884
|
+
# description overridden at human level
|
|
2885
|
+
"description": ["human sample", "human sample"],
|
|
2886
|
+
# host_common_name from human level
|
|
2887
|
+
"host_common_name": ["human", "human"],
|
|
2888
|
+
QIITA_SAMPLE_TYPE: ["stool", "stool"],
|
|
2889
|
+
SAMPLE_TYPE_KEY: ["stool", "stool"],
|
|
2890
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
|
|
2891
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
|
|
2892
|
+
QC_NOTE_KEY: ["", ""]
|
|
2893
|
+
})
|
|
2894
|
+
assert_frame_equal(expected_df, result_df)
|
|
2895
|
+
self.assertTrue(validation_msgs_df.empty)
|
|
2896
|
+
|
|
2897
|
+
def test_extend_metadata_df_with_pre_transformer(self):
|
|
2898
|
+
"""Test metadata extension with pre-transformer."""
|
|
2899
|
+
input_df = pandas.DataFrame({
|
|
2900
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
2901
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
|
|
2902
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
|
|
2903
|
+
"input_sex": ["F", "Male"]
|
|
2904
|
+
})
|
|
2905
|
+
study_config = {
|
|
2906
|
+
DEFAULT_KEY: "not provided",
|
|
2907
|
+
LEAVE_REQUIREDS_BLANK_KEY: True,
|
|
2908
|
+
OVERWRITE_NON_NANS_KEY: False,
|
|
2909
|
+
METADATA_TRANSFORMERS_KEY: {
|
|
2910
|
+
PRE_TRANSFORMERS_KEY: {
|
|
2911
|
+
"sex": {
|
|
2912
|
+
SOURCES_KEY: ["input_sex"],
|
|
2913
|
+
FUNCTION_KEY: "transform_input_sex_to_std_sex"
|
|
2914
|
+
}
|
|
2915
|
+
}
|
|
2916
|
+
},
|
|
2917
|
+
STUDY_SPECIFIC_METADATA_KEY: {
|
|
2918
|
+
HOST_TYPE_SPECIFIC_METADATA_KEY: {
|
|
2919
|
+
"human": {
|
|
2920
|
+
METADATA_FIELDS_KEY: {},
|
|
2921
|
+
SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
|
|
2922
|
+
"stool": {
|
|
2923
|
+
METADATA_FIELDS_KEY: {}
|
|
2924
|
+
}
|
|
2925
|
+
}
|
|
2926
|
+
}
|
|
2927
|
+
}
|
|
2928
|
+
}
|
|
2929
|
+
}
|
|
2930
|
+
|
|
2931
|
+
result_df, validation_msgs_df = extend_metadata_df(
|
|
2932
|
+
input_df, study_config, None, None, self.TEST_STDS_FP)
|
|
2933
|
+
|
|
2934
|
+
expected_df = pandas.DataFrame({
|
|
2935
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
2936
|
+
# body_product from human stool in test_standards.yml
|
|
2937
|
+
"body_product": ["UBERON:feces", "UBERON:feces"],
|
|
2938
|
+
"body_site": ["gut", "gut"],
|
|
2939
|
+
# description overridden at human level
|
|
2940
|
+
"description": ["human sample", "human sample"],
|
|
2941
|
+
"host_common_name": ["human", "human"],
|
|
2942
|
+
"input_sex": ["F", "Male"],
|
|
2943
|
+
QIITA_SAMPLE_TYPE: ["stool", "stool"],
|
|
2944
|
+
SAMPLE_TYPE_KEY: ["stool", "stool"],
|
|
2945
|
+
"sex": ["female", "male"],
|
|
2946
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
|
|
2947
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
|
|
2948
|
+
QC_NOTE_KEY: ["", ""]
|
|
2949
|
+
})
|
|
2950
|
+
assert_frame_equal(expected_df, result_df)
|
|
2951
|
+
|
|
2952
|
+
def test_extend_metadata_df_with_custom_transformer(self):
|
|
2953
|
+
"""Test metadata extension with custom transformer function."""
|
|
2954
|
+
input_df = pandas.DataFrame({
|
|
2955
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
2956
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
|
|
2957
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
|
|
2958
|
+
"source_field": ["hello", "world"]
|
|
2959
|
+
})
|
|
2960
|
+
study_config = {
|
|
2961
|
+
DEFAULT_KEY: "not provided",
|
|
2962
|
+
LEAVE_REQUIREDS_BLANK_KEY: True,
|
|
2963
|
+
OVERWRITE_NON_NANS_KEY: False,
|
|
2964
|
+
METADATA_TRANSFORMERS_KEY: {
|
|
2965
|
+
PRE_TRANSFORMERS_KEY: {
|
|
2966
|
+
"upper_field": {
|
|
2967
|
+
SOURCES_KEY: ["source_field"],
|
|
2968
|
+
FUNCTION_KEY: "custom_upper"
|
|
2969
|
+
}
|
|
2970
|
+
}
|
|
2971
|
+
},
|
|
2972
|
+
STUDY_SPECIFIC_METADATA_KEY: {
|
|
2973
|
+
HOST_TYPE_SPECIFIC_METADATA_KEY: {
|
|
2974
|
+
"human": {
|
|
2975
|
+
METADATA_FIELDS_KEY: {},
|
|
2976
|
+
SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
|
|
2977
|
+
"stool": {
|
|
2978
|
+
METADATA_FIELDS_KEY: {}
|
|
2979
|
+
}
|
|
2980
|
+
}
|
|
2981
|
+
}
|
|
2982
|
+
}
|
|
2983
|
+
}
|
|
2984
|
+
}
|
|
2985
|
+
|
|
2986
|
+
def custom_upper(row, source_fields):
|
|
2987
|
+
return row[source_fields[0]].upper()
|
|
2988
|
+
|
|
2989
|
+
transformer_funcs_dict = {"custom_upper": custom_upper}
|
|
2990
|
+
|
|
2991
|
+
result_df, validation_msgs_df = extend_metadata_df(
|
|
2992
|
+
input_df, study_config, transformer_funcs_dict, None, self.TEST_STDS_FP)
|
|
2993
|
+
|
|
2994
|
+
expected_df = pandas.DataFrame({
|
|
2995
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
2996
|
+
"body_product": ["UBERON:feces", "UBERON:feces"],
|
|
2997
|
+
"body_site": ["gut", "gut"],
|
|
2998
|
+
"description": ["human sample", "human sample"],
|
|
2999
|
+
"host_common_name": ["human", "human"],
|
|
3000
|
+
QIITA_SAMPLE_TYPE: ["stool", "stool"],
|
|
3001
|
+
SAMPLE_TYPE_KEY: ["stool", "stool"],
|
|
3002
|
+
"source_field": ["hello", "world"],
|
|
3003
|
+
"upper_field": ["HELLO", "WORLD"],
|
|
3004
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
|
|
3005
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
|
|
3006
|
+
QC_NOTE_KEY: ["", ""]
|
|
3007
|
+
})
|
|
3008
|
+
assert_frame_equal(expected_df, result_df)
|
|
3009
|
+
|
|
3010
|
+
def test_extend_metadata_df_missing_required_columns_raises(self):
|
|
3011
|
+
"""Test that missing required columns raises ValueError."""
|
|
3012
|
+
input_df = pandas.DataFrame({
|
|
3013
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"]
|
|
3014
|
+
# Missing HOSTTYPE_SHORTHAND_KEY and SAMPLETYPE_SHORTHAND_KEY
|
|
3015
|
+
})
|
|
3016
|
+
study_config = {}
|
|
3017
|
+
|
|
3018
|
+
with self.assertRaisesRegex(ValueError, "metadata missing required columns"):
|
|
3019
|
+
extend_metadata_df(input_df, study_config, None, None, self.TEST_STDS_FP)
|
|
3020
|
+
|
|
3021
|
+
def test_extend_metadata_df_none_study_config(self):
|
|
3022
|
+
"""Test metadata extension with None study config uses standards only."""
|
|
3023
|
+
input_df = pandas.DataFrame({
|
|
3024
|
+
SAMPLE_NAME_KEY: ["sample1"],
|
|
3025
|
+
HOSTTYPE_SHORTHAND_KEY: ["human"],
|
|
3026
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool"]
|
|
3027
|
+
})
|
|
3028
|
+
|
|
3029
|
+
result_df, validation_msgs_df = extend_metadata_df(
|
|
3030
|
+
input_df, None, None, None, self.TEST_STDS_FP)
|
|
3031
|
+
|
|
3032
|
+
expected_df = pandas.DataFrame({
|
|
3033
|
+
SAMPLE_NAME_KEY: ["sample1"],
|
|
3034
|
+
"body_product": ["UBERON:feces"],
|
|
3035
|
+
"body_site": ["gut"],
|
|
3036
|
+
"description": ["human sample"],
|
|
3037
|
+
"host_common_name": ["human"],
|
|
3038
|
+
QIITA_SAMPLE_TYPE: ["stool"],
|
|
3039
|
+
SAMPLE_TYPE_KEY: ["stool"],
|
|
3040
|
+
HOSTTYPE_SHORTHAND_KEY: ["human"],
|
|
3041
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool"],
|
|
3042
|
+
QC_NOTE_KEY: [""]
|
|
3043
|
+
})
|
|
3044
|
+
assert_frame_equal(expected_df, result_df)
|
|
3045
|
+
|
|
3046
|
+
def test_extend_metadata_df_unknown_host_type(self):
|
|
3047
|
+
"""Test that unknown host type adds QC note."""
|
|
3048
|
+
input_df = pandas.DataFrame({
|
|
3049
|
+
SAMPLE_NAME_KEY: ["sample1"],
|
|
3050
|
+
HOSTTYPE_SHORTHAND_KEY: ["unknown_host"],
|
|
3051
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool"]
|
|
3052
|
+
})
|
|
3053
|
+
study_config = {
|
|
3054
|
+
DEFAULT_KEY: "not provided",
|
|
3055
|
+
LEAVE_REQUIREDS_BLANK_KEY: True,
|
|
3056
|
+
OVERWRITE_NON_NANS_KEY: False,
|
|
3057
|
+
STUDY_SPECIFIC_METADATA_KEY: {
|
|
3058
|
+
HOST_TYPE_SPECIFIC_METADATA_KEY: {
|
|
3059
|
+
"human": {
|
|
3060
|
+
METADATA_FIELDS_KEY: {},
|
|
3061
|
+
SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
|
|
3062
|
+
"stool": {
|
|
3063
|
+
METADATA_FIELDS_KEY: {}
|
|
3064
|
+
}
|
|
3065
|
+
}
|
|
3066
|
+
}
|
|
3067
|
+
}
|
|
3068
|
+
}
|
|
3069
|
+
}
|
|
3070
|
+
|
|
3071
|
+
result_df, validation_msgs_df = extend_metadata_df(
|
|
3072
|
+
input_df, study_config, None, None, self.TEST_STDS_FP)
|
|
3073
|
+
|
|
3074
|
+
expected_df = pandas.DataFrame({
|
|
3075
|
+
SAMPLE_NAME_KEY: ["sample1"],
|
|
3076
|
+
HOSTTYPE_SHORTHAND_KEY: ["unknown_host"],
|
|
3077
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool"],
|
|
3078
|
+
QC_NOTE_KEY: ["invalid host_type"]
|
|
3079
|
+
})
|
|
3080
|
+
assert_frame_equal(expected_df, result_df)
|
|
3081
|
+
|
|
3082
|
+
def test_extend_metadata_df_multiple_host_types(self):
|
|
3083
|
+
"""Test metadata extension with multiple host types."""
|
|
3084
|
+
input_df = pandas.DataFrame({
|
|
3085
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2", "sample3"],
|
|
3086
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "mouse", "human"],
|
|
3087
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool", "blood"]
|
|
3088
|
+
})
|
|
3089
|
+
study_config = {
|
|
3090
|
+
DEFAULT_KEY: "not provided",
|
|
3091
|
+
LEAVE_REQUIREDS_BLANK_KEY: True,
|
|
3092
|
+
OVERWRITE_NON_NANS_KEY: False,
|
|
3093
|
+
STUDY_SPECIFIC_METADATA_KEY: {
|
|
3094
|
+
HOST_TYPE_SPECIFIC_METADATA_KEY: {
|
|
3095
|
+
"human": {
|
|
3096
|
+
METADATA_FIELDS_KEY: {},
|
|
3097
|
+
SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
|
|
3098
|
+
"stool": {
|
|
3099
|
+
METADATA_FIELDS_KEY: {}
|
|
3100
|
+
},
|
|
3101
|
+
"blood": {
|
|
3102
|
+
METADATA_FIELDS_KEY: {}
|
|
3103
|
+
}
|
|
3104
|
+
}
|
|
3105
|
+
},
|
|
3106
|
+
"mouse": {
|
|
3107
|
+
METADATA_FIELDS_KEY: {},
|
|
3108
|
+
SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
|
|
3109
|
+
"stool": {
|
|
3110
|
+
METADATA_FIELDS_KEY: {}
|
|
3111
|
+
}
|
|
3112
|
+
}
|
|
3113
|
+
}
|
|
3114
|
+
}
|
|
3115
|
+
}
|
|
3116
|
+
}
|
|
3117
|
+
|
|
3118
|
+
result_df, validation_msgs_df = extend_metadata_df(
|
|
3119
|
+
input_df, study_config, None, None, self.TEST_STDS_FP)
|
|
3120
|
+
|
|
3121
|
+
# After processing multiple host types, rows may be reordered
|
|
3122
|
+
# Human samples are processed together, then mouse samples
|
|
3123
|
+
expected_df = pandas.DataFrame({
|
|
3124
|
+
SAMPLE_NAME_KEY: ["sample1", "sample3", "sample2"],
|
|
3125
|
+
# body_product: human stool/blood have it, mouse stool uses default
|
|
3126
|
+
"body_product": ["UBERON:feces", "UBERON:blood", "not provided"],
|
|
3127
|
+
"body_site": ["gut", "blood", "gut"],
|
|
3128
|
+
# description: human overrides to "human sample",
|
|
3129
|
+
# mouse inherits "host associated sample"
|
|
3130
|
+
"description": ["human sample", "human sample", "host associated sample"],
|
|
3131
|
+
"host_common_name": ["human", "human", "mouse"],
|
|
3132
|
+
QIITA_SAMPLE_TYPE: ["stool", "blood", "stool"],
|
|
3133
|
+
SAMPLE_TYPE_KEY: ["stool", "blood", "stool"],
|
|
3134
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "human", "mouse"],
|
|
3135
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "blood", "stool"],
|
|
3136
|
+
QC_NOTE_KEY: ["", "", ""]
|
|
3137
|
+
})
|
|
3138
|
+
assert_frame_equal(expected_df, result_df)
|
|
3139
|
+
|
|
3140
|
+
def test_extend_metadata_df_with_software_config(self):
|
|
3141
|
+
"""Test metadata extension with custom software config overrides defaults."""
|
|
3142
|
+
input_df = pandas.DataFrame({
|
|
3143
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
3144
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
|
|
3145
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"]
|
|
3146
|
+
})
|
|
3147
|
+
# Software config with custom default value
|
|
3148
|
+
software_config = {
|
|
3149
|
+
DEFAULT_KEY: "custom_software_default",
|
|
3150
|
+
LEAVE_REQUIREDS_BLANK_KEY: True,
|
|
3151
|
+
OVERWRITE_NON_NANS_KEY: False
|
|
3152
|
+
}
|
|
3153
|
+
# Study config that doesn't override DEFAULT_KEY
|
|
3154
|
+
study_config = {
|
|
3155
|
+
STUDY_SPECIFIC_METADATA_KEY: {
|
|
3156
|
+
HOST_TYPE_SPECIFIC_METADATA_KEY: {
|
|
3157
|
+
"human": {
|
|
3158
|
+
METADATA_FIELDS_KEY: {
|
|
3159
|
+
"study_field": {
|
|
3160
|
+
DEFAULT_KEY: "study_value",
|
|
3161
|
+
TYPE_KEY: "string"
|
|
3162
|
+
}
|
|
3163
|
+
},
|
|
3164
|
+
SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
|
|
3165
|
+
"stool": {
|
|
3166
|
+
METADATA_FIELDS_KEY: {}
|
|
3167
|
+
}
|
|
3168
|
+
}
|
|
3169
|
+
}
|
|
3170
|
+
}
|
|
3171
|
+
}
|
|
3172
|
+
}
|
|
3173
|
+
|
|
3174
|
+
result_df, validation_msgs_df = extend_metadata_df(
|
|
3175
|
+
input_df, study_config, None, software_config, self.TEST_STDS_FP)
|
|
3176
|
+
|
|
3177
|
+
expected_df = pandas.DataFrame({
|
|
3178
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
3179
|
+
"body_product": ["UBERON:feces", "UBERON:feces"],
|
|
3180
|
+
"body_site": ["gut", "gut"],
|
|
3181
|
+
"description": ["human sample", "human sample"],
|
|
3182
|
+
"host_common_name": ["human", "human"],
|
|
3183
|
+
QIITA_SAMPLE_TYPE: ["stool", "stool"],
|
|
3184
|
+
SAMPLE_TYPE_KEY: ["stool", "stool"],
|
|
3185
|
+
"study_field": ["study_value", "study_value"],
|
|
3186
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
|
|
3187
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
|
|
3188
|
+
QC_NOTE_KEY: ["", ""]
|
|
3189
|
+
})
|
|
3190
|
+
assert_frame_equal(expected_df, result_df)
|
|
3191
|
+
|
|
3192
|
+
# Tests for _get_study_specific_config
|
|
3193
|
+
|
|
3194
|
+
def test__get_study_specific_config_with_valid_file(self):
|
|
3195
|
+
"""Test loading study-specific config from a valid YAML file."""
|
|
3196
|
+
config_fp = path.join(self.TEST_DIR, "data/test_config.yml")
|
|
3197
|
+
|
|
3198
|
+
result = _get_study_specific_config(config_fp)
|
|
3199
|
+
|
|
3200
|
+
expected = {
|
|
3201
|
+
HOST_TYPE_SPECIFIC_METADATA_KEY: {
|
|
3202
|
+
"base": {
|
|
3203
|
+
METADATA_FIELDS_KEY: {
|
|
3204
|
+
"sample_name": {
|
|
3205
|
+
TYPE_KEY: "string",
|
|
3206
|
+
"unique": True
|
|
3207
|
+
},
|
|
3208
|
+
"sample_type": {
|
|
3209
|
+
"empty": False,
|
|
3210
|
+
"is_phi": False
|
|
3211
|
+
}
|
|
3212
|
+
}
|
|
3213
|
+
}
|
|
3214
|
+
}
|
|
3215
|
+
}
|
|
3216
|
+
self.assertDictEqual(expected, result)
|
|
3217
|
+
|
|
3218
|
+
def test__get_study_specific_config_with_none(self):
|
|
3219
|
+
"""Test that None file path returns None."""
|
|
3220
|
+
result = _get_study_specific_config(None)
|
|
3221
|
+
|
|
3222
|
+
self.assertIsNone(result)
|
|
3223
|
+
|
|
3224
|
+
def test__get_study_specific_config_with_empty_string(self):
|
|
3225
|
+
"""Test that empty string file path returns None."""
|
|
3226
|
+
result = _get_study_specific_config("")
|
|
3227
|
+
|
|
3228
|
+
self.assertIsNone(result)
|
|
3229
|
+
|
|
3230
|
+
def test__get_study_specific_config_nonexistent_file_raises(self):
|
|
3231
|
+
"""Test that nonexistent file raises FileNotFoundError."""
|
|
3232
|
+
with self.assertRaises(FileNotFoundError):
|
|
3233
|
+
_get_study_specific_config("/nonexistent/path/config.yml")
|
|
3234
|
+
|
|
3235
|
+
def test__get_study_specific_config_invalid_yaml_raises(self):
|
|
3236
|
+
"""Test that invalid YAML file raises an error."""
|
|
3237
|
+
invalid_fp = path.join(self.TEST_DIR, "data/invalid.yml")
|
|
3238
|
+
|
|
3239
|
+
with self.assertRaises(Exception):
|
|
3240
|
+
_get_study_specific_config(invalid_fp)
|
|
3241
|
+
|
|
3242
|
+
# Tests for _output_metadata_df_to_files
|
|
3243
|
+
|
|
3244
|
+
def test__output_metadata_df_to_files_basic(self):
|
|
3245
|
+
"""Test basic output of metadata DataFrame to file."""
|
|
3246
|
+
input_df = pandas.DataFrame({
|
|
3247
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
3248
|
+
"field_a": ["a1", "a2"],
|
|
3249
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
|
|
3250
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
|
|
3251
|
+
QC_NOTE_KEY: ["", ""]
|
|
3252
|
+
})
|
|
3253
|
+
|
|
3254
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
3255
|
+
_output_metadata_df_to_files(
|
|
3256
|
+
input_df, tmpdir, "test_output", INTERNAL_COL_KEYS,
|
|
3257
|
+
sep="\t", remove_internals_and_fails=False)
|
|
3258
|
+
|
|
3259
|
+
# Find the output file (has timestamp prefix)
|
|
3260
|
+
output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
|
|
3261
|
+
self.assertEqual(1, len(output_files))
|
|
3262
|
+
|
|
3263
|
+
# Read and verify contents (keep_default_na=False preserves empty strings)
|
|
3264
|
+
result_df = pandas.read_csv(output_files[0], sep="\t", keep_default_na=False)
|
|
3265
|
+
expected_df = input_df
|
|
3266
|
+
assert_frame_equal(expected_df, result_df)
|
|
3267
|
+
|
|
3268
|
+
def test__output_metadata_df_to_files_remove_internals_and_fails(self):
|
|
3269
|
+
"""Test output with internal columns and failures removed."""
|
|
3270
|
+
input_df = pandas.DataFrame({
|
|
3271
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2", "sample3"],
|
|
3272
|
+
"field_a": ["a1", "a2", "a3"],
|
|
3273
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "human", "human"],
|
|
3274
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool", "stool"],
|
|
3275
|
+
QC_NOTE_KEY: ["", "invalid host_type", ""]
|
|
3276
|
+
})
|
|
3277
|
+
|
|
3278
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
3279
|
+
_output_metadata_df_to_files(
|
|
3280
|
+
input_df, tmpdir, "test_output", INTERNAL_COL_KEYS,
|
|
3281
|
+
sep="\t", remove_internals_and_fails=True)
|
|
3282
|
+
|
|
3283
|
+
# Find the main output file
|
|
3284
|
+
output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
|
|
3285
|
+
self.assertEqual(1, len(output_files))
|
|
3286
|
+
|
|
3287
|
+
# Verify main output has internal cols removed and no failures
|
|
3288
|
+
result_df = pandas.read_csv(output_files[0], sep="\t")
|
|
3289
|
+
expected_df = pandas.DataFrame({
|
|
3290
|
+
SAMPLE_NAME_KEY: ["sample1", "sample3"],
|
|
3291
|
+
"field_a": ["a1", "a3"]
|
|
3292
|
+
})
|
|
3293
|
+
assert_frame_equal(expected_df, result_df)
|
|
3294
|
+
|
|
3295
|
+
# Find the fails file
|
|
3296
|
+
fails_files = glob.glob(os.path.join(tmpdir, "*_test_output_fails.csv"))
|
|
3297
|
+
self.assertEqual(1, len(fails_files))
|
|
3298
|
+
|
|
3299
|
+
# Verify fails file contains the failed row
|
|
3300
|
+
fails_df = pandas.read_csv(fails_files[0], sep=",")
|
|
3301
|
+
expected_fails_df = pandas.DataFrame({
|
|
3302
|
+
SAMPLE_NAME_KEY: ["sample2"],
|
|
3303
|
+
"field_a": ["a2"],
|
|
3304
|
+
HOSTTYPE_SHORTHAND_KEY: ["human"],
|
|
3305
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool"],
|
|
3306
|
+
QC_NOTE_KEY: ["invalid host_type"]
|
|
3307
|
+
})
|
|
3308
|
+
assert_frame_equal(expected_fails_df, fails_df)
|
|
3309
|
+
|
|
3310
|
+
def test__output_metadata_df_to_files_no_failures_creates_empty_file(self):
|
|
3311
|
+
"""Test that empty fails file is created when there are no failures."""
|
|
3312
|
+
input_df = pandas.DataFrame({
|
|
3313
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
3314
|
+
"field_a": ["a1", "a2"],
|
|
3315
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
|
|
3316
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
|
|
3317
|
+
QC_NOTE_KEY: ["", ""]
|
|
3318
|
+
})
|
|
3319
|
+
|
|
3320
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
3321
|
+
_output_metadata_df_to_files(
|
|
3322
|
+
input_df, tmpdir, "test_output", INTERNAL_COL_KEYS,
|
|
3323
|
+
sep="\t", remove_internals_and_fails=True,
|
|
3324
|
+
suppress_empty_fails=False)
|
|
3325
|
+
|
|
3326
|
+
# Find the fails file
|
|
3327
|
+
fails_files = glob.glob(os.path.join(tmpdir, "*_test_output_fails.csv"))
|
|
3328
|
+
self.assertEqual(1, len(fails_files))
|
|
3329
|
+
|
|
3330
|
+
# Verify fails file is empty (zero bytes)
|
|
3331
|
+
self.assertEqual(0, os.path.getsize(fails_files[0]))
|
|
3332
|
+
|
|
3333
|
+
def test__output_metadata_df_to_files_suppress_empty_fails(self):
|
|
3334
|
+
"""Test that empty fails file is not created when suppress_empty_fails=True."""
|
|
3335
|
+
input_df = pandas.DataFrame({
|
|
3336
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
3337
|
+
"field_a": ["a1", "a2"],
|
|
3338
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
|
|
3339
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
|
|
3340
|
+
QC_NOTE_KEY: ["", ""]
|
|
3341
|
+
})
|
|
3342
|
+
|
|
3343
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
3344
|
+
_output_metadata_df_to_files(
|
|
3345
|
+
input_df, tmpdir, "test_output", INTERNAL_COL_KEYS,
|
|
3346
|
+
sep="\t", remove_internals_and_fails=True,
|
|
3347
|
+
suppress_empty_fails=True)
|
|
3348
|
+
|
|
3349
|
+
# Find the fails file - should not exist
|
|
3350
|
+
fails_files = glob.glob(os.path.join(tmpdir, "*_test_output_fails.csv"))
|
|
3351
|
+
self.assertEqual(0, len(fails_files))
|
|
3352
|
+
|
|
3353
|
+
# Main output file should still exist
|
|
3354
|
+
output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
|
|
3355
|
+
self.assertEqual(1, len(output_files))
|
|
3356
|
+
|
|
3357
|
+
def test__output_metadata_df_to_files_csv_separator(self):
|
|
3358
|
+
"""Test output with comma separator creates .csv file."""
|
|
3359
|
+
input_df = pandas.DataFrame({
|
|
3360
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
3361
|
+
"field_a": ["a1", "a2"],
|
|
3362
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
|
|
3363
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
|
|
3364
|
+
QC_NOTE_KEY: ["", ""]
|
|
3365
|
+
})
|
|
3366
|
+
|
|
3367
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
3368
|
+
_output_metadata_df_to_files(
|
|
3369
|
+
input_df, tmpdir, "test_output", INTERNAL_COL_KEYS,
|
|
3370
|
+
sep=",", remove_internals_and_fails=False)
|
|
3371
|
+
|
|
3372
|
+
# Find the output file with .csv extension
|
|
3373
|
+
output_files = glob.glob(os.path.join(tmpdir, "*_test_output.csv"))
|
|
3374
|
+
self.assertEqual(1, len(output_files))
|
|
3375
|
+
|
|
3376
|
+
# Read and verify contents (keep_default_na=False preserves empty strings)
|
|
3377
|
+
result_df = pandas.read_csv(output_files[0], sep=",", keep_default_na=False)
|
|
3378
|
+
expected_df = input_df
|
|
3379
|
+
assert_frame_equal(expected_df, result_df)
|
|
3380
|
+
|
|
3381
|
+
def test__output_metadata_df_to_files_all_failures(self):
|
|
3382
|
+
"""Test output when all rows are failures."""
|
|
3383
|
+
input_df = pandas.DataFrame({
|
|
3384
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
3385
|
+
"field_a": ["a1", "a2"],
|
|
2053
3386
|
HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
|
|
2054
|
-
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"]
|
|
3387
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
|
|
3388
|
+
QC_NOTE_KEY: ["invalid host_type", "invalid sample_type"]
|
|
2055
3389
|
})
|
|
2056
|
-
full_flat_config_dict = {
|
|
2057
|
-
HOST_TYPE_SPECIFIC_METADATA_KEY: {}
|
|
2058
|
-
}
|
|
2059
3390
|
|
|
2060
|
-
with
|
|
2061
|
-
|
|
3391
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
3392
|
+
_output_metadata_df_to_files(
|
|
3393
|
+
input_df, tmpdir, "test_output", INTERNAL_COL_KEYS,
|
|
3394
|
+
sep="\t", remove_internals_and_fails=True)
|
|
2062
3395
|
|
|
2063
|
-
|
|
3396
|
+
# Main output file should have only headers (empty data)
|
|
3397
|
+
output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
|
|
3398
|
+
self.assertEqual(1, len(output_files))
|
|
3399
|
+
result_df = pandas.read_csv(output_files[0], sep="\t")
|
|
3400
|
+
self.assertTrue(result_df.empty)
|
|
3401
|
+
self.assertEqual([SAMPLE_NAME_KEY, "field_a"], list(result_df.columns))
|
|
2064
3402
|
|
|
2065
|
-
|
|
2066
|
-
|
|
3403
|
+
# Fails file should have both rows
|
|
3404
|
+
fails_files = glob.glob(os.path.join(tmpdir, "*_test_output_fails.csv"))
|
|
3405
|
+
self.assertEqual(1, len(fails_files))
|
|
3406
|
+
fails_df = pandas.read_csv(fails_files[0], sep=",")
|
|
3407
|
+
self.assertEqual(2, len(fails_df))
|
|
2067
3408
|
|
|
2068
|
-
|
|
2069
|
-
|
|
3409
|
+
# Tests for get_extended_metadata_from_df_and_yaml
|
|
3410
|
+
|
|
3411
|
+
TEST_STUDY_CONFIG_FP = path.join(TEST_DIR, "data/test_study_config.yml")
|
|
3412
|
+
|
|
3413
|
+
def test_get_extended_metadata_from_df_and_yaml_with_config(self):
|
|
3414
|
+
"""Test extending metadata with a study-specific YAML config file."""
|
|
2070
3415
|
input_df = pandas.DataFrame({
|
|
2071
3416
|
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
2072
3417
|
HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
|
|
2073
3418
|
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"]
|
|
2074
3419
|
})
|
|
2075
|
-
study_config = {
|
|
2076
|
-
DEFAULT_KEY: "not provided",
|
|
2077
|
-
LEAVE_REQUIREDS_BLANK_KEY: True,
|
|
2078
|
-
OVERWRITE_NON_NANS_KEY: False,
|
|
2079
|
-
STUDY_SPECIFIC_METADATA_KEY: {
|
|
2080
|
-
HOST_TYPE_SPECIFIC_METADATA_KEY: {
|
|
2081
|
-
"human": {
|
|
2082
|
-
METADATA_FIELDS_KEY: {
|
|
2083
|
-
"custom_field": {
|
|
2084
|
-
DEFAULT_KEY: "custom_value",
|
|
2085
|
-
TYPE_KEY: "string"
|
|
2086
|
-
}
|
|
2087
|
-
},
|
|
2088
|
-
SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
|
|
2089
|
-
"stool": {
|
|
2090
|
-
METADATA_FIELDS_KEY: {}
|
|
2091
|
-
}
|
|
2092
|
-
}
|
|
2093
|
-
}
|
|
2094
|
-
}
|
|
2095
|
-
}
|
|
2096
|
-
}
|
|
2097
3420
|
|
|
2098
|
-
result_df, validation_msgs_df =
|
|
2099
|
-
input_df,
|
|
3421
|
+
result_df, validation_msgs_df = get_extended_metadata_from_df_and_yaml(
|
|
3422
|
+
input_df, self.TEST_STUDY_CONFIG_FP, self.TEST_STDS_FP)
|
|
2100
3423
|
|
|
2101
3424
|
expected_df = pandas.DataFrame({
|
|
2102
3425
|
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
3426
|
+
"body_product": ["UBERON:feces", "UBERON:feces"],
|
|
2103
3427
|
"body_site": ["gut", "gut"],
|
|
2104
|
-
"
|
|
3428
|
+
"description": ["human sample", "human sample"],
|
|
2105
3429
|
"host_common_name": ["human", "human"],
|
|
2106
3430
|
QIITA_SAMPLE_TYPE: ["stool", "stool"],
|
|
2107
3431
|
SAMPLE_TYPE_KEY: ["stool", "stool"],
|
|
3432
|
+
"study_custom_field": ["custom_value", "custom_value"],
|
|
3433
|
+
"study_stool_field": ["stool_custom", "stool_custom"],
|
|
2108
3434
|
HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
|
|
2109
3435
|
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
|
|
2110
3436
|
QC_NOTE_KEY: ["", ""]
|
|
@@ -2112,153 +3438,68 @@ class TestMetadataExtender(TestCase):
|
|
|
2112
3438
|
assert_frame_equal(expected_df, result_df)
|
|
2113
3439
|
self.assertTrue(validation_msgs_df.empty)
|
|
2114
3440
|
|
|
2115
|
-
def
|
|
2116
|
-
"""Test metadata
|
|
3441
|
+
def test_get_extended_metadata_from_df_and_yaml_none_config(self):
|
|
3442
|
+
"""Test extending metadata with None for study_specific_config_fp."""
|
|
2117
3443
|
input_df = pandas.DataFrame({
|
|
2118
3444
|
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
2119
3445
|
HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
|
|
2120
|
-
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"]
|
|
2121
|
-
"input_sex": ["F", "Male"]
|
|
3446
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"]
|
|
2122
3447
|
})
|
|
2123
|
-
study_config = {
|
|
2124
|
-
DEFAULT_KEY: "not provided",
|
|
2125
|
-
LEAVE_REQUIREDS_BLANK_KEY: True,
|
|
2126
|
-
OVERWRITE_NON_NANS_KEY: False,
|
|
2127
|
-
METADATA_TRANSFORMERS_KEY: {
|
|
2128
|
-
PRE_TRANSFORMERS_KEY: {
|
|
2129
|
-
"sex": {
|
|
2130
|
-
SOURCES_KEY: ["input_sex"],
|
|
2131
|
-
FUNCTION_KEY: "transform_input_sex_to_std_sex"
|
|
2132
|
-
}
|
|
2133
|
-
}
|
|
2134
|
-
},
|
|
2135
|
-
STUDY_SPECIFIC_METADATA_KEY: {
|
|
2136
|
-
HOST_TYPE_SPECIFIC_METADATA_KEY: {
|
|
2137
|
-
"human": {
|
|
2138
|
-
METADATA_FIELDS_KEY: {},
|
|
2139
|
-
SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
|
|
2140
|
-
"stool": {
|
|
2141
|
-
METADATA_FIELDS_KEY: {}
|
|
2142
|
-
}
|
|
2143
|
-
}
|
|
2144
|
-
}
|
|
2145
|
-
}
|
|
2146
|
-
}
|
|
2147
|
-
}
|
|
2148
3448
|
|
|
2149
|
-
result_df, validation_msgs_df =
|
|
2150
|
-
input_df,
|
|
3449
|
+
result_df, validation_msgs_df = get_extended_metadata_from_df_and_yaml(
|
|
3450
|
+
input_df, None, self.TEST_STDS_FP)
|
|
2151
3451
|
|
|
2152
3452
|
expected_df = pandas.DataFrame({
|
|
2153
3453
|
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
3454
|
+
"body_product": ["UBERON:feces", "UBERON:feces"],
|
|
2154
3455
|
"body_site": ["gut", "gut"],
|
|
3456
|
+
"description": ["human sample", "human sample"],
|
|
2155
3457
|
"host_common_name": ["human", "human"],
|
|
2156
|
-
"input_sex": ["F", "Male"],
|
|
2157
3458
|
QIITA_SAMPLE_TYPE: ["stool", "stool"],
|
|
2158
3459
|
SAMPLE_TYPE_KEY: ["stool", "stool"],
|
|
2159
|
-
"sex": ["female", "male"],
|
|
2160
3460
|
HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
|
|
2161
3461
|
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
|
|
2162
3462
|
QC_NOTE_KEY: ["", ""]
|
|
2163
3463
|
})
|
|
2164
3464
|
assert_frame_equal(expected_df, result_df)
|
|
3465
|
+
self.assertTrue(validation_msgs_df.empty)
|
|
2165
3466
|
|
|
2166
|
-
def
|
|
2167
|
-
"""Test
|
|
3467
|
+
def test_get_extended_metadata_from_df_and_yaml_invalid_host_type(self):
|
|
3468
|
+
"""Test that invalid host types are flagged with QC note."""
|
|
2168
3469
|
input_df = pandas.DataFrame({
|
|
2169
3470
|
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
2170
|
-
HOSTTYPE_SHORTHAND_KEY: ["
|
|
2171
|
-
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"]
|
|
2172
|
-
"source_field": ["hello", "world"]
|
|
3471
|
+
HOSTTYPE_SHORTHAND_KEY: ["unknown_host", "human"],
|
|
3472
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"]
|
|
2173
3473
|
})
|
|
2174
|
-
study_config = {
|
|
2175
|
-
DEFAULT_KEY: "not provided",
|
|
2176
|
-
LEAVE_REQUIREDS_BLANK_KEY: True,
|
|
2177
|
-
OVERWRITE_NON_NANS_KEY: False,
|
|
2178
|
-
METADATA_TRANSFORMERS_KEY: {
|
|
2179
|
-
PRE_TRANSFORMERS_KEY: {
|
|
2180
|
-
"upper_field": {
|
|
2181
|
-
SOURCES_KEY: ["source_field"],
|
|
2182
|
-
FUNCTION_KEY: "custom_upper"
|
|
2183
|
-
}
|
|
2184
|
-
}
|
|
2185
|
-
},
|
|
2186
|
-
STUDY_SPECIFIC_METADATA_KEY: {
|
|
2187
|
-
HOST_TYPE_SPECIFIC_METADATA_KEY: {
|
|
2188
|
-
"human": {
|
|
2189
|
-
METADATA_FIELDS_KEY: {},
|
|
2190
|
-
SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
|
|
2191
|
-
"stool": {
|
|
2192
|
-
METADATA_FIELDS_KEY: {}
|
|
2193
|
-
}
|
|
2194
|
-
}
|
|
2195
|
-
}
|
|
2196
|
-
}
|
|
2197
|
-
}
|
|
2198
|
-
}
|
|
2199
|
-
|
|
2200
|
-
def custom_upper(row, source_fields):
|
|
2201
|
-
return row[source_fields[0]].upper()
|
|
2202
|
-
|
|
2203
|
-
transformer_funcs_dict = {"custom_upper": custom_upper}
|
|
2204
3474
|
|
|
2205
|
-
result_df, validation_msgs_df =
|
|
2206
|
-
input_df,
|
|
3475
|
+
result_df, validation_msgs_df = get_extended_metadata_from_df_and_yaml(
|
|
3476
|
+
input_df, self.TEST_STUDY_CONFIG_FP, self.TEST_STDS_FP)
|
|
2207
3477
|
|
|
2208
3478
|
expected_df = pandas.DataFrame({
|
|
2209
3479
|
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
2210
|
-
"
|
|
2211
|
-
"
|
|
2212
|
-
|
|
2213
|
-
|
|
2214
|
-
|
|
2215
|
-
|
|
2216
|
-
|
|
3480
|
+
"body_product": ["not provided", "UBERON:feces"],
|
|
3481
|
+
"body_site": ["not provided", "gut"],
|
|
3482
|
+
"description": ["not provided", "human sample"],
|
|
3483
|
+
"host_common_name": ["not provided", "human"],
|
|
3484
|
+
QIITA_SAMPLE_TYPE: ["not provided", "stool"],
|
|
3485
|
+
SAMPLE_TYPE_KEY: ["not provided", "stool"],
|
|
3486
|
+
"study_custom_field": ["not provided", "custom_value"],
|
|
3487
|
+
"study_stool_field": ["not provided", "stool_custom"],
|
|
3488
|
+
HOSTTYPE_SHORTHAND_KEY: ["unknown_host", "human"],
|
|
2217
3489
|
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
|
|
2218
|
-
QC_NOTE_KEY: ["", ""]
|
|
3490
|
+
QC_NOTE_KEY: ["invalid host_type", ""]
|
|
2219
3491
|
})
|
|
2220
3492
|
assert_frame_equal(expected_df, result_df)
|
|
3493
|
+
self.assertTrue(validation_msgs_df.empty)
|
|
2221
3494
|
|
|
2222
|
-
|
|
2223
|
-
"""Test that missing required columns raises ValueError."""
|
|
2224
|
-
input_df = pandas.DataFrame({
|
|
2225
|
-
SAMPLE_NAME_KEY: ["sample1", "sample2"]
|
|
2226
|
-
# Missing HOSTTYPE_SHORTHAND_KEY and SAMPLETYPE_SHORTHAND_KEY
|
|
2227
|
-
})
|
|
2228
|
-
study_config = {}
|
|
2229
|
-
|
|
2230
|
-
with self.assertRaisesRegex(ValueError, "metadata missing required columns"):
|
|
2231
|
-
extend_metadata_df(input_df, study_config, None, None, self.TEST_STDS_FP)
|
|
2232
|
-
|
|
2233
|
-
def test_extend_metadata_df_none_study_config(self):
|
|
2234
|
-
"""Test metadata extension with None study config uses standards only."""
|
|
2235
|
-
input_df = pandas.DataFrame({
|
|
2236
|
-
SAMPLE_NAME_KEY: ["sample1"],
|
|
2237
|
-
HOSTTYPE_SHORTHAND_KEY: ["human"],
|
|
2238
|
-
SAMPLETYPE_SHORTHAND_KEY: ["stool"]
|
|
2239
|
-
})
|
|
2240
|
-
|
|
2241
|
-
result_df, validation_msgs_df = extend_metadata_df(
|
|
2242
|
-
input_df, None, None, None, self.TEST_STDS_FP)
|
|
2243
|
-
|
|
2244
|
-
expected_df = pandas.DataFrame({
|
|
2245
|
-
SAMPLE_NAME_KEY: ["sample1"],
|
|
2246
|
-
"body_site": ["gut"],
|
|
2247
|
-
"host_common_name": ["human"],
|
|
2248
|
-
QIITA_SAMPLE_TYPE: ["stool"],
|
|
2249
|
-
SAMPLE_TYPE_KEY: ["stool"],
|
|
2250
|
-
HOSTTYPE_SHORTHAND_KEY: ["human"],
|
|
2251
|
-
SAMPLETYPE_SHORTHAND_KEY: ["stool"],
|
|
2252
|
-
QC_NOTE_KEY: [""]
|
|
2253
|
-
})
|
|
2254
|
-
assert_frame_equal(expected_df, result_df)
|
|
3495
|
+
# Tests for write_extended_metadata_from_df
|
|
2255
3496
|
|
|
2256
|
-
def
|
|
2257
|
-
"""Test
|
|
3497
|
+
def test_write_extended_metadata_from_df_basic(self):
|
|
3498
|
+
"""Test basic writing of extended metadata to files."""
|
|
2258
3499
|
input_df = pandas.DataFrame({
|
|
2259
|
-
SAMPLE_NAME_KEY: ["sample1"],
|
|
2260
|
-
HOSTTYPE_SHORTHAND_KEY: ["
|
|
2261
|
-
SAMPLETYPE_SHORTHAND_KEY: ["stool"]
|
|
3500
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
3501
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
|
|
3502
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"]
|
|
2262
3503
|
})
|
|
2263
3504
|
study_config = {
|
|
2264
3505
|
DEFAULT_KEY: "not provided",
|
|
@@ -2267,7 +3508,12 @@ class TestMetadataExtender(TestCase):
|
|
|
2267
3508
|
STUDY_SPECIFIC_METADATA_KEY: {
|
|
2268
3509
|
HOST_TYPE_SPECIFIC_METADATA_KEY: {
|
|
2269
3510
|
"human": {
|
|
2270
|
-
METADATA_FIELDS_KEY: {
|
|
3511
|
+
METADATA_FIELDS_KEY: {
|
|
3512
|
+
"custom_field": {
|
|
3513
|
+
DEFAULT_KEY: "custom_value",
|
|
3514
|
+
TYPE_KEY: "string"
|
|
3515
|
+
}
|
|
3516
|
+
},
|
|
2271
3517
|
SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
|
|
2272
3518
|
"stool": {
|
|
2273
3519
|
METADATA_FIELDS_KEY: {}
|
|
@@ -2278,23 +3524,60 @@ class TestMetadataExtender(TestCase):
|
|
|
2278
3524
|
}
|
|
2279
3525
|
}
|
|
2280
3526
|
|
|
2281
|
-
|
|
2282
|
-
|
|
3527
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
3528
|
+
result_df = write_extended_metadata_from_df(
|
|
3529
|
+
input_df, study_config, tmpdir, "test_output",
|
|
3530
|
+
stds_fp=self.TEST_STDS_FP)
|
|
2283
3531
|
|
|
2284
|
-
|
|
2285
|
-
|
|
2286
|
-
|
|
2287
|
-
|
|
2288
|
-
|
|
2289
|
-
|
|
2290
|
-
|
|
3532
|
+
# Verify returned DataFrame
|
|
3533
|
+
expected_df = pandas.DataFrame({
|
|
3534
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
3535
|
+
"body_product": ["UBERON:feces", "UBERON:feces"],
|
|
3536
|
+
"body_site": ["gut", "gut"],
|
|
3537
|
+
"custom_field": ["custom_value", "custom_value"],
|
|
3538
|
+
"description": ["human sample", "human sample"],
|
|
3539
|
+
"host_common_name": ["human", "human"],
|
|
3540
|
+
QIITA_SAMPLE_TYPE: ["stool", "stool"],
|
|
3541
|
+
SAMPLE_TYPE_KEY: ["stool", "stool"],
|
|
3542
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
|
|
3543
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
|
|
3544
|
+
QC_NOTE_KEY: ["", ""]
|
|
3545
|
+
})
|
|
3546
|
+
assert_frame_equal(expected_df, result_df)
|
|
2291
3547
|
|
|
2292
|
-
|
|
2293
|
-
|
|
3548
|
+
# Verify main output file was created (internal cols removed by default)
|
|
3549
|
+
output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
|
|
3550
|
+
self.assertEqual(1, len(output_files))
|
|
3551
|
+
output_df = pandas.read_csv(output_files[0], sep="\t")
|
|
3552
|
+
expected_output_df = pandas.DataFrame({
|
|
3553
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
3554
|
+
"body_product": ["UBERON:feces", "UBERON:feces"],
|
|
3555
|
+
"body_site": ["gut", "gut"],
|
|
3556
|
+
"custom_field": ["custom_value", "custom_value"],
|
|
3557
|
+
"description": ["human sample", "human sample"],
|
|
3558
|
+
"host_common_name": ["human", "human"],
|
|
3559
|
+
QIITA_SAMPLE_TYPE: ["stool", "stool"],
|
|
3560
|
+
SAMPLE_TYPE_KEY: ["stool", "stool"]
|
|
3561
|
+
})
|
|
3562
|
+
assert_frame_equal(expected_output_df, output_df)
|
|
3563
|
+
|
|
3564
|
+
# Verify empty fails file was created
|
|
3565
|
+
fails_files = glob.glob(os.path.join(tmpdir, "*_test_output_fails.csv"))
|
|
3566
|
+
self.assertEqual(1, len(fails_files))
|
|
3567
|
+
self.assertEqual(0, os.path.getsize(fails_files[0]))
|
|
3568
|
+
|
|
3569
|
+
# Verify validation errors file was created (empty)
|
|
3570
|
+
validation_files = glob.glob(
|
|
3571
|
+
os.path.join(tmpdir, "*_test_output_validation_errors.csv"))
|
|
3572
|
+
self.assertEqual(1, len(validation_files))
|
|
3573
|
+
self.assertEqual(0, os.path.getsize(validation_files[0]))
|
|
3574
|
+
|
|
3575
|
+
def test_write_extended_metadata_from_df_with_qc_failures(self):
|
|
3576
|
+
"""Test writing extended metadata when some rows have QC failures."""
|
|
2294
3577
|
input_df = pandas.DataFrame({
|
|
2295
3578
|
SAMPLE_NAME_KEY: ["sample1", "sample2", "sample3"],
|
|
2296
|
-
HOSTTYPE_SHORTHAND_KEY: ["human", "
|
|
2297
|
-
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool", "
|
|
3579
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "unknown_host", "human"],
|
|
3580
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool", "stool"]
|
|
2298
3581
|
})
|
|
2299
3582
|
study_config = {
|
|
2300
3583
|
DEFAULT_KEY: "not provided",
|
|
@@ -2303,17 +3586,6 @@ class TestMetadataExtender(TestCase):
|
|
|
2303
3586
|
STUDY_SPECIFIC_METADATA_KEY: {
|
|
2304
3587
|
HOST_TYPE_SPECIFIC_METADATA_KEY: {
|
|
2305
3588
|
"human": {
|
|
2306
|
-
METADATA_FIELDS_KEY: {},
|
|
2307
|
-
SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
|
|
2308
|
-
"stool": {
|
|
2309
|
-
METADATA_FIELDS_KEY: {}
|
|
2310
|
-
},
|
|
2311
|
-
"blood": {
|
|
2312
|
-
METADATA_FIELDS_KEY: {}
|
|
2313
|
-
}
|
|
2314
|
-
}
|
|
2315
|
-
},
|
|
2316
|
-
"mouse": {
|
|
2317
3589
|
METADATA_FIELDS_KEY: {},
|
|
2318
3590
|
SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
|
|
2319
3591
|
"stool": {
|
|
@@ -2325,45 +3597,79 @@ class TestMetadataExtender(TestCase):
|
|
|
2325
3597
|
}
|
|
2326
3598
|
}
|
|
2327
3599
|
|
|
2328
|
-
|
|
2329
|
-
|
|
3600
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
3601
|
+
result_df = write_extended_metadata_from_df(
|
|
3602
|
+
input_df, study_config, tmpdir, "test_output",
|
|
3603
|
+
stds_fp=self.TEST_STDS_FP)
|
|
3604
|
+
|
|
3605
|
+
# Verify returned DataFrame includes all rows (including failures)
|
|
3606
|
+
# Note: rows are reordered by host type processing (valid hosts first)
|
|
3607
|
+
expected_result_df = pandas.DataFrame({
|
|
3608
|
+
SAMPLE_NAME_KEY: ["sample1", "sample3", "sample2"],
|
|
3609
|
+
"body_product": ["UBERON:feces", "UBERON:feces", "not provided"],
|
|
3610
|
+
"body_site": ["gut", "gut", "not provided"],
|
|
3611
|
+
"description": ["human sample", "human sample", "not provided"],
|
|
3612
|
+
"host_common_name": ["human", "human", "not provided"],
|
|
3613
|
+
QIITA_SAMPLE_TYPE: ["stool", "stool", "not provided"],
|
|
3614
|
+
SAMPLE_TYPE_KEY: ["stool", "stool", "not provided"],
|
|
3615
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "human", "unknown_host"],
|
|
3616
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool", "stool"],
|
|
3617
|
+
QC_NOTE_KEY: ["", "", "invalid host_type"]
|
|
3618
|
+
})
|
|
3619
|
+
assert_frame_equal(expected_result_df, result_df)
|
|
2330
3620
|
|
|
2331
|
-
|
|
2332
|
-
|
|
2333
|
-
|
|
2334
|
-
|
|
2335
|
-
|
|
2336
|
-
|
|
2337
|
-
|
|
2338
|
-
|
|
2339
|
-
|
|
2340
|
-
|
|
2341
|
-
|
|
2342
|
-
|
|
2343
|
-
|
|
3621
|
+
# Verify main output file excludes failure rows
|
|
3622
|
+
output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
|
|
3623
|
+
self.assertEqual(1, len(output_files))
|
|
3624
|
+
output_df = pandas.read_csv(output_files[0], sep="\t")
|
|
3625
|
+
expected_output_df = pandas.DataFrame({
|
|
3626
|
+
SAMPLE_NAME_KEY: ["sample1", "sample3"],
|
|
3627
|
+
"body_product": ["UBERON:feces", "UBERON:feces"],
|
|
3628
|
+
"body_site": ["gut", "gut"],
|
|
3629
|
+
"description": ["human sample", "human sample"],
|
|
3630
|
+
"host_common_name": ["human", "human"],
|
|
3631
|
+
QIITA_SAMPLE_TYPE: ["stool", "stool"],
|
|
3632
|
+
SAMPLE_TYPE_KEY: ["stool", "stool"]
|
|
3633
|
+
})
|
|
3634
|
+
assert_frame_equal(expected_output_df, output_df)
|
|
3635
|
+
|
|
3636
|
+
# Verify fails file contains the failed row
|
|
3637
|
+
fails_files = glob.glob(os.path.join(tmpdir, "*_test_output_fails.csv"))
|
|
3638
|
+
self.assertEqual(1, len(fails_files))
|
|
3639
|
+
fails_df = pandas.read_csv(fails_files[0], sep=",")
|
|
3640
|
+
expected_fails_df = pandas.DataFrame({
|
|
3641
|
+
SAMPLE_NAME_KEY: ["sample2"],
|
|
3642
|
+
"body_product": ["not provided"],
|
|
3643
|
+
"body_site": ["not provided"],
|
|
3644
|
+
"description": ["not provided"],
|
|
3645
|
+
"host_common_name": ["not provided"],
|
|
3646
|
+
QIITA_SAMPLE_TYPE: ["not provided"],
|
|
3647
|
+
SAMPLE_TYPE_KEY: ["not provided"],
|
|
3648
|
+
HOSTTYPE_SHORTHAND_KEY: ["unknown_host"],
|
|
3649
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool"],
|
|
3650
|
+
QC_NOTE_KEY: ["invalid host_type"]
|
|
3651
|
+
})
|
|
3652
|
+
assert_frame_equal(expected_fails_df, fails_df)
|
|
2344
3653
|
|
|
2345
|
-
def
|
|
2346
|
-
"""Test
|
|
3654
|
+
def test_write_extended_metadata_from_df_with_validation_errors(self):
|
|
3655
|
+
"""Test writing extended metadata when validation errors occur."""
|
|
2347
3656
|
input_df = pandas.DataFrame({
|
|
2348
3657
|
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
2349
3658
|
HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
|
|
2350
|
-
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"]
|
|
3659
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
|
|
3660
|
+
"restricted_field": ["invalid_value", "allowed_value"]
|
|
2351
3661
|
})
|
|
2352
|
-
# Software config with custom default value
|
|
2353
|
-
software_config = {
|
|
2354
|
-
DEFAULT_KEY: "custom_software_default",
|
|
2355
|
-
LEAVE_REQUIREDS_BLANK_KEY: True,
|
|
2356
|
-
OVERWRITE_NON_NANS_KEY: False
|
|
2357
|
-
}
|
|
2358
|
-
# Study config that doesn't override DEFAULT_KEY
|
|
2359
3662
|
study_config = {
|
|
3663
|
+
DEFAULT_KEY: "not provided",
|
|
3664
|
+
LEAVE_REQUIREDS_BLANK_KEY: True,
|
|
3665
|
+
OVERWRITE_NON_NANS_KEY: False,
|
|
2360
3666
|
STUDY_SPECIFIC_METADATA_KEY: {
|
|
2361
3667
|
HOST_TYPE_SPECIFIC_METADATA_KEY: {
|
|
2362
3668
|
"human": {
|
|
2363
3669
|
METADATA_FIELDS_KEY: {
|
|
2364
|
-
"
|
|
2365
|
-
|
|
2366
|
-
|
|
3670
|
+
"restricted_field": {
|
|
3671
|
+
TYPE_KEY: "string",
|
|
3672
|
+
ALLOWED_KEY: ["allowed_value"]
|
|
2367
3673
|
}
|
|
2368
3674
|
},
|
|
2369
3675
|
SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
|
|
@@ -2376,235 +3682,466 @@ class TestMetadataExtender(TestCase):
|
|
|
2376
3682
|
}
|
|
2377
3683
|
}
|
|
2378
3684
|
|
|
2379
|
-
|
|
2380
|
-
|
|
3685
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
3686
|
+
result_df = write_extended_metadata_from_df(
|
|
3687
|
+
input_df, study_config, tmpdir, "test_output",
|
|
3688
|
+
stds_fp=self.TEST_STDS_FP)
|
|
3689
|
+
|
|
3690
|
+
# Verify returned DataFrame
|
|
3691
|
+
expected_result_df = pandas.DataFrame({
|
|
3692
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
3693
|
+
"body_product": ["UBERON:feces", "UBERON:feces"],
|
|
3694
|
+
"body_site": ["gut", "gut"],
|
|
3695
|
+
"description": ["human sample", "human sample"],
|
|
3696
|
+
"host_common_name": ["human", "human"],
|
|
3697
|
+
QIITA_SAMPLE_TYPE: ["stool", "stool"],
|
|
3698
|
+
"restricted_field": ["invalid_value", "allowed_value"],
|
|
3699
|
+
SAMPLE_TYPE_KEY: ["stool", "stool"],
|
|
3700
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
|
|
3701
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
|
|
3702
|
+
QC_NOTE_KEY: ["", ""]
|
|
3703
|
+
})
|
|
3704
|
+
assert_frame_equal(expected_result_df, result_df)
|
|
3705
|
+
|
|
3706
|
+
# Verify validation errors file contains the error
|
|
3707
|
+
validation_files = glob.glob(
|
|
3708
|
+
os.path.join(tmpdir, "*_test_output_validation_errors.csv"))
|
|
3709
|
+
self.assertEqual(1, len(validation_files))
|
|
3710
|
+
validation_df = pandas.read_csv(validation_files[0], sep=",")
|
|
3711
|
+
expected_validation_df = pandas.DataFrame({
|
|
3712
|
+
"sample_name": ["sample1"],
|
|
3713
|
+
"field_name": ["restricted_field"],
|
|
3714
|
+
"error_message": ["['unallowed value invalid_value']"]
|
|
3715
|
+
})
|
|
3716
|
+
assert_frame_equal(expected_validation_df, validation_df)
|
|
2381
3717
|
|
|
2382
|
-
|
|
2383
|
-
|
|
2384
|
-
|
|
2385
|
-
|
|
2386
|
-
|
|
2387
|
-
|
|
2388
|
-
"study_field": ["study_value", "study_value"],
|
|
2389
|
-
HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
|
|
2390
|
-
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
|
|
2391
|
-
QC_NOTE_KEY: ["", ""]
|
|
3718
|
+
def test_write_extended_metadata_from_df_remove_internals_false(self):
|
|
3719
|
+
"""Test writing extended metadata with remove_internals=False."""
|
|
3720
|
+
input_df = pandas.DataFrame({
|
|
3721
|
+
SAMPLE_NAME_KEY: ["sample1"],
|
|
3722
|
+
HOSTTYPE_SHORTHAND_KEY: ["human"],
|
|
3723
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool"]
|
|
2392
3724
|
})
|
|
2393
|
-
|
|
2394
|
-
|
|
2395
|
-
|
|
2396
|
-
|
|
2397
|
-
|
|
2398
|
-
|
|
2399
|
-
|
|
2400
|
-
|
|
2401
|
-
|
|
2402
|
-
|
|
2403
|
-
|
|
2404
|
-
|
|
2405
|
-
"base": {
|
|
2406
|
-
METADATA_FIELDS_KEY: {
|
|
2407
|
-
"sample_name": {
|
|
2408
|
-
TYPE_KEY: "string",
|
|
2409
|
-
"unique": True
|
|
2410
|
-
},
|
|
2411
|
-
"sample_type": {
|
|
2412
|
-
"empty": False,
|
|
2413
|
-
"is_phi": False
|
|
3725
|
+
study_config = {
|
|
3726
|
+
DEFAULT_KEY: "not provided",
|
|
3727
|
+
LEAVE_REQUIREDS_BLANK_KEY: True,
|
|
3728
|
+
OVERWRITE_NON_NANS_KEY: False,
|
|
3729
|
+
STUDY_SPECIFIC_METADATA_KEY: {
|
|
3730
|
+
HOST_TYPE_SPECIFIC_METADATA_KEY: {
|
|
3731
|
+
"human": {
|
|
3732
|
+
METADATA_FIELDS_KEY: {},
|
|
3733
|
+
SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
|
|
3734
|
+
"stool": {
|
|
3735
|
+
METADATA_FIELDS_KEY: {}
|
|
3736
|
+
}
|
|
2414
3737
|
}
|
|
2415
3738
|
}
|
|
2416
3739
|
}
|
|
2417
3740
|
}
|
|
2418
3741
|
}
|
|
2419
|
-
self.assertDictEqual(expected, result)
|
|
2420
|
-
|
|
2421
|
-
def test__get_study_specific_config_with_none(self):
|
|
2422
|
-
"""Test that None file path returns None."""
|
|
2423
|
-
result = _get_study_specific_config(None)
|
|
2424
|
-
|
|
2425
|
-
self.assertIsNone(result)
|
|
2426
|
-
|
|
2427
|
-
def test__get_study_specific_config_with_empty_string(self):
|
|
2428
|
-
"""Test that empty string file path returns None."""
|
|
2429
|
-
result = _get_study_specific_config("")
|
|
2430
|
-
|
|
2431
|
-
self.assertIsNone(result)
|
|
2432
3742
|
|
|
2433
|
-
|
|
2434
|
-
|
|
2435
|
-
|
|
2436
|
-
|
|
3743
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
3744
|
+
write_extended_metadata_from_df(
|
|
3745
|
+
input_df, study_config, tmpdir, "test_output",
|
|
3746
|
+
remove_internals=False, stds_fp=self.TEST_STDS_FP)
|
|
2437
3747
|
|
|
2438
|
-
|
|
2439
|
-
|
|
2440
|
-
|
|
3748
|
+
# Verify main output file includes internal columns
|
|
3749
|
+
output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
|
|
3750
|
+
self.assertEqual(1, len(output_files))
|
|
3751
|
+
output_df = pandas.read_csv(output_files[0], sep="\t", keep_default_na=False)
|
|
3752
|
+
expected_output_df = pandas.DataFrame({
|
|
3753
|
+
SAMPLE_NAME_KEY: ["sample1"],
|
|
3754
|
+
"body_product": ["UBERON:feces"],
|
|
3755
|
+
"body_site": ["gut"],
|
|
3756
|
+
"description": ["human sample"],
|
|
3757
|
+
"host_common_name": ["human"],
|
|
3758
|
+
QIITA_SAMPLE_TYPE: ["stool"],
|
|
3759
|
+
SAMPLE_TYPE_KEY: ["stool"],
|
|
3760
|
+
HOSTTYPE_SHORTHAND_KEY: ["human"],
|
|
3761
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool"],
|
|
3762
|
+
QC_NOTE_KEY: [""]
|
|
3763
|
+
})
|
|
3764
|
+
assert_frame_equal(expected_output_df, output_df)
|
|
2441
3765
|
|
|
2442
|
-
|
|
2443
|
-
|
|
3766
|
+
# Verify no fails file was created (since remove_internals=False)
|
|
3767
|
+
fails_files = glob.glob(os.path.join(tmpdir, "*_test_output_fails.csv"))
|
|
3768
|
+
self.assertEqual(0, len(fails_files))
|
|
2444
3769
|
|
|
2445
|
-
# Tests for
|
|
3770
|
+
# Tests for write_extended_metadata
|
|
2446
3771
|
|
|
2447
|
-
|
|
2448
|
-
|
|
2449
|
-
|
|
2450
|
-
|
|
2451
|
-
|
|
2452
|
-
|
|
2453
|
-
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
|
|
2454
|
-
QC_NOTE_KEY: ["", ""]
|
|
2455
|
-
})
|
|
3772
|
+
TEST_METADATA_CSV_FP = path.join(TEST_DIR, "data/test_metadata.csv")
|
|
3773
|
+
TEST_METADATA_TXT_FP = path.join(TEST_DIR, "data/test_metadata.txt")
|
|
3774
|
+
TEST_METADATA_WITH_ERRORS_FP = path.join(
|
|
3775
|
+
TEST_DIR, "data/test_metadata_with_errors.csv")
|
|
3776
|
+
TEST_STUDY_CONFIG_WITH_VALIDATION_FP = path.join(
|
|
3777
|
+
TEST_DIR, "data/test_study_config_with_validation.yml")
|
|
2456
3778
|
|
|
3779
|
+
def test_write_extended_metadata_csv_input(self):
|
|
3780
|
+
"""Test writing extended metadata from a CSV input file."""
|
|
2457
3781
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
2458
|
-
|
|
2459
|
-
|
|
2460
|
-
|
|
3782
|
+
result_df = write_extended_metadata(
|
|
3783
|
+
self.TEST_METADATA_CSV_FP, self.TEST_STUDY_CONFIG_FP,
|
|
3784
|
+
tmpdir, "test_output", stds_fp=self.TEST_STDS_FP)
|
|
3785
|
+
|
|
3786
|
+
# Verify returned DataFrame
|
|
3787
|
+
expected_result_df = pandas.DataFrame({
|
|
3788
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
3789
|
+
"body_product": ["UBERON:feces", "UBERON:feces"],
|
|
3790
|
+
"body_site": ["gut", "gut"],
|
|
3791
|
+
"description": ["human sample", "human sample"],
|
|
3792
|
+
"host_common_name": ["human", "human"],
|
|
3793
|
+
QIITA_SAMPLE_TYPE: ["stool", "stool"],
|
|
3794
|
+
SAMPLE_TYPE_KEY: ["stool", "stool"],
|
|
3795
|
+
"study_custom_field": ["custom_value", "custom_value"],
|
|
3796
|
+
"study_stool_field": ["stool_custom", "stool_custom"],
|
|
3797
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
|
|
3798
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
|
|
3799
|
+
QC_NOTE_KEY: ["", ""]
|
|
3800
|
+
})
|
|
3801
|
+
assert_frame_equal(expected_result_df, result_df)
|
|
2461
3802
|
|
|
2462
|
-
#
|
|
3803
|
+
# Verify main output file was created (internal cols removed by default)
|
|
2463
3804
|
output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
|
|
2464
3805
|
self.assertEqual(1, len(output_files))
|
|
3806
|
+
output_df = pandas.read_csv(output_files[0], sep="\t")
|
|
3807
|
+
expected_output_df = pandas.DataFrame({
|
|
3808
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
3809
|
+
"body_product": ["UBERON:feces", "UBERON:feces"],
|
|
3810
|
+
"body_site": ["gut", "gut"],
|
|
3811
|
+
"description": ["human sample", "human sample"],
|
|
3812
|
+
"host_common_name": ["human", "human"],
|
|
3813
|
+
QIITA_SAMPLE_TYPE: ["stool", "stool"],
|
|
3814
|
+
SAMPLE_TYPE_KEY: ["stool", "stool"],
|
|
3815
|
+
"study_custom_field": ["custom_value", "custom_value"],
|
|
3816
|
+
"study_stool_field": ["stool_custom", "stool_custom"]
|
|
3817
|
+
})
|
|
3818
|
+
assert_frame_equal(expected_output_df, output_df)
|
|
2465
3819
|
|
|
2466
|
-
#
|
|
2467
|
-
|
|
2468
|
-
|
|
2469
|
-
|
|
3820
|
+
# Verify empty fails file was created
|
|
3821
|
+
fails_files = glob.glob(os.path.join(tmpdir, "*_test_output_fails.csv"))
|
|
3822
|
+
self.assertEqual(1, len(fails_files))
|
|
3823
|
+
self.assertEqual(0, os.path.getsize(fails_files[0]))
|
|
2470
3824
|
|
|
2471
|
-
|
|
2472
|
-
|
|
2473
|
-
|
|
2474
|
-
|
|
2475
|
-
|
|
2476
|
-
HOSTTYPE_SHORTHAND_KEY: ["human", "human", "human"],
|
|
2477
|
-
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool", "stool"],
|
|
2478
|
-
QC_NOTE_KEY: ["", "invalid host_type", ""]
|
|
2479
|
-
})
|
|
3825
|
+
# Verify empty validation errors file was created
|
|
3826
|
+
validation_files = glob.glob(
|
|
3827
|
+
os.path.join(tmpdir, "*_test_output_validation_errors.csv"))
|
|
3828
|
+
self.assertEqual(1, len(validation_files))
|
|
3829
|
+
self.assertEqual(0, os.path.getsize(validation_files[0]))
|
|
2480
3830
|
|
|
3831
|
+
def test_write_extended_metadata_txt_input(self):
|
|
3832
|
+
"""Test writing extended metadata from a tab-delimited TXT input file."""
|
|
2481
3833
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
2482
|
-
|
|
2483
|
-
|
|
2484
|
-
|
|
3834
|
+
result_df = write_extended_metadata(
|
|
3835
|
+
self.TEST_METADATA_TXT_FP, self.TEST_STUDY_CONFIG_FP,
|
|
3836
|
+
tmpdir, "test_output", stds_fp=self.TEST_STDS_FP)
|
|
3837
|
+
|
|
3838
|
+
# Verify returned DataFrame
|
|
3839
|
+
expected_result_df = pandas.DataFrame({
|
|
3840
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
3841
|
+
"body_product": ["UBERON:feces", "UBERON:feces"],
|
|
3842
|
+
"body_site": ["gut", "gut"],
|
|
3843
|
+
"description": ["human sample", "human sample"],
|
|
3844
|
+
"host_common_name": ["human", "human"],
|
|
3845
|
+
QIITA_SAMPLE_TYPE: ["stool", "stool"],
|
|
3846
|
+
SAMPLE_TYPE_KEY: ["stool", "stool"],
|
|
3847
|
+
"study_custom_field": ["custom_value", "custom_value"],
|
|
3848
|
+
"study_stool_field": ["stool_custom", "stool_custom"],
|
|
3849
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
|
|
3850
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
|
|
3851
|
+
QC_NOTE_KEY: ["", ""]
|
|
3852
|
+
})
|
|
3853
|
+
assert_frame_equal(expected_result_df, result_df)
|
|
2485
3854
|
|
|
2486
|
-
#
|
|
3855
|
+
# Verify main output file was created
|
|
2487
3856
|
output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
|
|
2488
3857
|
self.assertEqual(1, len(output_files))
|
|
3858
|
+
output_df = pandas.read_csv(output_files[0], sep="\t")
|
|
3859
|
+
expected_output_df = pandas.DataFrame({
|
|
3860
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
3861
|
+
"body_product": ["UBERON:feces", "UBERON:feces"],
|
|
3862
|
+
"body_site": ["gut", "gut"],
|
|
3863
|
+
"description": ["human sample", "human sample"],
|
|
3864
|
+
"host_common_name": ["human", "human"],
|
|
3865
|
+
QIITA_SAMPLE_TYPE: ["stool", "stool"],
|
|
3866
|
+
SAMPLE_TYPE_KEY: ["stool", "stool"],
|
|
3867
|
+
"study_custom_field": ["custom_value", "custom_value"],
|
|
3868
|
+
"study_stool_field": ["stool_custom", "stool_custom"]
|
|
3869
|
+
})
|
|
3870
|
+
assert_frame_equal(expected_output_df, output_df)
|
|
2489
3871
|
|
|
2490
|
-
|
|
2491
|
-
|
|
2492
|
-
|
|
2493
|
-
|
|
2494
|
-
|
|
3872
|
+
def test_write_extended_metadata_with_validation_errors(self):
|
|
3873
|
+
"""Test writing extended metadata when validation errors occur."""
|
|
3874
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
3875
|
+
result_df = write_extended_metadata(
|
|
3876
|
+
self.TEST_METADATA_WITH_ERRORS_FP,
|
|
3877
|
+
self.TEST_STUDY_CONFIG_WITH_VALIDATION_FP,
|
|
3878
|
+
tmpdir, "test_output", stds_fp=self.TEST_STDS_FP)
|
|
3879
|
+
|
|
3880
|
+
# Verify returned DataFrame
|
|
3881
|
+
expected_result_df = pandas.DataFrame({
|
|
3882
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
3883
|
+
"body_product": ["UBERON:feces", "UBERON:feces"],
|
|
3884
|
+
"body_site": ["gut", "gut"],
|
|
3885
|
+
"description": ["human sample", "human sample"],
|
|
3886
|
+
"host_common_name": ["human", "human"],
|
|
3887
|
+
QIITA_SAMPLE_TYPE: ["stool", "stool"],
|
|
3888
|
+
"restricted_field": ["invalid_value", "allowed_value"],
|
|
3889
|
+
SAMPLE_TYPE_KEY: ["stool", "stool"],
|
|
3890
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
|
|
3891
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
|
|
3892
|
+
QC_NOTE_KEY: ["", ""]
|
|
2495
3893
|
})
|
|
2496
|
-
assert_frame_equal(
|
|
3894
|
+
assert_frame_equal(expected_result_df, result_df)
|
|
2497
3895
|
|
|
2498
|
-
#
|
|
2499
|
-
|
|
2500
|
-
self.assertEqual(1, len(
|
|
3896
|
+
# Verify main output file was created
|
|
3897
|
+
output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
|
|
3898
|
+
self.assertEqual(1, len(output_files))
|
|
3899
|
+
output_df = pandas.read_csv(output_files[0], sep="\t")
|
|
3900
|
+
expected_output_df = pandas.DataFrame({
|
|
3901
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
3902
|
+
"body_product": ["UBERON:feces", "UBERON:feces"],
|
|
3903
|
+
"body_site": ["gut", "gut"],
|
|
3904
|
+
"description": ["human sample", "human sample"],
|
|
3905
|
+
"host_common_name": ["human", "human"],
|
|
3906
|
+
QIITA_SAMPLE_TYPE: ["stool", "stool"],
|
|
3907
|
+
"restricted_field": ["invalid_value", "allowed_value"],
|
|
3908
|
+
SAMPLE_TYPE_KEY: ["stool", "stool"]
|
|
3909
|
+
})
|
|
3910
|
+
assert_frame_equal(expected_output_df, output_df)
|
|
3911
|
+
|
|
3912
|
+
# Verify validation errors file contains the error
|
|
3913
|
+
validation_files = glob.glob(
|
|
3914
|
+
os.path.join(tmpdir, "*_test_output_validation_errors.csv"))
|
|
3915
|
+
self.assertEqual(1, len(validation_files))
|
|
3916
|
+
validation_df = pandas.read_csv(validation_files[0], sep=",")
|
|
3917
|
+
expected_validation_df = pandas.DataFrame({
|
|
3918
|
+
"sample_name": ["sample1"],
|
|
3919
|
+
"field_name": ["restricted_field"],
|
|
3920
|
+
"error_message": ["['unallowed value invalid_value']"]
|
|
3921
|
+
})
|
|
3922
|
+
assert_frame_equal(expected_validation_df, validation_df)
|
|
2501
3923
|
|
|
2502
|
-
|
|
2503
|
-
|
|
2504
|
-
|
|
2505
|
-
|
|
2506
|
-
|
|
2507
|
-
|
|
2508
|
-
|
|
2509
|
-
|
|
3924
|
+
def test_write_extended_metadata_unrecognized_extension_raises(self):
|
|
3925
|
+
"""Test that unrecognized file extension raises ValueError."""
|
|
3926
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
3927
|
+
fake_fp = path.join(tmpdir, "test.json")
|
|
3928
|
+
# Create a dummy file so the path exists
|
|
3929
|
+
with open(fake_fp, "w") as f:
|
|
3930
|
+
f.write("{}")
|
|
3931
|
+
|
|
3932
|
+
with self.assertRaisesRegex(
|
|
3933
|
+
ValueError, "Unrecognized input file extension"):
|
|
3934
|
+
write_extended_metadata(
|
|
3935
|
+
fake_fp, self.TEST_STUDY_CONFIG_FP,
|
|
3936
|
+
tmpdir, "test_output", stds_fp=self.TEST_STDS_FP)
|
|
3937
|
+
|
|
3938
|
+
def test_write_extended_metadata_csv_separator_output(self):
|
|
3939
|
+
"""Test writing extended metadata with CSV separator for output."""
|
|
3940
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
3941
|
+
result_df = write_extended_metadata(
|
|
3942
|
+
self.TEST_METADATA_CSV_FP, self.TEST_STUDY_CONFIG_FP,
|
|
3943
|
+
tmpdir, "test_output", sep=",", stds_fp=self.TEST_STDS_FP)
|
|
3944
|
+
|
|
3945
|
+
# Verify returned DataFrame
|
|
3946
|
+
expected_result_df = pandas.DataFrame({
|
|
3947
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
3948
|
+
"body_product": ["UBERON:feces", "UBERON:feces"],
|
|
3949
|
+
"body_site": ["gut", "gut"],
|
|
3950
|
+
"description": ["human sample", "human sample"],
|
|
3951
|
+
"host_common_name": ["human", "human"],
|
|
3952
|
+
QIITA_SAMPLE_TYPE: ["stool", "stool"],
|
|
3953
|
+
SAMPLE_TYPE_KEY: ["stool", "stool"],
|
|
3954
|
+
"study_custom_field": ["custom_value", "custom_value"],
|
|
3955
|
+
"study_stool_field": ["stool_custom", "stool_custom"],
|
|
3956
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
|
|
3957
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
|
|
3958
|
+
QC_NOTE_KEY: ["", ""]
|
|
2510
3959
|
})
|
|
2511
|
-
assert_frame_equal(
|
|
3960
|
+
assert_frame_equal(expected_result_df, result_df)
|
|
2512
3961
|
|
|
2513
|
-
|
|
2514
|
-
|
|
2515
|
-
|
|
2516
|
-
|
|
2517
|
-
|
|
2518
|
-
|
|
2519
|
-
|
|
2520
|
-
|
|
2521
|
-
|
|
3962
|
+
# Verify output file has .csv extension
|
|
3963
|
+
output_files = glob.glob(os.path.join(tmpdir, "*_test_output.csv"))
|
|
3964
|
+
self.assertEqual(1, len(output_files))
|
|
3965
|
+
output_df = pandas.read_csv(output_files[0], sep=",")
|
|
3966
|
+
expected_output_df = pandas.DataFrame({
|
|
3967
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
3968
|
+
"body_product": ["UBERON:feces", "UBERON:feces"],
|
|
3969
|
+
"body_site": ["gut", "gut"],
|
|
3970
|
+
"description": ["human sample", "human sample"],
|
|
3971
|
+
"host_common_name": ["human", "human"],
|
|
3972
|
+
QIITA_SAMPLE_TYPE: ["stool", "stool"],
|
|
3973
|
+
SAMPLE_TYPE_KEY: ["stool", "stool"],
|
|
3974
|
+
"study_custom_field": ["custom_value", "custom_value"],
|
|
3975
|
+
"study_stool_field": ["stool_custom", "stool_custom"]
|
|
3976
|
+
})
|
|
3977
|
+
assert_frame_equal(expected_output_df, output_df)
|
|
2522
3978
|
|
|
3979
|
+
def test_write_extended_metadata_remove_internals_false(self):
|
|
3980
|
+
"""Test writing extended metadata with remove_internals=False."""
|
|
2523
3981
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
2524
|
-
|
|
2525
|
-
|
|
2526
|
-
|
|
2527
|
-
|
|
2528
|
-
|
|
2529
|
-
#
|
|
2530
|
-
|
|
2531
|
-
|
|
3982
|
+
result_df = write_extended_metadata(
|
|
3983
|
+
self.TEST_METADATA_CSV_FP, self.TEST_STUDY_CONFIG_FP,
|
|
3984
|
+
tmpdir, "test_output", remove_internals=False,
|
|
3985
|
+
stds_fp=self.TEST_STDS_FP)
|
|
3986
|
+
|
|
3987
|
+
# Verify returned DataFrame
|
|
3988
|
+
expected_result_df = pandas.DataFrame({
|
|
3989
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
3990
|
+
"body_product": ["UBERON:feces", "UBERON:feces"],
|
|
3991
|
+
"body_site": ["gut", "gut"],
|
|
3992
|
+
"description": ["human sample", "human sample"],
|
|
3993
|
+
"host_common_name": ["human", "human"],
|
|
3994
|
+
QIITA_SAMPLE_TYPE: ["stool", "stool"],
|
|
3995
|
+
SAMPLE_TYPE_KEY: ["stool", "stool"],
|
|
3996
|
+
"study_custom_field": ["custom_value", "custom_value"],
|
|
3997
|
+
"study_stool_field": ["stool_custom", "stool_custom"],
|
|
3998
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
|
|
3999
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
|
|
4000
|
+
QC_NOTE_KEY: ["", ""]
|
|
4001
|
+
})
|
|
4002
|
+
assert_frame_equal(expected_result_df, result_df)
|
|
2532
4003
|
|
|
2533
|
-
# Verify
|
|
2534
|
-
|
|
4004
|
+
# Verify main output file includes internal columns
|
|
4005
|
+
output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
|
|
4006
|
+
self.assertEqual(1, len(output_files))
|
|
4007
|
+
output_df = pandas.read_csv(output_files[0], sep="\t", keep_default_na=False)
|
|
4008
|
+
expected_output_df = pandas.DataFrame({
|
|
4009
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
4010
|
+
"body_product": ["UBERON:feces", "UBERON:feces"],
|
|
4011
|
+
"body_site": ["gut", "gut"],
|
|
4012
|
+
"description": ["human sample", "human sample"],
|
|
4013
|
+
"host_common_name": ["human", "human"],
|
|
4014
|
+
QIITA_SAMPLE_TYPE: ["stool", "stool"],
|
|
4015
|
+
SAMPLE_TYPE_KEY: ["stool", "stool"],
|
|
4016
|
+
"study_custom_field": ["custom_value", "custom_value"],
|
|
4017
|
+
"study_stool_field": ["stool_custom", "stool_custom"],
|
|
4018
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
|
|
4019
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
|
|
4020
|
+
QC_NOTE_KEY: ["", ""]
|
|
4021
|
+
})
|
|
4022
|
+
assert_frame_equal(expected_output_df, output_df)
|
|
2535
4023
|
|
|
2536
|
-
|
|
2537
|
-
|
|
2538
|
-
|
|
2539
|
-
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
2540
|
-
"field_a": ["a1", "a2"],
|
|
2541
|
-
HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
|
|
2542
|
-
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
|
|
2543
|
-
QC_NOTE_KEY: ["", ""]
|
|
2544
|
-
})
|
|
4024
|
+
# Verify no fails file was created (since remove_internals=False)
|
|
4025
|
+
fails_files = glob.glob(os.path.join(tmpdir, "*_test_output_fails.csv"))
|
|
4026
|
+
self.assertEqual(0, len(fails_files))
|
|
2545
4027
|
|
|
4028
|
+
def test_write_extended_metadata_suppress_empty_fails(self):
|
|
4029
|
+
"""Test writing extended metadata with suppress_empty_fails=True."""
|
|
2546
4030
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
2547
|
-
|
|
2548
|
-
|
|
2549
|
-
|
|
2550
|
-
|
|
4031
|
+
result_df = write_extended_metadata(
|
|
4032
|
+
self.TEST_METADATA_CSV_FP, self.TEST_STUDY_CONFIG_FP,
|
|
4033
|
+
tmpdir, "test_output", suppress_empty_fails=True,
|
|
4034
|
+
stds_fp=self.TEST_STDS_FP)
|
|
4035
|
+
|
|
4036
|
+
# Verify returned DataFrame
|
|
4037
|
+
expected_result_df = pandas.DataFrame({
|
|
4038
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
4039
|
+
"body_product": ["UBERON:feces", "UBERON:feces"],
|
|
4040
|
+
"body_site": ["gut", "gut"],
|
|
4041
|
+
"description": ["human sample", "human sample"],
|
|
4042
|
+
"host_common_name": ["human", "human"],
|
|
4043
|
+
QIITA_SAMPLE_TYPE: ["stool", "stool"],
|
|
4044
|
+
SAMPLE_TYPE_KEY: ["stool", "stool"],
|
|
4045
|
+
"study_custom_field": ["custom_value", "custom_value"],
|
|
4046
|
+
"study_stool_field": ["stool_custom", "stool_custom"],
|
|
4047
|
+
HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
|
|
4048
|
+
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
|
|
4049
|
+
QC_NOTE_KEY: ["", ""]
|
|
4050
|
+
})
|
|
4051
|
+
assert_frame_equal(expected_result_df, result_df)
|
|
2551
4052
|
|
|
2552
|
-
#
|
|
4053
|
+
# Verify main output file was created
|
|
4054
|
+
output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
|
|
4055
|
+
self.assertEqual(1, len(output_files))
|
|
4056
|
+
output_df = pandas.read_csv(output_files[0], sep="\t")
|
|
4057
|
+
expected_output_df = pandas.DataFrame({
|
|
4058
|
+
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
4059
|
+
"body_product": ["UBERON:feces", "UBERON:feces"],
|
|
4060
|
+
"body_site": ["gut", "gut"],
|
|
4061
|
+
"description": ["human sample", "human sample"],
|
|
4062
|
+
"host_common_name": ["human", "human"],
|
|
4063
|
+
QIITA_SAMPLE_TYPE: ["stool", "stool"],
|
|
4064
|
+
SAMPLE_TYPE_KEY: ["stool", "stool"],
|
|
4065
|
+
"study_custom_field": ["custom_value", "custom_value"],
|
|
4066
|
+
"study_stool_field": ["stool_custom", "stool_custom"]
|
|
4067
|
+
})
|
|
4068
|
+
assert_frame_equal(expected_output_df, output_df)
|
|
4069
|
+
|
|
4070
|
+
# Verify no empty fails file was created (since suppress_empty_fails=True)
|
|
2553
4071
|
fails_files = glob.glob(os.path.join(tmpdir, "*_test_output_fails.csv"))
|
|
2554
4072
|
self.assertEqual(0, len(fails_files))
|
|
2555
4073
|
|
|
2556
|
-
#
|
|
2557
|
-
|
|
2558
|
-
|
|
4074
|
+
# Verify no empty validation errors file was created
|
|
4075
|
+
validation_files = glob.glob(
|
|
4076
|
+
os.path.join(tmpdir, "*_test_output_validation_errors.csv"))
|
|
4077
|
+
self.assertEqual(0, len(validation_files))
|
|
2559
4078
|
|
|
2560
|
-
|
|
2561
|
-
"""Test output with comma separator creates .csv file."""
|
|
2562
|
-
input_df = pandas.DataFrame({
|
|
2563
|
-
SAMPLE_NAME_KEY: ["sample1", "sample2"],
|
|
2564
|
-
"field_a": ["a1", "a2"],
|
|
2565
|
-
HOSTTYPE_SHORTHAND_KEY: ["human", "human"],
|
|
2566
|
-
SAMPLETYPE_SHORTHAND_KEY: ["stool", "stool"],
|
|
2567
|
-
QC_NOTE_KEY: ["", ""]
|
|
2568
|
-
})
|
|
4079
|
+
# Integration tests
|
|
2569
4080
|
|
|
2570
|
-
|
|
2571
|
-
|
|
2572
|
-
|
|
2573
|
-
|
|
4081
|
+
TEST_PROJECT1_METADATA_FP = path.join(TEST_DIR, "data/test_project1_input_metadata.csv")
|
|
4082
|
+
TEST_PROJECT1_CONFIG_FP = path.join(TEST_DIR, "data/test_project1_config.yml")
|
|
4083
|
+
TEST_PROJECT1_EXPECTED_OUTPUT_FP = path.join(
|
|
4084
|
+
TEST_DIR, "data/test_project1_output_metadata.txt")
|
|
4085
|
+
TEST_PROJECT1_EXPECTED_FAILS_FP = path.join(
|
|
4086
|
+
TEST_DIR, "data/test_project1_output_fails.csv")
|
|
4087
|
+
def test_write_extended_metadata_from_df_project1_integration(self):
|
|
4088
|
+
"""Integration test using project1 test data files."""
|
|
2574
4089
|
|
|
2575
|
-
|
|
2576
|
-
|
|
2577
|
-
|
|
4090
|
+
def write_mismatched_debug_files(expected_content, actual_content, file_name):
|
|
4091
|
+
"""Write debug files to Desktop for unmatched content."""
|
|
4092
|
+
debug_dir = path.join(path.expanduser("~"), "Desktop")
|
|
4093
|
+
with open(path.join(debug_dir, f"UNMATCHED_1_{file_name}"), 'w') as debug_expected_file:
|
|
4094
|
+
debug_expected_file.write(expected_content)
|
|
4095
|
+
with open(path.join(debug_dir, f"UNMATCHED_2_{file_name}"), 'w') as debug_actual_file:
|
|
4096
|
+
debug_actual_file.write(actual_content)
|
|
2578
4097
|
|
|
2579
|
-
# Read and verify contents (keep_default_na=False preserves empty strings)
|
|
2580
|
-
result_df = pandas.read_csv(output_files[0], sep=",", keep_default_na=False)
|
|
2581
|
-
expected_df = input_df
|
|
2582
|
-
assert_frame_equal(expected_df, result_df)
|
|
2583
4098
|
|
|
2584
|
-
|
|
2585
|
-
|
|
2586
|
-
|
|
2587
|
-
|
|
2588
|
-
|
|
2589
|
-
|
|
2590
|
-
|
|
2591
|
-
|
|
2592
|
-
})
|
|
4099
|
+
# Load input metadata CSV
|
|
4100
|
+
input_df = pandas.read_csv(self.TEST_PROJECT1_METADATA_FP, dtype=str)
|
|
4101
|
+
# for the columns "plating_notes" and "notes", fill NaN with empty string
|
|
4102
|
+
input_df["plating_notes"] = input_df["plating_notes"].fillna("")
|
|
4103
|
+
input_df["notes"] = input_df["notes"].fillna("")
|
|
4104
|
+
|
|
4105
|
+
# Load study config
|
|
4106
|
+
study_config = _get_study_specific_config(self.TEST_PROJECT1_CONFIG_FP)
|
|
2593
4107
|
|
|
2594
4108
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
2595
|
-
|
|
2596
|
-
input_df, tmpdir, "test_output",
|
|
2597
|
-
|
|
4109
|
+
write_extended_metadata_from_df(
|
|
4110
|
+
input_df, study_config, tmpdir, "test_output",
|
|
4111
|
+
remove_internals=True)
|
|
2598
4112
|
|
|
2599
|
-
#
|
|
4113
|
+
# Compare main output file directly to expected file
|
|
2600
4114
|
output_files = glob.glob(os.path.join(tmpdir, "*_test_output.txt"))
|
|
2601
4115
|
self.assertEqual(1, len(output_files))
|
|
2602
|
-
|
|
2603
|
-
|
|
2604
|
-
self.
|
|
2605
|
-
|
|
2606
|
-
|
|
4116
|
+
with open(output_files[0], 'r') as actual_file:
|
|
4117
|
+
actual_content = actual_file.read()
|
|
4118
|
+
with open(self.TEST_PROJECT1_EXPECTED_OUTPUT_FP, 'r') as expected_file:
|
|
4119
|
+
expected_content = expected_file.read()
|
|
4120
|
+
try:
|
|
4121
|
+
self.assertEqual(expected_content, actual_content)
|
|
4122
|
+
except AssertionError:
|
|
4123
|
+
write_mismatched_debug_files(
|
|
4124
|
+
expected_content, actual_content,
|
|
4125
|
+
"project1_output.txt")
|
|
4126
|
+
raise
|
|
4127
|
+
|
|
4128
|
+
# Compare fails file directly to expected file
|
|
2607
4129
|
fails_files = glob.glob(os.path.join(tmpdir, "*_test_output_fails.csv"))
|
|
2608
4130
|
self.assertEqual(1, len(fails_files))
|
|
2609
|
-
|
|
2610
|
-
|
|
4131
|
+
with open(fails_files[0], 'r') as actual_file:
|
|
4132
|
+
actual_fails_content = actual_file.read()
|
|
4133
|
+
with open(self.TEST_PROJECT1_EXPECTED_FAILS_FP, 'r') as expected_file:
|
|
4134
|
+
expected_fails_content = expected_file.read()
|
|
4135
|
+
try:
|
|
4136
|
+
self.assertEqual(expected_fails_content, actual_fails_content)
|
|
4137
|
+
except AssertionError:
|
|
4138
|
+
write_mismatched_debug_files(
|
|
4139
|
+
expected_fails_content, actual_fails_content,
|
|
4140
|
+
"project1_fails.csv")
|
|
4141
|
+
raise
|
|
4142
|
+
|
|
4143
|
+
# Verify validation errors file is empty
|
|
4144
|
+
validation_files = glob.glob(
|
|
4145
|
+
os.path.join(tmpdir, "*_test_output_validation_errors.csv"))
|
|
4146
|
+
self.assertEqual(1, len(validation_files))
|
|
4147
|
+
self.assertEqual(0, os.path.getsize(validation_files[0]))
|