metameq 2026.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- metameq/__init__.py +42 -0
- metameq/_version.py +21 -0
- metameq/config/__init__.py +0 -0
- metameq/config/config.yml +3 -0
- metameq/config/standards.yml +1648 -0
- metameq/src/__init__.py +0 -0
- metameq/src/__main__.py +34 -0
- metameq/src/metadata_configurator.py +512 -0
- metameq/src/metadata_extender.py +1168 -0
- metameq/src/metadata_merger.py +362 -0
- metameq/src/metadata_transformers.py +335 -0
- metameq/src/metadata_validator.py +387 -0
- metameq/src/util.py +299 -0
- metameq/tests/__init__.py +0 -0
- metameq/tests/data/invalid.yml +1 -0
- metameq/tests/data/test_config.yml +9 -0
- metameq/tests/test_metadata_configurator.py +2334 -0
- metameq/tests/test_metadata_extender.py +2610 -0
- metameq/tests/test_metadata_merger.py +657 -0
- metameq/tests/test_metadata_transformers.py +277 -0
- metameq/tests/test_metadata_validator.py +1191 -0
- metameq/tests/test_util.py +436 -0
- metameq-2026.1.1.dist-info/METADATA +21 -0
- metameq-2026.1.1.dist-info/RECORD +27 -0
- metameq-2026.1.1.dist-info/WHEEL +5 -0
- metameq-2026.1.1.dist-info/entry_points.txt +2 -0
- metameq-2026.1.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1191 @@
|
|
|
1
|
+
import glob
|
|
2
|
+
import os
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import tempfile
|
|
5
|
+
from unittest import TestCase
|
|
6
|
+
from datetime import datetime
|
|
7
|
+
from datetime import timedelta
|
|
8
|
+
from metameq.src.metadata_validator import (
|
|
9
|
+
_cast_field_to_type,
|
|
10
|
+
_generate_validation_msg,
|
|
11
|
+
_get_allowed_pandas_types,
|
|
12
|
+
_make_cerberus_schema,
|
|
13
|
+
_remove_leaf_keys_from_dict,
|
|
14
|
+
_remove_leaf_keys_from_dict_in_list,
|
|
15
|
+
MetameqValidator,
|
|
16
|
+
output_validation_msgs,
|
|
17
|
+
validate_metadata_df
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class TestRemoveLeafKeysFromDictInList(TestCase):
|
|
22
|
+
"""Tests for _remove_leaf_keys_from_dict_in_list function."""
|
|
23
|
+
|
|
24
|
+
def test_remove_leaf_keys_from_dict_in_list_simple(self):
|
|
25
|
+
"""Test removing keys from dicts in a flat list."""
|
|
26
|
+
input_list = [
|
|
27
|
+
{"a": 1, "b": 2, "c": 3},
|
|
28
|
+
{"a": 4, "b": 5, "c": 6}
|
|
29
|
+
]
|
|
30
|
+
keys_to_remove = ["b"]
|
|
31
|
+
|
|
32
|
+
result = _remove_leaf_keys_from_dict_in_list(input_list, keys_to_remove)
|
|
33
|
+
|
|
34
|
+
expected = [
|
|
35
|
+
{"a": 1, "c": 3},
|
|
36
|
+
{"a": 4, "c": 6}
|
|
37
|
+
]
|
|
38
|
+
self.assertEqual(expected, result)
|
|
39
|
+
|
|
40
|
+
def test_remove_leaf_keys_from_dict_in_list_nested_dicts(self):
|
|
41
|
+
"""Test removing keys from nested dicts within list items."""
|
|
42
|
+
input_list = [
|
|
43
|
+
{
|
|
44
|
+
"outer": "value",
|
|
45
|
+
"nested": {
|
|
46
|
+
"keep": "yes",
|
|
47
|
+
"remove_me": "be gone"
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
]
|
|
51
|
+
keys_to_remove = ["remove_me"]
|
|
52
|
+
|
|
53
|
+
result = _remove_leaf_keys_from_dict_in_list(input_list, keys_to_remove)
|
|
54
|
+
|
|
55
|
+
expected = [
|
|
56
|
+
{
|
|
57
|
+
"outer": "value",
|
|
58
|
+
"nested": {
|
|
59
|
+
"keep": "yes"
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
]
|
|
63
|
+
self.assertEqual(expected, result)
|
|
64
|
+
|
|
65
|
+
def test_remove_leaf_keys_from_dict_in_list_nested_lists(self):
|
|
66
|
+
"""Test handling nested lists containing dicts."""
|
|
67
|
+
input_list = [
|
|
68
|
+
[
|
|
69
|
+
{"a": 1, "b": 2},
|
|
70
|
+
{"a": 3, "b": 4}
|
|
71
|
+
],
|
|
72
|
+
{"c": 5, "b": 6}
|
|
73
|
+
]
|
|
74
|
+
keys_to_remove = ["b"]
|
|
75
|
+
|
|
76
|
+
result = _remove_leaf_keys_from_dict_in_list(input_list, keys_to_remove)
|
|
77
|
+
|
|
78
|
+
expected = [
|
|
79
|
+
[
|
|
80
|
+
{"a": 1},
|
|
81
|
+
{"a": 3}
|
|
82
|
+
],
|
|
83
|
+
{"c": 5}
|
|
84
|
+
]
|
|
85
|
+
self.assertEqual(expected, result)
|
|
86
|
+
|
|
87
|
+
def test_remove_leaf_keys_from_dict_in_list_non_dict_items(self):
|
|
88
|
+
"""Test that non-dict items in the list are preserved unchanged."""
|
|
89
|
+
input_list = [
|
|
90
|
+
"string_item",
|
|
91
|
+
"b", # Note this is a string, not a dict, so should remain
|
|
92
|
+
123,
|
|
93
|
+
{"a": 1, "b": 2},
|
|
94
|
+
None,
|
|
95
|
+
True
|
|
96
|
+
]
|
|
97
|
+
keys_to_remove = ["b"]
|
|
98
|
+
|
|
99
|
+
result = _remove_leaf_keys_from_dict_in_list(input_list, keys_to_remove)
|
|
100
|
+
|
|
101
|
+
expected = [
|
|
102
|
+
"string_item",
|
|
103
|
+
"b", # remains unchanged
|
|
104
|
+
123,
|
|
105
|
+
{"a": 1},
|
|
106
|
+
None,
|
|
107
|
+
True
|
|
108
|
+
]
|
|
109
|
+
self.assertEqual(expected, result)
|
|
110
|
+
|
|
111
|
+
def test_remove_leaf_keys_from_dict_in_list_empty_list(self):
|
|
112
|
+
"""Test that empty list returns empty list."""
|
|
113
|
+
input_list = []
|
|
114
|
+
keys_to_remove = ["a", "b"]
|
|
115
|
+
|
|
116
|
+
result = _remove_leaf_keys_from_dict_in_list(input_list, keys_to_remove)
|
|
117
|
+
|
|
118
|
+
self.assertEqual([], result)
|
|
119
|
+
|
|
120
|
+
def test_remove_leaf_keys_from_dict_in_list_no_matching_keys(self):
|
|
121
|
+
"""Test when no keys match those to be removed."""
|
|
122
|
+
input_list = [
|
|
123
|
+
{"a": 1, "b": 2},
|
|
124
|
+
{"c": 3, "d": 4}
|
|
125
|
+
]
|
|
126
|
+
keys_to_remove = ["x", "y", "z"]
|
|
127
|
+
|
|
128
|
+
result = _remove_leaf_keys_from_dict_in_list(input_list, keys_to_remove)
|
|
129
|
+
|
|
130
|
+
expected = [
|
|
131
|
+
{"a": 1, "b": 2},
|
|
132
|
+
{"c": 3, "d": 4}
|
|
133
|
+
]
|
|
134
|
+
self.assertEqual(expected, result)
|
|
135
|
+
|
|
136
|
+
def test_remove_leaf_keys_from_dict_in_list_multiple_keys(self):
|
|
137
|
+
"""Test removing multiple keys at once."""
|
|
138
|
+
input_list = [
|
|
139
|
+
{"a": 1, "b": 2, "c": 3, "d": 4},
|
|
140
|
+
{"a": 5, "b": 6, "c": 7, "d": 8}
|
|
141
|
+
]
|
|
142
|
+
keys_to_remove = ["b", "d"]
|
|
143
|
+
|
|
144
|
+
result = _remove_leaf_keys_from_dict_in_list(input_list, keys_to_remove)
|
|
145
|
+
|
|
146
|
+
expected = [
|
|
147
|
+
{"a": 1, "c": 3},
|
|
148
|
+
{"a": 5, "c": 7}
|
|
149
|
+
]
|
|
150
|
+
self.assertEqual(expected, result)
|
|
151
|
+
|
|
152
|
+
def test_remove_leaf_keys_from_dict_in_list_deeply_nested(self):
|
|
153
|
+
"""Test removing keys from deeply nested structures."""
|
|
154
|
+
input_list = [
|
|
155
|
+
{
|
|
156
|
+
"level1": {
|
|
157
|
+
"level2": {
|
|
158
|
+
"keep": "value",
|
|
159
|
+
"remove_me": "be gone"
|
|
160
|
+
},
|
|
161
|
+
"remove_me": "also be gone"
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
]
|
|
165
|
+
keys_to_remove = ["remove_me"]
|
|
166
|
+
|
|
167
|
+
result = _remove_leaf_keys_from_dict_in_list(input_list, keys_to_remove)
|
|
168
|
+
|
|
169
|
+
expected = [
|
|
170
|
+
{
|
|
171
|
+
"level1": {
|
|
172
|
+
"level2": {
|
|
173
|
+
"keep": "value"
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
]
|
|
178
|
+
self.assertEqual(expected, result)
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
class TestRemoveLeafKeysFromDict(TestCase):
|
|
182
|
+
"""Tests for _remove_leaf_keys_from_dict function."""
|
|
183
|
+
|
|
184
|
+
def test_remove_leaf_keys_from_dict_simple(self):
|
|
185
|
+
"""Test removing specified keys from a flat dict."""
|
|
186
|
+
input_dict = {"a": 1, "b": 2, "c": 3}
|
|
187
|
+
keys_to_remove = ["b"]
|
|
188
|
+
|
|
189
|
+
result = _remove_leaf_keys_from_dict(input_dict, keys_to_remove)
|
|
190
|
+
|
|
191
|
+
expected = {"a": 1, "c": 3}
|
|
192
|
+
self.assertEqual(expected, result)
|
|
193
|
+
|
|
194
|
+
def test_remove_leaf_keys_from_dict_nested(self):
|
|
195
|
+
"""Test removing specified keys from nested dicts."""
|
|
196
|
+
input_dict = {
|
|
197
|
+
"outer": "value",
|
|
198
|
+
"nested": {
|
|
199
|
+
"keep": "yes",
|
|
200
|
+
"remove_me": "be gone"
|
|
201
|
+
},
|
|
202
|
+
"remove_me": "top-level be gone"
|
|
203
|
+
}
|
|
204
|
+
keys_to_remove = ["remove_me"]
|
|
205
|
+
|
|
206
|
+
result = _remove_leaf_keys_from_dict(input_dict, keys_to_remove)
|
|
207
|
+
|
|
208
|
+
expected = {
|
|
209
|
+
"outer": "value",
|
|
210
|
+
"nested": {
|
|
211
|
+
"keep": "yes"
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
self.assertEqual(expected, result)
|
|
215
|
+
|
|
216
|
+
def test_remove_leaf_keys_from_dict_with_list(self):
|
|
217
|
+
"""Test removing keys from dicts within lists."""
|
|
218
|
+
input_dict = {
|
|
219
|
+
"items": [
|
|
220
|
+
{"a": 1, "b": 2},
|
|
221
|
+
{"a": 3, "b": 4}
|
|
222
|
+
],
|
|
223
|
+
"b": "top level"
|
|
224
|
+
}
|
|
225
|
+
keys_to_remove = ["b"]
|
|
226
|
+
|
|
227
|
+
result = _remove_leaf_keys_from_dict(input_dict, keys_to_remove)
|
|
228
|
+
|
|
229
|
+
expected = {
|
|
230
|
+
"items": [
|
|
231
|
+
{"a": 1},
|
|
232
|
+
{"a": 3}
|
|
233
|
+
]
|
|
234
|
+
}
|
|
235
|
+
self.assertEqual(expected, result)
|
|
236
|
+
|
|
237
|
+
def test_remove_leaf_keys_from_dict_no_matching_keys(self):
|
|
238
|
+
"""Test when no keys match those to be removed."""
|
|
239
|
+
input_dict = {"a": 1, "b": 2, "c": 3}
|
|
240
|
+
keys_to_remove = ["x", "y", "z"]
|
|
241
|
+
|
|
242
|
+
result = _remove_leaf_keys_from_dict(input_dict, keys_to_remove)
|
|
243
|
+
|
|
244
|
+
expected = {"a": 1, "b": 2, "c": 3}
|
|
245
|
+
self.assertEqual(expected, result)
|
|
246
|
+
|
|
247
|
+
def test_remove_leaf_keys_from_dict_empty(self):
|
|
248
|
+
"""Test that empty dict returns empty dict."""
|
|
249
|
+
input_dict = {}
|
|
250
|
+
keys_to_remove = ["a", "b"]
|
|
251
|
+
|
|
252
|
+
result = _remove_leaf_keys_from_dict(input_dict, keys_to_remove)
|
|
253
|
+
|
|
254
|
+
self.assertEqual({}, result)
|
|
255
|
+
|
|
256
|
+
def test_remove_leaf_keys_from_dict_multiple_keys(self):
|
|
257
|
+
"""Test removing multiple keys at once."""
|
|
258
|
+
input_dict = {"a": 1, "b": 2, "c": 3, "d": 4}
|
|
259
|
+
keys_to_remove = ["b", "d"]
|
|
260
|
+
|
|
261
|
+
result = _remove_leaf_keys_from_dict(input_dict, keys_to_remove)
|
|
262
|
+
|
|
263
|
+
expected = {"a": 1, "c": 3}
|
|
264
|
+
self.assertEqual(expected, result)
|
|
265
|
+
|
|
266
|
+
def test_remove_leaf_keys_from_dict_deeply_nested(self):
|
|
267
|
+
"""Test removing keys from deeply nested structures."""
|
|
268
|
+
input_dict = {
|
|
269
|
+
"level1": {
|
|
270
|
+
"level2": {
|
|
271
|
+
"level3": {
|
|
272
|
+
"keep": "value",
|
|
273
|
+
"remove_me": "be gone"
|
|
274
|
+
},
|
|
275
|
+
"remove_me": "level2 be gone"
|
|
276
|
+
},
|
|
277
|
+
"remove_me": "level1 be gone"
|
|
278
|
+
}
|
|
279
|
+
}
|
|
280
|
+
keys_to_remove = ["remove_me"]
|
|
281
|
+
|
|
282
|
+
result = _remove_leaf_keys_from_dict(input_dict, keys_to_remove)
|
|
283
|
+
|
|
284
|
+
expected = {
|
|
285
|
+
"level1": {
|
|
286
|
+
"level2": {
|
|
287
|
+
"level3": {
|
|
288
|
+
"keep": "value"
|
|
289
|
+
}
|
|
290
|
+
}
|
|
291
|
+
}
|
|
292
|
+
}
|
|
293
|
+
self.assertEqual(expected, result)
|
|
294
|
+
|
|
295
|
+
def test_remove_leaf_keys_from_dict_key_with_dict_value_not_removed(self):
|
|
296
|
+
"""Test that keys with dict values are preserved, only their contents processed."""
|
|
297
|
+
input_dict = {
|
|
298
|
+
"remove_me": {
|
|
299
|
+
"nested_key": "value",
|
|
300
|
+
"remove_me": "be gone"
|
|
301
|
+
},
|
|
302
|
+
"keep": "yes"
|
|
303
|
+
}
|
|
304
|
+
keys_to_remove = ["remove_me"]
|
|
305
|
+
|
|
306
|
+
result = _remove_leaf_keys_from_dict(input_dict, keys_to_remove)
|
|
307
|
+
|
|
308
|
+
# Keys with dict values are NOT removed; only non-dict, non-list-valued keys are removed
|
|
309
|
+
expected = {
|
|
310
|
+
"remove_me": {
|
|
311
|
+
"nested_key": "value"
|
|
312
|
+
},
|
|
313
|
+
"keep": "yes"
|
|
314
|
+
}
|
|
315
|
+
self.assertEqual(expected, result)
|
|
316
|
+
|
|
317
|
+
def test_remove_leaf_keys_from_dict_mixed_nested_structures(self):
|
|
318
|
+
"""Test with mixed nested dicts and lists."""
|
|
319
|
+
input_dict = {
|
|
320
|
+
"config": {
|
|
321
|
+
"items": [
|
|
322
|
+
{"name": "item1", "secret": "hidden"},
|
|
323
|
+
{"name": "item2", "secret": "also hidden"}
|
|
324
|
+
],
|
|
325
|
+
"secret": "hidden config"
|
|
326
|
+
},
|
|
327
|
+
"secret": "hidden secret"
|
|
328
|
+
}
|
|
329
|
+
keys_to_remove = ["secret"]
|
|
330
|
+
|
|
331
|
+
result = _remove_leaf_keys_from_dict(input_dict, keys_to_remove)
|
|
332
|
+
|
|
333
|
+
expected = {
|
|
334
|
+
"config": {
|
|
335
|
+
"items": [
|
|
336
|
+
{"name": "item1"},
|
|
337
|
+
{"name": "item2"}
|
|
338
|
+
]
|
|
339
|
+
}
|
|
340
|
+
}
|
|
341
|
+
self.assertEqual(expected, result)
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
class TestMakeCerberusSchema(TestCase):
|
|
345
|
+
"""Tests for _make_cerberus_schema function."""
|
|
346
|
+
|
|
347
|
+
def test_make_cerberus_schema_removes_is_phi(self):
|
|
348
|
+
"""Test that is_phi key is removed from schema."""
|
|
349
|
+
input_dict = {
|
|
350
|
+
"field1": {
|
|
351
|
+
"type": "string",
|
|
352
|
+
"is_phi": True
|
|
353
|
+
}
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
result = _make_cerberus_schema(input_dict)
|
|
357
|
+
|
|
358
|
+
expected = {
|
|
359
|
+
"field1": {
|
|
360
|
+
"type": "string"
|
|
361
|
+
}
|
|
362
|
+
}
|
|
363
|
+
self.assertEqual(expected, result)
|
|
364
|
+
|
|
365
|
+
def test_make_cerberus_schema_removes_field_desc(self):
|
|
366
|
+
"""Test that field_desc key is removed from schema."""
|
|
367
|
+
input_dict = {
|
|
368
|
+
"field1": {
|
|
369
|
+
"type": "string",
|
|
370
|
+
"field_desc": "A description of the field"
|
|
371
|
+
}
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
result = _make_cerberus_schema(input_dict)
|
|
375
|
+
|
|
376
|
+
expected = {
|
|
377
|
+
"field1": {
|
|
378
|
+
"type": "string"
|
|
379
|
+
}
|
|
380
|
+
}
|
|
381
|
+
self.assertEqual(expected, result)
|
|
382
|
+
|
|
383
|
+
def test_make_cerberus_schema_removes_units(self):
|
|
384
|
+
"""Test that units key is removed from schema."""
|
|
385
|
+
input_dict = {
|
|
386
|
+
"field1": {
|
|
387
|
+
"type": "float",
|
|
388
|
+
"units": "meters"
|
|
389
|
+
}
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
result = _make_cerberus_schema(input_dict)
|
|
393
|
+
|
|
394
|
+
expected = {
|
|
395
|
+
"field1": {
|
|
396
|
+
"type": "float"
|
|
397
|
+
}
|
|
398
|
+
}
|
|
399
|
+
self.assertEqual(expected, result)
|
|
400
|
+
|
|
401
|
+
def test_make_cerberus_schema_removes_min_exclusive(self):
|
|
402
|
+
"""Test that min_exclusive key is removed from schema."""
|
|
403
|
+
input_dict = {
|
|
404
|
+
"field1": {
|
|
405
|
+
"type": "integer",
|
|
406
|
+
"min_exclusive": 0
|
|
407
|
+
}
|
|
408
|
+
}
|
|
409
|
+
|
|
410
|
+
result = _make_cerberus_schema(input_dict)
|
|
411
|
+
|
|
412
|
+
expected = {
|
|
413
|
+
"field1": {
|
|
414
|
+
"type": "integer"
|
|
415
|
+
}
|
|
416
|
+
}
|
|
417
|
+
self.assertEqual(expected, result)
|
|
418
|
+
|
|
419
|
+
def test_make_cerberus_schema_removes_unique(self):
|
|
420
|
+
"""Test that unique key is removed from schema."""
|
|
421
|
+
input_dict = {
|
|
422
|
+
"field1": {
|
|
423
|
+
"type": "string",
|
|
424
|
+
"unique": True
|
|
425
|
+
}
|
|
426
|
+
}
|
|
427
|
+
|
|
428
|
+
result = _make_cerberus_schema(input_dict)
|
|
429
|
+
|
|
430
|
+
expected = {
|
|
431
|
+
"field1": {
|
|
432
|
+
"type": "string"
|
|
433
|
+
}
|
|
434
|
+
}
|
|
435
|
+
self.assertEqual(expected, result)
|
|
436
|
+
|
|
437
|
+
def test_make_cerberus_schema_preserves_cerberus_keys(self):
|
|
438
|
+
"""Test that valid cerberus keys are preserved."""
|
|
439
|
+
input_dict = {
|
|
440
|
+
"field1": {
|
|
441
|
+
"type": "string",
|
|
442
|
+
"required": True,
|
|
443
|
+
"allowed": ["a", "b", "c"],
|
|
444
|
+
"default": "a"
|
|
445
|
+
}
|
|
446
|
+
}
|
|
447
|
+
|
|
448
|
+
result = _make_cerberus_schema(input_dict)
|
|
449
|
+
|
|
450
|
+
expected = {
|
|
451
|
+
"field1": {
|
|
452
|
+
"type": "string",
|
|
453
|
+
"required": True,
|
|
454
|
+
"allowed": ["a", "b", "c"],
|
|
455
|
+
"default": "a"
|
|
456
|
+
}
|
|
457
|
+
}
|
|
458
|
+
self.assertEqual(expected, result)
|
|
459
|
+
|
|
460
|
+
def test_make_cerberus_schema_removes_multiple_unrecognized_keys(self):
|
|
461
|
+
"""Test removing multiple unrecognized keys at once."""
|
|
462
|
+
input_dict = {
|
|
463
|
+
"field1": {
|
|
464
|
+
"type": "string",
|
|
465
|
+
"is_phi": False,
|
|
466
|
+
"field_desc": "description",
|
|
467
|
+
"units": "none",
|
|
468
|
+
"min_exclusive": 0,
|
|
469
|
+
"unique": True,
|
|
470
|
+
"required": True
|
|
471
|
+
}
|
|
472
|
+
}
|
|
473
|
+
|
|
474
|
+
result = _make_cerberus_schema(input_dict)
|
|
475
|
+
|
|
476
|
+
expected = {
|
|
477
|
+
"field1": {
|
|
478
|
+
"type": "string",
|
|
479
|
+
"required": True
|
|
480
|
+
}
|
|
481
|
+
}
|
|
482
|
+
self.assertEqual(expected, result)
|
|
483
|
+
|
|
484
|
+
def test_make_cerberus_schema_nested_fields(self):
|
|
485
|
+
"""Test that unrecognized keys are removed from nested structures."""
|
|
486
|
+
input_dict = {
|
|
487
|
+
"field1": {
|
|
488
|
+
"type": "string",
|
|
489
|
+
"is_phi": True,
|
|
490
|
+
"anyof": [
|
|
491
|
+
{"type": "string", "field_desc": "string option"},
|
|
492
|
+
{"type": "integer", "units": "count"}
|
|
493
|
+
]
|
|
494
|
+
}
|
|
495
|
+
}
|
|
496
|
+
|
|
497
|
+
result = _make_cerberus_schema(input_dict)
|
|
498
|
+
|
|
499
|
+
expected = {
|
|
500
|
+
"field1": {
|
|
501
|
+
"type": "string",
|
|
502
|
+
"anyof": [
|
|
503
|
+
{"type": "string"},
|
|
504
|
+
{"type": "integer"}
|
|
505
|
+
]
|
|
506
|
+
}
|
|
507
|
+
}
|
|
508
|
+
self.assertEqual(expected, result)
|
|
509
|
+
|
|
510
|
+
def test_make_cerberus_schema_empty_dict(self):
|
|
511
|
+
"""Test that empty dict returns empty dict."""
|
|
512
|
+
input_dict = {}
|
|
513
|
+
|
|
514
|
+
result = _make_cerberus_schema(input_dict)
|
|
515
|
+
|
|
516
|
+
self.assertEqual({}, result)
|
|
517
|
+
|
|
518
|
+
def test_make_cerberus_schema_does_not_modify_original(self):
|
|
519
|
+
"""Test that the original dictionary is not modified."""
|
|
520
|
+
input_dict = {
|
|
521
|
+
"field1": {
|
|
522
|
+
"type": "string",
|
|
523
|
+
"is_phi": True
|
|
524
|
+
}
|
|
525
|
+
}
|
|
526
|
+
|
|
527
|
+
_make_cerberus_schema(input_dict)
|
|
528
|
+
|
|
529
|
+
# Original should still have is_phi
|
|
530
|
+
self.assertEqual(True, input_dict["field1"]["is_phi"])
|
|
531
|
+
|
|
532
|
+
|
|
533
|
+
class TestOutputValidationMsgs(TestCase):
|
|
534
|
+
"""Tests for output_validation_msgs function."""
|
|
535
|
+
|
|
536
|
+
def test_output_validation_msgs_non_empty_df_tab_separator(self):
|
|
537
|
+
"""Test writing non-empty DataFrame with tab separator creates .txt file."""
|
|
538
|
+
validation_msgs_df = pd.DataFrame({
|
|
539
|
+
"sample_name": ["sample1", "sample2"],
|
|
540
|
+
"field_name": ["field1", "field2"],
|
|
541
|
+
"error_message": ["error1", "error2"]
|
|
542
|
+
})
|
|
543
|
+
|
|
544
|
+
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
545
|
+
output_validation_msgs(validation_msgs_df, tmp_dir, "test", sep="\t")
|
|
546
|
+
|
|
547
|
+
output_files = glob.glob(os.path.join(tmp_dir, "*_test_validation_errors.txt"))
|
|
548
|
+
self.assertEqual(1, len(output_files))
|
|
549
|
+
|
|
550
|
+
result_df = pd.read_csv(output_files[0], sep="\t")
|
|
551
|
+
pd.testing.assert_frame_equal(validation_msgs_df, result_df)
|
|
552
|
+
|
|
553
|
+
def test_output_validation_msgs_non_empty_df_comma_separator(self):
|
|
554
|
+
"""Test writing non-empty DataFrame with comma separator creates .csv file."""
|
|
555
|
+
validation_msgs_df = pd.DataFrame({
|
|
556
|
+
"sample_name": ["sample1", "sample2"],
|
|
557
|
+
"field_name": ["field1", "field2"],
|
|
558
|
+
"error_message": ["error1", "error2"]
|
|
559
|
+
})
|
|
560
|
+
|
|
561
|
+
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
562
|
+
output_validation_msgs(validation_msgs_df, tmp_dir, "test", sep=",")
|
|
563
|
+
|
|
564
|
+
output_files = glob.glob(os.path.join(tmp_dir, "*_test_validation_errors.csv"))
|
|
565
|
+
self.assertEqual(1, len(output_files))
|
|
566
|
+
|
|
567
|
+
result_df = pd.read_csv(output_files[0], sep=",")
|
|
568
|
+
pd.testing.assert_frame_equal(validation_msgs_df, result_df)
|
|
569
|
+
|
|
570
|
+
def test_output_validation_msgs_empty_df_creates_empty_file(self):
|
|
571
|
+
"""Test that empty DataFrame creates empty file when suppress_empty_fails=False."""
|
|
572
|
+
validation_msgs_df = pd.DataFrame()
|
|
573
|
+
|
|
574
|
+
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
575
|
+
output_validation_msgs(
|
|
576
|
+
validation_msgs_df, tmp_dir, "test", sep="\t",
|
|
577
|
+
suppress_empty_fails=False)
|
|
578
|
+
|
|
579
|
+
output_files = glob.glob(os.path.join(tmp_dir, "*_test_validation_errors.txt"))
|
|
580
|
+
self.assertEqual(1, len(output_files))
|
|
581
|
+
|
|
582
|
+
# Verify file is empty
|
|
583
|
+
self.assertEqual(0, os.path.getsize(output_files[0]))
|
|
584
|
+
|
|
585
|
+
def test_output_validation_msgs_empty_df_suppressed_no_file(self):
|
|
586
|
+
"""Test that empty DataFrame creates no file when suppress_empty_fails=True."""
|
|
587
|
+
validation_msgs_df = pd.DataFrame()
|
|
588
|
+
|
|
589
|
+
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
590
|
+
output_validation_msgs(
|
|
591
|
+
validation_msgs_df, tmp_dir, "test", sep="\t",
|
|
592
|
+
suppress_empty_fails=True)
|
|
593
|
+
|
|
594
|
+
# Verify no file was created
|
|
595
|
+
output_files = glob.glob(os.path.join(tmp_dir, "*_test_validation_errors.*"))
|
|
596
|
+
self.assertEqual(0, len(output_files))
|
|
597
|
+
|
|
598
|
+
def test_output_validation_msgs_filename_contains_timestamp(self):
|
|
599
|
+
"""Test that output filename contains a timestamp prefix."""
|
|
600
|
+
validation_msgs_df = pd.DataFrame({
|
|
601
|
+
"sample_name": ["sample1"],
|
|
602
|
+
"field_name": ["field1"],
|
|
603
|
+
"error_message": ["error1"]
|
|
604
|
+
})
|
|
605
|
+
|
|
606
|
+
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
607
|
+
output_validation_msgs(validation_msgs_df, tmp_dir, "mybase", sep="\t")
|
|
608
|
+
|
|
609
|
+
output_files = glob.glob(os.path.join(tmp_dir, "*_mybase_validation_errors.txt"))
|
|
610
|
+
self.assertEqual(1, len(output_files))
|
|
611
|
+
|
|
612
|
+
# Verify filename has timestamp pattern (YYYY-MM-DD_HH-MM-SS)
|
|
613
|
+
filename = os.path.basename(output_files[0])
|
|
614
|
+
# Format: YYYY-MM-DD_HH-MM-SS_mybase_validation_errors.txt
|
|
615
|
+
parts = filename.split("_")
|
|
616
|
+
# Should have date part (YYYY-MM-DD) and time part (HH-MM-SS)
|
|
617
|
+
self.assertEqual(3, len(parts[0].split("-"))) # date has 3 parts
|
|
618
|
+
self.assertEqual(3, len(parts[1].split("-"))) # time has 3 parts
|
|
619
|
+
|
|
620
|
+
def test_output_validation_msgs_default_separator_is_tab(self):
|
|
621
|
+
"""Test that default separator is tab, producing .txt file."""
|
|
622
|
+
validation_msgs_df = pd.DataFrame({
|
|
623
|
+
"sample_name": ["sample1"],
|
|
624
|
+
"field_name": ["field1"],
|
|
625
|
+
"error_message": ["error1"]
|
|
626
|
+
})
|
|
627
|
+
|
|
628
|
+
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
629
|
+
# Call without specifying sep parameter
|
|
630
|
+
output_validation_msgs(validation_msgs_df, tmp_dir, "test")
|
|
631
|
+
|
|
632
|
+
# Should create .txt file (tab separator default)
|
|
633
|
+
txt_files = glob.glob(os.path.join(tmp_dir, "*_test_validation_errors.txt"))
|
|
634
|
+
csv_files = glob.glob(os.path.join(tmp_dir, "*_test_validation_errors.csv"))
|
|
635
|
+
self.assertEqual(1, len(txt_files))
|
|
636
|
+
self.assertEqual(0, len(csv_files))
|
|
637
|
+
|
|
638
|
+
|
|
639
|
+
class TestGetAllowedPandasTypes(TestCase):
|
|
640
|
+
"""Tests for _get_allowed_pandas_types function."""
|
|
641
|
+
|
|
642
|
+
def test_get_allowed_pandas_types_string(self):
|
|
643
|
+
"""Test that cerberus 'string' type maps to Python str."""
|
|
644
|
+
field_definition = {"type": "string"}
|
|
645
|
+
|
|
646
|
+
result = _get_allowed_pandas_types("test_field", field_definition)
|
|
647
|
+
|
|
648
|
+
self.assertEqual([str], result)
|
|
649
|
+
|
|
650
|
+
def test_get_allowed_pandas_types_integer(self):
|
|
651
|
+
"""Test that cerberus 'integer' type maps to Python int."""
|
|
652
|
+
field_definition = {"type": "integer"}
|
|
653
|
+
|
|
654
|
+
result = _get_allowed_pandas_types("test_field", field_definition)
|
|
655
|
+
|
|
656
|
+
self.assertEqual([int], result)
|
|
657
|
+
|
|
658
|
+
def test_get_allowed_pandas_types_float(self):
|
|
659
|
+
"""Test that cerberus 'float' type maps to Python float."""
|
|
660
|
+
field_definition = {"type": "float"}
|
|
661
|
+
|
|
662
|
+
result = _get_allowed_pandas_types("test_field", field_definition)
|
|
663
|
+
|
|
664
|
+
self.assertEqual([float], result)
|
|
665
|
+
|
|
666
|
+
def test_get_allowed_pandas_types_number(self):
|
|
667
|
+
"""Test that cerberus 'number' type maps to Python float."""
|
|
668
|
+
field_definition = {"type": "number"}
|
|
669
|
+
|
|
670
|
+
result = _get_allowed_pandas_types("test_field", field_definition)
|
|
671
|
+
|
|
672
|
+
self.assertEqual([float], result)
|
|
673
|
+
|
|
674
|
+
def test_get_allowed_pandas_types_bool(self):
|
|
675
|
+
"""Test that cerberus 'bool' type maps to Python bool."""
|
|
676
|
+
field_definition = {"type": "bool"}
|
|
677
|
+
|
|
678
|
+
result = _get_allowed_pandas_types("test_field", field_definition)
|
|
679
|
+
|
|
680
|
+
self.assertEqual([bool], result)
|
|
681
|
+
|
|
682
|
+
def test_get_allowed_pandas_types_datetime(self):
|
|
683
|
+
"""Test that cerberus 'datetime' type maps to datetime.date."""
|
|
684
|
+
field_definition = {"type": "datetime"}
|
|
685
|
+
|
|
686
|
+
result = _get_allowed_pandas_types("test_field", field_definition)
|
|
687
|
+
|
|
688
|
+
self.assertEqual([datetime.date], result)
|
|
689
|
+
|
|
690
|
+
def test_get_allowed_pandas_types_anyof_single(self):
|
|
691
|
+
"""Test anyof with single type option."""
|
|
692
|
+
field_definition = {
|
|
693
|
+
"anyof": [
|
|
694
|
+
{"type": "string"}
|
|
695
|
+
]
|
|
696
|
+
}
|
|
697
|
+
|
|
698
|
+
result = _get_allowed_pandas_types("test_field", field_definition)
|
|
699
|
+
|
|
700
|
+
self.assertEqual([str], result)
|
|
701
|
+
|
|
702
|
+
def test_get_allowed_pandas_types_anyof_multiple(self):
|
|
703
|
+
"""Test anyof with multiple type options."""
|
|
704
|
+
field_definition = {
|
|
705
|
+
"anyof": [
|
|
706
|
+
{"type": "string"},
|
|
707
|
+
{"type": "integer"},
|
|
708
|
+
{"type": "float"}
|
|
709
|
+
]
|
|
710
|
+
}
|
|
711
|
+
|
|
712
|
+
result = _get_allowed_pandas_types("test_field", field_definition)
|
|
713
|
+
|
|
714
|
+
self.assertEqual([str, int, float], result)
|
|
715
|
+
|
|
716
|
+
def test_get_allowed_pandas_types_no_type_raises_error(self):
|
|
717
|
+
"""Test that missing type definition raises ValueError."""
|
|
718
|
+
field_definition = {"required": True}
|
|
719
|
+
|
|
720
|
+
self.assertRaisesRegex(
|
|
721
|
+
ValueError,
|
|
722
|
+
"Unable to find type definition for field 'my_field'",
|
|
723
|
+
_get_allowed_pandas_types,
|
|
724
|
+
"my_field",
|
|
725
|
+
field_definition)
|
|
726
|
+
|
|
727
|
+
|
|
728
|
+
class TestCastFieldToType(TestCase):
|
|
729
|
+
"""Tests for _cast_field_to_type function."""
|
|
730
|
+
|
|
731
|
+
def test_cast_field_to_type_string(self):
|
|
732
|
+
"""Test casting a value to string."""
|
|
733
|
+
result = _cast_field_to_type(123, [str])
|
|
734
|
+
|
|
735
|
+
self.assertEqual("123", result)
|
|
736
|
+
self.assertIsInstance(result, str)
|
|
737
|
+
|
|
738
|
+
def test_cast_field_to_type_integer(self):
|
|
739
|
+
"""Test casting a value to integer."""
|
|
740
|
+
result = _cast_field_to_type("42", [int])
|
|
741
|
+
|
|
742
|
+
self.assertEqual(42, result)
|
|
743
|
+
self.assertIsInstance(result, int)
|
|
744
|
+
|
|
745
|
+
def test_cast_field_to_type_float(self):
|
|
746
|
+
"""Test casting a value to float."""
|
|
747
|
+
result = _cast_field_to_type("3.14", [float])
|
|
748
|
+
|
|
749
|
+
self.assertEqual(3.14, result)
|
|
750
|
+
self.assertIsInstance(result, float)
|
|
751
|
+
|
|
752
|
+
def test_cast_field_to_type_bool(self):
|
|
753
|
+
"""Test casting a value to bool."""
|
|
754
|
+
result = _cast_field_to_type(1, [bool])
|
|
755
|
+
|
|
756
|
+
self.assertEqual(True, result)
|
|
757
|
+
self.assertIsInstance(result, bool)
|
|
758
|
+
|
|
759
|
+
def test_cast_field_to_type_first_type_succeeds(self):
|
|
760
|
+
"""Test that first matching type in list is used."""
|
|
761
|
+
result = _cast_field_to_type("42", [str, int])
|
|
762
|
+
|
|
763
|
+
self.assertEqual("42", result)
|
|
764
|
+
self.assertIsInstance(result, str)
|
|
765
|
+
|
|
766
|
+
def test_cast_field_to_type_fallback_to_second_type(self):
|
|
767
|
+
"""Test fallback to second type when first fails."""
|
|
768
|
+
result = _cast_field_to_type("hello", [int, str])
|
|
769
|
+
|
|
770
|
+
self.assertEqual("hello", result)
|
|
771
|
+
self.assertIsInstance(result, str)
|
|
772
|
+
|
|
773
|
+
def test_cast_field_to_type_no_valid_type_raises_error(self):
|
|
774
|
+
"""Test that ValueError is raised when no type matches."""
|
|
775
|
+
self.assertRaisesRegex(
|
|
776
|
+
ValueError,
|
|
777
|
+
"Unable to cast 'hello' to any of the allowed types",
|
|
778
|
+
_cast_field_to_type,
|
|
779
|
+
"hello",
|
|
780
|
+
[int, float])
|
|
781
|
+
|
|
782
|
+
|
|
783
|
+
class TestMetameqValidatorCheckWithDateNotInFuture(TestCase):
|
|
784
|
+
"""Tests for MetameqValidator._check_with_date_not_in_future method."""
|
|
785
|
+
|
|
786
|
+
def test_check_with_date_not_in_future_valid_past_date(self):
|
|
787
|
+
"""Test that a past date passes validation."""
|
|
788
|
+
validator = MetameqValidator()
|
|
789
|
+
schema = {"date_field": {"type": "string", "check_with": "date_not_in_future"}}
|
|
790
|
+
|
|
791
|
+
result = validator.validate({"date_field": "2020-01-15"}, schema)
|
|
792
|
+
|
|
793
|
+
self.assertTrue(result)
|
|
794
|
+
self.assertEqual({}, validator.errors)
|
|
795
|
+
|
|
796
|
+
def test_check_with_date_not_in_future_valid_today(self):
|
|
797
|
+
"""Test that today's date passes validation."""
|
|
798
|
+
validator = MetameqValidator()
|
|
799
|
+
schema = {"date_field": {"type": "string", "check_with": "date_not_in_future"}}
|
|
800
|
+
today_str = datetime.now().strftime("%Y-%m-%d")
|
|
801
|
+
|
|
802
|
+
result = validator.validate({"date_field": today_str}, schema)
|
|
803
|
+
|
|
804
|
+
self.assertTrue(result)
|
|
805
|
+
self.assertEqual({}, validator.errors)
|
|
806
|
+
|
|
807
|
+
def test_check_with_date_not_in_future_future_date_fails(self):
|
|
808
|
+
"""Test that a future date fails validation."""
|
|
809
|
+
validator = MetameqValidator()
|
|
810
|
+
schema = {"date_field": {"type": "string", "check_with": "date_not_in_future"}}
|
|
811
|
+
future_date = (datetime.now() + timedelta(days=365)).strftime("%Y-%m-%d")
|
|
812
|
+
|
|
813
|
+
result = validator.validate({"date_field": future_date}, schema)
|
|
814
|
+
|
|
815
|
+
self.assertFalse(result)
|
|
816
|
+
self.assertIn("date_field", validator.errors)
|
|
817
|
+
self.assertIn("Date cannot be in the future", validator.errors["date_field"])
|
|
818
|
+
|
|
819
|
+
def test_check_with_date_not_in_future_invalid_date_string_fails(self):
|
|
820
|
+
"""Test that an invalid date string fails validation."""
|
|
821
|
+
validator = MetameqValidator()
|
|
822
|
+
schema = {"date_field": {"type": "string", "check_with": "date_not_in_future"}}
|
|
823
|
+
|
|
824
|
+
result = validator.validate({"date_field": "not a date"}, schema)
|
|
825
|
+
|
|
826
|
+
self.assertFalse(result)
|
|
827
|
+
self.assertIn("date_field", validator.errors)
|
|
828
|
+
self.assertIn("Must be a valid date", validator.errors["date_field"])
|
|
829
|
+
|
|
830
|
+
def test_check_with_date_not_in_future_various_date_formats(self):
|
|
831
|
+
"""Test that various date formats are accepted."""
|
|
832
|
+
validator = MetameqValidator()
|
|
833
|
+
schema = {"date_field": {"type": "string", "check_with": "date_not_in_future"}}
|
|
834
|
+
|
|
835
|
+
date_formats = [
|
|
836
|
+
"2020-01-15",
|
|
837
|
+
"01/15/2020",
|
|
838
|
+
"January 15, 2020",
|
|
839
|
+
"15 Jan 2020"
|
|
840
|
+
]
|
|
841
|
+
|
|
842
|
+
for date_str in date_formats:
|
|
843
|
+
result = validator.validate({"date_field": date_str}, schema)
|
|
844
|
+
self.assertTrue(result, f"Date format '{date_str}' should be valid")
|
|
845
|
+
|
|
846
|
+
|
|
847
|
+
class TestGenerateValidationMsg(TestCase):
|
|
848
|
+
"""Tests for _generate_validation_msg function."""
|
|
849
|
+
|
|
850
|
+
def test_generate_validation_msg_all_valid(self):
|
|
851
|
+
"""Test that valid rows return empty list."""
|
|
852
|
+
metadata_df = pd.DataFrame({
|
|
853
|
+
"sample_name": ["sample1", "sample2"],
|
|
854
|
+
"field1": ["value1", "value2"]
|
|
855
|
+
})
|
|
856
|
+
config = {
|
|
857
|
+
"sample_name": {"type": "string"},
|
|
858
|
+
"field1": {"type": "string"}
|
|
859
|
+
}
|
|
860
|
+
|
|
861
|
+
result = _generate_validation_msg(metadata_df, config)
|
|
862
|
+
|
|
863
|
+
self.assertEqual([], result)
|
|
864
|
+
|
|
865
|
+
def test_generate_validation_msg_single_error(self):
|
|
866
|
+
"""Test that a single validation error is captured."""
|
|
867
|
+
metadata_df = pd.DataFrame({
|
|
868
|
+
"sample_name": ["sample1"],
|
|
869
|
+
"age": ["not_an_integer"]
|
|
870
|
+
})
|
|
871
|
+
config = {
|
|
872
|
+
"sample_name": {"type": "string"},
|
|
873
|
+
"age": {"type": "integer"}
|
|
874
|
+
}
|
|
875
|
+
|
|
876
|
+
result = _generate_validation_msg(metadata_df, config)
|
|
877
|
+
result_df = pd.DataFrame(result)
|
|
878
|
+
|
|
879
|
+
expected_df = pd.DataFrame({
|
|
880
|
+
"sample_name": ["sample1"],
|
|
881
|
+
"field_name": ["age"],
|
|
882
|
+
"error_message": [["must be of integer type"]]
|
|
883
|
+
})
|
|
884
|
+
pd.testing.assert_frame_equal(expected_df, result_df)
|
|
885
|
+
|
|
886
|
+
def test_generate_validation_msg_multiple_errors_single_row(self):
|
|
887
|
+
"""Test that multiple errors in one row are all captured."""
|
|
888
|
+
metadata_df = pd.DataFrame({
|
|
889
|
+
"sample_name": ["sample1"],
|
|
890
|
+
"age": ["not_an_integer"],
|
|
891
|
+
"count": ["also_not_an_integer"]
|
|
892
|
+
})
|
|
893
|
+
config = {
|
|
894
|
+
"sample_name": {"type": "string"},
|
|
895
|
+
"age": {"type": "integer"},
|
|
896
|
+
"count": {"type": "integer"}
|
|
897
|
+
}
|
|
898
|
+
|
|
899
|
+
result = _generate_validation_msg(metadata_df, config)
|
|
900
|
+
result_df = pd.DataFrame(result)
|
|
901
|
+
|
|
902
|
+
expected_df = pd.DataFrame({
|
|
903
|
+
"sample_name": ["sample1", "sample1"],
|
|
904
|
+
"field_name": ["age", "count"],
|
|
905
|
+
"error_message": [["must be of integer type"], ["must be of integer type"]]
|
|
906
|
+
})
|
|
907
|
+
pd.testing.assert_frame_equal(expected_df, result_df)
|
|
908
|
+
|
|
909
|
+
def test_generate_validation_msg_errors_across_multiple_rows(self):
|
|
910
|
+
"""Test that errors across multiple rows are all captured."""
|
|
911
|
+
metadata_df = pd.DataFrame({
|
|
912
|
+
"sample_name": ["sample1", "sample2"],
|
|
913
|
+
"age": ["not_an_integer", "also_not_an_integer"]
|
|
914
|
+
})
|
|
915
|
+
config = {
|
|
916
|
+
"sample_name": {"type": "string"},
|
|
917
|
+
"age": {"type": "integer"}
|
|
918
|
+
}
|
|
919
|
+
|
|
920
|
+
result = _generate_validation_msg(metadata_df, config)
|
|
921
|
+
result_df = pd.DataFrame(result)
|
|
922
|
+
|
|
923
|
+
expected_df = pd.DataFrame({
|
|
924
|
+
"sample_name": ["sample1", "sample2"],
|
|
925
|
+
"field_name": ["age", "age"],
|
|
926
|
+
"error_message": [["must be of integer type"], ["must be of integer type"]]
|
|
927
|
+
})
|
|
928
|
+
pd.testing.assert_frame_equal(expected_df, result_df)
|
|
929
|
+
|
|
930
|
+
def test_generate_validation_msg_allows_unknown_fields(self):
|
|
931
|
+
"""Test that unknown fields are allowed and don't cause errors."""
|
|
932
|
+
metadata_df = pd.DataFrame({
|
|
933
|
+
"sample_name": ["sample1"],
|
|
934
|
+
"known_field": ["value"],
|
|
935
|
+
"unknown_field": ["extra_value"]
|
|
936
|
+
})
|
|
937
|
+
config = {
|
|
938
|
+
"sample_name": {"type": "string"},
|
|
939
|
+
"known_field": {"type": "string"}
|
|
940
|
+
}
|
|
941
|
+
|
|
942
|
+
result = _generate_validation_msg(metadata_df, config)
|
|
943
|
+
|
|
944
|
+
self.assertEqual([], result)
|
|
945
|
+
|
|
946
|
+
def test_generate_validation_msg_required_field_missing(self):
|
|
947
|
+
"""Test that missing required fields are caught."""
|
|
948
|
+
metadata_df = pd.DataFrame({
|
|
949
|
+
"sample_name": ["sample1"],
|
|
950
|
+
"optional_field": ["value"]
|
|
951
|
+
})
|
|
952
|
+
config = {
|
|
953
|
+
"sample_name": {"type": "string"},
|
|
954
|
+
"required_field": {"type": "string", "required": True}
|
|
955
|
+
}
|
|
956
|
+
|
|
957
|
+
result = _generate_validation_msg(metadata_df, config)
|
|
958
|
+
result_df = pd.DataFrame(result)
|
|
959
|
+
|
|
960
|
+
expected_df = pd.DataFrame({
|
|
961
|
+
"sample_name": ["sample1"],
|
|
962
|
+
"field_name": ["required_field"],
|
|
963
|
+
"error_message": [["required field"]]
|
|
964
|
+
})
|
|
965
|
+
pd.testing.assert_frame_equal(expected_df, result_df)
|
|
966
|
+
|
|
967
|
+
def test_generate_validation_msg_multiple_errors_same_field(self):
|
|
968
|
+
"""Test that multiple errors for the same field are returned as a list."""
|
|
969
|
+
metadata_df = pd.DataFrame({
|
|
970
|
+
"sample_name": ["sample1"],
|
|
971
|
+
"date_field": ["not a date"]
|
|
972
|
+
})
|
|
973
|
+
config = {
|
|
974
|
+
"sample_name": {"type": "string"},
|
|
975
|
+
"date_field": {
|
|
976
|
+
"type": "string",
|
|
977
|
+
"regex": "^[0-9]{4}-[0-9]{2}-[0-9]{2}$",
|
|
978
|
+
"check_with": "date_not_in_future"
|
|
979
|
+
}
|
|
980
|
+
}
|
|
981
|
+
|
|
982
|
+
result = _generate_validation_msg(metadata_df, config)
|
|
983
|
+
result_df = pd.DataFrame(result)
|
|
984
|
+
|
|
985
|
+
expected_df = pd.DataFrame({
|
|
986
|
+
"sample_name": ["sample1"],
|
|
987
|
+
"field_name": ["date_field"],
|
|
988
|
+
"error_message": [[
|
|
989
|
+
"Must be a valid date",
|
|
990
|
+
"value does not match regex '^[0-9]{4}-[0-9]{2}-[0-9]{2}$'"
|
|
991
|
+
]]
|
|
992
|
+
})
|
|
993
|
+
pd.testing.assert_frame_equal(expected_df, result_df)
|
|
994
|
+
|
|
995
|
+
|
|
996
|
+
class TestValidateMetadataDf(TestCase):
|
|
997
|
+
"""Tests for validate_metadata_df function."""
|
|
998
|
+
|
|
999
|
+
def test_validate_metadata_df_all_valid(self):
|
|
1000
|
+
"""Test that valid metadata returns empty list."""
|
|
1001
|
+
metadata_df = pd.DataFrame({
|
|
1002
|
+
"sample_name": ["sample1", "sample2"],
|
|
1003
|
+
"age": [25, 30]
|
|
1004
|
+
})
|
|
1005
|
+
fields_dict = {
|
|
1006
|
+
"sample_name": {"type": "string"},
|
|
1007
|
+
"age": {"type": "integer"}
|
|
1008
|
+
}
|
|
1009
|
+
|
|
1010
|
+
result = validate_metadata_df(metadata_df, fields_dict)
|
|
1011
|
+
|
|
1012
|
+
self.assertEqual([], result)
|
|
1013
|
+
|
|
1014
|
+
def test_validate_metadata_df_uncastable_value_raises_error(self):
|
|
1015
|
+
"""Test that values that cannot be cast to expected type raise ValueError."""
|
|
1016
|
+
metadata_df = pd.DataFrame({
|
|
1017
|
+
"sample_name": ["sample1"],
|
|
1018
|
+
"age": ["not_an_integer"]
|
|
1019
|
+
})
|
|
1020
|
+
fields_dict = {
|
|
1021
|
+
"sample_name": {"type": "string"},
|
|
1022
|
+
"age": {"type": "integer"}
|
|
1023
|
+
}
|
|
1024
|
+
|
|
1025
|
+
self.assertRaisesRegex(
|
|
1026
|
+
ValueError,
|
|
1027
|
+
"Unable to cast 'not_an_integer' to any of the allowed types",
|
|
1028
|
+
validate_metadata_df,
|
|
1029
|
+
metadata_df,
|
|
1030
|
+
fields_dict)
|
|
1031
|
+
|
|
1032
|
+
def test_validate_metadata_df_strips_metameq_keys(self):
|
|
1033
|
+
"""Test that metameq-specific keys are stripped before validation."""
|
|
1034
|
+
metadata_df = pd.DataFrame({
|
|
1035
|
+
"sample_name": ["sample1"],
|
|
1036
|
+
"field1": ["12"]
|
|
1037
|
+
})
|
|
1038
|
+
# Include metameq-specific keys that should be stripped
|
|
1039
|
+
fields_dict = {
|
|
1040
|
+
"sample_name": {"type": "string", "unique": True},
|
|
1041
|
+
"field1": {
|
|
1042
|
+
"type": "integer",
|
|
1043
|
+
"is_phi": True,
|
|
1044
|
+
"field_desc": "A test field",
|
|
1045
|
+
"units": "none",
|
|
1046
|
+
"min_exclusive": 0
|
|
1047
|
+
}
|
|
1048
|
+
}
|
|
1049
|
+
|
|
1050
|
+
# Should not raise an error about unknown schema keys
|
|
1051
|
+
result = validate_metadata_df(metadata_df, fields_dict)
|
|
1052
|
+
|
|
1053
|
+
self.assertEqual([], result)
|
|
1054
|
+
|
|
1055
|
+
def test_validate_metadata_df_missing_field_in_df_skipped(self):
|
|
1056
|
+
"""Test that fields defined in schema but missing from DataFrame are skipped."""
|
|
1057
|
+
metadata_df = pd.DataFrame({
|
|
1058
|
+
"sample_name": ["sample1"],
|
|
1059
|
+
"existing_field": ["value"]
|
|
1060
|
+
})
|
|
1061
|
+
fields_dict = {
|
|
1062
|
+
"sample_name": {"type": "string"},
|
|
1063
|
+
"existing_field": {"type": "string"},
|
|
1064
|
+
"missing_field": {"type": "integer"}
|
|
1065
|
+
}
|
|
1066
|
+
|
|
1067
|
+
# Should not raise an error; missing_field is simply skipped
|
|
1068
|
+
result = validate_metadata_df(metadata_df, fields_dict)
|
|
1069
|
+
|
|
1070
|
+
self.assertEqual([], result)
|
|
1071
|
+
|
|
1072
|
+
def test_validate_metadata_df_casts_to_expected_type(self):
|
|
1073
|
+
"""Test that fields are cast to their expected types before validation."""
|
|
1074
|
+
metadata_df = pd.DataFrame({
|
|
1075
|
+
"sample_name": ["sample1"],
|
|
1076
|
+
"count": ["42"] # String that can be cast to int
|
|
1077
|
+
})
|
|
1078
|
+
fields_dict = {
|
|
1079
|
+
"sample_name": {"type": "string"},
|
|
1080
|
+
"count": {"type": "integer"}
|
|
1081
|
+
}
|
|
1082
|
+
|
|
1083
|
+
result = validate_metadata_df(metadata_df, fields_dict)
|
|
1084
|
+
|
|
1085
|
+
# After casting "42" to int, it should be valid
|
|
1086
|
+
self.assertEqual([], result)
|
|
1087
|
+
|
|
1088
|
+
def test_validate_metadata_df_anyof_type_validation(self):
|
|
1089
|
+
"""Test validation with anyof type definitions."""
|
|
1090
|
+
metadata_df = pd.DataFrame({
|
|
1091
|
+
"sample_name": ["sample1", "sample2"],
|
|
1092
|
+
"flexible_field": ["text", "123"]
|
|
1093
|
+
})
|
|
1094
|
+
fields_dict = {
|
|
1095
|
+
"sample_name": {"type": "string"},
|
|
1096
|
+
"flexible_field": {
|
|
1097
|
+
"anyof": [
|
|
1098
|
+
{"type": "string"},
|
|
1099
|
+
{"type": "integer"}
|
|
1100
|
+
]
|
|
1101
|
+
}
|
|
1102
|
+
}
|
|
1103
|
+
|
|
1104
|
+
result = validate_metadata_df(metadata_df, fields_dict)
|
|
1105
|
+
|
|
1106
|
+
self.assertEqual([], result)
|
|
1107
|
+
|
|
1108
|
+
def test_validate_metadata_df_multiple_rows(self):
|
|
1109
|
+
"""Test validation across multiple rows."""
|
|
1110
|
+
metadata_df = pd.DataFrame({
|
|
1111
|
+
"sample_name": ["sample1", "sample2"],
|
|
1112
|
+
"status": ["invalid_status", "active"]
|
|
1113
|
+
})
|
|
1114
|
+
fields_dict = {
|
|
1115
|
+
"sample_name": {"type": "string"},
|
|
1116
|
+
"status": {"type": "string", "allowed": ["active", "inactive"]}
|
|
1117
|
+
}
|
|
1118
|
+
|
|
1119
|
+
result = validate_metadata_df(metadata_df, fields_dict)
|
|
1120
|
+
result_df = pd.DataFrame(result)
|
|
1121
|
+
|
|
1122
|
+
expected_df = pd.DataFrame({
|
|
1123
|
+
"sample_name": ["sample1"],
|
|
1124
|
+
"field_name": ["status"],
|
|
1125
|
+
"error_message": [["unallowed value invalid_status"]]
|
|
1126
|
+
})
|
|
1127
|
+
pd.testing.assert_frame_equal(expected_df, result_df)
|
|
1128
|
+
|
|
1129
|
+
def test_validate_metadata_df_allowed_values_validation(self):
|
|
1130
|
+
"""Test validation of allowed values constraint."""
|
|
1131
|
+
metadata_df = pd.DataFrame({
|
|
1132
|
+
"sample_name": ["sample1"],
|
|
1133
|
+
"status": ["invalid_status"]
|
|
1134
|
+
})
|
|
1135
|
+
fields_dict = {
|
|
1136
|
+
"sample_name": {"type": "string"},
|
|
1137
|
+
"status": {"type": "string", "allowed": ["active", "inactive"]}
|
|
1138
|
+
}
|
|
1139
|
+
|
|
1140
|
+
result = validate_metadata_df(metadata_df, fields_dict)
|
|
1141
|
+
result_df = pd.DataFrame(result)
|
|
1142
|
+
|
|
1143
|
+
expected_df = pd.DataFrame({
|
|
1144
|
+
"sample_name": ["sample1"],
|
|
1145
|
+
"field_name": ["status"],
|
|
1146
|
+
"error_message": [["unallowed value invalid_status"]]
|
|
1147
|
+
})
|
|
1148
|
+
pd.testing.assert_frame_equal(expected_df, result_df)
|
|
1149
|
+
|
|
1150
|
+
def test_validate_metadata_df_regex_validation(self):
|
|
1151
|
+
"""Test validation of regex constraint."""
|
|
1152
|
+
metadata_df = pd.DataFrame({
|
|
1153
|
+
"sample_name": ["sample1"],
|
|
1154
|
+
"code": ["abc"]
|
|
1155
|
+
})
|
|
1156
|
+
fields_dict = {
|
|
1157
|
+
"sample_name": {"type": "string"},
|
|
1158
|
+
"code": {"type": "string", "regex": "^[0-9]+$"}
|
|
1159
|
+
}
|
|
1160
|
+
|
|
1161
|
+
result = validate_metadata_df(metadata_df, fields_dict)
|
|
1162
|
+
result_df = pd.DataFrame(result)
|
|
1163
|
+
|
|
1164
|
+
expected_df = pd.DataFrame({
|
|
1165
|
+
"sample_name": ["sample1"],
|
|
1166
|
+
"field_name": ["code"],
|
|
1167
|
+
"error_message": [["value does not match regex '^[0-9]+$'"]]
|
|
1168
|
+
})
|
|
1169
|
+
pd.testing.assert_frame_equal(expected_df, result_df)
|
|
1170
|
+
|
|
1171
|
+
def test_validate_metadata_df_custom_check_with_validation(self):
|
|
1172
|
+
"""Test validation with custom check_with rule."""
|
|
1173
|
+
future_date = (datetime.now() + timedelta(days=365)).strftime("%Y-%m-%d")
|
|
1174
|
+
metadata_df = pd.DataFrame({
|
|
1175
|
+
"sample_name": ["sample1"],
|
|
1176
|
+
"collection_date": [future_date]
|
|
1177
|
+
})
|
|
1178
|
+
fields_dict = {
|
|
1179
|
+
"sample_name": {"type": "string"},
|
|
1180
|
+
"collection_date": {"type": "string", "check_with": "date_not_in_future"}
|
|
1181
|
+
}
|
|
1182
|
+
|
|
1183
|
+
result = validate_metadata_df(metadata_df, fields_dict)
|
|
1184
|
+
result_df = pd.DataFrame(result)
|
|
1185
|
+
|
|
1186
|
+
expected_df = pd.DataFrame({
|
|
1187
|
+
"sample_name": ["sample1"],
|
|
1188
|
+
"field_name": ["collection_date"],
|
|
1189
|
+
"error_message": [["Date cannot be in the future"]]
|
|
1190
|
+
})
|
|
1191
|
+
pd.testing.assert_frame_equal(expected_df, result_df)
|