metameq 2026.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1191 @@
1
+ import glob
2
+ import os
3
+ import pandas as pd
4
+ import tempfile
5
+ from unittest import TestCase
6
+ from datetime import datetime
7
+ from datetime import timedelta
8
+ from metameq.src.metadata_validator import (
9
+ _cast_field_to_type,
10
+ _generate_validation_msg,
11
+ _get_allowed_pandas_types,
12
+ _make_cerberus_schema,
13
+ _remove_leaf_keys_from_dict,
14
+ _remove_leaf_keys_from_dict_in_list,
15
+ MetameqValidator,
16
+ output_validation_msgs,
17
+ validate_metadata_df
18
+ )
19
+
20
+
21
+ class TestRemoveLeafKeysFromDictInList(TestCase):
22
+ """Tests for _remove_leaf_keys_from_dict_in_list function."""
23
+
24
+ def test_remove_leaf_keys_from_dict_in_list_simple(self):
25
+ """Test removing keys from dicts in a flat list."""
26
+ input_list = [
27
+ {"a": 1, "b": 2, "c": 3},
28
+ {"a": 4, "b": 5, "c": 6}
29
+ ]
30
+ keys_to_remove = ["b"]
31
+
32
+ result = _remove_leaf_keys_from_dict_in_list(input_list, keys_to_remove)
33
+
34
+ expected = [
35
+ {"a": 1, "c": 3},
36
+ {"a": 4, "c": 6}
37
+ ]
38
+ self.assertEqual(expected, result)
39
+
40
+ def test_remove_leaf_keys_from_dict_in_list_nested_dicts(self):
41
+ """Test removing keys from nested dicts within list items."""
42
+ input_list = [
43
+ {
44
+ "outer": "value",
45
+ "nested": {
46
+ "keep": "yes",
47
+ "remove_me": "be gone"
48
+ }
49
+ }
50
+ ]
51
+ keys_to_remove = ["remove_me"]
52
+
53
+ result = _remove_leaf_keys_from_dict_in_list(input_list, keys_to_remove)
54
+
55
+ expected = [
56
+ {
57
+ "outer": "value",
58
+ "nested": {
59
+ "keep": "yes"
60
+ }
61
+ }
62
+ ]
63
+ self.assertEqual(expected, result)
64
+
65
+ def test_remove_leaf_keys_from_dict_in_list_nested_lists(self):
66
+ """Test handling nested lists containing dicts."""
67
+ input_list = [
68
+ [
69
+ {"a": 1, "b": 2},
70
+ {"a": 3, "b": 4}
71
+ ],
72
+ {"c": 5, "b": 6}
73
+ ]
74
+ keys_to_remove = ["b"]
75
+
76
+ result = _remove_leaf_keys_from_dict_in_list(input_list, keys_to_remove)
77
+
78
+ expected = [
79
+ [
80
+ {"a": 1},
81
+ {"a": 3}
82
+ ],
83
+ {"c": 5}
84
+ ]
85
+ self.assertEqual(expected, result)
86
+
87
+ def test_remove_leaf_keys_from_dict_in_list_non_dict_items(self):
88
+ """Test that non-dict items in the list are preserved unchanged."""
89
+ input_list = [
90
+ "string_item",
91
+ "b", # Note this is a string, not a dict, so should remain
92
+ 123,
93
+ {"a": 1, "b": 2},
94
+ None,
95
+ True
96
+ ]
97
+ keys_to_remove = ["b"]
98
+
99
+ result = _remove_leaf_keys_from_dict_in_list(input_list, keys_to_remove)
100
+
101
+ expected = [
102
+ "string_item",
103
+ "b", # remains unchanged
104
+ 123,
105
+ {"a": 1},
106
+ None,
107
+ True
108
+ ]
109
+ self.assertEqual(expected, result)
110
+
111
+ def test_remove_leaf_keys_from_dict_in_list_empty_list(self):
112
+ """Test that empty list returns empty list."""
113
+ input_list = []
114
+ keys_to_remove = ["a", "b"]
115
+
116
+ result = _remove_leaf_keys_from_dict_in_list(input_list, keys_to_remove)
117
+
118
+ self.assertEqual([], result)
119
+
120
+ def test_remove_leaf_keys_from_dict_in_list_no_matching_keys(self):
121
+ """Test when no keys match those to be removed."""
122
+ input_list = [
123
+ {"a": 1, "b": 2},
124
+ {"c": 3, "d": 4}
125
+ ]
126
+ keys_to_remove = ["x", "y", "z"]
127
+
128
+ result = _remove_leaf_keys_from_dict_in_list(input_list, keys_to_remove)
129
+
130
+ expected = [
131
+ {"a": 1, "b": 2},
132
+ {"c": 3, "d": 4}
133
+ ]
134
+ self.assertEqual(expected, result)
135
+
136
+ def test_remove_leaf_keys_from_dict_in_list_multiple_keys(self):
137
+ """Test removing multiple keys at once."""
138
+ input_list = [
139
+ {"a": 1, "b": 2, "c": 3, "d": 4},
140
+ {"a": 5, "b": 6, "c": 7, "d": 8}
141
+ ]
142
+ keys_to_remove = ["b", "d"]
143
+
144
+ result = _remove_leaf_keys_from_dict_in_list(input_list, keys_to_remove)
145
+
146
+ expected = [
147
+ {"a": 1, "c": 3},
148
+ {"a": 5, "c": 7}
149
+ ]
150
+ self.assertEqual(expected, result)
151
+
152
+ def test_remove_leaf_keys_from_dict_in_list_deeply_nested(self):
153
+ """Test removing keys from deeply nested structures."""
154
+ input_list = [
155
+ {
156
+ "level1": {
157
+ "level2": {
158
+ "keep": "value",
159
+ "remove_me": "be gone"
160
+ },
161
+ "remove_me": "also be gone"
162
+ }
163
+ }
164
+ ]
165
+ keys_to_remove = ["remove_me"]
166
+
167
+ result = _remove_leaf_keys_from_dict_in_list(input_list, keys_to_remove)
168
+
169
+ expected = [
170
+ {
171
+ "level1": {
172
+ "level2": {
173
+ "keep": "value"
174
+ }
175
+ }
176
+ }
177
+ ]
178
+ self.assertEqual(expected, result)
179
+
180
+
181
+ class TestRemoveLeafKeysFromDict(TestCase):
182
+ """Tests for _remove_leaf_keys_from_dict function."""
183
+
184
+ def test_remove_leaf_keys_from_dict_simple(self):
185
+ """Test removing specified keys from a flat dict."""
186
+ input_dict = {"a": 1, "b": 2, "c": 3}
187
+ keys_to_remove = ["b"]
188
+
189
+ result = _remove_leaf_keys_from_dict(input_dict, keys_to_remove)
190
+
191
+ expected = {"a": 1, "c": 3}
192
+ self.assertEqual(expected, result)
193
+
194
+ def test_remove_leaf_keys_from_dict_nested(self):
195
+ """Test removing specified keys from nested dicts."""
196
+ input_dict = {
197
+ "outer": "value",
198
+ "nested": {
199
+ "keep": "yes",
200
+ "remove_me": "be gone"
201
+ },
202
+ "remove_me": "top-level be gone"
203
+ }
204
+ keys_to_remove = ["remove_me"]
205
+
206
+ result = _remove_leaf_keys_from_dict(input_dict, keys_to_remove)
207
+
208
+ expected = {
209
+ "outer": "value",
210
+ "nested": {
211
+ "keep": "yes"
212
+ }
213
+ }
214
+ self.assertEqual(expected, result)
215
+
216
+ def test_remove_leaf_keys_from_dict_with_list(self):
217
+ """Test removing keys from dicts within lists."""
218
+ input_dict = {
219
+ "items": [
220
+ {"a": 1, "b": 2},
221
+ {"a": 3, "b": 4}
222
+ ],
223
+ "b": "top level"
224
+ }
225
+ keys_to_remove = ["b"]
226
+
227
+ result = _remove_leaf_keys_from_dict(input_dict, keys_to_remove)
228
+
229
+ expected = {
230
+ "items": [
231
+ {"a": 1},
232
+ {"a": 3}
233
+ ]
234
+ }
235
+ self.assertEqual(expected, result)
236
+
237
+ def test_remove_leaf_keys_from_dict_no_matching_keys(self):
238
+ """Test when no keys match those to be removed."""
239
+ input_dict = {"a": 1, "b": 2, "c": 3}
240
+ keys_to_remove = ["x", "y", "z"]
241
+
242
+ result = _remove_leaf_keys_from_dict(input_dict, keys_to_remove)
243
+
244
+ expected = {"a": 1, "b": 2, "c": 3}
245
+ self.assertEqual(expected, result)
246
+
247
+ def test_remove_leaf_keys_from_dict_empty(self):
248
+ """Test that empty dict returns empty dict."""
249
+ input_dict = {}
250
+ keys_to_remove = ["a", "b"]
251
+
252
+ result = _remove_leaf_keys_from_dict(input_dict, keys_to_remove)
253
+
254
+ self.assertEqual({}, result)
255
+
256
+ def test_remove_leaf_keys_from_dict_multiple_keys(self):
257
+ """Test removing multiple keys at once."""
258
+ input_dict = {"a": 1, "b": 2, "c": 3, "d": 4}
259
+ keys_to_remove = ["b", "d"]
260
+
261
+ result = _remove_leaf_keys_from_dict(input_dict, keys_to_remove)
262
+
263
+ expected = {"a": 1, "c": 3}
264
+ self.assertEqual(expected, result)
265
+
266
+ def test_remove_leaf_keys_from_dict_deeply_nested(self):
267
+ """Test removing keys from deeply nested structures."""
268
+ input_dict = {
269
+ "level1": {
270
+ "level2": {
271
+ "level3": {
272
+ "keep": "value",
273
+ "remove_me": "be gone"
274
+ },
275
+ "remove_me": "level2 be gone"
276
+ },
277
+ "remove_me": "level1 be gone"
278
+ }
279
+ }
280
+ keys_to_remove = ["remove_me"]
281
+
282
+ result = _remove_leaf_keys_from_dict(input_dict, keys_to_remove)
283
+
284
+ expected = {
285
+ "level1": {
286
+ "level2": {
287
+ "level3": {
288
+ "keep": "value"
289
+ }
290
+ }
291
+ }
292
+ }
293
+ self.assertEqual(expected, result)
294
+
295
+ def test_remove_leaf_keys_from_dict_key_with_dict_value_not_removed(self):
296
+ """Test that keys with dict values are preserved, only their contents processed."""
297
+ input_dict = {
298
+ "remove_me": {
299
+ "nested_key": "value",
300
+ "remove_me": "be gone"
301
+ },
302
+ "keep": "yes"
303
+ }
304
+ keys_to_remove = ["remove_me"]
305
+
306
+ result = _remove_leaf_keys_from_dict(input_dict, keys_to_remove)
307
+
308
+ # Keys with dict values are NOT removed; only non-dict, non-list-valued keys are removed
309
+ expected = {
310
+ "remove_me": {
311
+ "nested_key": "value"
312
+ },
313
+ "keep": "yes"
314
+ }
315
+ self.assertEqual(expected, result)
316
+
317
+ def test_remove_leaf_keys_from_dict_mixed_nested_structures(self):
318
+ """Test with mixed nested dicts and lists."""
319
+ input_dict = {
320
+ "config": {
321
+ "items": [
322
+ {"name": "item1", "secret": "hidden"},
323
+ {"name": "item2", "secret": "also hidden"}
324
+ ],
325
+ "secret": "hidden config"
326
+ },
327
+ "secret": "hidden secret"
328
+ }
329
+ keys_to_remove = ["secret"]
330
+
331
+ result = _remove_leaf_keys_from_dict(input_dict, keys_to_remove)
332
+
333
+ expected = {
334
+ "config": {
335
+ "items": [
336
+ {"name": "item1"},
337
+ {"name": "item2"}
338
+ ]
339
+ }
340
+ }
341
+ self.assertEqual(expected, result)
342
+
343
+
344
+ class TestMakeCerberusSchema(TestCase):
345
+ """Tests for _make_cerberus_schema function."""
346
+
347
+ def test_make_cerberus_schema_removes_is_phi(self):
348
+ """Test that is_phi key is removed from schema."""
349
+ input_dict = {
350
+ "field1": {
351
+ "type": "string",
352
+ "is_phi": True
353
+ }
354
+ }
355
+
356
+ result = _make_cerberus_schema(input_dict)
357
+
358
+ expected = {
359
+ "field1": {
360
+ "type": "string"
361
+ }
362
+ }
363
+ self.assertEqual(expected, result)
364
+
365
+ def test_make_cerberus_schema_removes_field_desc(self):
366
+ """Test that field_desc key is removed from schema."""
367
+ input_dict = {
368
+ "field1": {
369
+ "type": "string",
370
+ "field_desc": "A description of the field"
371
+ }
372
+ }
373
+
374
+ result = _make_cerberus_schema(input_dict)
375
+
376
+ expected = {
377
+ "field1": {
378
+ "type": "string"
379
+ }
380
+ }
381
+ self.assertEqual(expected, result)
382
+
383
+ def test_make_cerberus_schema_removes_units(self):
384
+ """Test that units key is removed from schema."""
385
+ input_dict = {
386
+ "field1": {
387
+ "type": "float",
388
+ "units": "meters"
389
+ }
390
+ }
391
+
392
+ result = _make_cerberus_schema(input_dict)
393
+
394
+ expected = {
395
+ "field1": {
396
+ "type": "float"
397
+ }
398
+ }
399
+ self.assertEqual(expected, result)
400
+
401
+ def test_make_cerberus_schema_removes_min_exclusive(self):
402
+ """Test that min_exclusive key is removed from schema."""
403
+ input_dict = {
404
+ "field1": {
405
+ "type": "integer",
406
+ "min_exclusive": 0
407
+ }
408
+ }
409
+
410
+ result = _make_cerberus_schema(input_dict)
411
+
412
+ expected = {
413
+ "field1": {
414
+ "type": "integer"
415
+ }
416
+ }
417
+ self.assertEqual(expected, result)
418
+
419
+ def test_make_cerberus_schema_removes_unique(self):
420
+ """Test that unique key is removed from schema."""
421
+ input_dict = {
422
+ "field1": {
423
+ "type": "string",
424
+ "unique": True
425
+ }
426
+ }
427
+
428
+ result = _make_cerberus_schema(input_dict)
429
+
430
+ expected = {
431
+ "field1": {
432
+ "type": "string"
433
+ }
434
+ }
435
+ self.assertEqual(expected, result)
436
+
437
+ def test_make_cerberus_schema_preserves_cerberus_keys(self):
438
+ """Test that valid cerberus keys are preserved."""
439
+ input_dict = {
440
+ "field1": {
441
+ "type": "string",
442
+ "required": True,
443
+ "allowed": ["a", "b", "c"],
444
+ "default": "a"
445
+ }
446
+ }
447
+
448
+ result = _make_cerberus_schema(input_dict)
449
+
450
+ expected = {
451
+ "field1": {
452
+ "type": "string",
453
+ "required": True,
454
+ "allowed": ["a", "b", "c"],
455
+ "default": "a"
456
+ }
457
+ }
458
+ self.assertEqual(expected, result)
459
+
460
+ def test_make_cerberus_schema_removes_multiple_unrecognized_keys(self):
461
+ """Test removing multiple unrecognized keys at once."""
462
+ input_dict = {
463
+ "field1": {
464
+ "type": "string",
465
+ "is_phi": False,
466
+ "field_desc": "description",
467
+ "units": "none",
468
+ "min_exclusive": 0,
469
+ "unique": True,
470
+ "required": True
471
+ }
472
+ }
473
+
474
+ result = _make_cerberus_schema(input_dict)
475
+
476
+ expected = {
477
+ "field1": {
478
+ "type": "string",
479
+ "required": True
480
+ }
481
+ }
482
+ self.assertEqual(expected, result)
483
+
484
+ def test_make_cerberus_schema_nested_fields(self):
485
+ """Test that unrecognized keys are removed from nested structures."""
486
+ input_dict = {
487
+ "field1": {
488
+ "type": "string",
489
+ "is_phi": True,
490
+ "anyof": [
491
+ {"type": "string", "field_desc": "string option"},
492
+ {"type": "integer", "units": "count"}
493
+ ]
494
+ }
495
+ }
496
+
497
+ result = _make_cerberus_schema(input_dict)
498
+
499
+ expected = {
500
+ "field1": {
501
+ "type": "string",
502
+ "anyof": [
503
+ {"type": "string"},
504
+ {"type": "integer"}
505
+ ]
506
+ }
507
+ }
508
+ self.assertEqual(expected, result)
509
+
510
+ def test_make_cerberus_schema_empty_dict(self):
511
+ """Test that empty dict returns empty dict."""
512
+ input_dict = {}
513
+
514
+ result = _make_cerberus_schema(input_dict)
515
+
516
+ self.assertEqual({}, result)
517
+
518
+ def test_make_cerberus_schema_does_not_modify_original(self):
519
+ """Test that the original dictionary is not modified."""
520
+ input_dict = {
521
+ "field1": {
522
+ "type": "string",
523
+ "is_phi": True
524
+ }
525
+ }
526
+
527
+ _make_cerberus_schema(input_dict)
528
+
529
+ # Original should still have is_phi
530
+ self.assertEqual(True, input_dict["field1"]["is_phi"])
531
+
532
+
533
+ class TestOutputValidationMsgs(TestCase):
534
+ """Tests for output_validation_msgs function."""
535
+
536
+ def test_output_validation_msgs_non_empty_df_tab_separator(self):
537
+ """Test writing non-empty DataFrame with tab separator creates .txt file."""
538
+ validation_msgs_df = pd.DataFrame({
539
+ "sample_name": ["sample1", "sample2"],
540
+ "field_name": ["field1", "field2"],
541
+ "error_message": ["error1", "error2"]
542
+ })
543
+
544
+ with tempfile.TemporaryDirectory() as tmp_dir:
545
+ output_validation_msgs(validation_msgs_df, tmp_dir, "test", sep="\t")
546
+
547
+ output_files = glob.glob(os.path.join(tmp_dir, "*_test_validation_errors.txt"))
548
+ self.assertEqual(1, len(output_files))
549
+
550
+ result_df = pd.read_csv(output_files[0], sep="\t")
551
+ pd.testing.assert_frame_equal(validation_msgs_df, result_df)
552
+
553
+ def test_output_validation_msgs_non_empty_df_comma_separator(self):
554
+ """Test writing non-empty DataFrame with comma separator creates .csv file."""
555
+ validation_msgs_df = pd.DataFrame({
556
+ "sample_name": ["sample1", "sample2"],
557
+ "field_name": ["field1", "field2"],
558
+ "error_message": ["error1", "error2"]
559
+ })
560
+
561
+ with tempfile.TemporaryDirectory() as tmp_dir:
562
+ output_validation_msgs(validation_msgs_df, tmp_dir, "test", sep=",")
563
+
564
+ output_files = glob.glob(os.path.join(tmp_dir, "*_test_validation_errors.csv"))
565
+ self.assertEqual(1, len(output_files))
566
+
567
+ result_df = pd.read_csv(output_files[0], sep=",")
568
+ pd.testing.assert_frame_equal(validation_msgs_df, result_df)
569
+
570
+ def test_output_validation_msgs_empty_df_creates_empty_file(self):
571
+ """Test that empty DataFrame creates empty file when suppress_empty_fails=False."""
572
+ validation_msgs_df = pd.DataFrame()
573
+
574
+ with tempfile.TemporaryDirectory() as tmp_dir:
575
+ output_validation_msgs(
576
+ validation_msgs_df, tmp_dir, "test", sep="\t",
577
+ suppress_empty_fails=False)
578
+
579
+ output_files = glob.glob(os.path.join(tmp_dir, "*_test_validation_errors.txt"))
580
+ self.assertEqual(1, len(output_files))
581
+
582
+ # Verify file is empty
583
+ self.assertEqual(0, os.path.getsize(output_files[0]))
584
+
585
+ def test_output_validation_msgs_empty_df_suppressed_no_file(self):
586
+ """Test that empty DataFrame creates no file when suppress_empty_fails=True."""
587
+ validation_msgs_df = pd.DataFrame()
588
+
589
+ with tempfile.TemporaryDirectory() as tmp_dir:
590
+ output_validation_msgs(
591
+ validation_msgs_df, tmp_dir, "test", sep="\t",
592
+ suppress_empty_fails=True)
593
+
594
+ # Verify no file was created
595
+ output_files = glob.glob(os.path.join(tmp_dir, "*_test_validation_errors.*"))
596
+ self.assertEqual(0, len(output_files))
597
+
598
+ def test_output_validation_msgs_filename_contains_timestamp(self):
599
+ """Test that output filename contains a timestamp prefix."""
600
+ validation_msgs_df = pd.DataFrame({
601
+ "sample_name": ["sample1"],
602
+ "field_name": ["field1"],
603
+ "error_message": ["error1"]
604
+ })
605
+
606
+ with tempfile.TemporaryDirectory() as tmp_dir:
607
+ output_validation_msgs(validation_msgs_df, tmp_dir, "mybase", sep="\t")
608
+
609
+ output_files = glob.glob(os.path.join(tmp_dir, "*_mybase_validation_errors.txt"))
610
+ self.assertEqual(1, len(output_files))
611
+
612
+ # Verify filename has timestamp pattern (YYYY-MM-DD_HH-MM-SS)
613
+ filename = os.path.basename(output_files[0])
614
+ # Format: YYYY-MM-DD_HH-MM-SS_mybase_validation_errors.txt
615
+ parts = filename.split("_")
616
+ # Should have date part (YYYY-MM-DD) and time part (HH-MM-SS)
617
+ self.assertEqual(3, len(parts[0].split("-"))) # date has 3 parts
618
+ self.assertEqual(3, len(parts[1].split("-"))) # time has 3 parts
619
+
620
+ def test_output_validation_msgs_default_separator_is_tab(self):
621
+ """Test that default separator is tab, producing .txt file."""
622
+ validation_msgs_df = pd.DataFrame({
623
+ "sample_name": ["sample1"],
624
+ "field_name": ["field1"],
625
+ "error_message": ["error1"]
626
+ })
627
+
628
+ with tempfile.TemporaryDirectory() as tmp_dir:
629
+ # Call without specifying sep parameter
630
+ output_validation_msgs(validation_msgs_df, tmp_dir, "test")
631
+
632
+ # Should create .txt file (tab separator default)
633
+ txt_files = glob.glob(os.path.join(tmp_dir, "*_test_validation_errors.txt"))
634
+ csv_files = glob.glob(os.path.join(tmp_dir, "*_test_validation_errors.csv"))
635
+ self.assertEqual(1, len(txt_files))
636
+ self.assertEqual(0, len(csv_files))
637
+
638
+
639
+ class TestGetAllowedPandasTypes(TestCase):
640
+ """Tests for _get_allowed_pandas_types function."""
641
+
642
+ def test_get_allowed_pandas_types_string(self):
643
+ """Test that cerberus 'string' type maps to Python str."""
644
+ field_definition = {"type": "string"}
645
+
646
+ result = _get_allowed_pandas_types("test_field", field_definition)
647
+
648
+ self.assertEqual([str], result)
649
+
650
+ def test_get_allowed_pandas_types_integer(self):
651
+ """Test that cerberus 'integer' type maps to Python int."""
652
+ field_definition = {"type": "integer"}
653
+
654
+ result = _get_allowed_pandas_types("test_field", field_definition)
655
+
656
+ self.assertEqual([int], result)
657
+
658
+ def test_get_allowed_pandas_types_float(self):
659
+ """Test that cerberus 'float' type maps to Python float."""
660
+ field_definition = {"type": "float"}
661
+
662
+ result = _get_allowed_pandas_types("test_field", field_definition)
663
+
664
+ self.assertEqual([float], result)
665
+
666
+ def test_get_allowed_pandas_types_number(self):
667
+ """Test that cerberus 'number' type maps to Python float."""
668
+ field_definition = {"type": "number"}
669
+
670
+ result = _get_allowed_pandas_types("test_field", field_definition)
671
+
672
+ self.assertEqual([float], result)
673
+
674
+ def test_get_allowed_pandas_types_bool(self):
675
+ """Test that cerberus 'bool' type maps to Python bool."""
676
+ field_definition = {"type": "bool"}
677
+
678
+ result = _get_allowed_pandas_types("test_field", field_definition)
679
+
680
+ self.assertEqual([bool], result)
681
+
682
+ def test_get_allowed_pandas_types_datetime(self):
683
+ """Test that cerberus 'datetime' type maps to datetime.date."""
684
+ field_definition = {"type": "datetime"}
685
+
686
+ result = _get_allowed_pandas_types("test_field", field_definition)
687
+
688
+ self.assertEqual([datetime.date], result)
689
+
690
+ def test_get_allowed_pandas_types_anyof_single(self):
691
+ """Test anyof with single type option."""
692
+ field_definition = {
693
+ "anyof": [
694
+ {"type": "string"}
695
+ ]
696
+ }
697
+
698
+ result = _get_allowed_pandas_types("test_field", field_definition)
699
+
700
+ self.assertEqual([str], result)
701
+
702
+ def test_get_allowed_pandas_types_anyof_multiple(self):
703
+ """Test anyof with multiple type options."""
704
+ field_definition = {
705
+ "anyof": [
706
+ {"type": "string"},
707
+ {"type": "integer"},
708
+ {"type": "float"}
709
+ ]
710
+ }
711
+
712
+ result = _get_allowed_pandas_types("test_field", field_definition)
713
+
714
+ self.assertEqual([str, int, float], result)
715
+
716
+ def test_get_allowed_pandas_types_no_type_raises_error(self):
717
+ """Test that missing type definition raises ValueError."""
718
+ field_definition = {"required": True}
719
+
720
+ self.assertRaisesRegex(
721
+ ValueError,
722
+ "Unable to find type definition for field 'my_field'",
723
+ _get_allowed_pandas_types,
724
+ "my_field",
725
+ field_definition)
726
+
727
+
728
+ class TestCastFieldToType(TestCase):
729
+ """Tests for _cast_field_to_type function."""
730
+
731
+ def test_cast_field_to_type_string(self):
732
+ """Test casting a value to string."""
733
+ result = _cast_field_to_type(123, [str])
734
+
735
+ self.assertEqual("123", result)
736
+ self.assertIsInstance(result, str)
737
+
738
+ def test_cast_field_to_type_integer(self):
739
+ """Test casting a value to integer."""
740
+ result = _cast_field_to_type("42", [int])
741
+
742
+ self.assertEqual(42, result)
743
+ self.assertIsInstance(result, int)
744
+
745
+ def test_cast_field_to_type_float(self):
746
+ """Test casting a value to float."""
747
+ result = _cast_field_to_type("3.14", [float])
748
+
749
+ self.assertEqual(3.14, result)
750
+ self.assertIsInstance(result, float)
751
+
752
+ def test_cast_field_to_type_bool(self):
753
+ """Test casting a value to bool."""
754
+ result = _cast_field_to_type(1, [bool])
755
+
756
+ self.assertEqual(True, result)
757
+ self.assertIsInstance(result, bool)
758
+
759
+ def test_cast_field_to_type_first_type_succeeds(self):
760
+ """Test that first matching type in list is used."""
761
+ result = _cast_field_to_type("42", [str, int])
762
+
763
+ self.assertEqual("42", result)
764
+ self.assertIsInstance(result, str)
765
+
766
+ def test_cast_field_to_type_fallback_to_second_type(self):
767
+ """Test fallback to second type when first fails."""
768
+ result = _cast_field_to_type("hello", [int, str])
769
+
770
+ self.assertEqual("hello", result)
771
+ self.assertIsInstance(result, str)
772
+
773
+ def test_cast_field_to_type_no_valid_type_raises_error(self):
774
+ """Test that ValueError is raised when no type matches."""
775
+ self.assertRaisesRegex(
776
+ ValueError,
777
+ "Unable to cast 'hello' to any of the allowed types",
778
+ _cast_field_to_type,
779
+ "hello",
780
+ [int, float])
781
+
782
+
783
+ class TestMetameqValidatorCheckWithDateNotInFuture(TestCase):
784
+ """Tests for MetameqValidator._check_with_date_not_in_future method."""
785
+
786
+ def test_check_with_date_not_in_future_valid_past_date(self):
787
+ """Test that a past date passes validation."""
788
+ validator = MetameqValidator()
789
+ schema = {"date_field": {"type": "string", "check_with": "date_not_in_future"}}
790
+
791
+ result = validator.validate({"date_field": "2020-01-15"}, schema)
792
+
793
+ self.assertTrue(result)
794
+ self.assertEqual({}, validator.errors)
795
+
796
+ def test_check_with_date_not_in_future_valid_today(self):
797
+ """Test that today's date passes validation."""
798
+ validator = MetameqValidator()
799
+ schema = {"date_field": {"type": "string", "check_with": "date_not_in_future"}}
800
+ today_str = datetime.now().strftime("%Y-%m-%d")
801
+
802
+ result = validator.validate({"date_field": today_str}, schema)
803
+
804
+ self.assertTrue(result)
805
+ self.assertEqual({}, validator.errors)
806
+
807
+ def test_check_with_date_not_in_future_future_date_fails(self):
808
+ """Test that a future date fails validation."""
809
+ validator = MetameqValidator()
810
+ schema = {"date_field": {"type": "string", "check_with": "date_not_in_future"}}
811
+ future_date = (datetime.now() + timedelta(days=365)).strftime("%Y-%m-%d")
812
+
813
+ result = validator.validate({"date_field": future_date}, schema)
814
+
815
+ self.assertFalse(result)
816
+ self.assertIn("date_field", validator.errors)
817
+ self.assertIn("Date cannot be in the future", validator.errors["date_field"])
818
+
819
+ def test_check_with_date_not_in_future_invalid_date_string_fails(self):
820
+ """Test that an invalid date string fails validation."""
821
+ validator = MetameqValidator()
822
+ schema = {"date_field": {"type": "string", "check_with": "date_not_in_future"}}
823
+
824
+ result = validator.validate({"date_field": "not a date"}, schema)
825
+
826
+ self.assertFalse(result)
827
+ self.assertIn("date_field", validator.errors)
828
+ self.assertIn("Must be a valid date", validator.errors["date_field"])
829
+
830
+ def test_check_with_date_not_in_future_various_date_formats(self):
831
+ """Test that various date formats are accepted."""
832
+ validator = MetameqValidator()
833
+ schema = {"date_field": {"type": "string", "check_with": "date_not_in_future"}}
834
+
835
+ date_formats = [
836
+ "2020-01-15",
837
+ "01/15/2020",
838
+ "January 15, 2020",
839
+ "15 Jan 2020"
840
+ ]
841
+
842
+ for date_str in date_formats:
843
+ result = validator.validate({"date_field": date_str}, schema)
844
+ self.assertTrue(result, f"Date format '{date_str}' should be valid")
845
+
846
+
847
+ class TestGenerateValidationMsg(TestCase):
848
+ """Tests for _generate_validation_msg function."""
849
+
850
+ def test_generate_validation_msg_all_valid(self):
851
+ """Test that valid rows return empty list."""
852
+ metadata_df = pd.DataFrame({
853
+ "sample_name": ["sample1", "sample2"],
854
+ "field1": ["value1", "value2"]
855
+ })
856
+ config = {
857
+ "sample_name": {"type": "string"},
858
+ "field1": {"type": "string"}
859
+ }
860
+
861
+ result = _generate_validation_msg(metadata_df, config)
862
+
863
+ self.assertEqual([], result)
864
+
865
+ def test_generate_validation_msg_single_error(self):
866
+ """Test that a single validation error is captured."""
867
+ metadata_df = pd.DataFrame({
868
+ "sample_name": ["sample1"],
869
+ "age": ["not_an_integer"]
870
+ })
871
+ config = {
872
+ "sample_name": {"type": "string"},
873
+ "age": {"type": "integer"}
874
+ }
875
+
876
+ result = _generate_validation_msg(metadata_df, config)
877
+ result_df = pd.DataFrame(result)
878
+
879
+ expected_df = pd.DataFrame({
880
+ "sample_name": ["sample1"],
881
+ "field_name": ["age"],
882
+ "error_message": [["must be of integer type"]]
883
+ })
884
+ pd.testing.assert_frame_equal(expected_df, result_df)
885
+
886
+ def test_generate_validation_msg_multiple_errors_single_row(self):
887
+ """Test that multiple errors in one row are all captured."""
888
+ metadata_df = pd.DataFrame({
889
+ "sample_name": ["sample1"],
890
+ "age": ["not_an_integer"],
891
+ "count": ["also_not_an_integer"]
892
+ })
893
+ config = {
894
+ "sample_name": {"type": "string"},
895
+ "age": {"type": "integer"},
896
+ "count": {"type": "integer"}
897
+ }
898
+
899
+ result = _generate_validation_msg(metadata_df, config)
900
+ result_df = pd.DataFrame(result)
901
+
902
+ expected_df = pd.DataFrame({
903
+ "sample_name": ["sample1", "sample1"],
904
+ "field_name": ["age", "count"],
905
+ "error_message": [["must be of integer type"], ["must be of integer type"]]
906
+ })
907
+ pd.testing.assert_frame_equal(expected_df, result_df)
908
+
909
+ def test_generate_validation_msg_errors_across_multiple_rows(self):
910
+ """Test that errors across multiple rows are all captured."""
911
+ metadata_df = pd.DataFrame({
912
+ "sample_name": ["sample1", "sample2"],
913
+ "age": ["not_an_integer", "also_not_an_integer"]
914
+ })
915
+ config = {
916
+ "sample_name": {"type": "string"},
917
+ "age": {"type": "integer"}
918
+ }
919
+
920
+ result = _generate_validation_msg(metadata_df, config)
921
+ result_df = pd.DataFrame(result)
922
+
923
+ expected_df = pd.DataFrame({
924
+ "sample_name": ["sample1", "sample2"],
925
+ "field_name": ["age", "age"],
926
+ "error_message": [["must be of integer type"], ["must be of integer type"]]
927
+ })
928
+ pd.testing.assert_frame_equal(expected_df, result_df)
929
+
930
+ def test_generate_validation_msg_allows_unknown_fields(self):
931
+ """Test that unknown fields are allowed and don't cause errors."""
932
+ metadata_df = pd.DataFrame({
933
+ "sample_name": ["sample1"],
934
+ "known_field": ["value"],
935
+ "unknown_field": ["extra_value"]
936
+ })
937
+ config = {
938
+ "sample_name": {"type": "string"},
939
+ "known_field": {"type": "string"}
940
+ }
941
+
942
+ result = _generate_validation_msg(metadata_df, config)
943
+
944
+ self.assertEqual([], result)
945
+
946
+ def test_generate_validation_msg_required_field_missing(self):
947
+ """Test that missing required fields are caught."""
948
+ metadata_df = pd.DataFrame({
949
+ "sample_name": ["sample1"],
950
+ "optional_field": ["value"]
951
+ })
952
+ config = {
953
+ "sample_name": {"type": "string"},
954
+ "required_field": {"type": "string", "required": True}
955
+ }
956
+
957
+ result = _generate_validation_msg(metadata_df, config)
958
+ result_df = pd.DataFrame(result)
959
+
960
+ expected_df = pd.DataFrame({
961
+ "sample_name": ["sample1"],
962
+ "field_name": ["required_field"],
963
+ "error_message": [["required field"]]
964
+ })
965
+ pd.testing.assert_frame_equal(expected_df, result_df)
966
+
967
+ def test_generate_validation_msg_multiple_errors_same_field(self):
968
+ """Test that multiple errors for the same field are returned as a list."""
969
+ metadata_df = pd.DataFrame({
970
+ "sample_name": ["sample1"],
971
+ "date_field": ["not a date"]
972
+ })
973
+ config = {
974
+ "sample_name": {"type": "string"},
975
+ "date_field": {
976
+ "type": "string",
977
+ "regex": "^[0-9]{4}-[0-9]{2}-[0-9]{2}$",
978
+ "check_with": "date_not_in_future"
979
+ }
980
+ }
981
+
982
+ result = _generate_validation_msg(metadata_df, config)
983
+ result_df = pd.DataFrame(result)
984
+
985
+ expected_df = pd.DataFrame({
986
+ "sample_name": ["sample1"],
987
+ "field_name": ["date_field"],
988
+ "error_message": [[
989
+ "Must be a valid date",
990
+ "value does not match regex '^[0-9]{4}-[0-9]{2}-[0-9]{2}$'"
991
+ ]]
992
+ })
993
+ pd.testing.assert_frame_equal(expected_df, result_df)
994
+
995
+
996
+ class TestValidateMetadataDf(TestCase):
997
+ """Tests for validate_metadata_df function."""
998
+
999
+ def test_validate_metadata_df_all_valid(self):
1000
+ """Test that valid metadata returns empty list."""
1001
+ metadata_df = pd.DataFrame({
1002
+ "sample_name": ["sample1", "sample2"],
1003
+ "age": [25, 30]
1004
+ })
1005
+ fields_dict = {
1006
+ "sample_name": {"type": "string"},
1007
+ "age": {"type": "integer"}
1008
+ }
1009
+
1010
+ result = validate_metadata_df(metadata_df, fields_dict)
1011
+
1012
+ self.assertEqual([], result)
1013
+
1014
+ def test_validate_metadata_df_uncastable_value_raises_error(self):
1015
+ """Test that values that cannot be cast to expected type raise ValueError."""
1016
+ metadata_df = pd.DataFrame({
1017
+ "sample_name": ["sample1"],
1018
+ "age": ["not_an_integer"]
1019
+ })
1020
+ fields_dict = {
1021
+ "sample_name": {"type": "string"},
1022
+ "age": {"type": "integer"}
1023
+ }
1024
+
1025
+ self.assertRaisesRegex(
1026
+ ValueError,
1027
+ "Unable to cast 'not_an_integer' to any of the allowed types",
1028
+ validate_metadata_df,
1029
+ metadata_df,
1030
+ fields_dict)
1031
+
1032
+ def test_validate_metadata_df_strips_metameq_keys(self):
1033
+ """Test that metameq-specific keys are stripped before validation."""
1034
+ metadata_df = pd.DataFrame({
1035
+ "sample_name": ["sample1"],
1036
+ "field1": ["12"]
1037
+ })
1038
+ # Include metameq-specific keys that should be stripped
1039
+ fields_dict = {
1040
+ "sample_name": {"type": "string", "unique": True},
1041
+ "field1": {
1042
+ "type": "integer",
1043
+ "is_phi": True,
1044
+ "field_desc": "A test field",
1045
+ "units": "none",
1046
+ "min_exclusive": 0
1047
+ }
1048
+ }
1049
+
1050
+ # Should not raise an error about unknown schema keys
1051
+ result = validate_metadata_df(metadata_df, fields_dict)
1052
+
1053
+ self.assertEqual([], result)
1054
+
1055
+ def test_validate_metadata_df_missing_field_in_df_skipped(self):
1056
+ """Test that fields defined in schema but missing from DataFrame are skipped."""
1057
+ metadata_df = pd.DataFrame({
1058
+ "sample_name": ["sample1"],
1059
+ "existing_field": ["value"]
1060
+ })
1061
+ fields_dict = {
1062
+ "sample_name": {"type": "string"},
1063
+ "existing_field": {"type": "string"},
1064
+ "missing_field": {"type": "integer"}
1065
+ }
1066
+
1067
+ # Should not raise an error; missing_field is simply skipped
1068
+ result = validate_metadata_df(metadata_df, fields_dict)
1069
+
1070
+ self.assertEqual([], result)
1071
+
1072
+ def test_validate_metadata_df_casts_to_expected_type(self):
1073
+ """Test that fields are cast to their expected types before validation."""
1074
+ metadata_df = pd.DataFrame({
1075
+ "sample_name": ["sample1"],
1076
+ "count": ["42"] # String that can be cast to int
1077
+ })
1078
+ fields_dict = {
1079
+ "sample_name": {"type": "string"},
1080
+ "count": {"type": "integer"}
1081
+ }
1082
+
1083
+ result = validate_metadata_df(metadata_df, fields_dict)
1084
+
1085
+ # After casting "42" to int, it should be valid
1086
+ self.assertEqual([], result)
1087
+
1088
+ def test_validate_metadata_df_anyof_type_validation(self):
1089
+ """Test validation with anyof type definitions."""
1090
+ metadata_df = pd.DataFrame({
1091
+ "sample_name": ["sample1", "sample2"],
1092
+ "flexible_field": ["text", "123"]
1093
+ })
1094
+ fields_dict = {
1095
+ "sample_name": {"type": "string"},
1096
+ "flexible_field": {
1097
+ "anyof": [
1098
+ {"type": "string"},
1099
+ {"type": "integer"}
1100
+ ]
1101
+ }
1102
+ }
1103
+
1104
+ result = validate_metadata_df(metadata_df, fields_dict)
1105
+
1106
+ self.assertEqual([], result)
1107
+
1108
+ def test_validate_metadata_df_multiple_rows(self):
1109
+ """Test validation across multiple rows."""
1110
+ metadata_df = pd.DataFrame({
1111
+ "sample_name": ["sample1", "sample2"],
1112
+ "status": ["invalid_status", "active"]
1113
+ })
1114
+ fields_dict = {
1115
+ "sample_name": {"type": "string"},
1116
+ "status": {"type": "string", "allowed": ["active", "inactive"]}
1117
+ }
1118
+
1119
+ result = validate_metadata_df(metadata_df, fields_dict)
1120
+ result_df = pd.DataFrame(result)
1121
+
1122
+ expected_df = pd.DataFrame({
1123
+ "sample_name": ["sample1"],
1124
+ "field_name": ["status"],
1125
+ "error_message": [["unallowed value invalid_status"]]
1126
+ })
1127
+ pd.testing.assert_frame_equal(expected_df, result_df)
1128
+
1129
+ def test_validate_metadata_df_allowed_values_validation(self):
1130
+ """Test validation of allowed values constraint."""
1131
+ metadata_df = pd.DataFrame({
1132
+ "sample_name": ["sample1"],
1133
+ "status": ["invalid_status"]
1134
+ })
1135
+ fields_dict = {
1136
+ "sample_name": {"type": "string"},
1137
+ "status": {"type": "string", "allowed": ["active", "inactive"]}
1138
+ }
1139
+
1140
+ result = validate_metadata_df(metadata_df, fields_dict)
1141
+ result_df = pd.DataFrame(result)
1142
+
1143
+ expected_df = pd.DataFrame({
1144
+ "sample_name": ["sample1"],
1145
+ "field_name": ["status"],
1146
+ "error_message": [["unallowed value invalid_status"]]
1147
+ })
1148
+ pd.testing.assert_frame_equal(expected_df, result_df)
1149
+
1150
+ def test_validate_metadata_df_regex_validation(self):
1151
+ """Test validation of regex constraint."""
1152
+ metadata_df = pd.DataFrame({
1153
+ "sample_name": ["sample1"],
1154
+ "code": ["abc"]
1155
+ })
1156
+ fields_dict = {
1157
+ "sample_name": {"type": "string"},
1158
+ "code": {"type": "string", "regex": "^[0-9]+$"}
1159
+ }
1160
+
1161
+ result = validate_metadata_df(metadata_df, fields_dict)
1162
+ result_df = pd.DataFrame(result)
1163
+
1164
+ expected_df = pd.DataFrame({
1165
+ "sample_name": ["sample1"],
1166
+ "field_name": ["code"],
1167
+ "error_message": [["value does not match regex '^[0-9]+$'"]]
1168
+ })
1169
+ pd.testing.assert_frame_equal(expected_df, result_df)
1170
+
1171
+ def test_validate_metadata_df_custom_check_with_validation(self):
1172
+ """Test validation with custom check_with rule."""
1173
+ future_date = (datetime.now() + timedelta(days=365)).strftime("%Y-%m-%d")
1174
+ metadata_df = pd.DataFrame({
1175
+ "sample_name": ["sample1"],
1176
+ "collection_date": [future_date]
1177
+ })
1178
+ fields_dict = {
1179
+ "sample_name": {"type": "string"},
1180
+ "collection_date": {"type": "string", "check_with": "date_not_in_future"}
1181
+ }
1182
+
1183
+ result = validate_metadata_df(metadata_df, fields_dict)
1184
+ result_df = pd.DataFrame(result)
1185
+
1186
+ expected_df = pd.DataFrame({
1187
+ "sample_name": ["sample1"],
1188
+ "field_name": ["collection_date"],
1189
+ "error_message": [["Date cannot be in the future"]]
1190
+ })
1191
+ pd.testing.assert_frame_equal(expected_df, result_df)