valediction 1.1.0__py3-none-any.whl → 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- valediction/data_types/data_type_helpers.py +2 -2
- valediction/data_types/data_types.py +6 -6
- valediction/data_types/type_inference.py +25 -13
- valediction/datasets/datasets.py +12 -12
- valediction/demo/DEMO - Data Dictionary.xlsx +0 -0
- valediction/demo/demo_dictionary.py +1 -1
- valediction/dictionary/generation.py +6 -6
- valediction/dictionary/helpers.py +1 -8
- valediction/dictionary/importing.py +44 -21
- valediction/dictionary/model.py +108 -36
- valediction/dictionary/template/PROJECT - Data Dictionary.xltx +0 -0
- valediction/integrity.py +80 -24
- valediction/io/csv_readers.py +3 -3
- valediction/support.py +5 -1
- valediction/validation/helpers.py +91 -35
- valediction/validation/issues.py +38 -25
- valediction/validation/validation.py +151 -110
- {valediction-1.1.0.dist-info → valediction-1.5.0.dist-info}/METADATA +1 -1
- valediction-1.5.0.dist-info/RECORD +38 -0
- valediction-1.1.0.dist-info/RECORD +0 -38
- {valediction-1.1.0.dist-info → valediction-1.5.0.dist-info}/WHEEL +0 -0
valediction/dictionary/model.py
CHANGED
|
@@ -9,10 +9,9 @@ from valediction.dictionary.helpers import (
|
|
|
9
9
|
_check_name,
|
|
10
10
|
_check_order,
|
|
11
11
|
_check_primary_key,
|
|
12
|
-
_normalise_name,
|
|
13
12
|
)
|
|
14
13
|
from valediction.exceptions import DataDictionaryError
|
|
15
|
-
from valediction.support import list_as_bullets
|
|
14
|
+
from valediction.support import _normalise, _strip, list_as_bullets
|
|
16
15
|
|
|
17
16
|
|
|
18
17
|
class Column:
|
|
@@ -44,7 +43,7 @@ class Column:
|
|
|
44
43
|
description: str | None = None,
|
|
45
44
|
datetime_format: str | None = None,
|
|
46
45
|
):
|
|
47
|
-
self.name =
|
|
46
|
+
self.name = _strip(name)
|
|
48
47
|
self.order = int(order) if order is not None else None
|
|
49
48
|
self.data_type: DataType = None
|
|
50
49
|
self.length = int(length) if length is not None else None
|
|
@@ -127,7 +126,7 @@ class Table(list[Column]):
|
|
|
127
126
|
columns: list[Column] | None = None,
|
|
128
127
|
):
|
|
129
128
|
super().__init__()
|
|
130
|
-
self.name =
|
|
129
|
+
self.name = _strip(name)
|
|
131
130
|
self.description = description
|
|
132
131
|
for column in columns or []:
|
|
133
132
|
self.add_column(column)
|
|
@@ -139,24 +138,28 @@ class Table(list[Column]):
|
|
|
139
138
|
)
|
|
140
139
|
return f"Table(name={self.name!r}, description={self.description!r}{cols_str})"
|
|
141
140
|
|
|
141
|
+
def __key(self, name: str) -> str:
|
|
142
|
+
return _normalise(name)
|
|
143
|
+
|
|
142
144
|
def __getitem__(self, key: int | str) -> Column:
|
|
143
145
|
if isinstance(key, int):
|
|
144
146
|
return super().__getitem__(key)
|
|
145
|
-
|
|
146
|
-
|
|
147
|
+
|
|
148
|
+
target_key = self.__key(key)
|
|
149
|
+
found = next((c for c in self if self.__key(c.name) == target_key), None)
|
|
147
150
|
if not found:
|
|
148
151
|
raise KeyError(f"Column {key!r} not found in table {self.name!r}.")
|
|
149
152
|
return found
|
|
150
153
|
|
|
151
154
|
def __get(self, name: str, default: Column | None = None) -> Column | None:
|
|
152
|
-
|
|
153
|
-
return next((c for c in self if c.name ==
|
|
155
|
+
target_key = self.__key(name)
|
|
156
|
+
return next((c for c in self if self.__key(c.name) == target_key), default)
|
|
154
157
|
|
|
155
158
|
# Getters
|
|
156
159
|
def index_of(self, name: str) -> int | None:
|
|
157
|
-
|
|
160
|
+
target_key = self.__key(name)
|
|
158
161
|
for i, c in enumerate(self):
|
|
159
|
-
if c.name ==
|
|
162
|
+
if self.__key(c.name) == target_key:
|
|
160
163
|
return i
|
|
161
164
|
return None
|
|
162
165
|
|
|
@@ -303,16 +306,17 @@ class Table(list[Column]):
|
|
|
303
306
|
if not isinstance(column, Column):
|
|
304
307
|
raise DataDictionaryError("Only Column objects can be added to a Table.")
|
|
305
308
|
|
|
306
|
-
|
|
307
|
-
|
|
309
|
+
incoming_key = self.__key(column.name)
|
|
310
|
+
conflict = next((c for c in self if self.__key(c.name) == incoming_key), None)
|
|
311
|
+
if conflict is not None:
|
|
308
312
|
raise DataDictionaryError(
|
|
309
|
-
f"Column {column.name!r} already exists (order={conflict.order!r})"
|
|
313
|
+
f"Column {column.name!r} already exists (order={conflict.order!r}, as {conflict.name!r})."
|
|
310
314
|
)
|
|
311
315
|
|
|
312
316
|
if column.order in self.get_column_orders():
|
|
313
|
-
|
|
317
|
+
conflict_by_order = self.get_column(column.order)
|
|
314
318
|
raise DataDictionaryError(
|
|
315
|
-
f"Order {column.order!r} already exists (name={
|
|
319
|
+
f"Order {column.order!r} already exists (name={conflict_by_order.name!r})"
|
|
316
320
|
)
|
|
317
321
|
|
|
318
322
|
if column.primary_key is not None:
|
|
@@ -339,10 +343,7 @@ class Table(list[Column]):
|
|
|
339
343
|
Raises:
|
|
340
344
|
DataDictionaryError: if the column does not exist
|
|
341
345
|
"""
|
|
342
|
-
|
|
343
|
-
name = self.get_column(column).name
|
|
344
|
-
else:
|
|
345
|
-
name = self.get_column(column).name # by order
|
|
346
|
+
name = self.get_column(column).name
|
|
346
347
|
remaining = [c for c in self if c.name != name]
|
|
347
348
|
self.clear()
|
|
348
349
|
super().extend(remaining)
|
|
@@ -367,16 +368,17 @@ class Table(list[Column]):
|
|
|
367
368
|
for col in self:
|
|
368
369
|
col.primary_key = None
|
|
369
370
|
|
|
370
|
-
# Resolve and
|
|
371
|
+
# Resolve and deduplicate
|
|
371
372
|
resolved: list[Column] = []
|
|
372
373
|
seen: set[str] = set()
|
|
373
374
|
for key in primary_keys:
|
|
374
375
|
col = self.get_column(key)
|
|
375
|
-
|
|
376
|
+
col_key = self.__key(col.name)
|
|
377
|
+
if col_key in seen:
|
|
376
378
|
raise DataDictionaryError(
|
|
377
379
|
f"Duplicate column {col.name!r} provided for table {self.name!r}."
|
|
378
380
|
)
|
|
379
|
-
seen.add(
|
|
381
|
+
seen.add(col_key)
|
|
380
382
|
resolved.append(col)
|
|
381
383
|
|
|
382
384
|
# Assign ordinals 1..N
|
|
@@ -416,14 +418,20 @@ class Dictionary(list[Table]):
|
|
|
416
418
|
):
|
|
417
419
|
super().__init__()
|
|
418
420
|
self.name = name
|
|
421
|
+
|
|
422
|
+
if isinstance(tables, Table):
|
|
423
|
+
tables = [tables]
|
|
424
|
+
|
|
419
425
|
for t in tables or []:
|
|
420
426
|
self.add_table(t)
|
|
427
|
+
|
|
421
428
|
self.organisations = organisations
|
|
422
429
|
self.version = version
|
|
423
430
|
self.version_notes = version_notes
|
|
424
431
|
self.inclusion_criteria = inclusion_criteria
|
|
425
432
|
self.exclusion_criteria = exclusion_criteria
|
|
426
433
|
self.imported = imported
|
|
434
|
+
self.__check_variables()
|
|
427
435
|
|
|
428
436
|
# Properties
|
|
429
437
|
@property
|
|
@@ -439,24 +447,85 @@ class Dictionary(list[Table]):
|
|
|
439
447
|
tables = list_as_bullets(elements=[str(t) for t in self], bullet="\n- ")
|
|
440
448
|
return f"Dictionary(name={self.name!r}, imported={self.imported!r}, {tables})"
|
|
441
449
|
|
|
450
|
+
def __key(self, name: str) -> str:
|
|
451
|
+
return _normalise(name)
|
|
452
|
+
|
|
442
453
|
def __getitem__(self, key: int | str) -> Table:
|
|
443
454
|
if isinstance(key, int):
|
|
444
455
|
return super().__getitem__(key)
|
|
445
|
-
|
|
446
|
-
|
|
456
|
+
|
|
457
|
+
target_key = self.__key(key)
|
|
458
|
+
found = next((t for t in self if self.__key(t.name) == target_key), None)
|
|
447
459
|
if not found:
|
|
448
460
|
raise KeyError(f"Table {key!r} not found in Dictionary.")
|
|
449
461
|
return found
|
|
450
462
|
|
|
451
|
-
# Getters
|
|
452
463
|
def __get(self, name: str, default: Table | None = None) -> Table | None:
|
|
453
|
-
|
|
454
|
-
return next((t for t in self if t.name ==
|
|
464
|
+
target_key = self.__key(name)
|
|
465
|
+
return next((t for t in self if self.__key(t.name) == target_key), default)
|
|
466
|
+
|
|
467
|
+
# Checkers
|
|
468
|
+
def __check_variables(self) -> None:
|
|
469
|
+
self.__check_name()
|
|
470
|
+
self.__check_organisations()
|
|
471
|
+
self.__check_version()
|
|
472
|
+
self.__check_version_notes()
|
|
473
|
+
self.__check_criteria()
|
|
474
|
+
|
|
475
|
+
def __check_name(self) -> None:
|
|
476
|
+
# Check name
|
|
477
|
+
if self.name is not None:
|
|
478
|
+
if not isinstance(self.name, str):
|
|
479
|
+
raise DataDictionaryError("Dictionary `name` must be a string.")
|
|
480
|
+
|
|
481
|
+
def __check_organisations(self) -> None:
|
|
482
|
+
# Check organisations
|
|
483
|
+
if self.organisations is not None:
|
|
484
|
+
if not isinstance(self.organisations, str):
|
|
485
|
+
raise DataDictionaryError(
|
|
486
|
+
"Dictionary `organisations` must be a string."
|
|
487
|
+
)
|
|
488
|
+
|
|
489
|
+
def __check_version(self) -> None:
|
|
490
|
+
# Check version
|
|
491
|
+
if self.version is not None:
|
|
492
|
+
if not isinstance(self.version, (str, int, float)):
|
|
493
|
+
raise DataDictionaryError(
|
|
494
|
+
"Dictionary `version` must be a string, int, or float."
|
|
495
|
+
)
|
|
496
|
+
|
|
497
|
+
if isinstance(self.version, (int, float)):
|
|
498
|
+
self.version = str(self.version)
|
|
499
|
+
|
|
500
|
+
# Check version_notes
|
|
455
501
|
|
|
502
|
+
def __check_version_notes(self) -> None:
|
|
503
|
+
if self.version_notes is not None:
|
|
504
|
+
if not isinstance(self.version_notes, str):
|
|
505
|
+
raise DataDictionaryError(
|
|
506
|
+
"Dictionary `version_notes` must be a string."
|
|
507
|
+
)
|
|
508
|
+
|
|
509
|
+
def __check_criteria(self) -> None:
|
|
510
|
+
# Check inclusion_criteria
|
|
511
|
+
if self.inclusion_criteria is not None:
|
|
512
|
+
if not isinstance(self.inclusion_criteria, str):
|
|
513
|
+
raise DataDictionaryError(
|
|
514
|
+
"Dictionary `inclusion_criteria` must be a string."
|
|
515
|
+
)
|
|
516
|
+
|
|
517
|
+
# Check exclusion_criteria
|
|
518
|
+
if self.exclusion_criteria is not None:
|
|
519
|
+
if not isinstance(self.exclusion_criteria, str):
|
|
520
|
+
raise DataDictionaryError(
|
|
521
|
+
"Dictionary exclusion_criteria must be a string."
|
|
522
|
+
)
|
|
523
|
+
|
|
524
|
+
# Getters
|
|
456
525
|
def index_of(self, name: str) -> int | None:
|
|
457
|
-
|
|
526
|
+
target_key = self.__key(name)
|
|
458
527
|
for i, t in enumerate(self):
|
|
459
|
-
if t.name ==
|
|
528
|
+
if self.__key(t.name) == target_key:
|
|
460
529
|
return i
|
|
461
530
|
return None
|
|
462
531
|
|
|
@@ -484,12 +553,9 @@ class Dictionary(list[Table]):
|
|
|
484
553
|
Raises:
|
|
485
554
|
KeyError: If the table is not found in the dictionary.
|
|
486
555
|
"""
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
if not found:
|
|
556
|
+
found = self.__get(table)
|
|
557
|
+
if found is None:
|
|
491
558
|
raise KeyError(f"Table {table!r} not found in Dictionary.")
|
|
492
|
-
|
|
493
559
|
return found
|
|
494
560
|
|
|
495
561
|
# Manipulation
|
|
@@ -508,8 +574,14 @@ class Dictionary(list[Table]):
|
|
|
508
574
|
raise DataDictionaryError(
|
|
509
575
|
"Only Table objects can be added to a Dictionary."
|
|
510
576
|
)
|
|
511
|
-
|
|
512
|
-
|
|
577
|
+
|
|
578
|
+
incoming_key = self.__key(table.name)
|
|
579
|
+
conflict = next((t for t in self if self.__key(t.name) == incoming_key), None)
|
|
580
|
+
if conflict is not None:
|
|
581
|
+
raise DataDictionaryError(
|
|
582
|
+
f"Table {table.name!r} already exists (as {conflict.name!r})."
|
|
583
|
+
)
|
|
584
|
+
|
|
513
585
|
super().append(table)
|
|
514
586
|
|
|
515
587
|
def remove_table(self, table: str) -> None:
|
|
Binary file
|
valediction/integrity.py
CHANGED
|
@@ -1,6 +1,10 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import re
|
|
4
|
+
from copy import deepcopy
|
|
2
5
|
from pathlib import Path
|
|
3
6
|
from re import Pattern
|
|
7
|
+
from typing import Any
|
|
4
8
|
|
|
5
9
|
from valediction.data_types.data_types import DataType
|
|
6
10
|
from valediction.support import list_as_bullets
|
|
@@ -12,13 +16,58 @@ TEMPLATE_DATA_DICTIONARY_PATH = (
|
|
|
12
16
|
)
|
|
13
17
|
|
|
14
18
|
|
|
19
|
+
externally_injected_variables: dict[
|
|
20
|
+
str, Any
|
|
21
|
+
] = {} # External injection store for package wrapping (any keys, always included)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def reset_injected_config_variables() -> None:
|
|
25
|
+
global externally_injected_variables
|
|
26
|
+
externally_injected_variables = {}
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def inject_config_variables(variables: dict[str, Any]) -> None:
|
|
30
|
+
"""Injects variables into the Valediction Config, which will always be incorporated
|
|
31
|
+
as overrides, regardless of Config calling method (default, session-scoped, or
|
|
32
|
+
contextual).
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
variables (dict[str, Any]): Dictionary of config variables.
|
|
36
|
+
"""
|
|
37
|
+
global externally_injected_variables, session_config
|
|
38
|
+
|
|
39
|
+
# check type allows
|
|
40
|
+
if not isinstance(variables, dict):
|
|
41
|
+
raise TypeError(
|
|
42
|
+
f"Config injection variables must be a dictionary, not {type(variables)}"
|
|
43
|
+
)
|
|
44
|
+
problematic_keys = []
|
|
45
|
+
for variable_name in variables.keys():
|
|
46
|
+
if not isinstance(variable_name, str):
|
|
47
|
+
problematic_keys.append(variable_name)
|
|
48
|
+
|
|
49
|
+
if problematic_keys:
|
|
50
|
+
raise TypeError("Config injection variables accepts only string keys.")
|
|
51
|
+
|
|
52
|
+
externally_injected_variables = dict(variables or {})
|
|
53
|
+
|
|
54
|
+
# Apply immediately to the current session config (if it exists)
|
|
55
|
+
if session_config is not None:
|
|
56
|
+
_apply_external_injections(session_config)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _apply_external_injections(config: Config) -> None:
|
|
60
|
+
for variable_name, variable_value in externally_injected_variables.items():
|
|
61
|
+
setattr(config, variable_name, deepcopy(variable_value))
|
|
62
|
+
|
|
63
|
+
|
|
15
64
|
class Config:
|
|
16
65
|
def __init__(self):
|
|
17
66
|
self.template_data_dictionary_path: Path = TEMPLATE_DATA_DICTIONARY_PATH
|
|
18
67
|
self.max_table_name_length: int = 63
|
|
19
68
|
self.max_column_name_length: int = 30
|
|
20
69
|
self.max_primary_keys: int = 7
|
|
21
|
-
self.invalid_name_pattern: str | Pattern = re.compile(r"[^A-
|
|
70
|
+
self.invalid_name_pattern: str | Pattern = re.compile(r"[^A-Za-z0-9_]")
|
|
22
71
|
self.null_values: list[str] = ["", "null", "none"]
|
|
23
72
|
self.forbidden_characters: list[str] = []
|
|
24
73
|
self.date_formats: dict[str, DataType] = {
|
|
@@ -28,20 +77,22 @@ class Config:
|
|
|
28
77
|
"%d-%m-%Y": DataType.DATE,
|
|
29
78
|
"%m/%d/%Y": DataType.DATE,
|
|
30
79
|
"%m-%d-%Y": DataType.DATE,
|
|
31
|
-
"%Y-%m-%d %H:%M:%S": DataType.
|
|
32
|
-
"%Y-%m-%d %H:%M": DataType.
|
|
33
|
-
"%d/%m/%Y %H:%M:%S": DataType.
|
|
34
|
-
"%d/%m/%Y %H:%M": DataType.
|
|
35
|
-
"%m/%d/%Y %H:%M:%S": DataType.
|
|
36
|
-
"%Y-%m-%dT%H:%M:%S": DataType.
|
|
37
|
-
"%Y-%m-%dT%H:%M:%S.%f": DataType.
|
|
38
|
-
"%Y-%m-%dT%H:%M:%S%z": DataType.
|
|
39
|
-
"%Y-%m-%dT%H:%M:%S.%f%z": DataType.
|
|
40
|
-
"%Y-%m-%dT%H:%M:%SZ": DataType.
|
|
41
|
-
"%Y-%m-%dT%H:%M:%S.%fZ": DataType.
|
|
80
|
+
"%Y-%m-%d %H:%M:%S": DataType.TIMESTAMP,
|
|
81
|
+
"%Y-%m-%d %H:%M": DataType.TIMESTAMP,
|
|
82
|
+
"%d/%m/%Y %H:%M:%S": DataType.TIMESTAMP,
|
|
83
|
+
"%d/%m/%Y %H:%M": DataType.TIMESTAMP,
|
|
84
|
+
"%m/%d/%Y %H:%M:%S": DataType.TIMESTAMP,
|
|
85
|
+
"%Y-%m-%dT%H:%M:%S": DataType.TIMESTAMP,
|
|
86
|
+
"%Y-%m-%dT%H:%M:%S.%f": DataType.TIMESTAMP,
|
|
87
|
+
"%Y-%m-%dT%H:%M:%S%z": DataType.TIMESTAMP,
|
|
88
|
+
"%Y-%m-%dT%H:%M:%S.%f%z": DataType.TIMESTAMP,
|
|
89
|
+
"%Y-%m-%dT%H:%M:%SZ": DataType.TIMESTAMP,
|
|
90
|
+
"%Y-%m-%dT%H:%M:%S.%fZ": DataType.TIMESTAMP,
|
|
42
91
|
}
|
|
43
92
|
self.enforce_no_null_columns: bool = True
|
|
44
93
|
self.enforce_primary_keys: bool = True
|
|
94
|
+
self.allow_bigint: bool = True
|
|
95
|
+
_apply_external_injections(self)
|
|
45
96
|
|
|
46
97
|
def __repr__(self):
|
|
47
98
|
date_list = list_as_bullets(
|
|
@@ -60,38 +111,43 @@ class Config:
|
|
|
60
111
|
f" - default_null_values={self.null_values}\n"
|
|
61
112
|
f" - forbidden_characters={self.forbidden_characters}\n"
|
|
62
113
|
f" - date_formats=[{date_list}\n ]\n"
|
|
114
|
+
f" - allow_bigint={self.allow_bigint}\n"
|
|
63
115
|
")"
|
|
64
116
|
)
|
|
65
117
|
|
|
66
118
|
# Context Wrapper With Reset
|
|
67
119
|
def __enter__(self):
|
|
68
|
-
global
|
|
69
|
-
|
|
120
|
+
global session_config
|
|
121
|
+
|
|
122
|
+
_apply_external_injections(self)
|
|
123
|
+
|
|
124
|
+
session_config = self
|
|
70
125
|
return self
|
|
71
126
|
|
|
72
127
|
def __exit__(self, exc_type, exc_value, traceback):
|
|
73
|
-
global
|
|
74
|
-
|
|
128
|
+
global session_config
|
|
129
|
+
session_config = Config()
|
|
75
130
|
|
|
76
131
|
|
|
77
|
-
|
|
132
|
+
session_config: Config = None
|
|
78
133
|
|
|
79
134
|
|
|
80
135
|
def get_config() -> Config:
|
|
81
|
-
"""Gets the current `
|
|
82
|
-
globally.
|
|
136
|
+
"""Gets the current `session_config` instance. Changing attributes will set them
|
|
137
|
+
globally for the python session. Use `reset_default_config()` to reset to original
|
|
138
|
+
defaults.
|
|
83
139
|
|
|
84
140
|
Returns:
|
|
85
|
-
Config: The current
|
|
141
|
+
Config: The current session configuration.
|
|
86
142
|
"""
|
|
87
|
-
global
|
|
88
|
-
return
|
|
143
|
+
global session_config
|
|
144
|
+
return session_config
|
|
89
145
|
|
|
90
146
|
|
|
91
147
|
def reset_default_config() -> None:
|
|
92
148
|
"""Resets `default_config` settings globally to original defaults."""
|
|
93
|
-
global
|
|
94
|
-
|
|
149
|
+
global session_config
|
|
150
|
+
session_config = Config()
|
|
95
151
|
|
|
96
152
|
|
|
97
153
|
reset_default_config()
|
valediction/io/csv_readers.py
CHANGED
|
@@ -11,7 +11,7 @@ import pandas as pd
|
|
|
11
11
|
from pandas import DataFrame
|
|
12
12
|
from pandas.errors import ParserError
|
|
13
13
|
|
|
14
|
-
from valediction.support import
|
|
14
|
+
from valediction.support import _strip
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
class FrameChunk(NamedTuple):
|
|
@@ -34,7 +34,7 @@ class FrameChunk(NamedTuple):
|
|
|
34
34
|
total_chunks_seen: int | None
|
|
35
35
|
|
|
36
36
|
def estimate_chunk_count(self) -> int:
|
|
37
|
-
# Buffers (accounting for CSV tails/bytes
|
|
37
|
+
# Buffers (accounting for CSV tails/bytes inaccuracy)
|
|
38
38
|
EPS_ABS = 4096 # Fixed
|
|
39
39
|
EPS_REL = 0.05 # 5% tail buffer
|
|
40
40
|
|
|
@@ -93,7 +93,7 @@ def _post_read_processing(df: DataFrame, cfg: CsvReadConfig) -> DataFrame:
|
|
|
93
93
|
"""Apply header normalisation and vectorised value stripping after reading."""
|
|
94
94
|
cfg = cfg or CsvReadConfig()
|
|
95
95
|
if cfg.normalise_headers:
|
|
96
|
-
df = df.rename(columns={c:
|
|
96
|
+
df = df.rename(columns={c: _strip(c) for c in df.columns})
|
|
97
97
|
if cfg.strip_values:
|
|
98
98
|
str_cols = df.select_dtypes(include=["string"]).columns
|
|
99
99
|
if len(str_cols) > 0:
|
valediction/support.py
CHANGED
|
@@ -35,10 +35,14 @@ def list_as_bullets(elements: list, bullet: str = "\n - ") -> str:
|
|
|
35
35
|
return bullet + bullet.join(elements)
|
|
36
36
|
|
|
37
37
|
|
|
38
|
-
def
|
|
38
|
+
def _normalise(name: str) -> str:
|
|
39
39
|
return name.strip().upper()
|
|
40
40
|
|
|
41
41
|
|
|
42
|
+
def _strip(name: str) -> str:
|
|
43
|
+
return name.strip()
|
|
44
|
+
|
|
45
|
+
|
|
42
46
|
def _get_runtime_string(runtime: timedelta) -> str:
|
|
43
47
|
total_seconds = runtime.total_seconds()
|
|
44
48
|
hours = trunc(total_seconds / 3600)
|
|
@@ -10,6 +10,7 @@ from pandas.util import hash_pandas_object
|
|
|
10
10
|
from valediction.data_types.data_types import DataType
|
|
11
11
|
from valediction.dictionary.model import Table
|
|
12
12
|
from valediction.integrity import get_config
|
|
13
|
+
from valediction.support import _normalise
|
|
13
14
|
from valediction.validation.issues import Range
|
|
14
15
|
|
|
15
16
|
|
|
@@ -17,11 +18,14 @@ from valediction.validation.issues import Range
|
|
|
17
18
|
def _set_nulls(df: DataFrame) -> DataFrame:
|
|
18
19
|
null_values = get_config().null_values
|
|
19
20
|
token_set = {str(t).strip().casefold() for t in null_values}
|
|
20
|
-
columns = df.select_dtypes(include=["string", "object"]).columns
|
|
21
|
+
columns = df.select_dtypes(include=["string", "object", "category"]).columns
|
|
21
22
|
for column in columns:
|
|
22
23
|
series = df[column]
|
|
23
|
-
|
|
24
|
-
|
|
24
|
+
|
|
25
|
+
s_txt = series.astype("string", copy=False) # dtype safe
|
|
26
|
+
mask = s_txt.notna() & s_txt.str.strip().str.casefold().isin(token_set)
|
|
27
|
+
if mask.any():
|
|
28
|
+
df[column] = series.mask(mask, NA)
|
|
25
29
|
|
|
26
30
|
return df
|
|
27
31
|
|
|
@@ -68,37 +72,24 @@ def create_pk_hashes(
|
|
|
68
72
|
Returns:
|
|
69
73
|
Series: Pandas Series with hashes or Nulls.
|
|
70
74
|
"""
|
|
71
|
-
|
|
75
|
+
HASH_COL_NAME = "PK_HASH"
|
|
72
76
|
if df_primaries.empty or df_primaries.shape[1] == 0:
|
|
73
|
-
return Series([], dtype=object, name=
|
|
77
|
+
return Series([], dtype=object, name=HASH_COL_NAME)
|
|
74
78
|
|
|
75
|
-
#
|
|
79
|
+
# Check Nulls
|
|
76
80
|
null_rows = df_primaries.isna().any(axis=1)
|
|
77
81
|
|
|
78
|
-
#
|
|
79
|
-
hash_1 = hash_pandas_object(df_primaries, index=False)
|
|
80
|
-
|
|
81
|
-
# Second Hash (rows backwards if single row, else salt)
|
|
82
|
-
if df_primaries.shape[1] > 1:
|
|
83
|
-
df_primaries_backwards = df_primaries.iloc[:, ::-1]
|
|
84
|
-
else:
|
|
85
|
-
s = df_primaries.iloc[:, 0]
|
|
86
|
-
salt = Series(["§"] * len(s), index=s.index, dtype="string")
|
|
87
|
-
df_primaries_backwards = DataFrame(
|
|
88
|
-
{
|
|
89
|
-
"_a": s,
|
|
90
|
-
"_b": s.str.cat(salt),
|
|
91
|
-
}
|
|
92
|
-
)
|
|
93
|
-
|
|
94
|
-
hash_2 = hash_pandas_object(df_primaries_backwards, index=False) # uint64
|
|
82
|
+
# Two independent 64-bit hashes with 16 byte keys
|
|
83
|
+
hash_1 = hash_pandas_object(df_primaries, index=False, hash_key="valediction_pk1!")
|
|
84
|
+
hash_2 = hash_pandas_object(df_primaries, index=False, hash_key="valediction_pk2!")
|
|
95
85
|
|
|
86
|
+
# Combine into 128-bit integer keys
|
|
96
87
|
a1 = hash_1.to_numpy(dtype="uint64", copy=False).astype(object)
|
|
97
88
|
a2 = hash_2.to_numpy(dtype="uint64", copy=False).astype(object)
|
|
98
|
-
|
|
99
89
|
combined = (a1 << 64) | a2
|
|
90
|
+
|
|
100
91
|
hashes = Series(
|
|
101
|
-
combined, index=df_primaries.index, name=
|
|
92
|
+
combined, index=df_primaries.index, name=HASH_COL_NAME, dtype=object
|
|
102
93
|
)
|
|
103
94
|
hashes[null_rows] = None
|
|
104
95
|
return hashes
|
|
@@ -167,8 +158,9 @@ def pk_contains_whitespace_mask(df_primaries: DataFrame) -> Series:
|
|
|
167
158
|
if df_primaries.empty or df_primaries.shape[1] == 0:
|
|
168
159
|
return Series(False, index=df_primaries.index)
|
|
169
160
|
|
|
170
|
-
col_masks = df_primaries.apply(
|
|
171
|
-
|
|
161
|
+
col_masks = df_primaries.apply(
|
|
162
|
+
lambda s: s.astype("string", copy=False).str.contains(r"\s", na=False)
|
|
163
|
+
)
|
|
172
164
|
return col_masks.any(axis=1)
|
|
173
165
|
|
|
174
166
|
|
|
@@ -249,7 +241,7 @@ def invalid_mask_datetime(column: Series, fmt: str | None) -> Series:
|
|
|
249
241
|
ok = parsed.notna()
|
|
250
242
|
return notnull & (~ok)
|
|
251
243
|
|
|
252
|
-
allowed = _allowed_formats_for(DataType.
|
|
244
|
+
allowed = _allowed_formats_for(DataType.TIMESTAMP)
|
|
253
245
|
ok_any = _parse_ok_any(column, allowed)
|
|
254
246
|
return notnull & (~ok_any)
|
|
255
247
|
|
|
@@ -261,7 +253,9 @@ def invalid_mask_text_too_long(column: Series, max_len: int) -> Series:
|
|
|
261
253
|
return Series(False, index=column.index)
|
|
262
254
|
|
|
263
255
|
notnull = column.notna()
|
|
264
|
-
|
|
256
|
+
s_txt = column.astype("string", copy=False)
|
|
257
|
+
lens = s_txt.str.len()
|
|
258
|
+
|
|
265
259
|
return notnull & (lens > max_len)
|
|
266
260
|
|
|
267
261
|
|
|
@@ -270,20 +264,23 @@ def invalid_mask_text_forbidden_characters(column: Series) -> Series:
|
|
|
270
264
|
if not forbidden:
|
|
271
265
|
return column.notna() & False
|
|
272
266
|
|
|
273
|
-
pattern = "[" + re.escape("".join(forbidden)) + "]"
|
|
267
|
+
pattern = "[" + re.escape("".join([str(s) for s in forbidden])) + "]"
|
|
274
268
|
notnull = column.notna()
|
|
275
|
-
|
|
269
|
+
|
|
270
|
+
s_txt = column.astype("string", copy=False)
|
|
271
|
+
has_forbidden = s_txt.str.contains(pattern, regex=True, na=False)
|
|
272
|
+
|
|
276
273
|
return notnull & has_forbidden
|
|
277
274
|
|
|
278
275
|
|
|
279
276
|
# Apply Data Types #
|
|
280
277
|
def apply_data_types(df: DataFrame, table_dictionary: Table) -> DataFrame:
|
|
281
278
|
# name -> column object
|
|
282
|
-
column_dictionary = {column.name: column for column in table_dictionary}
|
|
279
|
+
column_dictionary = {_normalise(column.name): column for column in table_dictionary}
|
|
283
280
|
|
|
284
281
|
for col in df.columns:
|
|
285
|
-
data_type = column_dictionary.get(col).data_type
|
|
286
|
-
datetime_format = column_dictionary.get(col).datetime_format
|
|
282
|
+
data_type = column_dictionary.get(_normalise(col)).data_type
|
|
283
|
+
datetime_format = column_dictionary.get(_normalise(col)).datetime_format
|
|
287
284
|
|
|
288
285
|
if data_type in (DataType.TEXT, DataType.FILE):
|
|
289
286
|
df[col] = df[col].astype("string")
|
|
@@ -303,7 +300,7 @@ def apply_data_types(df: DataFrame, table_dictionary: Table) -> DataFrame:
|
|
|
303
300
|
)
|
|
304
301
|
df[col] = dtv.dt.normalize() # midnight
|
|
305
302
|
|
|
306
|
-
elif data_type == DataType.
|
|
303
|
+
elif data_type == DataType.TIMESTAMP:
|
|
307
304
|
df[col] = to_datetime(
|
|
308
305
|
df[col], format=datetime_format, errors="raise", utc=False
|
|
309
306
|
)
|
|
@@ -313,3 +310,62 @@ def apply_data_types(df: DataFrame, table_dictionary: Table) -> DataFrame:
|
|
|
313
310
|
df[col] = df[col].astype("string")
|
|
314
311
|
|
|
315
312
|
return df
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
# Bigint Checks
|
|
316
|
+
_PG_INT4_MIN_STR_ABS = "2147483648" # abs(-2147483648)
|
|
317
|
+
_PG_INT4_MAX_STR_ABS = "2147483647"
|
|
318
|
+
_PG_INT4_MIN_LEN = len(_PG_INT4_MIN_STR_ABS)
|
|
319
|
+
_PG_INT4_MAX_LEN = len(_PG_INT4_MAX_STR_ABS)
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
def invalid_mask_integer_out_of_range(
|
|
323
|
+
series: Series,
|
|
324
|
+
invalid_integer_mask: Series | None = None,
|
|
325
|
+
) -> Series:
|
|
326
|
+
"""
|
|
327
|
+
Returns a boolean mask for values that:
|
|
328
|
+
- are integer-like under Valediction's integer rules, AND
|
|
329
|
+
- fall outside PostgreSQL INTEGER (int4) range.
|
|
330
|
+
"""
|
|
331
|
+
|
|
332
|
+
# Start with all-False mask
|
|
333
|
+
out = series.isna() & False
|
|
334
|
+
|
|
335
|
+
# Use caller-provided invalid mask to avoid recomputing if available
|
|
336
|
+
if invalid_integer_mask is None:
|
|
337
|
+
from valediction.validation.helpers import invalid_mask_integer # avoid cycles
|
|
338
|
+
|
|
339
|
+
invalid_integer_mask = invalid_mask_integer(series)
|
|
340
|
+
|
|
341
|
+
# We only check range for values that already pass integer validation
|
|
342
|
+
valid = (~invalid_integer_mask) & series.notna()
|
|
343
|
+
if not valid.any():
|
|
344
|
+
return out
|
|
345
|
+
|
|
346
|
+
# String-normalise for safe compare (works for object/int dtype)
|
|
347
|
+
s = series[valid].astype("string", copy=False).str.strip()
|
|
348
|
+
|
|
349
|
+
# Sign handling
|
|
350
|
+
neg = s.str.startswith("-")
|
|
351
|
+
abs_str = s.str.lstrip("+-")
|
|
352
|
+
|
|
353
|
+
# Lengths
|
|
354
|
+
abs_len = abs_str.str.len()
|
|
355
|
+
|
|
356
|
+
# Positive overflow:
|
|
357
|
+
# abs_len > 10 OR (abs_len == 10 AND abs_str > 2147483647)
|
|
358
|
+
pos = ~neg
|
|
359
|
+
pos_over = (abs_len > _PG_INT4_MAX_LEN) | (
|
|
360
|
+
(abs_len == _PG_INT4_MAX_LEN) & (abs_str > _PG_INT4_MAX_STR_ABS)
|
|
361
|
+
)
|
|
362
|
+
|
|
363
|
+
# Negative overflow (too small):
|
|
364
|
+
# abs_len > 10 OR (abs_len == 10 AND abs_str > 2147483648)
|
|
365
|
+
neg_over = (abs_len > _PG_INT4_MIN_LEN) | (
|
|
366
|
+
(abs_len == _PG_INT4_MIN_LEN) & (abs_str > _PG_INT4_MIN_STR_ABS)
|
|
367
|
+
)
|
|
368
|
+
|
|
369
|
+
# Combine back into the full index
|
|
370
|
+
out.loc[valid] = (pos & pos_over) | (neg & neg_over)
|
|
371
|
+
return out
|