valediction 1.0.3__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- valediction/convenience.py +7 -12
- valediction/datasets/datasets.py +17 -17
- valediction/dictionary/generation.py +5 -5
- valediction/dictionary/helpers.py +0 -7
- valediction/dictionary/importing.py +43 -20
- valediction/dictionary/model.py +108 -36
- valediction/integrity.py +67 -13
- valediction/io/csv_readers.py +3 -3
- valediction/support.py +5 -1
- valediction/validation/helpers.py +30 -33
- valediction/validation/issues.py +37 -25
- valediction/validation/validation.py +102 -53
- {valediction-1.0.3.dist-info → valediction-1.2.0.dist-info}/METADATA +1 -1
- {valediction-1.0.3.dist-info → valediction-1.2.0.dist-info}/RECORD +15 -15
- {valediction-1.0.3.dist-info → valediction-1.2.0.dist-info}/WHEEL +0 -0
valediction/integrity.py
CHANGED
|
@@ -1,6 +1,10 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import re
|
|
4
|
+
from copy import deepcopy
|
|
2
5
|
from pathlib import Path
|
|
3
6
|
from re import Pattern
|
|
7
|
+
from typing import Any
|
|
4
8
|
|
|
5
9
|
from valediction.data_types.data_types import DataType
|
|
6
10
|
from valediction.support import list_as_bullets
|
|
@@ -12,13 +16,58 @@ TEMPLATE_DATA_DICTIONARY_PATH = (
|
|
|
12
16
|
)
|
|
13
17
|
|
|
14
18
|
|
|
19
|
+
externally_injected_variables: dict[
|
|
20
|
+
str, Any
|
|
21
|
+
] = {} # External injection store for package wrapping (any keys, always included)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def reset_injected_config_variables() -> None:
|
|
25
|
+
global externally_injected_variables
|
|
26
|
+
externally_injected_variables = {}
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def inject_config_variables(variables: dict[str, Any]) -> None:
|
|
30
|
+
"""Injects variables into the Valediction Config, which will always be incorporated
|
|
31
|
+
as overrides, regardless of Config calling method (default, session-scoped, or
|
|
32
|
+
contextual).
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
variables (dict[str, Any]): Dictionary of config variables.
|
|
36
|
+
"""
|
|
37
|
+
global externally_injected_variables, session_config
|
|
38
|
+
|
|
39
|
+
# check type allows
|
|
40
|
+
if not isinstance(variables, dict):
|
|
41
|
+
raise TypeError(
|
|
42
|
+
f"Config injection variables must be a dictionary, not {type(variables)}"
|
|
43
|
+
)
|
|
44
|
+
problematic_keys = []
|
|
45
|
+
for variable_name in variables.keys():
|
|
46
|
+
if not isinstance(variable_name, str):
|
|
47
|
+
problematic_keys.append(variable_name)
|
|
48
|
+
|
|
49
|
+
if problematic_keys:
|
|
50
|
+
raise TypeError("Config injection variables accepts only string keys.")
|
|
51
|
+
|
|
52
|
+
externally_injected_variables = dict(variables or {})
|
|
53
|
+
|
|
54
|
+
# Apply immediately to the current session config (if it exists)
|
|
55
|
+
if session_config is not None:
|
|
56
|
+
_apply_external_injections(session_config)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _apply_external_injections(config: Config) -> None:
|
|
60
|
+
for variable_name, variable_value in externally_injected_variables.items():
|
|
61
|
+
setattr(config, variable_name, deepcopy(variable_value))
|
|
62
|
+
|
|
63
|
+
|
|
15
64
|
class Config:
|
|
16
65
|
def __init__(self):
|
|
17
66
|
self.template_data_dictionary_path: Path = TEMPLATE_DATA_DICTIONARY_PATH
|
|
18
67
|
self.max_table_name_length: int = 63
|
|
19
68
|
self.max_column_name_length: int = 30
|
|
20
69
|
self.max_primary_keys: int = 7
|
|
21
|
-
self.invalid_name_pattern: str | Pattern = re.compile(r"[^A-
|
|
70
|
+
self.invalid_name_pattern: str | Pattern = re.compile(r"[^A-Za-z0-9_]")
|
|
22
71
|
self.null_values: list[str] = ["", "null", "none"]
|
|
23
72
|
self.forbidden_characters: list[str] = []
|
|
24
73
|
self.date_formats: dict[str, DataType] = {
|
|
@@ -42,6 +91,7 @@ class Config:
|
|
|
42
91
|
}
|
|
43
92
|
self.enforce_no_null_columns: bool = True
|
|
44
93
|
self.enforce_primary_keys: bool = True
|
|
94
|
+
_apply_external_injections(self)
|
|
45
95
|
|
|
46
96
|
def __repr__(self):
|
|
47
97
|
date_list = list_as_bullets(
|
|
@@ -65,33 +115,37 @@ class Config:
|
|
|
65
115
|
|
|
66
116
|
# Context Wrapper With Reset
|
|
67
117
|
def __enter__(self):
|
|
68
|
-
global
|
|
69
|
-
|
|
118
|
+
global session_config
|
|
119
|
+
|
|
120
|
+
_apply_external_injections(self)
|
|
121
|
+
|
|
122
|
+
session_config = self
|
|
70
123
|
return self
|
|
71
124
|
|
|
72
125
|
def __exit__(self, exc_type, exc_value, traceback):
|
|
73
|
-
global
|
|
74
|
-
|
|
126
|
+
global session_config
|
|
127
|
+
session_config = Config()
|
|
75
128
|
|
|
76
129
|
|
|
77
|
-
|
|
130
|
+
session_config: Config = None
|
|
78
131
|
|
|
79
132
|
|
|
80
133
|
def get_config() -> Config:
|
|
81
|
-
"""Gets the current `
|
|
82
|
-
globally.
|
|
134
|
+
"""Gets the current `session_config` instance. Changing attributes will set them
|
|
135
|
+
globally for the python session. Use `reset_default_config()` to reset to original
|
|
136
|
+
defaults.
|
|
83
137
|
|
|
84
138
|
Returns:
|
|
85
|
-
Config: The current
|
|
139
|
+
Config: The current session configuration.
|
|
86
140
|
"""
|
|
87
|
-
global
|
|
88
|
-
return
|
|
141
|
+
global session_config
|
|
142
|
+
return session_config
|
|
89
143
|
|
|
90
144
|
|
|
91
145
|
def reset_default_config() -> None:
|
|
92
146
|
"""Resets `default_config` settings globally to original defaults."""
|
|
93
|
-
global
|
|
94
|
-
|
|
147
|
+
global session_config
|
|
148
|
+
session_config = Config()
|
|
95
149
|
|
|
96
150
|
|
|
97
151
|
reset_default_config()
|
valediction/io/csv_readers.py
CHANGED
|
@@ -11,7 +11,7 @@ import pandas as pd
|
|
|
11
11
|
from pandas import DataFrame
|
|
12
12
|
from pandas.errors import ParserError
|
|
13
13
|
|
|
14
|
-
from valediction.support import
|
|
14
|
+
from valediction.support import _strip
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
class FrameChunk(NamedTuple):
|
|
@@ -34,7 +34,7 @@ class FrameChunk(NamedTuple):
|
|
|
34
34
|
total_chunks_seen: int | None
|
|
35
35
|
|
|
36
36
|
def estimate_chunk_count(self) -> int:
|
|
37
|
-
# Buffers (accounting for CSV tails/bytes
|
|
37
|
+
# Buffers (accounting for CSV tails/bytes inaccuracy)
|
|
38
38
|
EPS_ABS = 4096 # Fixed
|
|
39
39
|
EPS_REL = 0.05 # 5% tail buffer
|
|
40
40
|
|
|
@@ -93,7 +93,7 @@ def _post_read_processing(df: DataFrame, cfg: CsvReadConfig) -> DataFrame:
|
|
|
93
93
|
"""Apply header normalisation and vectorised value stripping after reading."""
|
|
94
94
|
cfg = cfg or CsvReadConfig()
|
|
95
95
|
if cfg.normalise_headers:
|
|
96
|
-
df = df.rename(columns={c:
|
|
96
|
+
df = df.rename(columns={c: _strip(c) for c in df.columns})
|
|
97
97
|
if cfg.strip_values:
|
|
98
98
|
str_cols = df.select_dtypes(include=["string"]).columns
|
|
99
99
|
if len(str_cols) > 0:
|
valediction/support.py
CHANGED
|
@@ -35,10 +35,14 @@ def list_as_bullets(elements: list, bullet: str = "\n - ") -> str:
|
|
|
35
35
|
return bullet + bullet.join(elements)
|
|
36
36
|
|
|
37
37
|
|
|
38
|
-
def
|
|
38
|
+
def _normalise(name: str) -> str:
|
|
39
39
|
return name.strip().upper()
|
|
40
40
|
|
|
41
41
|
|
|
42
|
+
def _strip(name: str) -> str:
|
|
43
|
+
return name.strip()
|
|
44
|
+
|
|
45
|
+
|
|
42
46
|
def _get_runtime_string(runtime: timedelta) -> str:
|
|
43
47
|
total_seconds = runtime.total_seconds()
|
|
44
48
|
hours = trunc(total_seconds / 3600)
|
|
@@ -10,6 +10,7 @@ from pandas.util import hash_pandas_object
|
|
|
10
10
|
from valediction.data_types.data_types import DataType
|
|
11
11
|
from valediction.dictionary.model import Table
|
|
12
12
|
from valediction.integrity import get_config
|
|
13
|
+
from valediction.support import _normalise
|
|
13
14
|
from valediction.validation.issues import Range
|
|
14
15
|
|
|
15
16
|
|
|
@@ -17,11 +18,14 @@ from valediction.validation.issues import Range
|
|
|
17
18
|
def _set_nulls(df: DataFrame) -> DataFrame:
|
|
18
19
|
null_values = get_config().null_values
|
|
19
20
|
token_set = {str(t).strip().casefold() for t in null_values}
|
|
20
|
-
columns = df.select_dtypes(include=["string", "object"]).columns
|
|
21
|
+
columns = df.select_dtypes(include=["string", "object", "category"]).columns
|
|
21
22
|
for column in columns:
|
|
22
23
|
series = df[column]
|
|
23
|
-
|
|
24
|
-
|
|
24
|
+
|
|
25
|
+
s_txt = series.astype("string", copy=False) # dtype safe
|
|
26
|
+
mask = s_txt.notna() & s_txt.str.strip().str.casefold().isin(token_set)
|
|
27
|
+
if mask.any():
|
|
28
|
+
df[column] = series.mask(mask, NA)
|
|
25
29
|
|
|
26
30
|
return df
|
|
27
31
|
|
|
@@ -68,37 +72,24 @@ def create_pk_hashes(
|
|
|
68
72
|
Returns:
|
|
69
73
|
Series: Pandas Series with hashes or Nulls.
|
|
70
74
|
"""
|
|
71
|
-
|
|
75
|
+
HASH_COL_NAME = "PK_HASH"
|
|
72
76
|
if df_primaries.empty or df_primaries.shape[1] == 0:
|
|
73
|
-
return Series([], dtype=object, name=
|
|
77
|
+
return Series([], dtype=object, name=HASH_COL_NAME)
|
|
74
78
|
|
|
75
|
-
#
|
|
79
|
+
# Check Nulls
|
|
76
80
|
null_rows = df_primaries.isna().any(axis=1)
|
|
77
81
|
|
|
78
|
-
#
|
|
79
|
-
hash_1 = hash_pandas_object(df_primaries, index=False)
|
|
80
|
-
|
|
81
|
-
# Second Hash (rows backwards if single row, else salt)
|
|
82
|
-
if df_primaries.shape[1] > 1:
|
|
83
|
-
df_primaries_backwards = df_primaries.iloc[:, ::-1]
|
|
84
|
-
else:
|
|
85
|
-
s = df_primaries.iloc[:, 0]
|
|
86
|
-
salt = Series(["§"] * len(s), index=s.index, dtype="string")
|
|
87
|
-
df_primaries_backwards = DataFrame(
|
|
88
|
-
{
|
|
89
|
-
"_a": s,
|
|
90
|
-
"_b": s.str.cat(salt),
|
|
91
|
-
}
|
|
92
|
-
)
|
|
93
|
-
|
|
94
|
-
hash_2 = hash_pandas_object(df_primaries_backwards, index=False) # uint64
|
|
82
|
+
# Two independent 64-bit hashes with 16 byte keys
|
|
83
|
+
hash_1 = hash_pandas_object(df_primaries, index=False, hash_key="valediction_pk1!")
|
|
84
|
+
hash_2 = hash_pandas_object(df_primaries, index=False, hash_key="valediction_pk2!")
|
|
95
85
|
|
|
86
|
+
# Combine into 128-bit integer keys
|
|
96
87
|
a1 = hash_1.to_numpy(dtype="uint64", copy=False).astype(object)
|
|
97
88
|
a2 = hash_2.to_numpy(dtype="uint64", copy=False).astype(object)
|
|
98
|
-
|
|
99
89
|
combined = (a1 << 64) | a2
|
|
90
|
+
|
|
100
91
|
hashes = Series(
|
|
101
|
-
combined, index=df_primaries.index, name=
|
|
92
|
+
combined, index=df_primaries.index, name=HASH_COL_NAME, dtype=object
|
|
102
93
|
)
|
|
103
94
|
hashes[null_rows] = None
|
|
104
95
|
return hashes
|
|
@@ -167,8 +158,9 @@ def pk_contains_whitespace_mask(df_primaries: DataFrame) -> Series:
|
|
|
167
158
|
if df_primaries.empty or df_primaries.shape[1] == 0:
|
|
168
159
|
return Series(False, index=df_primaries.index)
|
|
169
160
|
|
|
170
|
-
col_masks = df_primaries.apply(
|
|
171
|
-
|
|
161
|
+
col_masks = df_primaries.apply(
|
|
162
|
+
lambda s: s.astype("string", copy=False).str.contains(r"\s", na=False)
|
|
163
|
+
)
|
|
172
164
|
return col_masks.any(axis=1)
|
|
173
165
|
|
|
174
166
|
|
|
@@ -261,7 +253,9 @@ def invalid_mask_text_too_long(column: Series, max_len: int) -> Series:
|
|
|
261
253
|
return Series(False, index=column.index)
|
|
262
254
|
|
|
263
255
|
notnull = column.notna()
|
|
264
|
-
|
|
256
|
+
s_txt = column.astype("string", copy=False)
|
|
257
|
+
lens = s_txt.str.len()
|
|
258
|
+
|
|
265
259
|
return notnull & (lens > max_len)
|
|
266
260
|
|
|
267
261
|
|
|
@@ -270,20 +264,23 @@ def invalid_mask_text_forbidden_characters(column: Series) -> Series:
|
|
|
270
264
|
if not forbidden:
|
|
271
265
|
return column.notna() & False
|
|
272
266
|
|
|
273
|
-
pattern = "[" + re.escape("".join(forbidden)) + "]"
|
|
267
|
+
pattern = "[" + re.escape("".join([str(s) for s in forbidden])) + "]"
|
|
274
268
|
notnull = column.notna()
|
|
275
|
-
|
|
269
|
+
|
|
270
|
+
s_txt = column.astype("string", copy=False)
|
|
271
|
+
has_forbidden = s_txt.str.contains(pattern, regex=True, na=False)
|
|
272
|
+
|
|
276
273
|
return notnull & has_forbidden
|
|
277
274
|
|
|
278
275
|
|
|
279
276
|
# Apply Data Types #
|
|
280
277
|
def apply_data_types(df: DataFrame, table_dictionary: Table) -> DataFrame:
|
|
281
278
|
# name -> column object
|
|
282
|
-
column_dictionary = {column.name: column for column in table_dictionary}
|
|
279
|
+
column_dictionary = {_normalise(column.name): column for column in table_dictionary}
|
|
283
280
|
|
|
284
281
|
for col in df.columns:
|
|
285
|
-
data_type = column_dictionary.get(col).data_type
|
|
286
|
-
datetime_format = column_dictionary.get(col).datetime_format
|
|
282
|
+
data_type = column_dictionary.get(_normalise(col)).data_type
|
|
283
|
+
datetime_format = column_dictionary.get(_normalise(col)).datetime_format
|
|
287
284
|
|
|
288
285
|
if data_type in (DataType.TEXT, DataType.FILE):
|
|
289
286
|
df[col] = df[col].astype("string")
|
valediction/validation/issues.py
CHANGED
|
@@ -8,7 +8,7 @@ from pandas import DataFrame, concat
|
|
|
8
8
|
|
|
9
9
|
from valediction.datasets.datasets_helpers import DatasetItemLike
|
|
10
10
|
from valediction.io.csv_readers import CsvReadConfig, read_csv_ranges
|
|
11
|
-
from valediction.support import
|
|
11
|
+
from valediction.support import _strip, list_as_bullets
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
class IssueType(Enum):
|
|
@@ -107,6 +107,7 @@ class Issue:
|
|
|
107
107
|
merged.append(cur)
|
|
108
108
|
self.ranges = merged
|
|
109
109
|
|
|
110
|
+
# Inspect
|
|
110
111
|
def inspect(
|
|
111
112
|
self,
|
|
112
113
|
additional_columns: bool | str | list[str] | None = None,
|
|
@@ -132,9 +133,9 @@ class Issue:
|
|
|
132
133
|
ValueError: if the issue has no parent DatasetItem
|
|
133
134
|
"""
|
|
134
135
|
# Guard
|
|
135
|
-
|
|
136
|
-
raise ValueError("Issue has no parent DatasetItem")
|
|
136
|
+
self.__guard_parent()
|
|
137
137
|
header = self.__repr__() if print_header else ""
|
|
138
|
+
|
|
138
139
|
# Not applicable
|
|
139
140
|
if self.type in APPLIES_WHOLE_COLUMN:
|
|
140
141
|
print(f"{header}: applies to whole column")
|
|
@@ -143,22 +144,8 @@ class Issue:
|
|
|
143
144
|
# Column Inclusion
|
|
144
145
|
if print_header:
|
|
145
146
|
print(f"{header}:")
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
else:
|
|
149
|
-
additional_columns = (
|
|
150
|
-
[additional_columns]
|
|
151
|
-
if isinstance(additional_columns, str)
|
|
152
|
-
else additional_columns
|
|
153
|
-
)
|
|
154
|
-
base = (
|
|
155
|
-
set(self.parent.primary_keys)
|
|
156
|
-
if self.type in PRIMARY_KEY_ISSUES
|
|
157
|
-
else {self.column}
|
|
158
|
-
)
|
|
159
|
-
base |= set(additional_columns or [])
|
|
160
|
-
base.discard(None)
|
|
161
|
-
columns = list(base) if base else None
|
|
147
|
+
|
|
148
|
+
columns = self.__select_columns(additional_columns)
|
|
162
149
|
|
|
163
150
|
if not self.ranges:
|
|
164
151
|
return DataFrame(columns=columns) if columns else DataFrame()
|
|
@@ -194,6 +181,31 @@ class Issue:
|
|
|
194
181
|
|
|
195
182
|
return out if columns is None else out.loc[:, columns]
|
|
196
183
|
|
|
184
|
+
# Inspect Helpers
|
|
185
|
+
def __guard_parent(self):
|
|
186
|
+
if not self.parent:
|
|
187
|
+
raise ValueError("Issue has no parent DatasetItem")
|
|
188
|
+
|
|
189
|
+
def __select_columns(self, additional_columns: bool | str | list[str]) -> list:
|
|
190
|
+
if additional_columns is True:
|
|
191
|
+
columns = None
|
|
192
|
+
else:
|
|
193
|
+
additional_columns = (
|
|
194
|
+
[additional_columns]
|
|
195
|
+
if isinstance(additional_columns, str)
|
|
196
|
+
else additional_columns
|
|
197
|
+
)
|
|
198
|
+
base = (
|
|
199
|
+
set(self.parent.primary_keys)
|
|
200
|
+
if self.type in PRIMARY_KEY_ISSUES
|
|
201
|
+
else {self.column}
|
|
202
|
+
)
|
|
203
|
+
base |= set(additional_columns or [])
|
|
204
|
+
base.discard(None)
|
|
205
|
+
columns = list(base) if base else None
|
|
206
|
+
|
|
207
|
+
return columns
|
|
208
|
+
|
|
197
209
|
|
|
198
210
|
@dataclass
|
|
199
211
|
class Issues:
|
|
@@ -235,8 +247,8 @@ class Issues:
|
|
|
235
247
|
parent: DatasetItemLike | None = None,
|
|
236
248
|
) -> Issue:
|
|
237
249
|
key = (
|
|
238
|
-
|
|
239
|
-
|
|
250
|
+
_strip(table),
|
|
251
|
+
_strip(column) if column is not None else None,
|
|
240
252
|
issue_type,
|
|
241
253
|
)
|
|
242
254
|
issue = self._index.get(key)
|
|
@@ -255,8 +267,8 @@ class Issues:
|
|
|
255
267
|
issue_type: IssueType | None = None,
|
|
256
268
|
) -> list[Issue]:
|
|
257
269
|
"""Case-insensitive filter; any arg can be None to act as a wildcard."""
|
|
258
|
-
table =
|
|
259
|
-
column =
|
|
270
|
+
table = _strip(table)
|
|
271
|
+
column = _strip(column) if column is not None else None
|
|
260
272
|
output: list[Issue] = []
|
|
261
273
|
if issue_type is not None:
|
|
262
274
|
# direct index lookup where possible
|
|
@@ -268,9 +280,9 @@ class Issues:
|
|
|
268
280
|
|
|
269
281
|
# otherwise scan (still cheap; we maintain a compact list)
|
|
270
282
|
for item in self._items:
|
|
271
|
-
if
|
|
283
|
+
if _strip(item.table) != table:
|
|
272
284
|
continue
|
|
273
|
-
if column is not None and (
|
|
285
|
+
if column is not None and (_strip(item.column) or "") != column:
|
|
274
286
|
continue
|
|
275
287
|
output.append(item)
|
|
276
288
|
return output
|
|
@@ -20,7 +20,7 @@ from valediction.io.csv_readers import (
|
|
|
20
20
|
iter_csv_chunks,
|
|
21
21
|
)
|
|
22
22
|
from valediction.progress import Progress
|
|
23
|
-
from valediction.support import _get_runtime_string, calculate_runtime
|
|
23
|
+
from valediction.support import _get_runtime_string, _normalise, calculate_runtime
|
|
24
24
|
from valediction.validation.helpers import (
|
|
25
25
|
_column_has_values,
|
|
26
26
|
_set_nulls,
|
|
@@ -62,7 +62,7 @@ class Validator:
|
|
|
62
62
|
dataset_item: DatasetItemLike,
|
|
63
63
|
table_dictionary: Table,
|
|
64
64
|
feedback: bool = True,
|
|
65
|
-
chunk_size: int = 10_000_000,
|
|
65
|
+
chunk_size: int | None = 10_000_000,
|
|
66
66
|
_padding: int = 0,
|
|
67
67
|
):
|
|
68
68
|
# User Variables
|
|
@@ -86,7 +86,9 @@ class Validator:
|
|
|
86
86
|
self._dt_needs_infer: set[str] = set()
|
|
87
87
|
|
|
88
88
|
# Helpers
|
|
89
|
-
self._column_names: set =
|
|
89
|
+
self._column_names: set[str] = {
|
|
90
|
+
_normalise(n) for n in self.table_dictionary.get_column_names()
|
|
91
|
+
}
|
|
90
92
|
|
|
91
93
|
# Progress Tracking
|
|
92
94
|
self.progress: Progress | None = None
|
|
@@ -155,6 +157,20 @@ class Validator:
|
|
|
155
157
|
if not datetime_format:
|
|
156
158
|
self._dt_needs_infer.add(name)
|
|
157
159
|
|
|
160
|
+
# Column Scanning
|
|
161
|
+
def _resolve_df_col(self, df: DataFrame, name: str) -> str | None:
|
|
162
|
+
"""Return the actual df column label matching name case-insensitively."""
|
|
163
|
+
target = _normalise(name)
|
|
164
|
+
return next((c for c in df.columns if _normalise(str(c)) == target), None)
|
|
165
|
+
|
|
166
|
+
def _resolve_df_cols(self, df: DataFrame, names: list[str]) -> list[str]:
|
|
167
|
+
resolved: list[str] = []
|
|
168
|
+
for n in names:
|
|
169
|
+
c = self._resolve_df_col(df, n)
|
|
170
|
+
if c is not None:
|
|
171
|
+
resolved.append(c)
|
|
172
|
+
return resolved
|
|
173
|
+
|
|
158
174
|
# Validate
|
|
159
175
|
def validate(self):
|
|
160
176
|
"""
|
|
@@ -272,28 +288,45 @@ class Validator:
|
|
|
272
288
|
# Validation: Start Helpers
|
|
273
289
|
def _check_for_missing_columns(self, df: DataFrame):
|
|
274
290
|
self.__begin_step(step="Checking for missing columns")
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
291
|
+
|
|
292
|
+
dict_names = self.table_dictionary.get_column_names()
|
|
293
|
+
dict_keys = {_normalise(name) for name in dict_names}
|
|
294
|
+
|
|
295
|
+
df_keys = {_normalise(str(column)) for column in df.columns}
|
|
296
|
+
|
|
297
|
+
missing_keys = dict_keys - df_keys
|
|
298
|
+
if missing_keys:
|
|
299
|
+
for name in dict_names:
|
|
300
|
+
if _normalise(name) in missing_keys:
|
|
301
|
+
self.issues.add(
|
|
302
|
+
issue_type=IssueType.MISSING_COLUMN,
|
|
303
|
+
table=self.table_name,
|
|
304
|
+
column=name,
|
|
305
|
+
parent=self.dataset_item,
|
|
306
|
+
)
|
|
307
|
+
|
|
284
308
|
self.__complete_step()
|
|
285
309
|
|
|
286
310
|
def _check_for_extra_columns(self, df: DataFrame):
|
|
287
311
|
self.__begin_step(step="Checking for extra columns")
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
for
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
312
|
+
|
|
313
|
+
dict_keys = {
|
|
314
|
+
_normalise(name) for name in self.table_dictionary.get_column_names()
|
|
315
|
+
}
|
|
316
|
+
df_cols = [str(column) for column in df.columns]
|
|
317
|
+
df_keys = {_normalise(column) for column in df_cols}
|
|
318
|
+
|
|
319
|
+
extra_keys = df_keys - dict_keys
|
|
320
|
+
if extra_keys:
|
|
321
|
+
for col in df_cols:
|
|
322
|
+
if _normalise(col) in extra_keys:
|
|
323
|
+
self.issues.add(
|
|
324
|
+
issue_type=IssueType.EXTRA_COLUMN,
|
|
325
|
+
table=self.table_name,
|
|
326
|
+
column=col, # report the actual df label
|
|
327
|
+
parent=self.dataset_item,
|
|
328
|
+
)
|
|
329
|
+
|
|
297
330
|
self.__complete_step()
|
|
298
331
|
|
|
299
332
|
# Validation: Chunk Helpers
|
|
@@ -319,13 +352,16 @@ class Validator:
|
|
|
319
352
|
|
|
320
353
|
# Check for whitespace (text cols only)
|
|
321
354
|
self.__begin_step(step="Checking for primary key whitespace")
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
355
|
+
pk_keys = {_normalise(p) for p in pk_cols}
|
|
356
|
+
pk_cols_text = [
|
|
357
|
+
column.name
|
|
358
|
+
for column in self.table_dictionary
|
|
359
|
+
if _normalise(column.name) in pk_keys and column.data_type is DataType.TEXT
|
|
360
|
+
]
|
|
326
361
|
|
|
327
362
|
if pk_cols_text:
|
|
328
|
-
|
|
363
|
+
pk_cols_text_df = self._resolve_df_cols(df, pk_cols_text)
|
|
364
|
+
space_mask = pk_contains_whitespace_mask(df[pk_cols_text_df])
|
|
329
365
|
if space_mask.any():
|
|
330
366
|
self.issues.add(
|
|
331
367
|
issue_type=IssueType.PK_WHITESPACE,
|
|
@@ -343,7 +379,9 @@ class Validator:
|
|
|
343
379
|
|
|
344
380
|
# Create primary key hashes
|
|
345
381
|
self.__begin_step(step="Creating primary key hashes")
|
|
346
|
-
|
|
382
|
+
pk_cols_df = self._resolve_df_cols(df, pk_cols)
|
|
383
|
+
pk_hashes = create_pk_hashes(df[pk_cols_df])
|
|
384
|
+
|
|
347
385
|
self.__complete_step()
|
|
348
386
|
|
|
349
387
|
# Primary Key Nulls
|
|
@@ -437,44 +475,51 @@ class Validator:
|
|
|
437
475
|
self.__complete_step()
|
|
438
476
|
return
|
|
439
477
|
|
|
440
|
-
|
|
441
|
-
|
|
478
|
+
cols = [
|
|
479
|
+
(dict_col, df_col)
|
|
480
|
+
for dict_col in self._dt_needs_infer
|
|
481
|
+
if (df_col := self._resolve_df_col(df, dict_col)) is not None
|
|
482
|
+
]
|
|
483
|
+
if not cols:
|
|
442
484
|
self.__complete_step()
|
|
443
485
|
return
|
|
444
486
|
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
487
|
+
from valediction.validation.helpers import _allowed_formats_for
|
|
488
|
+
|
|
489
|
+
for dict_col, df_col in cols:
|
|
490
|
+
unique = (
|
|
491
|
+
df[df_col].astype("string", copy=False).str.strip().dropna().unique()
|
|
492
|
+
)
|
|
448
493
|
if len(unique) == 0:
|
|
449
494
|
continue
|
|
450
495
|
|
|
451
496
|
try:
|
|
452
|
-
|
|
497
|
+
fmt = infer_datetime_format(Series(unique, dtype="string"))
|
|
453
498
|
except ValueError:
|
|
454
|
-
# ambiguous - try again in later chunk
|
|
455
499
|
continue
|
|
456
500
|
|
|
457
|
-
if
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
pass
|
|
501
|
+
if not fmt or fmt is False:
|
|
502
|
+
continue
|
|
503
|
+
|
|
504
|
+
col_dtype = self._find_data_type(dict_col) # case-insensitive getter
|
|
505
|
+
if fmt not in _allowed_formats_for(col_dtype):
|
|
506
|
+
continue
|
|
507
|
+
|
|
508
|
+
self._dt_format_cache[dict_col] = fmt
|
|
509
|
+
self._dt_needs_infer.discard(dict_col)
|
|
510
|
+
|
|
511
|
+
try:
|
|
512
|
+
self.table_dictionary.get_column(dict_col).datetime_format = fmt
|
|
513
|
+
except Exception:
|
|
514
|
+
pass
|
|
515
|
+
|
|
473
516
|
self.__complete_step()
|
|
474
517
|
|
|
475
518
|
def _check_column_types(self, df: DataFrame, start_row: int) -> None:
|
|
476
519
|
self.__begin_step(step="Checking column types")
|
|
477
|
-
present = [
|
|
520
|
+
present = [
|
|
521
|
+
col for col in df.columns if _normalise(str(col)) in self._column_names
|
|
522
|
+
]
|
|
478
523
|
for col in present:
|
|
479
524
|
dtype = self._find_data_type(col)
|
|
480
525
|
if dtype == DataType.TEXT:
|
|
@@ -506,7 +551,9 @@ class Validator:
|
|
|
506
551
|
|
|
507
552
|
def _check_text_lengths(self, df: DataFrame, start_row: int) -> None:
|
|
508
553
|
self.__begin_step(step="Checking text lengths")
|
|
509
|
-
present = [
|
|
554
|
+
present = [
|
|
555
|
+
col for col in df.columns if _normalise(str(col)) in self._column_names
|
|
556
|
+
]
|
|
510
557
|
for col in present:
|
|
511
558
|
if self._find_data_type(col) != DataType.TEXT:
|
|
512
559
|
continue
|
|
@@ -524,7 +571,9 @@ class Validator:
|
|
|
524
571
|
|
|
525
572
|
def _check_text_forbidden_chars(self, df: DataFrame, start_row: int) -> None:
|
|
526
573
|
self.__begin_step(step="Checking for forbidden characters")
|
|
527
|
-
present = [
|
|
574
|
+
present = [
|
|
575
|
+
col for col in df.columns if _normalise(str(col)) in self._column_names
|
|
576
|
+
]
|
|
528
577
|
for col in present:
|
|
529
578
|
if self._find_data_type(col) != DataType.TEXT:
|
|
530
579
|
continue
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: valediction
|
|
3
|
-
Version: 1.0
|
|
3
|
+
Version: 1.2.0
|
|
4
4
|
Summary: Valediction is a convenience data validation package that allows generation, import, and constraint enforcement of user-defined data dictionaries against datasets.
|
|
5
5
|
Author-email: Cai Davis <Cai.Davis@uhs.nhs.uk>
|
|
6
6
|
Requires-Python: <4.0,>=3.11
|