valediction 1.1.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- valediction/datasets/datasets.py +12 -12
- valediction/dictionary/generation.py +5 -5
- valediction/dictionary/helpers.py +0 -7
- valediction/dictionary/importing.py +43 -20
- valediction/dictionary/model.py +108 -36
- valediction/integrity.py +67 -13
- valediction/io/csv_readers.py +3 -3
- valediction/support.py +5 -1
- valediction/validation/helpers.py +30 -33
- valediction/validation/issues.py +37 -25
- valediction/validation/validation.py +101 -52
- {valediction-1.1.0.dist-info → valediction-1.2.0.dist-info}/METADATA +1 -1
- {valediction-1.1.0.dist-info → valediction-1.2.0.dist-info}/RECORD +14 -14
- {valediction-1.1.0.dist-info → valediction-1.2.0.dist-info}/WHEEL +0 -0
valediction/datasets/datasets.py
CHANGED
|
@@ -20,7 +20,8 @@ from valediction.io.csv_readers import (
|
|
|
20
20
|
)
|
|
21
21
|
from valediction.support import (
|
|
22
22
|
_get_runtime_string,
|
|
23
|
-
|
|
23
|
+
_normalise,
|
|
24
|
+
_strip,
|
|
24
25
|
list_as_bullets,
|
|
25
26
|
print_bold_red,
|
|
26
27
|
print_red,
|
|
@@ -437,16 +438,16 @@ class Dataset(list[DatasetItem]):
|
|
|
437
438
|
|
|
438
439
|
# Getters
|
|
439
440
|
def get(self, name: str, default: DatasetItem | None = None) -> DatasetItem | None:
|
|
440
|
-
name_key = name
|
|
441
|
+
name_key = _normalise(name)
|
|
441
442
|
for item in self:
|
|
442
|
-
if item.name
|
|
443
|
+
if _normalise(item.name) == name_key:
|
|
443
444
|
return item
|
|
444
445
|
return default
|
|
445
446
|
|
|
446
447
|
def index_of(self, name: str) -> int | None:
|
|
447
|
-
name_key = name
|
|
448
|
+
name_key = _normalise(name)
|
|
448
449
|
for i, item in enumerate(self):
|
|
449
|
-
if item.name == name_key:
|
|
450
|
+
if _normalise(item.name) == name_key:
|
|
450
451
|
return i
|
|
451
452
|
return None
|
|
452
453
|
|
|
@@ -796,20 +797,21 @@ class Dataset(list[DatasetItem]):
|
|
|
796
797
|
name: str | None,
|
|
797
798
|
data: DataLike,
|
|
798
799
|
) -> DatasetItem:
|
|
799
|
-
"""Normalise a (
|
|
800
|
+
"""Normalise a (name, data) double into a DatasetItem."""
|
|
800
801
|
if isinstance(data, (str, Path)):
|
|
801
802
|
path = Path(data)
|
|
802
803
|
if not path.exists():
|
|
803
804
|
raise FileNotFoundError(f"File not found: {path}")
|
|
804
805
|
if path.suffix.lower() != ".csv":
|
|
805
806
|
raise ValueError(f"Only .csv supported right now, got: {path}")
|
|
806
|
-
resolved_name =
|
|
807
|
+
resolved_name = _strip(name or path.stem)
|
|
807
808
|
return DatasetItem(name=resolved_name, data=path.resolve())
|
|
808
809
|
|
|
809
810
|
if isinstance(data, DataFrame):
|
|
810
811
|
if not name:
|
|
811
812
|
raise ValueError("When providing a DataFrame, 'name' is required.")
|
|
812
|
-
resolved_name =
|
|
813
|
+
resolved_name = _strip(name)
|
|
814
|
+
data.columns = [_strip(column) for column in data.columns]
|
|
813
815
|
return DatasetItem(name=resolved_name, data=data)
|
|
814
816
|
|
|
815
817
|
raise TypeError("data must be a Path/str to .csv or a pandas DataFrame.")
|
|
@@ -823,13 +825,11 @@ class Dataset(list[DatasetItem]):
|
|
|
823
825
|
if p.is_file():
|
|
824
826
|
if p.suffix.lower() != ".csv":
|
|
825
827
|
raise ValueError(f"Expected a .csv file, got: {p.suffix} ({p})")
|
|
826
|
-
return [DatasetItem(name=
|
|
828
|
+
return [DatasetItem(name=_strip(p.stem), data=p.resolve())]
|
|
827
829
|
|
|
828
830
|
if p.is_dir():
|
|
829
831
|
return [
|
|
830
|
-
DatasetItem(
|
|
831
|
-
name=_normalise_name(csv_path.stem), data=csv_path.resolve()
|
|
832
|
-
)
|
|
832
|
+
DatasetItem(name=_strip(csv_path.stem), data=csv_path.resolve())
|
|
833
833
|
for csv_path in p.glob("*.csv")
|
|
834
834
|
]
|
|
835
835
|
|
|
@@ -24,7 +24,7 @@ from valediction.io.csv_readers import (
|
|
|
24
24
|
read_csv_sample,
|
|
25
25
|
)
|
|
26
26
|
from valediction.progress import Progress
|
|
27
|
-
from valediction.support import
|
|
27
|
+
from valediction.support import _strip, calculate_runtime
|
|
28
28
|
|
|
29
29
|
IMPORTING_DATA = "Importing data"
|
|
30
30
|
CHUNK_STEPS = 1
|
|
@@ -124,7 +124,7 @@ class Generator:
|
|
|
124
124
|
self.__say(f"Generating dictionary for {len(items)} tables")
|
|
125
125
|
for item in items:
|
|
126
126
|
self.__progress_init(item)
|
|
127
|
-
table = Table(name=
|
|
127
|
+
table = Table(name=_strip(item.name))
|
|
128
128
|
dictionary.add_table(table)
|
|
129
129
|
|
|
130
130
|
if item.is_path:
|
|
@@ -192,7 +192,7 @@ class Generator:
|
|
|
192
192
|
col_state = inferer.states[col_name]
|
|
193
193
|
data_type, length = col_state.final_data_type_and_length()
|
|
194
194
|
col = Column(
|
|
195
|
-
name=
|
|
195
|
+
name=_strip(col_name),
|
|
196
196
|
order=idx,
|
|
197
197
|
data_type=data_type,
|
|
198
198
|
length=length if data_type == DataType.TEXT else None,
|
|
@@ -242,7 +242,7 @@ class Generator:
|
|
|
242
242
|
col_state = inferer.states[col_name]
|
|
243
243
|
data_type, length = col_state.final_data_type_and_length()
|
|
244
244
|
col = Column(
|
|
245
|
-
name=
|
|
245
|
+
name=_strip(col_name),
|
|
246
246
|
order=idx,
|
|
247
247
|
data_type=data_type,
|
|
248
248
|
length=length if data_type == DataType.TEXT else None,
|
|
@@ -277,7 +277,7 @@ class Generator:
|
|
|
277
277
|
next_order = max((c.order or 0 for c in table), default=0) + 1
|
|
278
278
|
data_type, length = col_state.final_data_type_and_length()
|
|
279
279
|
new_col = Column(
|
|
280
|
-
name=
|
|
280
|
+
name=_strip(col_name),
|
|
281
281
|
order=next_order,
|
|
282
282
|
data_type=data_type,
|
|
283
283
|
length=length if data_type == DataType.TEXT else None,
|
|
@@ -26,9 +26,6 @@ def _check_name(name: str, entity: Literal["table", "column"]) -> list[str]:
|
|
|
26
26
|
else config.max_column_name_length
|
|
27
27
|
)
|
|
28
28
|
|
|
29
|
-
if name != name.upper(): # name must be uppercase
|
|
30
|
-
errors.append("must be uppercase")
|
|
31
|
-
|
|
32
29
|
if invalid_chars.search(name): # check invalid characters
|
|
33
30
|
bad = set(invalid_chars.findall(name))
|
|
34
31
|
errors.append(
|
|
@@ -115,10 +112,6 @@ def _check_primary_key(primary_key: int | None, data_type: DataType) -> list[str
|
|
|
115
112
|
return errors
|
|
116
113
|
|
|
117
114
|
|
|
118
|
-
def _normalise_name(name: str) -> str:
|
|
119
|
-
return name.upper().strip()
|
|
120
|
-
|
|
121
|
-
|
|
122
115
|
def _norm_header_map(columns: list) -> dict:
|
|
123
116
|
mapping, _ = {}, set()
|
|
124
117
|
for c in columns:
|
|
@@ -11,7 +11,6 @@ from valediction.dictionary.helpers import (
|
|
|
11
11
|
_get_required_header,
|
|
12
12
|
_is_missing,
|
|
13
13
|
_norm_header_map,
|
|
14
|
-
_normalise_name,
|
|
15
14
|
_parse_int,
|
|
16
15
|
_parse_truthy,
|
|
17
16
|
_row_is_blank,
|
|
@@ -19,7 +18,7 @@ from valediction.dictionary.helpers import (
|
|
|
19
18
|
from valediction.dictionary.integrity import REQUIRED_SHEETS
|
|
20
19
|
from valediction.dictionary.model import Column, Dictionary, Table
|
|
21
20
|
from valediction.exceptions import DataDictionaryError, DataDictionaryImportError
|
|
22
|
-
from valediction.support import list_as_bullets
|
|
21
|
+
from valediction.support import _normalise, _strip, list_as_bullets
|
|
23
22
|
|
|
24
23
|
|
|
25
24
|
@dataclass
|
|
@@ -80,6 +79,13 @@ class ExcelDataDictionary:
|
|
|
80
79
|
raise error
|
|
81
80
|
|
|
82
81
|
# Import & Helpers
|
|
82
|
+
def _resolve_table_name(self, name: str) -> str | None:
|
|
83
|
+
"""Return the canonical table name as it appears in Tables sheet (or None)."""
|
|
84
|
+
target = _normalise(name)
|
|
85
|
+
return next(
|
|
86
|
+
(t for t in self.table_metadata.keys() if _normalise(t) == target), None
|
|
87
|
+
)
|
|
88
|
+
|
|
83
89
|
def _open_workbook(self) -> None:
|
|
84
90
|
if not self.path.exists():
|
|
85
91
|
raise DataDictionaryImportError(f"File not found: {self.path}")
|
|
@@ -140,20 +146,27 @@ class ExcelDataDictionary:
|
|
|
140
146
|
description_col_header = _get_required_header(header_map, "description")
|
|
141
147
|
|
|
142
148
|
meta: dict[str, str | None] = {}
|
|
149
|
+
seen: set[str] = set()
|
|
150
|
+
|
|
143
151
|
for _, row in tables_df.iterrows():
|
|
144
152
|
if _is_missing(row[table_col_header]):
|
|
145
153
|
continue
|
|
146
|
-
|
|
154
|
+
|
|
155
|
+
table_name = _strip(str(row[table_col_header]))
|
|
147
156
|
table_description = (
|
|
148
157
|
None
|
|
149
158
|
if _is_missing(row[description_col_header])
|
|
150
159
|
else str(row[description_col_header])
|
|
151
160
|
)
|
|
152
|
-
|
|
161
|
+
|
|
162
|
+
key = _normalise(table_name)
|
|
163
|
+
if key in seen:
|
|
153
164
|
raise DataDictionaryImportError(
|
|
154
165
|
f"Duplicate table '{table_name}' in Tables sheet."
|
|
155
166
|
)
|
|
167
|
+
seen.add(key)
|
|
156
168
|
meta[table_name] = table_description
|
|
169
|
+
|
|
157
170
|
if not meta:
|
|
158
171
|
raise DataDictionaryImportError(
|
|
159
172
|
"Data Dictionary sheet 'Tables' contains no table rows."
|
|
@@ -177,12 +190,13 @@ class ExcelDataDictionary:
|
|
|
177
190
|
or _is_missing(row[code_col_header])
|
|
178
191
|
):
|
|
179
192
|
continue
|
|
180
|
-
table_name =
|
|
181
|
-
column_name =
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
]
|
|
193
|
+
table_name = _strip(str(row[table_col_header]))
|
|
194
|
+
column_name = _strip(str(row[column_col_header]))
|
|
195
|
+
resolved_table = self._resolve_table_name(table_name) or table_name
|
|
196
|
+
enum_key = (_normalise(resolved_table), _normalise(column_name))
|
|
197
|
+
enum_map.setdefault(enum_key, {})
|
|
198
|
+
enum_map[enum_key][row[code_col_header]] = row[name_col_header]
|
|
199
|
+
|
|
186
200
|
self.enumerations = enum_map
|
|
187
201
|
|
|
188
202
|
# Parse Columns
|
|
@@ -234,7 +248,12 @@ class ExcelDataDictionary:
|
|
|
234
248
|
|
|
235
249
|
self.table_columns[inputs.table_name].append(column_obj)
|
|
236
250
|
if inputs.has_enumerations:
|
|
237
|
-
self.enum_flags.add(
|
|
251
|
+
self.enum_flags.add(
|
|
252
|
+
(
|
|
253
|
+
_normalise(inputs.table_name),
|
|
254
|
+
_normalise(inputs.column_name),
|
|
255
|
+
)
|
|
256
|
+
)
|
|
238
257
|
|
|
239
258
|
if errors:
|
|
240
259
|
raise DataDictionaryImportError(
|
|
@@ -279,7 +298,7 @@ class ExcelDataDictionary:
|
|
|
279
298
|
|
|
280
299
|
# Validate Foreign Keys
|
|
281
300
|
def _validate_foreign_keys(self) -> None:
|
|
282
|
-
name_to_table = {t.name: t for t in self.tables}
|
|
301
|
+
name_to_table = {_normalise(t.name): t for t in self.tables}
|
|
283
302
|
errors: list[str] = []
|
|
284
303
|
for table in self.tables:
|
|
285
304
|
for column in table:
|
|
@@ -292,9 +311,9 @@ class ExcelDataDictionary:
|
|
|
292
311
|
)
|
|
293
312
|
continue
|
|
294
313
|
target_table_raw, target_column_raw = target.split(".", 1)
|
|
295
|
-
target_table_name =
|
|
296
|
-
target_column_name =
|
|
297
|
-
referenced_table = name_to_table.get(target_table_name)
|
|
314
|
+
target_table_name = _strip(target_table_raw)
|
|
315
|
+
target_column_name = _strip(target_column_raw)
|
|
316
|
+
referenced_table = name_to_table.get(_normalise(target_table_name))
|
|
298
317
|
if not referenced_table:
|
|
299
318
|
errors.append(
|
|
300
319
|
f"{table.name}.{column.name} references unknown table {target_table_name!r}."
|
|
@@ -392,13 +411,17 @@ class ExcelDataDictionary:
|
|
|
392
411
|
f"{row_context}: missing required field(s): {', '.join(missing_fields)}."
|
|
393
412
|
)
|
|
394
413
|
|
|
395
|
-
|
|
396
|
-
column_name =
|
|
397
|
-
|
|
414
|
+
table_name_raw = _strip(str(row[table_col_header]))
|
|
415
|
+
column_name = _strip(str(row[column_col_header]))
|
|
416
|
+
|
|
417
|
+
resolved_table_name = self._resolve_table_name(table_name_raw)
|
|
418
|
+
if resolved_table_name is None:
|
|
398
419
|
raise DataDictionaryImportError(
|
|
399
|
-
f"{row_context}: Table '{
|
|
420
|
+
f"{row_context}: Table '{table_name_raw}' not present in Tables sheet."
|
|
400
421
|
)
|
|
401
422
|
|
|
423
|
+
table_name = resolved_table_name
|
|
424
|
+
|
|
402
425
|
order_int = _parse_int(row[order_col_header], "Order", row_context)
|
|
403
426
|
length_int = (
|
|
404
427
|
_parse_int(row[length_col_header], "Length", row_context, required=False)
|
|
@@ -461,7 +484,7 @@ class ExcelDataDictionary:
|
|
|
461
484
|
|
|
462
485
|
def _make_column(self, inputs: _ColumnInputs) -> Column:
|
|
463
486
|
enums_for_column = self.enumerations.get(
|
|
464
|
-
(inputs.table_name, inputs.column_name), {}
|
|
487
|
+
(_normalise(inputs.table_name), _normalise(inputs.column_name)), {}
|
|
465
488
|
)
|
|
466
489
|
return Column(
|
|
467
490
|
name=inputs.column_name,
|
valediction/dictionary/model.py
CHANGED
|
@@ -9,10 +9,9 @@ from valediction.dictionary.helpers import (
|
|
|
9
9
|
_check_name,
|
|
10
10
|
_check_order,
|
|
11
11
|
_check_primary_key,
|
|
12
|
-
_normalise_name,
|
|
13
12
|
)
|
|
14
13
|
from valediction.exceptions import DataDictionaryError
|
|
15
|
-
from valediction.support import list_as_bullets
|
|
14
|
+
from valediction.support import _normalise, _strip, list_as_bullets
|
|
16
15
|
|
|
17
16
|
|
|
18
17
|
class Column:
|
|
@@ -44,7 +43,7 @@ class Column:
|
|
|
44
43
|
description: str | None = None,
|
|
45
44
|
datetime_format: str | None = None,
|
|
46
45
|
):
|
|
47
|
-
self.name =
|
|
46
|
+
self.name = _strip(name)
|
|
48
47
|
self.order = int(order) if order is not None else None
|
|
49
48
|
self.data_type: DataType = None
|
|
50
49
|
self.length = int(length) if length is not None else None
|
|
@@ -127,7 +126,7 @@ class Table(list[Column]):
|
|
|
127
126
|
columns: list[Column] | None = None,
|
|
128
127
|
):
|
|
129
128
|
super().__init__()
|
|
130
|
-
self.name =
|
|
129
|
+
self.name = _strip(name)
|
|
131
130
|
self.description = description
|
|
132
131
|
for column in columns or []:
|
|
133
132
|
self.add_column(column)
|
|
@@ -139,24 +138,28 @@ class Table(list[Column]):
|
|
|
139
138
|
)
|
|
140
139
|
return f"Table(name={self.name!r}, description={self.description!r}{cols_str})"
|
|
141
140
|
|
|
141
|
+
def __key(self, name: str) -> str:
|
|
142
|
+
return _normalise(name)
|
|
143
|
+
|
|
142
144
|
def __getitem__(self, key: int | str) -> Column:
|
|
143
145
|
if isinstance(key, int):
|
|
144
146
|
return super().__getitem__(key)
|
|
145
|
-
|
|
146
|
-
|
|
147
|
+
|
|
148
|
+
target_key = self.__key(key)
|
|
149
|
+
found = next((c for c in self if self.__key(c.name) == target_key), None)
|
|
147
150
|
if not found:
|
|
148
151
|
raise KeyError(f"Column {key!r} not found in table {self.name!r}.")
|
|
149
152
|
return found
|
|
150
153
|
|
|
151
154
|
def __get(self, name: str, default: Column | None = None) -> Column | None:
|
|
152
|
-
|
|
153
|
-
return next((c for c in self if c.name ==
|
|
155
|
+
target_key = self.__key(name)
|
|
156
|
+
return next((c for c in self if self.__key(c.name) == target_key), default)
|
|
154
157
|
|
|
155
158
|
# Getters
|
|
156
159
|
def index_of(self, name: str) -> int | None:
|
|
157
|
-
|
|
160
|
+
target_key = self.__key(name)
|
|
158
161
|
for i, c in enumerate(self):
|
|
159
|
-
if c.name ==
|
|
162
|
+
if self.__key(c.name) == target_key:
|
|
160
163
|
return i
|
|
161
164
|
return None
|
|
162
165
|
|
|
@@ -303,16 +306,17 @@ class Table(list[Column]):
|
|
|
303
306
|
if not isinstance(column, Column):
|
|
304
307
|
raise DataDictionaryError("Only Column objects can be added to a Table.")
|
|
305
308
|
|
|
306
|
-
|
|
307
|
-
|
|
309
|
+
incoming_key = self.__key(column.name)
|
|
310
|
+
conflict = next((c for c in self if self.__key(c.name) == incoming_key), None)
|
|
311
|
+
if conflict is not None:
|
|
308
312
|
raise DataDictionaryError(
|
|
309
|
-
f"Column {column.name!r} already exists (order={conflict.order!r})"
|
|
313
|
+
f"Column {column.name!r} already exists (order={conflict.order!r}, as {conflict.name!r})."
|
|
310
314
|
)
|
|
311
315
|
|
|
312
316
|
if column.order in self.get_column_orders():
|
|
313
|
-
|
|
317
|
+
conflict_by_order = self.get_column(column.order)
|
|
314
318
|
raise DataDictionaryError(
|
|
315
|
-
f"Order {column.order!r} already exists (name={
|
|
319
|
+
f"Order {column.order!r} already exists (name={conflict_by_order.name!r})"
|
|
316
320
|
)
|
|
317
321
|
|
|
318
322
|
if column.primary_key is not None:
|
|
@@ -339,10 +343,7 @@ class Table(list[Column]):
|
|
|
339
343
|
Raises:
|
|
340
344
|
DataDictionaryError: if the column does not exist
|
|
341
345
|
"""
|
|
342
|
-
|
|
343
|
-
name = self.get_column(column).name
|
|
344
|
-
else:
|
|
345
|
-
name = self.get_column(column).name # by order
|
|
346
|
+
name = self.get_column(column).name
|
|
346
347
|
remaining = [c for c in self if c.name != name]
|
|
347
348
|
self.clear()
|
|
348
349
|
super().extend(remaining)
|
|
@@ -367,16 +368,17 @@ class Table(list[Column]):
|
|
|
367
368
|
for col in self:
|
|
368
369
|
col.primary_key = None
|
|
369
370
|
|
|
370
|
-
# Resolve and
|
|
371
|
+
# Resolve and deduplicate
|
|
371
372
|
resolved: list[Column] = []
|
|
372
373
|
seen: set[str] = set()
|
|
373
374
|
for key in primary_keys:
|
|
374
375
|
col = self.get_column(key)
|
|
375
|
-
|
|
376
|
+
col_key = self.__key(col.name)
|
|
377
|
+
if col_key in seen:
|
|
376
378
|
raise DataDictionaryError(
|
|
377
379
|
f"Duplicate column {col.name!r} provided for table {self.name!r}."
|
|
378
380
|
)
|
|
379
|
-
seen.add(
|
|
381
|
+
seen.add(col_key)
|
|
380
382
|
resolved.append(col)
|
|
381
383
|
|
|
382
384
|
# Assign ordinals 1..N
|
|
@@ -416,14 +418,20 @@ class Dictionary(list[Table]):
|
|
|
416
418
|
):
|
|
417
419
|
super().__init__()
|
|
418
420
|
self.name = name
|
|
421
|
+
|
|
422
|
+
if isinstance(tables, Table):
|
|
423
|
+
tables = [tables]
|
|
424
|
+
|
|
419
425
|
for t in tables or []:
|
|
420
426
|
self.add_table(t)
|
|
427
|
+
|
|
421
428
|
self.organisations = organisations
|
|
422
429
|
self.version = version
|
|
423
430
|
self.version_notes = version_notes
|
|
424
431
|
self.inclusion_criteria = inclusion_criteria
|
|
425
432
|
self.exclusion_criteria = exclusion_criteria
|
|
426
433
|
self.imported = imported
|
|
434
|
+
self.__check_variables()
|
|
427
435
|
|
|
428
436
|
# Properties
|
|
429
437
|
@property
|
|
@@ -439,24 +447,85 @@ class Dictionary(list[Table]):
|
|
|
439
447
|
tables = list_as_bullets(elements=[str(t) for t in self], bullet="\n- ")
|
|
440
448
|
return f"Dictionary(name={self.name!r}, imported={self.imported!r}, {tables})"
|
|
441
449
|
|
|
450
|
+
def __key(self, name: str) -> str:
|
|
451
|
+
return _normalise(name)
|
|
452
|
+
|
|
442
453
|
def __getitem__(self, key: int | str) -> Table:
|
|
443
454
|
if isinstance(key, int):
|
|
444
455
|
return super().__getitem__(key)
|
|
445
|
-
|
|
446
|
-
|
|
456
|
+
|
|
457
|
+
target_key = self.__key(key)
|
|
458
|
+
found = next((t for t in self if self.__key(t.name) == target_key), None)
|
|
447
459
|
if not found:
|
|
448
460
|
raise KeyError(f"Table {key!r} not found in Dictionary.")
|
|
449
461
|
return found
|
|
450
462
|
|
|
451
|
-
# Getters
|
|
452
463
|
def __get(self, name: str, default: Table | None = None) -> Table | None:
|
|
453
|
-
|
|
454
|
-
return next((t for t in self if t.name ==
|
|
464
|
+
target_key = self.__key(name)
|
|
465
|
+
return next((t for t in self if self.__key(t.name) == target_key), default)
|
|
466
|
+
|
|
467
|
+
# Checkers
|
|
468
|
+
def __check_variables(self) -> None:
|
|
469
|
+
self.__check_name()
|
|
470
|
+
self.__check_organisations()
|
|
471
|
+
self.__check_version()
|
|
472
|
+
self.__check_version_notes()
|
|
473
|
+
self.__check_criteria()
|
|
474
|
+
|
|
475
|
+
def __check_name(self) -> None:
|
|
476
|
+
# Check name
|
|
477
|
+
if self.name is not None:
|
|
478
|
+
if not isinstance(self.name, str):
|
|
479
|
+
raise DataDictionaryError("Dictionary `name` must be a string.")
|
|
480
|
+
|
|
481
|
+
def __check_organisations(self) -> None:
|
|
482
|
+
# Check organisations
|
|
483
|
+
if self.organisations is not None:
|
|
484
|
+
if not isinstance(self.organisations, str):
|
|
485
|
+
raise DataDictionaryError(
|
|
486
|
+
"Dictionary `organisations` must be a string."
|
|
487
|
+
)
|
|
488
|
+
|
|
489
|
+
def __check_version(self) -> None:
|
|
490
|
+
# Check version
|
|
491
|
+
if self.version is not None:
|
|
492
|
+
if not isinstance(self.version, (str, int, float)):
|
|
493
|
+
raise DataDictionaryError(
|
|
494
|
+
"Dictionary `version` must be a string, int, or float."
|
|
495
|
+
)
|
|
496
|
+
|
|
497
|
+
if isinstance(self.version, (int, float)):
|
|
498
|
+
self.version = str(self.version)
|
|
499
|
+
|
|
500
|
+
# Check version_notes
|
|
455
501
|
|
|
502
|
+
def __check_version_notes(self) -> None:
|
|
503
|
+
if self.version_notes is not None:
|
|
504
|
+
if not isinstance(self.version_notes, str):
|
|
505
|
+
raise DataDictionaryError(
|
|
506
|
+
"Dictionary `version_notes` must be a string."
|
|
507
|
+
)
|
|
508
|
+
|
|
509
|
+
def __check_criteria(self) -> None:
|
|
510
|
+
# Check inclusion_criteria
|
|
511
|
+
if self.inclusion_criteria is not None:
|
|
512
|
+
if not isinstance(self.inclusion_criteria, str):
|
|
513
|
+
raise DataDictionaryError(
|
|
514
|
+
"Dictionary `inclusion_criteria` must be a string."
|
|
515
|
+
)
|
|
516
|
+
|
|
517
|
+
# Check exclusion_criteria
|
|
518
|
+
if self.exclusion_criteria is not None:
|
|
519
|
+
if not isinstance(self.exclusion_criteria, str):
|
|
520
|
+
raise DataDictionaryError(
|
|
521
|
+
"Dictionary exclusion_criteria must be a string."
|
|
522
|
+
)
|
|
523
|
+
|
|
524
|
+
# Getters
|
|
456
525
|
def index_of(self, name: str) -> int | None:
|
|
457
|
-
|
|
526
|
+
target_key = self.__key(name)
|
|
458
527
|
for i, t in enumerate(self):
|
|
459
|
-
if t.name ==
|
|
528
|
+
if self.__key(t.name) == target_key:
|
|
460
529
|
return i
|
|
461
530
|
return None
|
|
462
531
|
|
|
@@ -484,12 +553,9 @@ class Dictionary(list[Table]):
|
|
|
484
553
|
Raises:
|
|
485
554
|
KeyError: If the table is not found in the dictionary.
|
|
486
555
|
"""
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
if not found:
|
|
556
|
+
found = self.__get(table)
|
|
557
|
+
if found is None:
|
|
491
558
|
raise KeyError(f"Table {table!r} not found in Dictionary.")
|
|
492
|
-
|
|
493
559
|
return found
|
|
494
560
|
|
|
495
561
|
# Manipulation
|
|
@@ -508,8 +574,14 @@ class Dictionary(list[Table]):
|
|
|
508
574
|
raise DataDictionaryError(
|
|
509
575
|
"Only Table objects can be added to a Dictionary."
|
|
510
576
|
)
|
|
511
|
-
|
|
512
|
-
|
|
577
|
+
|
|
578
|
+
incoming_key = self.__key(table.name)
|
|
579
|
+
conflict = next((t for t in self if self.__key(t.name) == incoming_key), None)
|
|
580
|
+
if conflict is not None:
|
|
581
|
+
raise DataDictionaryError(
|
|
582
|
+
f"Table {table.name!r} already exists (as {conflict.name!r})."
|
|
583
|
+
)
|
|
584
|
+
|
|
513
585
|
super().append(table)
|
|
514
586
|
|
|
515
587
|
def remove_table(self, table: str) -> None:
|
valediction/integrity.py
CHANGED
|
@@ -1,6 +1,10 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import re
|
|
4
|
+
from copy import deepcopy
|
|
2
5
|
from pathlib import Path
|
|
3
6
|
from re import Pattern
|
|
7
|
+
from typing import Any
|
|
4
8
|
|
|
5
9
|
from valediction.data_types.data_types import DataType
|
|
6
10
|
from valediction.support import list_as_bullets
|
|
@@ -12,13 +16,58 @@ TEMPLATE_DATA_DICTIONARY_PATH = (
|
|
|
12
16
|
)
|
|
13
17
|
|
|
14
18
|
|
|
19
|
+
externally_injected_variables: dict[
|
|
20
|
+
str, Any
|
|
21
|
+
] = {} # External injection store for package wrapping (any keys, always included)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def reset_injected_config_variables() -> None:
|
|
25
|
+
global externally_injected_variables
|
|
26
|
+
externally_injected_variables = {}
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def inject_config_variables(variables: dict[str, Any]) -> None:
|
|
30
|
+
"""Injects variables into the Valediction Config, which will always be incorporated
|
|
31
|
+
as overrides, regardless of Config calling method (default, session-scoped, or
|
|
32
|
+
contextual).
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
variables (dict[str, Any]): Dictionary of config variables.
|
|
36
|
+
"""
|
|
37
|
+
global externally_injected_variables, session_config
|
|
38
|
+
|
|
39
|
+
# check type allows
|
|
40
|
+
if not isinstance(variables, dict):
|
|
41
|
+
raise TypeError(
|
|
42
|
+
f"Config injection variables must be a dictionary, not {type(variables)}"
|
|
43
|
+
)
|
|
44
|
+
problematic_keys = []
|
|
45
|
+
for variable_name in variables.keys():
|
|
46
|
+
if not isinstance(variable_name, str):
|
|
47
|
+
problematic_keys.append(variable_name)
|
|
48
|
+
|
|
49
|
+
if problematic_keys:
|
|
50
|
+
raise TypeError("Config injection variables accepts only string keys.")
|
|
51
|
+
|
|
52
|
+
externally_injected_variables = dict(variables or {})
|
|
53
|
+
|
|
54
|
+
# Apply immediately to the current session config (if it exists)
|
|
55
|
+
if session_config is not None:
|
|
56
|
+
_apply_external_injections(session_config)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _apply_external_injections(config: Config) -> None:
|
|
60
|
+
for variable_name, variable_value in externally_injected_variables.items():
|
|
61
|
+
setattr(config, variable_name, deepcopy(variable_value))
|
|
62
|
+
|
|
63
|
+
|
|
15
64
|
class Config:
|
|
16
65
|
def __init__(self):
|
|
17
66
|
self.template_data_dictionary_path: Path = TEMPLATE_DATA_DICTIONARY_PATH
|
|
18
67
|
self.max_table_name_length: int = 63
|
|
19
68
|
self.max_column_name_length: int = 30
|
|
20
69
|
self.max_primary_keys: int = 7
|
|
21
|
-
self.invalid_name_pattern: str | Pattern = re.compile(r"[^A-
|
|
70
|
+
self.invalid_name_pattern: str | Pattern = re.compile(r"[^A-Za-z0-9_]")
|
|
22
71
|
self.null_values: list[str] = ["", "null", "none"]
|
|
23
72
|
self.forbidden_characters: list[str] = []
|
|
24
73
|
self.date_formats: dict[str, DataType] = {
|
|
@@ -42,6 +91,7 @@ class Config:
|
|
|
42
91
|
}
|
|
43
92
|
self.enforce_no_null_columns: bool = True
|
|
44
93
|
self.enforce_primary_keys: bool = True
|
|
94
|
+
_apply_external_injections(self)
|
|
45
95
|
|
|
46
96
|
def __repr__(self):
|
|
47
97
|
date_list = list_as_bullets(
|
|
@@ -65,33 +115,37 @@ class Config:
|
|
|
65
115
|
|
|
66
116
|
# Context Wrapper With Reset
|
|
67
117
|
def __enter__(self):
|
|
68
|
-
global
|
|
69
|
-
|
|
118
|
+
global session_config
|
|
119
|
+
|
|
120
|
+
_apply_external_injections(self)
|
|
121
|
+
|
|
122
|
+
session_config = self
|
|
70
123
|
return self
|
|
71
124
|
|
|
72
125
|
def __exit__(self, exc_type, exc_value, traceback):
|
|
73
|
-
global
|
|
74
|
-
|
|
126
|
+
global session_config
|
|
127
|
+
session_config = Config()
|
|
75
128
|
|
|
76
129
|
|
|
77
|
-
|
|
130
|
+
session_config: Config = None
|
|
78
131
|
|
|
79
132
|
|
|
80
133
|
def get_config() -> Config:
|
|
81
|
-
"""Gets the current `
|
|
82
|
-
globally.
|
|
134
|
+
"""Gets the current `session_config` instance. Changing attributes will set them
|
|
135
|
+
globally for the python session. Use `reset_default_config()` to reset to original
|
|
136
|
+
defaults.
|
|
83
137
|
|
|
84
138
|
Returns:
|
|
85
|
-
Config: The current
|
|
139
|
+
Config: The current session configuration.
|
|
86
140
|
"""
|
|
87
|
-
global
|
|
88
|
-
return
|
|
141
|
+
global session_config
|
|
142
|
+
return session_config
|
|
89
143
|
|
|
90
144
|
|
|
91
145
|
def reset_default_config() -> None:
|
|
92
146
|
"""Resets `default_config` settings globally to original defaults."""
|
|
93
|
-
global
|
|
94
|
-
|
|
147
|
+
global session_config
|
|
148
|
+
session_config = Config()
|
|
95
149
|
|
|
96
150
|
|
|
97
151
|
reset_default_config()
|
valediction/io/csv_readers.py
CHANGED
|
@@ -11,7 +11,7 @@ import pandas as pd
|
|
|
11
11
|
from pandas import DataFrame
|
|
12
12
|
from pandas.errors import ParserError
|
|
13
13
|
|
|
14
|
-
from valediction.support import
|
|
14
|
+
from valediction.support import _strip
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
class FrameChunk(NamedTuple):
|
|
@@ -34,7 +34,7 @@ class FrameChunk(NamedTuple):
|
|
|
34
34
|
total_chunks_seen: int | None
|
|
35
35
|
|
|
36
36
|
def estimate_chunk_count(self) -> int:
|
|
37
|
-
# Buffers (accounting for CSV tails/bytes
|
|
37
|
+
# Buffers (accounting for CSV tails/bytes inaccuracy)
|
|
38
38
|
EPS_ABS = 4096 # Fixed
|
|
39
39
|
EPS_REL = 0.05 # 5% tail buffer
|
|
40
40
|
|
|
@@ -93,7 +93,7 @@ def _post_read_processing(df: DataFrame, cfg: CsvReadConfig) -> DataFrame:
|
|
|
93
93
|
"""Apply header normalisation and vectorised value stripping after reading."""
|
|
94
94
|
cfg = cfg or CsvReadConfig()
|
|
95
95
|
if cfg.normalise_headers:
|
|
96
|
-
df = df.rename(columns={c:
|
|
96
|
+
df = df.rename(columns={c: _strip(c) for c in df.columns})
|
|
97
97
|
if cfg.strip_values:
|
|
98
98
|
str_cols = df.select_dtypes(include=["string"]).columns
|
|
99
99
|
if len(str_cols) > 0:
|
valediction/support.py
CHANGED
|
@@ -35,10 +35,14 @@ def list_as_bullets(elements: list, bullet: str = "\n - ") -> str:
|
|
|
35
35
|
return bullet + bullet.join(elements)
|
|
36
36
|
|
|
37
37
|
|
|
38
|
-
def
|
|
38
|
+
def _normalise(name: str) -> str:
|
|
39
39
|
return name.strip().upper()
|
|
40
40
|
|
|
41
41
|
|
|
42
|
+
def _strip(name: str) -> str:
|
|
43
|
+
return name.strip()
|
|
44
|
+
|
|
45
|
+
|
|
42
46
|
def _get_runtime_string(runtime: timedelta) -> str:
|
|
43
47
|
total_seconds = runtime.total_seconds()
|
|
44
48
|
hours = trunc(total_seconds / 3600)
|
|
@@ -10,6 +10,7 @@ from pandas.util import hash_pandas_object
|
|
|
10
10
|
from valediction.data_types.data_types import DataType
|
|
11
11
|
from valediction.dictionary.model import Table
|
|
12
12
|
from valediction.integrity import get_config
|
|
13
|
+
from valediction.support import _normalise
|
|
13
14
|
from valediction.validation.issues import Range
|
|
14
15
|
|
|
15
16
|
|
|
@@ -17,11 +18,14 @@ from valediction.validation.issues import Range
|
|
|
17
18
|
def _set_nulls(df: DataFrame) -> DataFrame:
|
|
18
19
|
null_values = get_config().null_values
|
|
19
20
|
token_set = {str(t).strip().casefold() for t in null_values}
|
|
20
|
-
columns = df.select_dtypes(include=["string", "object"]).columns
|
|
21
|
+
columns = df.select_dtypes(include=["string", "object", "category"]).columns
|
|
21
22
|
for column in columns:
|
|
22
23
|
series = df[column]
|
|
23
|
-
|
|
24
|
-
|
|
24
|
+
|
|
25
|
+
s_txt = series.astype("string", copy=False) # dtype safe
|
|
26
|
+
mask = s_txt.notna() & s_txt.str.strip().str.casefold().isin(token_set)
|
|
27
|
+
if mask.any():
|
|
28
|
+
df[column] = series.mask(mask, NA)
|
|
25
29
|
|
|
26
30
|
return df
|
|
27
31
|
|
|
@@ -68,37 +72,24 @@ def create_pk_hashes(
|
|
|
68
72
|
Returns:
|
|
69
73
|
Series: Pandas Series with hashes or Nulls.
|
|
70
74
|
"""
|
|
71
|
-
|
|
75
|
+
HASH_COL_NAME = "PK_HASH"
|
|
72
76
|
if df_primaries.empty or df_primaries.shape[1] == 0:
|
|
73
|
-
return Series([], dtype=object, name=
|
|
77
|
+
return Series([], dtype=object, name=HASH_COL_NAME)
|
|
74
78
|
|
|
75
|
-
#
|
|
79
|
+
# Check Nulls
|
|
76
80
|
null_rows = df_primaries.isna().any(axis=1)
|
|
77
81
|
|
|
78
|
-
#
|
|
79
|
-
hash_1 = hash_pandas_object(df_primaries, index=False)
|
|
80
|
-
|
|
81
|
-
# Second Hash (rows backwards if single row, else salt)
|
|
82
|
-
if df_primaries.shape[1] > 1:
|
|
83
|
-
df_primaries_backwards = df_primaries.iloc[:, ::-1]
|
|
84
|
-
else:
|
|
85
|
-
s = df_primaries.iloc[:, 0]
|
|
86
|
-
salt = Series(["§"] * len(s), index=s.index, dtype="string")
|
|
87
|
-
df_primaries_backwards = DataFrame(
|
|
88
|
-
{
|
|
89
|
-
"_a": s,
|
|
90
|
-
"_b": s.str.cat(salt),
|
|
91
|
-
}
|
|
92
|
-
)
|
|
93
|
-
|
|
94
|
-
hash_2 = hash_pandas_object(df_primaries_backwards, index=False) # uint64
|
|
82
|
+
# Two independent 64-bit hashes with 16 byte keys
|
|
83
|
+
hash_1 = hash_pandas_object(df_primaries, index=False, hash_key="valediction_pk1!")
|
|
84
|
+
hash_2 = hash_pandas_object(df_primaries, index=False, hash_key="valediction_pk2!")
|
|
95
85
|
|
|
86
|
+
# Combine into 128-bit integer keys
|
|
96
87
|
a1 = hash_1.to_numpy(dtype="uint64", copy=False).astype(object)
|
|
97
88
|
a2 = hash_2.to_numpy(dtype="uint64", copy=False).astype(object)
|
|
98
|
-
|
|
99
89
|
combined = (a1 << 64) | a2
|
|
90
|
+
|
|
100
91
|
hashes = Series(
|
|
101
|
-
combined, index=df_primaries.index, name=
|
|
92
|
+
combined, index=df_primaries.index, name=HASH_COL_NAME, dtype=object
|
|
102
93
|
)
|
|
103
94
|
hashes[null_rows] = None
|
|
104
95
|
return hashes
|
|
@@ -167,8 +158,9 @@ def pk_contains_whitespace_mask(df_primaries: DataFrame) -> Series:
|
|
|
167
158
|
if df_primaries.empty or df_primaries.shape[1] == 0:
|
|
168
159
|
return Series(False, index=df_primaries.index)
|
|
169
160
|
|
|
170
|
-
col_masks = df_primaries.apply(
|
|
171
|
-
|
|
161
|
+
col_masks = df_primaries.apply(
|
|
162
|
+
lambda s: s.astype("string", copy=False).str.contains(r"\s", na=False)
|
|
163
|
+
)
|
|
172
164
|
return col_masks.any(axis=1)
|
|
173
165
|
|
|
174
166
|
|
|
@@ -261,7 +253,9 @@ def invalid_mask_text_too_long(column: Series, max_len: int) -> Series:
|
|
|
261
253
|
return Series(False, index=column.index)
|
|
262
254
|
|
|
263
255
|
notnull = column.notna()
|
|
264
|
-
|
|
256
|
+
s_txt = column.astype("string", copy=False)
|
|
257
|
+
lens = s_txt.str.len()
|
|
258
|
+
|
|
265
259
|
return notnull & (lens > max_len)
|
|
266
260
|
|
|
267
261
|
|
|
@@ -270,20 +264,23 @@ def invalid_mask_text_forbidden_characters(column: Series) -> Series:
|
|
|
270
264
|
if not forbidden:
|
|
271
265
|
return column.notna() & False
|
|
272
266
|
|
|
273
|
-
pattern = "[" + re.escape("".join(forbidden)) + "]"
|
|
267
|
+
pattern = "[" + re.escape("".join([str(s) for s in forbidden])) + "]"
|
|
274
268
|
notnull = column.notna()
|
|
275
|
-
|
|
269
|
+
|
|
270
|
+
s_txt = column.astype("string", copy=False)
|
|
271
|
+
has_forbidden = s_txt.str.contains(pattern, regex=True, na=False)
|
|
272
|
+
|
|
276
273
|
return notnull & has_forbidden
|
|
277
274
|
|
|
278
275
|
|
|
279
276
|
# Apply Data Types #
|
|
280
277
|
def apply_data_types(df: DataFrame, table_dictionary: Table) -> DataFrame:
|
|
281
278
|
# name -> column object
|
|
282
|
-
column_dictionary = {column.name: column for column in table_dictionary}
|
|
279
|
+
column_dictionary = {_normalise(column.name): column for column in table_dictionary}
|
|
283
280
|
|
|
284
281
|
for col in df.columns:
|
|
285
|
-
data_type = column_dictionary.get(col).data_type
|
|
286
|
-
datetime_format = column_dictionary.get(col).datetime_format
|
|
282
|
+
data_type = column_dictionary.get(_normalise(col)).data_type
|
|
283
|
+
datetime_format = column_dictionary.get(_normalise(col)).datetime_format
|
|
287
284
|
|
|
288
285
|
if data_type in (DataType.TEXT, DataType.FILE):
|
|
289
286
|
df[col] = df[col].astype("string")
|
valediction/validation/issues.py
CHANGED
|
@@ -8,7 +8,7 @@ from pandas import DataFrame, concat
|
|
|
8
8
|
|
|
9
9
|
from valediction.datasets.datasets_helpers import DatasetItemLike
|
|
10
10
|
from valediction.io.csv_readers import CsvReadConfig, read_csv_ranges
|
|
11
|
-
from valediction.support import
|
|
11
|
+
from valediction.support import _strip, list_as_bullets
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
class IssueType(Enum):
|
|
@@ -107,6 +107,7 @@ class Issue:
|
|
|
107
107
|
merged.append(cur)
|
|
108
108
|
self.ranges = merged
|
|
109
109
|
|
|
110
|
+
# Inspect
|
|
110
111
|
def inspect(
|
|
111
112
|
self,
|
|
112
113
|
additional_columns: bool | str | list[str] | None = None,
|
|
@@ -132,9 +133,9 @@ class Issue:
|
|
|
132
133
|
ValueError: if the issue has no parent DatasetItem
|
|
133
134
|
"""
|
|
134
135
|
# Guard
|
|
135
|
-
|
|
136
|
-
raise ValueError("Issue has no parent DatasetItem")
|
|
136
|
+
self.__guard_parent()
|
|
137
137
|
header = self.__repr__() if print_header else ""
|
|
138
|
+
|
|
138
139
|
# Not applicable
|
|
139
140
|
if self.type in APPLIES_WHOLE_COLUMN:
|
|
140
141
|
print(f"{header}: applies to whole column")
|
|
@@ -143,22 +144,8 @@ class Issue:
|
|
|
143
144
|
# Column Inclusion
|
|
144
145
|
if print_header:
|
|
145
146
|
print(f"{header}:")
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
else:
|
|
149
|
-
additional_columns = (
|
|
150
|
-
[additional_columns]
|
|
151
|
-
if isinstance(additional_columns, str)
|
|
152
|
-
else additional_columns
|
|
153
|
-
)
|
|
154
|
-
base = (
|
|
155
|
-
set(self.parent.primary_keys)
|
|
156
|
-
if self.type in PRIMARY_KEY_ISSUES
|
|
157
|
-
else {self.column}
|
|
158
|
-
)
|
|
159
|
-
base |= set(additional_columns or [])
|
|
160
|
-
base.discard(None)
|
|
161
|
-
columns = list(base) if base else None
|
|
147
|
+
|
|
148
|
+
columns = self.__select_columns(additional_columns)
|
|
162
149
|
|
|
163
150
|
if not self.ranges:
|
|
164
151
|
return DataFrame(columns=columns) if columns else DataFrame()
|
|
@@ -194,6 +181,31 @@ class Issue:
|
|
|
194
181
|
|
|
195
182
|
return out if columns is None else out.loc[:, columns]
|
|
196
183
|
|
|
184
|
+
# Inspect Helpers
|
|
185
|
+
def __guard_parent(self):
|
|
186
|
+
if not self.parent:
|
|
187
|
+
raise ValueError("Issue has no parent DatasetItem")
|
|
188
|
+
|
|
189
|
+
def __select_columns(self, additional_columns: bool | str | list[str]) -> list:
|
|
190
|
+
if additional_columns is True:
|
|
191
|
+
columns = None
|
|
192
|
+
else:
|
|
193
|
+
additional_columns = (
|
|
194
|
+
[additional_columns]
|
|
195
|
+
if isinstance(additional_columns, str)
|
|
196
|
+
else additional_columns
|
|
197
|
+
)
|
|
198
|
+
base = (
|
|
199
|
+
set(self.parent.primary_keys)
|
|
200
|
+
if self.type in PRIMARY_KEY_ISSUES
|
|
201
|
+
else {self.column}
|
|
202
|
+
)
|
|
203
|
+
base |= set(additional_columns or [])
|
|
204
|
+
base.discard(None)
|
|
205
|
+
columns = list(base) if base else None
|
|
206
|
+
|
|
207
|
+
return columns
|
|
208
|
+
|
|
197
209
|
|
|
198
210
|
@dataclass
|
|
199
211
|
class Issues:
|
|
@@ -235,8 +247,8 @@ class Issues:
|
|
|
235
247
|
parent: DatasetItemLike | None = None,
|
|
236
248
|
) -> Issue:
|
|
237
249
|
key = (
|
|
238
|
-
|
|
239
|
-
|
|
250
|
+
_strip(table),
|
|
251
|
+
_strip(column) if column is not None else None,
|
|
240
252
|
issue_type,
|
|
241
253
|
)
|
|
242
254
|
issue = self._index.get(key)
|
|
@@ -255,8 +267,8 @@ class Issues:
|
|
|
255
267
|
issue_type: IssueType | None = None,
|
|
256
268
|
) -> list[Issue]:
|
|
257
269
|
"""Case-insensitive filter; any arg can be None to act as a wildcard."""
|
|
258
|
-
table =
|
|
259
|
-
column =
|
|
270
|
+
table = _strip(table)
|
|
271
|
+
column = _strip(column) if column is not None else None
|
|
260
272
|
output: list[Issue] = []
|
|
261
273
|
if issue_type is not None:
|
|
262
274
|
# direct index lookup where possible
|
|
@@ -268,9 +280,9 @@ class Issues:
|
|
|
268
280
|
|
|
269
281
|
# otherwise scan (still cheap; we maintain a compact list)
|
|
270
282
|
for item in self._items:
|
|
271
|
-
if
|
|
283
|
+
if _strip(item.table) != table:
|
|
272
284
|
continue
|
|
273
|
-
if column is not None and (
|
|
285
|
+
if column is not None and (_strip(item.column) or "") != column:
|
|
274
286
|
continue
|
|
275
287
|
output.append(item)
|
|
276
288
|
return output
|
|
@@ -20,7 +20,7 @@ from valediction.io.csv_readers import (
|
|
|
20
20
|
iter_csv_chunks,
|
|
21
21
|
)
|
|
22
22
|
from valediction.progress import Progress
|
|
23
|
-
from valediction.support import _get_runtime_string, calculate_runtime
|
|
23
|
+
from valediction.support import _get_runtime_string, _normalise, calculate_runtime
|
|
24
24
|
from valediction.validation.helpers import (
|
|
25
25
|
_column_has_values,
|
|
26
26
|
_set_nulls,
|
|
@@ -86,7 +86,9 @@ class Validator:
|
|
|
86
86
|
self._dt_needs_infer: set[str] = set()
|
|
87
87
|
|
|
88
88
|
# Helpers
|
|
89
|
-
self._column_names: set =
|
|
89
|
+
self._column_names: set[str] = {
|
|
90
|
+
_normalise(n) for n in self.table_dictionary.get_column_names()
|
|
91
|
+
}
|
|
90
92
|
|
|
91
93
|
# Progress Tracking
|
|
92
94
|
self.progress: Progress | None = None
|
|
@@ -155,6 +157,20 @@ class Validator:
|
|
|
155
157
|
if not datetime_format:
|
|
156
158
|
self._dt_needs_infer.add(name)
|
|
157
159
|
|
|
160
|
+
# Column Scanning
|
|
161
|
+
def _resolve_df_col(self, df: DataFrame, name: str) -> str | None:
|
|
162
|
+
"""Return the actual df column label matching name case-insensitively."""
|
|
163
|
+
target = _normalise(name)
|
|
164
|
+
return next((c for c in df.columns if _normalise(str(c)) == target), None)
|
|
165
|
+
|
|
166
|
+
def _resolve_df_cols(self, df: DataFrame, names: list[str]) -> list[str]:
|
|
167
|
+
resolved: list[str] = []
|
|
168
|
+
for n in names:
|
|
169
|
+
c = self._resolve_df_col(df, n)
|
|
170
|
+
if c is not None:
|
|
171
|
+
resolved.append(c)
|
|
172
|
+
return resolved
|
|
173
|
+
|
|
158
174
|
# Validate
|
|
159
175
|
def validate(self):
|
|
160
176
|
"""
|
|
@@ -272,28 +288,45 @@ class Validator:
|
|
|
272
288
|
# Validation: Start Helpers
|
|
273
289
|
def _check_for_missing_columns(self, df: DataFrame):
|
|
274
290
|
self.__begin_step(step="Checking for missing columns")
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
291
|
+
|
|
292
|
+
dict_names = self.table_dictionary.get_column_names()
|
|
293
|
+
dict_keys = {_normalise(name) for name in dict_names}
|
|
294
|
+
|
|
295
|
+
df_keys = {_normalise(str(column)) for column in df.columns}
|
|
296
|
+
|
|
297
|
+
missing_keys = dict_keys - df_keys
|
|
298
|
+
if missing_keys:
|
|
299
|
+
for name in dict_names:
|
|
300
|
+
if _normalise(name) in missing_keys:
|
|
301
|
+
self.issues.add(
|
|
302
|
+
issue_type=IssueType.MISSING_COLUMN,
|
|
303
|
+
table=self.table_name,
|
|
304
|
+
column=name,
|
|
305
|
+
parent=self.dataset_item,
|
|
306
|
+
)
|
|
307
|
+
|
|
284
308
|
self.__complete_step()
|
|
285
309
|
|
|
286
310
|
def _check_for_extra_columns(self, df: DataFrame):
|
|
287
311
|
self.__begin_step(step="Checking for extra columns")
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
for
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
312
|
+
|
|
313
|
+
dict_keys = {
|
|
314
|
+
_normalise(name) for name in self.table_dictionary.get_column_names()
|
|
315
|
+
}
|
|
316
|
+
df_cols = [str(column) for column in df.columns]
|
|
317
|
+
df_keys = {_normalise(column) for column in df_cols}
|
|
318
|
+
|
|
319
|
+
extra_keys = df_keys - dict_keys
|
|
320
|
+
if extra_keys:
|
|
321
|
+
for col in df_cols:
|
|
322
|
+
if _normalise(col) in extra_keys:
|
|
323
|
+
self.issues.add(
|
|
324
|
+
issue_type=IssueType.EXTRA_COLUMN,
|
|
325
|
+
table=self.table_name,
|
|
326
|
+
column=col, # report the actual df label
|
|
327
|
+
parent=self.dataset_item,
|
|
328
|
+
)
|
|
329
|
+
|
|
297
330
|
self.__complete_step()
|
|
298
331
|
|
|
299
332
|
# Validation: Chunk Helpers
|
|
@@ -319,13 +352,16 @@ class Validator:
|
|
|
319
352
|
|
|
320
353
|
# Check for whitespace (text cols only)
|
|
321
354
|
self.__begin_step(step="Checking for primary key whitespace")
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
355
|
+
pk_keys = {_normalise(p) for p in pk_cols}
|
|
356
|
+
pk_cols_text = [
|
|
357
|
+
column.name
|
|
358
|
+
for column in self.table_dictionary
|
|
359
|
+
if _normalise(column.name) in pk_keys and column.data_type is DataType.TEXT
|
|
360
|
+
]
|
|
326
361
|
|
|
327
362
|
if pk_cols_text:
|
|
328
|
-
|
|
363
|
+
pk_cols_text_df = self._resolve_df_cols(df, pk_cols_text)
|
|
364
|
+
space_mask = pk_contains_whitespace_mask(df[pk_cols_text_df])
|
|
329
365
|
if space_mask.any():
|
|
330
366
|
self.issues.add(
|
|
331
367
|
issue_type=IssueType.PK_WHITESPACE,
|
|
@@ -343,7 +379,9 @@ class Validator:
|
|
|
343
379
|
|
|
344
380
|
# Create primary key hashes
|
|
345
381
|
self.__begin_step(step="Creating primary key hashes")
|
|
346
|
-
|
|
382
|
+
pk_cols_df = self._resolve_df_cols(df, pk_cols)
|
|
383
|
+
pk_hashes = create_pk_hashes(df[pk_cols_df])
|
|
384
|
+
|
|
347
385
|
self.__complete_step()
|
|
348
386
|
|
|
349
387
|
# Primary Key Nulls
|
|
@@ -437,44 +475,51 @@ class Validator:
|
|
|
437
475
|
self.__complete_step()
|
|
438
476
|
return
|
|
439
477
|
|
|
440
|
-
|
|
441
|
-
|
|
478
|
+
cols = [
|
|
479
|
+
(dict_col, df_col)
|
|
480
|
+
for dict_col in self._dt_needs_infer
|
|
481
|
+
if (df_col := self._resolve_df_col(df, dict_col)) is not None
|
|
482
|
+
]
|
|
483
|
+
if not cols:
|
|
442
484
|
self.__complete_step()
|
|
443
485
|
return
|
|
444
486
|
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
487
|
+
from valediction.validation.helpers import _allowed_formats_for
|
|
488
|
+
|
|
489
|
+
for dict_col, df_col in cols:
|
|
490
|
+
unique = (
|
|
491
|
+
df[df_col].astype("string", copy=False).str.strip().dropna().unique()
|
|
492
|
+
)
|
|
448
493
|
if len(unique) == 0:
|
|
449
494
|
continue
|
|
450
495
|
|
|
451
496
|
try:
|
|
452
|
-
|
|
497
|
+
fmt = infer_datetime_format(Series(unique, dtype="string"))
|
|
453
498
|
except ValueError:
|
|
454
|
-
# ambiguous - try again in later chunk
|
|
455
499
|
continue
|
|
456
500
|
|
|
457
|
-
if
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
pass
|
|
501
|
+
if not fmt or fmt is False:
|
|
502
|
+
continue
|
|
503
|
+
|
|
504
|
+
col_dtype = self._find_data_type(dict_col) # case-insensitive getter
|
|
505
|
+
if fmt not in _allowed_formats_for(col_dtype):
|
|
506
|
+
continue
|
|
507
|
+
|
|
508
|
+
self._dt_format_cache[dict_col] = fmt
|
|
509
|
+
self._dt_needs_infer.discard(dict_col)
|
|
510
|
+
|
|
511
|
+
try:
|
|
512
|
+
self.table_dictionary.get_column(dict_col).datetime_format = fmt
|
|
513
|
+
except Exception:
|
|
514
|
+
pass
|
|
515
|
+
|
|
473
516
|
self.__complete_step()
|
|
474
517
|
|
|
475
518
|
def _check_column_types(self, df: DataFrame, start_row: int) -> None:
|
|
476
519
|
self.__begin_step(step="Checking column types")
|
|
477
|
-
present = [
|
|
520
|
+
present = [
|
|
521
|
+
col for col in df.columns if _normalise(str(col)) in self._column_names
|
|
522
|
+
]
|
|
478
523
|
for col in present:
|
|
479
524
|
dtype = self._find_data_type(col)
|
|
480
525
|
if dtype == DataType.TEXT:
|
|
@@ -506,7 +551,9 @@ class Validator:
|
|
|
506
551
|
|
|
507
552
|
def _check_text_lengths(self, df: DataFrame, start_row: int) -> None:
|
|
508
553
|
self.__begin_step(step="Checking text lengths")
|
|
509
|
-
present = [
|
|
554
|
+
present = [
|
|
555
|
+
col for col in df.columns if _normalise(str(col)) in self._column_names
|
|
556
|
+
]
|
|
510
557
|
for col in present:
|
|
511
558
|
if self._find_data_type(col) != DataType.TEXT:
|
|
512
559
|
continue
|
|
@@ -524,7 +571,9 @@ class Validator:
|
|
|
524
571
|
|
|
525
572
|
def _check_text_forbidden_chars(self, df: DataFrame, start_row: int) -> None:
|
|
526
573
|
self.__begin_step(step="Checking for forbidden characters")
|
|
527
|
-
present = [
|
|
574
|
+
present = [
|
|
575
|
+
col for col in df.columns if _normalise(str(col)) in self._column_names
|
|
576
|
+
]
|
|
528
577
|
for col in present:
|
|
529
578
|
if self._find_data_type(col) != DataType.TEXT:
|
|
530
579
|
continue
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: valediction
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.2.0
|
|
4
4
|
Summary: Valediction is a convenience data validation package that allows generation, import, and constraint enforcement of user-defined data dictionaries against datasets.
|
|
5
5
|
Author-email: Cai Davis <Cai.Davis@uhs.nhs.uk>
|
|
6
6
|
Requires-Python: <4.0,>=3.11
|
|
@@ -1,15 +1,15 @@
|
|
|
1
1
|
valediction/__init__.py,sha256=HJy57qHyaeENZ0xGf5-jkkal-P92n242UU6vIqtsnaw,511
|
|
2
2
|
valediction/convenience.py,sha256=gDSNcI_T9VKO3Lk1Van4YQCt6hp_fqPyJnUJD8QNP_o,1438
|
|
3
3
|
valediction/exceptions.py,sha256=OtAq_ShVCZeoNx0hWCzJVlVdl3Gm55l72IP1KrKUMR0,748
|
|
4
|
-
valediction/integrity.py,sha256=
|
|
4
|
+
valediction/integrity.py,sha256=2x1xpz1J3dmXCPRSGHPpnbLEvdlJDUuQ9B1y0baZ-mk,5151
|
|
5
5
|
valediction/progress.py,sha256=fXld7BRhp8kk7xPCG50PbRPXvF8RV7Br2hENHuOUlbo,5974
|
|
6
|
-
valediction/support.py,sha256=
|
|
6
|
+
valediction/support.py,sha256=dhKwhtL6dgG709T6fkGaLZDvjYtnxIO9cMmgz477m-I,2207
|
|
7
7
|
valediction/data_types/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
8
|
valediction/data_types/data_type_helpers.py,sha256=iqcpSPBoFZybkTMHBmxrlv56ZRg8PbqSLtVsuJXC2G0,2282
|
|
9
9
|
valediction/data_types/data_types.py,sha256=MJv_io_hvbLo0G0N38kwj71goXlAo0isPFyS3TU05II,1605
|
|
10
10
|
valediction/data_types/type_inference.py,sha256=11SGYgpvfAfwrDwyOewVIwvfA6pQtDo6i94_xfebYM8,19952
|
|
11
11
|
valediction/datasets/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
12
|
-
valediction/datasets/datasets.py,sha256=
|
|
12
|
+
valediction/datasets/datasets.py,sha256=uweSdVkk5-zDBzL8M-cHnC6LETZXnubakajFi0J6L_c,30089
|
|
13
13
|
valediction/datasets/datasets_helpers.py,sha256=AdB3ws5MYFpiXTmHXmSsdm2wZVwDXkXDOtYLvSYhs4I,1159
|
|
14
14
|
valediction/demo/DEMO - Data Dictionary.xlsx,sha256=wj1JG8dHgdALVwV0zSSYnyWMomMTzrHxGFRm491wM_A,45308
|
|
15
15
|
valediction/demo/DEMOGRAPHICS.csv,sha256=ochj8tiHdRByvK2YbZTw5UR3UxbjYxA2xVbA1BfUlbU,3437
|
|
@@ -21,18 +21,18 @@ valediction/demo/demo_dictionary.py,sha256=OQcmKpKuRmLQuidYr2KIVF3_78crki5HU8E6R
|
|
|
21
21
|
valediction/dictionary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
22
22
|
valediction/dictionary/exporting.py,sha256=uRcnVzY5Uj0Yfju4y8sWsjTSP7ywATV9CQDYZMmnws0,18617
|
|
23
23
|
valediction/dictionary/exporting_helpers.py,sha256=O2pOMAHsRIyhLzxrSFrHIHJEf0wEteJguSI2JK9Rxcw,12434
|
|
24
|
-
valediction/dictionary/generation.py,sha256=
|
|
25
|
-
valediction/dictionary/helpers.py,sha256=
|
|
26
|
-
valediction/dictionary/importing.py,sha256=
|
|
24
|
+
valediction/dictionary/generation.py,sha256=CsmQcW6joFGQe-PkhzOXj0uv97hiq7rN3kZKliqox9A,12305
|
|
25
|
+
valediction/dictionary/helpers.py,sha256=DtEbtn__CSs9LWi0oVZD3DtZonRjHeoQA9de-xQ1z3I,5288
|
|
26
|
+
valediction/dictionary/importing.py,sha256=PbE5gLv-y29MAl540ClBCzoTGT8U4Ss0xbzG_GJOpzo,19557
|
|
27
27
|
valediction/dictionary/integrity.py,sha256=k0RLRyNs8dsHyOivl2WCS6jxlhPsW9wfXB48owyokfs,787
|
|
28
|
-
valediction/dictionary/model.py,sha256=
|
|
28
|
+
valediction/dictionary/model.py,sha256=WtTGb5gZAtg7JiurvaWuD1K4DnNkygU-PoEVTZIgBCc,21617
|
|
29
29
|
valediction/dictionary/template/PROJECT - Data Dictionary.xltx,sha256=ZsWmJsSBHvh3ADfrntmeVMWI9Vp_q7zqrTgp7rGd-AI,41721
|
|
30
30
|
valediction/io/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
31
|
-
valediction/io/csv_readers.py,sha256=
|
|
31
|
+
valediction/io/csv_readers.py,sha256=sKYP_xtOuxwm6ce2eDrphQ_wagxP0RYMXiMlEtkybBg,9812
|
|
32
32
|
valediction/validation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
33
|
-
valediction/validation/helpers.py,sha256=
|
|
34
|
-
valediction/validation/issues.py,sha256=
|
|
35
|
-
valediction/validation/validation.py,sha256=
|
|
36
|
-
valediction-1.
|
|
37
|
-
valediction-1.
|
|
38
|
-
valediction-1.
|
|
33
|
+
valediction/validation/helpers.py,sha256=rJl_t0XCwt_X3LZAEz5pMihsAKoAIn-qkANNcm4lLf0,10195
|
|
34
|
+
valediction/validation/issues.py,sha256=fBeGjbGGmwGg5XfENU4FtsYhvFfwdqAFSYGNF3UBEI8,9327
|
|
35
|
+
valediction/validation/validation.py,sha256=XgYnRslQTCZCpAHi_AYUkZw4mXM7yjwBw6-iYJXhcao,22961
|
|
36
|
+
valediction-1.2.0.dist-info/METADATA,sha256=vDocxg062EAic20YXBd3rQmfk95hkybDdnpLDQvV0W0,612
|
|
37
|
+
valediction-1.2.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
38
|
+
valediction-1.2.0.dist-info/RECORD,,
|
|
File without changes
|