valediction 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- valediction/__init__.py +8 -0
- valediction/convenience.py +50 -0
- valediction/data_types/__init__.py +0 -0
- valediction/data_types/data_type_helpers.py +75 -0
- valediction/data_types/data_types.py +58 -0
- valediction/data_types/type_inference.py +541 -0
- valediction/datasets/__init__.py +0 -0
- valediction/datasets/datasets.py +870 -0
- valediction/datasets/datasets_helpers.py +46 -0
- valediction/demo/DEMO - Data Dictionary.xlsx +0 -0
- valediction/demo/DEMOGRAPHICS.csv +101 -0
- valediction/demo/DIAGNOSES.csv +650 -0
- valediction/demo/LAB_TESTS.csv +1001 -0
- valediction/demo/VITALS.csv +1001 -0
- valediction/demo/__init__.py +6 -0
- valediction/demo/demo_dictionary.py +129 -0
- valediction/dictionary/__init__.py +0 -0
- valediction/dictionary/exporting.py +501 -0
- valediction/dictionary/exporting_helpers.py +371 -0
- valediction/dictionary/generation.py +357 -0
- valediction/dictionary/helpers.py +174 -0
- valediction/dictionary/importing.py +494 -0
- valediction/dictionary/integrity.py +37 -0
- valediction/dictionary/model.py +582 -0
- valediction/dictionary/template/PROJECT - Data Dictionary.xltx +0 -0
- valediction/exceptions.py +22 -0
- valediction/integrity.py +97 -0
- valediction/io/__init__.py +0 -0
- valediction/io/csv_readers.py +307 -0
- valediction/progress.py +206 -0
- valediction/support.py +72 -0
- valediction/validation/__init__.py +0 -0
- valediction/validation/helpers.py +315 -0
- valediction/validation/issues.py +280 -0
- valediction/validation/validation.py +598 -0
- valediction-1.0.0.dist-info/METADATA +15 -0
- valediction-1.0.0.dist-info/RECORD +38 -0
- valediction-1.0.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,598 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from datetime import timedelta
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Iterator
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
from pandas import DataFrame, Series
|
|
9
|
+
|
|
10
|
+
from valediction.data_types.data_type_helpers import (
|
|
11
|
+
infer_datetime_format,
|
|
12
|
+
)
|
|
13
|
+
from valediction.data_types.data_types import DataType
|
|
14
|
+
from valediction.datasets.datasets_helpers import DataLike, DatasetItemLike
|
|
15
|
+
from valediction.dictionary.model import Table
|
|
16
|
+
from valediction.exceptions import DataDictionaryImportError, DataIntegrityError
|
|
17
|
+
from valediction.io.csv_readers import (
|
|
18
|
+
CsvReadConfig,
|
|
19
|
+
FrameChunk,
|
|
20
|
+
iter_csv_chunks,
|
|
21
|
+
)
|
|
22
|
+
from valediction.progress import Progress
|
|
23
|
+
from valediction.support import _get_runtime_string, calculate_runtime
|
|
24
|
+
from valediction.validation.helpers import (
|
|
25
|
+
_column_has_values,
|
|
26
|
+
_set_nulls,
|
|
27
|
+
create_pk_hashes,
|
|
28
|
+
invalid_mask_date,
|
|
29
|
+
invalid_mask_datetime,
|
|
30
|
+
invalid_mask_float,
|
|
31
|
+
invalid_mask_integer,
|
|
32
|
+
invalid_mask_text_forbidden_characters,
|
|
33
|
+
invalid_mask_text_too_long,
|
|
34
|
+
mask_to_ranges,
|
|
35
|
+
pk_contains_whitespace_mask,
|
|
36
|
+
)
|
|
37
|
+
from valediction.validation.issues import Issues, IssueType, Range
|
|
38
|
+
|
|
39
|
+
IMPORTING_DATA = "Importing data"
|
|
40
|
+
SINGLE_STEPS: int = 3 # tweak if adding/amending step tracking
|
|
41
|
+
CHUNK_STEPS: int = 12 # tweak if adding/amending step tracking
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class Validator:
|
|
45
|
+
"""
|
|
46
|
+
Summary:
|
|
47
|
+
Validates a dataset against a dictionary.
|
|
48
|
+
|
|
49
|
+
Arguments:
|
|
50
|
+
dataset_item (DatasetItemLike): dataset item to validate
|
|
51
|
+
table_dictionary (Table): table dictionary to validate against
|
|
52
|
+
feedback (bool): whether to provide feedback on validation (default: True)
|
|
53
|
+
chunk_size (int): size of chunks to validate (default: 10_000_000)
|
|
54
|
+
|
|
55
|
+
Raises:
|
|
56
|
+
DataDictionaryImportError: if the provided dictionary is invalid
|
|
57
|
+
DataIntegrityError: if the validated dataset contains invalid values
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
def __init__(
|
|
61
|
+
self,
|
|
62
|
+
dataset_item: DatasetItemLike,
|
|
63
|
+
table_dictionary: Table,
|
|
64
|
+
feedback: bool = True,
|
|
65
|
+
chunk_size: int = 10_000_000,
|
|
66
|
+
_padding: int = 0,
|
|
67
|
+
):
|
|
68
|
+
# User Variables
|
|
69
|
+
self.dataset_item = dataset_item
|
|
70
|
+
self.data: DataLike = dataset_item.data
|
|
71
|
+
self.table_dictionary: Table = table_dictionary
|
|
72
|
+
self.chunk_size: int = chunk_size
|
|
73
|
+
self.feedback: bool = feedback
|
|
74
|
+
self._padding: int = _padding
|
|
75
|
+
|
|
76
|
+
# Config
|
|
77
|
+
self.table_name: str = None
|
|
78
|
+
self.issues: Issues = None
|
|
79
|
+
self.csv_cfg: CsvReadConfig = None
|
|
80
|
+
|
|
81
|
+
# Validation Tracking
|
|
82
|
+
self.tracker_seen_non_nulls: dict[str, bool] = {}
|
|
83
|
+
self.tracker_pk_hashes: dict[int, int] = {}
|
|
84
|
+
self.tracker_pk_reported_first: set[int] = set()
|
|
85
|
+
self._dt_format_cache: dict[str, str | None] = {}
|
|
86
|
+
self._dt_needs_infer: set[str] = set()
|
|
87
|
+
|
|
88
|
+
# Helpers
|
|
89
|
+
self._column_names: set = set(self.table_dictionary.get_column_names())
|
|
90
|
+
|
|
91
|
+
# Progress Tracking
|
|
92
|
+
self.progress: Progress | None = None
|
|
93
|
+
self.est_chunk_count: int = None
|
|
94
|
+
self._runtimes: dict[str, timedelta] = None
|
|
95
|
+
|
|
96
|
+
# Setup
|
|
97
|
+
self.__check_dictionary()
|
|
98
|
+
self.__init_issues()
|
|
99
|
+
self.__init_csv_cfg()
|
|
100
|
+
self.__reset_pk_trackers()
|
|
101
|
+
self.__import_datetime_format_cache()
|
|
102
|
+
|
|
103
|
+
# Properties
|
|
104
|
+
@property
|
|
105
|
+
def data_is_path(self) -> bool:
|
|
106
|
+
return isinstance(self.data, (Path, str))
|
|
107
|
+
|
|
108
|
+
@property
|
|
109
|
+
def data_is_dataframe(self) -> bool:
|
|
110
|
+
return isinstance(self.data, DataFrame)
|
|
111
|
+
|
|
112
|
+
@property
|
|
113
|
+
def runtimes(self) -> dict[str, str]:
|
|
114
|
+
return {
|
|
115
|
+
step: _get_runtime_string(time_delta)
|
|
116
|
+
for step, time_delta in self._runtimes.items()
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
# Initialisation
|
|
120
|
+
def __check_dictionary(self):
|
|
121
|
+
if self.table_dictionary is None or not isinstance(
|
|
122
|
+
self.table_dictionary, Table
|
|
123
|
+
):
|
|
124
|
+
raise DataDictionaryImportError(
|
|
125
|
+
"Data Dictionary not yet imported or generated. "
|
|
126
|
+
+ "Validation must first have a Data Dictionary. "
|
|
127
|
+
+ "Please first run DataSet.import_dictionary(), including `primary_keys`."
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
self.table_dictionary.check()
|
|
131
|
+
self.table_name = self.table_dictionary.name
|
|
132
|
+
|
|
133
|
+
def __init_issues(self):
|
|
134
|
+
self.issues = Issues()
|
|
135
|
+
|
|
136
|
+
def __init_csv_cfg(self):
|
|
137
|
+
self.csv_cfg = CsvReadConfig()
|
|
138
|
+
|
|
139
|
+
def __reset_pk_trackers(self):
|
|
140
|
+
self.tracker_pk_hashes: dict[int, int] = {}
|
|
141
|
+
self.tracker_pk_reported_first: set[int] = set()
|
|
142
|
+
|
|
143
|
+
def __import_datetime_format_cache(self) -> None:
|
|
144
|
+
self._dt_format_cache.clear()
|
|
145
|
+
self._dt_needs_infer.clear()
|
|
146
|
+
|
|
147
|
+
for column in self.table_dictionary:
|
|
148
|
+
name = column.name
|
|
149
|
+
datetime_format = column.datetime_format
|
|
150
|
+
data_type = column.data_type
|
|
151
|
+
|
|
152
|
+
if data_type in (DataType.DATE, DataType.DATETIME):
|
|
153
|
+
self._dt_format_cache[name] = datetime_format
|
|
154
|
+
|
|
155
|
+
if not datetime_format:
|
|
156
|
+
self._dt_needs_infer.add(name)
|
|
157
|
+
|
|
158
|
+
# Validate
|
|
159
|
+
def validate(self):
|
|
160
|
+
"""
|
|
161
|
+
Summary:
|
|
162
|
+
Validate the dataset against the data dictionary
|
|
163
|
+
|
|
164
|
+
Raises:
|
|
165
|
+
DataDictionaryImportError: if the data dictionary has not yet been imported or generated
|
|
166
|
+
DataDictionaryError: if the data dictionary contains invalid data
|
|
167
|
+
ValueError: if the dataset contains invalid data
|
|
168
|
+
"""
|
|
169
|
+
self.__progress_init()
|
|
170
|
+
first_chunk = True
|
|
171
|
+
|
|
172
|
+
for chunk in self.__iterate_data_chunks(
|
|
173
|
+
self.data, self.chunk_size, self.csv_cfg
|
|
174
|
+
):
|
|
175
|
+
df = chunk.df
|
|
176
|
+
start = chunk.start
|
|
177
|
+
|
|
178
|
+
# First Chunk Only
|
|
179
|
+
if first_chunk:
|
|
180
|
+
_ = self._check_for_missing_columns(df)
|
|
181
|
+
_ = self._check_for_extra_columns(df)
|
|
182
|
+
first_chunk = False
|
|
183
|
+
|
|
184
|
+
# Remove Nulls
|
|
185
|
+
df = self._set_nulls(df)
|
|
186
|
+
|
|
187
|
+
# Structural Checks
|
|
188
|
+
self._check_for_column_nulls(df)
|
|
189
|
+
self._check_primary_key_whitespace(df, start_row=start)
|
|
190
|
+
self._check_primary_key_integrity(df, start_row=start)
|
|
191
|
+
|
|
192
|
+
# Data Type Checks
|
|
193
|
+
self._infer_datetime_formats(df)
|
|
194
|
+
self._check_column_types(df, start_row=start)
|
|
195
|
+
self._check_text_lengths(df, start_row=start)
|
|
196
|
+
self._check_text_forbidden_chars(df, start_row=start)
|
|
197
|
+
|
|
198
|
+
# Final Checks
|
|
199
|
+
self._check_for_fully_null_column()
|
|
200
|
+
|
|
201
|
+
# Finish:
|
|
202
|
+
self.__reset_pk_trackers()
|
|
203
|
+
self.__finish_validation()
|
|
204
|
+
if self.issues:
|
|
205
|
+
raise DataIntegrityError(self.issues)
|
|
206
|
+
|
|
207
|
+
# Global Helpers
|
|
208
|
+
|
|
209
|
+
def __iterate_data_chunks(
|
|
210
|
+
self,
|
|
211
|
+
data: Path | str | DataFrame,
|
|
212
|
+
chunk_size: int | None,
|
|
213
|
+
csv_config: CsvReadConfig | None = None,
|
|
214
|
+
) -> Iterator[FrameChunk]:
|
|
215
|
+
"""Yield FrameChunk and keep a running estimate of total chunks for the progress
|
|
216
|
+
bar."""
|
|
217
|
+
csv_config = csv_config or CsvReadConfig()
|
|
218
|
+
# In-memory DataFrame: single chunk
|
|
219
|
+
if isinstance(data, DataFrame):
|
|
220
|
+
self.__begin_step(step=IMPORTING_DATA)
|
|
221
|
+
n = len(data)
|
|
222
|
+
if n == 0:
|
|
223
|
+
self.__complete_step()
|
|
224
|
+
return
|
|
225
|
+
|
|
226
|
+
# One chunk only
|
|
227
|
+
self.__progress_retarget_total(est_chunk_count=1)
|
|
228
|
+
|
|
229
|
+
self.__complete_step()
|
|
230
|
+
yield FrameChunk(
|
|
231
|
+
df=data,
|
|
232
|
+
start=0,
|
|
233
|
+
end=n - 1,
|
|
234
|
+
total_size=None,
|
|
235
|
+
file_pos=None,
|
|
236
|
+
bytes_read=None,
|
|
237
|
+
chunk_index=1,
|
|
238
|
+
total_bytes_read=None,
|
|
239
|
+
total_chunks_seen=None,
|
|
240
|
+
)
|
|
241
|
+
return
|
|
242
|
+
|
|
243
|
+
# Path/str: chunking
|
|
244
|
+
|
|
245
|
+
iterator = iter_csv_chunks(
|
|
246
|
+
path=Path(data), chunk_size=chunk_size, cfg=csv_config
|
|
247
|
+
)
|
|
248
|
+
while True:
|
|
249
|
+
self.__begin_step(step=IMPORTING_DATA)
|
|
250
|
+
try:
|
|
251
|
+
chunk = next(iterator)
|
|
252
|
+
except StopIteration:
|
|
253
|
+
break
|
|
254
|
+
|
|
255
|
+
est_chunk_count = chunk.estimate_chunk_count()
|
|
256
|
+
self.__progress_retarget_total(est_chunk_count=est_chunk_count)
|
|
257
|
+
|
|
258
|
+
# Bookkeeping & yield
|
|
259
|
+
self.__complete_step()
|
|
260
|
+
yield chunk
|
|
261
|
+
|
|
262
|
+
# Finder Helpers
|
|
263
|
+
def _find_data_type(self, column_name: str) -> DataType:
|
|
264
|
+
return self.table_dictionary.get_column(column_name).data_type
|
|
265
|
+
|
|
266
|
+
def _find_datetime_format(self, column_name: str) -> str:
|
|
267
|
+
return self.table_dictionary.get_column(column_name).datetime_format
|
|
268
|
+
|
|
269
|
+
def _find_max_length(self, column_name: str) -> int | None:
|
|
270
|
+
return self.table_dictionary.get_column(column_name).length
|
|
271
|
+
|
|
272
|
+
# Validation: Start Helpers
|
|
273
|
+
def _check_for_missing_columns(self, df: DataFrame):
|
|
274
|
+
self.__begin_step(step="Checking for missing columns")
|
|
275
|
+
missing = self._column_names - set(df.columns)
|
|
276
|
+
if missing:
|
|
277
|
+
for column in missing:
|
|
278
|
+
self.issues.add(
|
|
279
|
+
issue_type=IssueType.MISSING_COLUMN,
|
|
280
|
+
table=self.table_name,
|
|
281
|
+
column=column,
|
|
282
|
+
parent=self.dataset_item,
|
|
283
|
+
)
|
|
284
|
+
self.__complete_step()
|
|
285
|
+
|
|
286
|
+
def _check_for_extra_columns(self, df: DataFrame):
|
|
287
|
+
self.__begin_step(step="Checking for extra columns")
|
|
288
|
+
extra = set(df.columns) - self._column_names
|
|
289
|
+
if extra:
|
|
290
|
+
for column in extra:
|
|
291
|
+
self.issues.add(
|
|
292
|
+
issue_type=IssueType.EXTRA_COLUMN,
|
|
293
|
+
table=self.table_name,
|
|
294
|
+
column=column,
|
|
295
|
+
parent=self.dataset_item,
|
|
296
|
+
)
|
|
297
|
+
self.__complete_step()
|
|
298
|
+
|
|
299
|
+
# Validation: Chunk Helpers
|
|
300
|
+
def _set_nulls(self, df: DataFrame) -> DataFrame:
|
|
301
|
+
self.__begin_step(step="Setting nulls")
|
|
302
|
+
df = _set_nulls(df)
|
|
303
|
+
self.__complete_step()
|
|
304
|
+
return df
|
|
305
|
+
|
|
306
|
+
def _check_for_column_nulls(self, df: DataFrame) -> None:
|
|
307
|
+
self.__begin_step(step="Checking for column nulls")
|
|
308
|
+
for column in df.columns:
|
|
309
|
+
# Check if previously checked and found
|
|
310
|
+
seen_or_found = self.tracker_seen_non_nulls.get(column, False)
|
|
311
|
+
if not seen_or_found:
|
|
312
|
+
self.tracker_seen_non_nulls[column] = _column_has_values(df[column])
|
|
313
|
+
self.__complete_step()
|
|
314
|
+
|
|
315
|
+
def _check_primary_key_whitespace(self, df: DataFrame, start_row: int) -> None:
|
|
316
|
+
pk_cols = self.table_dictionary.get_primary_keys()
|
|
317
|
+
if not pk_cols:
|
|
318
|
+
return
|
|
319
|
+
|
|
320
|
+
# Check for whitespace (text cols only)
|
|
321
|
+
self.__begin_step(step="Checking for primary key whitespace")
|
|
322
|
+
pk_cols_text = []
|
|
323
|
+
for column in self.table_dictionary:
|
|
324
|
+
if column.name in pk_cols and column.data_type in [DataType.TEXT]:
|
|
325
|
+
pk_cols_text.append(column.name)
|
|
326
|
+
|
|
327
|
+
if pk_cols_text:
|
|
328
|
+
space_mask = pk_contains_whitespace_mask(df[pk_cols_text])
|
|
329
|
+
if space_mask.any():
|
|
330
|
+
self.issues.add(
|
|
331
|
+
issue_type=IssueType.PK_WHITESPACE,
|
|
332
|
+
table=self.table_name,
|
|
333
|
+
column=None,
|
|
334
|
+
ranges=mask_to_ranges(space_mask, start_row),
|
|
335
|
+
parent=self.dataset_item,
|
|
336
|
+
)
|
|
337
|
+
self.__complete_step()
|
|
338
|
+
|
|
339
|
+
def _check_primary_key_integrity(self, df, start_row: int) -> None:
|
|
340
|
+
pk_cols = self.table_dictionary.get_primary_keys()
|
|
341
|
+
if not pk_cols:
|
|
342
|
+
return
|
|
343
|
+
|
|
344
|
+
# Create primary key hashes
|
|
345
|
+
self.__begin_step(step="Creating primary key hashes")
|
|
346
|
+
pk_hashes = create_pk_hashes(df[pk_cols])
|
|
347
|
+
self.__complete_step()
|
|
348
|
+
|
|
349
|
+
# Primary Key Nulls
|
|
350
|
+
self.__begin_step(step="Checking for primary key nulls")
|
|
351
|
+
null = pk_hashes.isna()
|
|
352
|
+
non_null = ~null
|
|
353
|
+
pk_hashes_non_null = pk_hashes[non_null]
|
|
354
|
+
|
|
355
|
+
if null.any():
|
|
356
|
+
self.issues.add(
|
|
357
|
+
IssueType.PK_NULL,
|
|
358
|
+
table=self.table_name,
|
|
359
|
+
column=None,
|
|
360
|
+
ranges=mask_to_ranges(null, start_row),
|
|
361
|
+
parent=self.dataset_item,
|
|
362
|
+
)
|
|
363
|
+
self.__complete_step()
|
|
364
|
+
|
|
365
|
+
# 2) In-chunk collisions
|
|
366
|
+
self.__begin_step(step="Checking for primary key collision")
|
|
367
|
+
|
|
368
|
+
codes, uniques = pk_hashes_non_null.factorize(sort=False)
|
|
369
|
+
counts = np.bincount(codes, minlength=len(uniques))
|
|
370
|
+
in_chunk_local = counts[codes] > 1
|
|
371
|
+
in_chunk_collision = non_null.copy()
|
|
372
|
+
in_chunk_collision.loc[non_null] = in_chunk_local
|
|
373
|
+
|
|
374
|
+
# 3) Cross-chunk collisions
|
|
375
|
+
seen_before = set(self.tracker_pk_hashes)
|
|
376
|
+
unique_in_seen = np.fromiter(
|
|
377
|
+
(unique in seen_before for unique in uniques),
|
|
378
|
+
dtype=bool,
|
|
379
|
+
count=len(uniques),
|
|
380
|
+
)
|
|
381
|
+
seen_before_local = unique_in_seen[codes]
|
|
382
|
+
cross_chunk_local = seen_before_local & ~in_chunk_local
|
|
383
|
+
cross_chunk_collision = non_null.copy()
|
|
384
|
+
cross_chunk_collision.loc[non_null] = cross_chunk_local
|
|
385
|
+
|
|
386
|
+
# 4) Valid in-chunk PKs:
|
|
387
|
+
first_in_chunk_local = ~pk_hashes_non_null.duplicated(keep="first")
|
|
388
|
+
first_appearance_local = first_in_chunk_local & ~seen_before_local
|
|
389
|
+
|
|
390
|
+
# 7) Emit in-chunk collisions Issues
|
|
391
|
+
if in_chunk_collision.any():
|
|
392
|
+
self.issues.add(
|
|
393
|
+
IssueType.PK_COLLISION,
|
|
394
|
+
table=self.table_name,
|
|
395
|
+
column=None,
|
|
396
|
+
ranges=mask_to_ranges(in_chunk_collision, start_row),
|
|
397
|
+
parent=self.dataset_item,
|
|
398
|
+
)
|
|
399
|
+
|
|
400
|
+
# 7) Emit cross-chunk collisions Issues
|
|
401
|
+
if cross_chunk_collision.any():
|
|
402
|
+
self.issues.add(
|
|
403
|
+
IssueType.PK_COLLISION,
|
|
404
|
+
table=self.table_name,
|
|
405
|
+
column=None,
|
|
406
|
+
ranges=mask_to_ranges(cross_chunk_collision, start_row),
|
|
407
|
+
parent=self.dataset_item,
|
|
408
|
+
)
|
|
409
|
+
|
|
410
|
+
# Add the original PK row as a collision
|
|
411
|
+
for h in pk_hashes_non_null[cross_chunk_local].unique():
|
|
412
|
+
if h not in self.tracker_pk_reported_first:
|
|
413
|
+
first_row = self.tracker_pk_hashes[int(h)]
|
|
414
|
+
self.issues.add(
|
|
415
|
+
IssueType.PK_COLLISION,
|
|
416
|
+
table=self.table_name,
|
|
417
|
+
column=None,
|
|
418
|
+
ranges=[Range(first_row, first_row)],
|
|
419
|
+
parent=self.dataset_item,
|
|
420
|
+
)
|
|
421
|
+
self.tracker_pk_reported_first.add(int(h))
|
|
422
|
+
self.__complete_step()
|
|
423
|
+
|
|
424
|
+
# 7) Record valid PKs
|
|
425
|
+
self.__begin_step(step="Caching primary keys")
|
|
426
|
+
if first_appearance_local.any():
|
|
427
|
+
pos = np.flatnonzero(first_appearance_local.to_numpy())
|
|
428
|
+
vals = pk_hashes_non_null.to_numpy()[pos]
|
|
429
|
+
start_rows = start_row + pos
|
|
430
|
+
for h, r in zip(vals, start_rows, strict=False):
|
|
431
|
+
self.tracker_pk_hashes.setdefault(int(h), int(r))
|
|
432
|
+
self.__complete_step()
|
|
433
|
+
|
|
434
|
+
def _infer_datetime_formats(self, df: DataFrame) -> None:
|
|
435
|
+
self.__begin_step(step="Inferring datetime formats")
|
|
436
|
+
if not self._dt_needs_infer:
|
|
437
|
+
self.__complete_step()
|
|
438
|
+
return
|
|
439
|
+
|
|
440
|
+
columns = [col for col in self._dt_needs_infer if col in df.columns]
|
|
441
|
+
if not columns:
|
|
442
|
+
self.__complete_step()
|
|
443
|
+
return
|
|
444
|
+
|
|
445
|
+
for column in columns:
|
|
446
|
+
series = df[column].astype("string", copy=False).str.strip()
|
|
447
|
+
unique = series.dropna().unique()
|
|
448
|
+
if len(unique) == 0:
|
|
449
|
+
continue
|
|
450
|
+
|
|
451
|
+
try:
|
|
452
|
+
fmt_or_false = infer_datetime_format(Series(unique, dtype="string"))
|
|
453
|
+
except ValueError:
|
|
454
|
+
# ambiguous - try again in later chunk
|
|
455
|
+
continue
|
|
456
|
+
|
|
457
|
+
if fmt_or_false and fmt_or_false is not False:
|
|
458
|
+
col_dtype = self._find_data_type(column)
|
|
459
|
+
from valediction.validation.helpers import _allowed_formats_for
|
|
460
|
+
|
|
461
|
+
allowed = _allowed_formats_for(col_dtype)
|
|
462
|
+
if fmt_or_false in allowed:
|
|
463
|
+
self._dt_format_cache[column] = fmt_or_false
|
|
464
|
+
self._dt_needs_infer.discard(column)
|
|
465
|
+
|
|
466
|
+
# Persist in the dictionary
|
|
467
|
+
try:
|
|
468
|
+
self.table_dictionary.get_column(
|
|
469
|
+
column
|
|
470
|
+
).datetime_format = fmt_or_false
|
|
471
|
+
except Exception:
|
|
472
|
+
pass
|
|
473
|
+
self.__complete_step()
|
|
474
|
+
|
|
475
|
+
def _check_column_types(self, df: DataFrame, start_row: int) -> None:
|
|
476
|
+
self.__begin_step(step="Checking column types")
|
|
477
|
+
present = [col for col in df.columns if col in self._column_names]
|
|
478
|
+
for col in present:
|
|
479
|
+
dtype = self._find_data_type(col)
|
|
480
|
+
if dtype == DataType.TEXT:
|
|
481
|
+
continue
|
|
482
|
+
|
|
483
|
+
series = df[col]
|
|
484
|
+
if dtype == DataType.INTEGER:
|
|
485
|
+
invalid = invalid_mask_integer(series)
|
|
486
|
+
elif dtype == DataType.FLOAT:
|
|
487
|
+
invalid = invalid_mask_float(series)
|
|
488
|
+
elif dtype == DataType.DATE:
|
|
489
|
+
fmt = self._dt_format_cache.get(col) or self._find_datetime_format(col)
|
|
490
|
+
invalid = invalid_mask_date(series, fmt)
|
|
491
|
+
elif dtype == DataType.DATETIME:
|
|
492
|
+
fmt = self._dt_format_cache.get(col) or self._find_datetime_format(col)
|
|
493
|
+
invalid = invalid_mask_datetime(series, fmt)
|
|
494
|
+
else:
|
|
495
|
+
continue
|
|
496
|
+
|
|
497
|
+
if invalid.any():
|
|
498
|
+
self.issues.add(
|
|
499
|
+
IssueType.TYPE_MISMATCH,
|
|
500
|
+
table=self.table_name,
|
|
501
|
+
column=col,
|
|
502
|
+
ranges=mask_to_ranges(invalid, start_row),
|
|
503
|
+
parent=self.dataset_item,
|
|
504
|
+
)
|
|
505
|
+
self.__complete_step()
|
|
506
|
+
|
|
507
|
+
def _check_text_lengths(self, df: DataFrame, start_row: int) -> None:
|
|
508
|
+
self.__begin_step(step="Checking text lengths")
|
|
509
|
+
present = [col for col in df.columns if col in self._column_names]
|
|
510
|
+
for col in present:
|
|
511
|
+
if self._find_data_type(col) != DataType.TEXT:
|
|
512
|
+
continue
|
|
513
|
+
max_len = self._find_max_length(col)
|
|
514
|
+
invalid = invalid_mask_text_too_long(df[col], max_len)
|
|
515
|
+
if invalid.any():
|
|
516
|
+
self.issues.add(
|
|
517
|
+
IssueType.TEXT_TOO_LONG,
|
|
518
|
+
table=self.table_name,
|
|
519
|
+
column=col,
|
|
520
|
+
ranges=mask_to_ranges(invalid, start_row),
|
|
521
|
+
parent=self.dataset_item,
|
|
522
|
+
)
|
|
523
|
+
self.__complete_step()
|
|
524
|
+
|
|
525
|
+
def _check_text_forbidden_chars(self, df: DataFrame, start_row: int) -> None:
|
|
526
|
+
self.__begin_step(step="Checking for forbidden characters")
|
|
527
|
+
present = [col for col in df.columns if col in self._column_names]
|
|
528
|
+
for col in present:
|
|
529
|
+
if self._find_data_type(col) != DataType.TEXT:
|
|
530
|
+
continue
|
|
531
|
+
mask = invalid_mask_text_forbidden_characters(df[col])
|
|
532
|
+
if mask.any():
|
|
533
|
+
self.issues.add(
|
|
534
|
+
IssueType.FORBIDDEN_CHARACTER,
|
|
535
|
+
table=self.table_name,
|
|
536
|
+
column=col,
|
|
537
|
+
ranges=mask_to_ranges(mask, start_row),
|
|
538
|
+
parent=self.dataset_item,
|
|
539
|
+
)
|
|
540
|
+
self.__complete_step()
|
|
541
|
+
|
|
542
|
+
# Validation: Final Helpers
|
|
543
|
+
def _check_for_fully_null_column(self):
|
|
544
|
+
self.__begin_step(step="Checking for fully null columns")
|
|
545
|
+
for column, seen in self.tracker_seen_non_nulls.items():
|
|
546
|
+
if not seen:
|
|
547
|
+
self.issues.add(
|
|
548
|
+
issue_type=IssueType.FULLY_NULL_COLUMN,
|
|
549
|
+
table=self.table_name,
|
|
550
|
+
column=column,
|
|
551
|
+
parent=self.dataset_item,
|
|
552
|
+
)
|
|
553
|
+
self.__complete_step()
|
|
554
|
+
|
|
555
|
+
# Progress Helpers
|
|
556
|
+
def __progress_init(self) -> None:
|
|
557
|
+
if not self.feedback:
|
|
558
|
+
self.progress = Progress(enabled=False)
|
|
559
|
+
return
|
|
560
|
+
|
|
561
|
+
total_steps = (
|
|
562
|
+
(SINGLE_STEPS + CHUNK_STEPS)
|
|
563
|
+
if (isinstance(self.data, DataFrame) or not self.chunk_size)
|
|
564
|
+
else None
|
|
565
|
+
)
|
|
566
|
+
self.est_chunk_count = None
|
|
567
|
+
pad = " " * self._padding if self._padding else ""
|
|
568
|
+
|
|
569
|
+
self.progress = Progress(
|
|
570
|
+
desc=f"Validating {self.table_name}: {pad}",
|
|
571
|
+
starting_step=IMPORTING_DATA,
|
|
572
|
+
est_total=total_steps,
|
|
573
|
+
smoothing_steps=CHUNK_STEPS,
|
|
574
|
+
)
|
|
575
|
+
|
|
576
|
+
def __progress_retarget_total(self, est_chunk_count: int) -> None:
|
|
577
|
+
"""Once est_chunk_count is known, resize the bar without losing progress."""
|
|
578
|
+
if est_chunk_count != self.est_chunk_count:
|
|
579
|
+
self.est_chunk_count = est_chunk_count
|
|
580
|
+
new_total = SINGLE_STEPS + (CHUNK_STEPS * self.est_chunk_count)
|
|
581
|
+
self.progress.retarget_total(new_total=new_total)
|
|
582
|
+
|
|
583
|
+
def __finish_validation(self) -> None:
|
|
584
|
+
completed = "Completed with issues" if self.issues else "Completed"
|
|
585
|
+
step = (
|
|
586
|
+
f"{completed} ({calculate_runtime(start=self.progress.full_start).message})"
|
|
587
|
+
)
|
|
588
|
+
save_as = "Total"
|
|
589
|
+
good = False if self.issues else True
|
|
590
|
+
self.progress.finish(postfix=step, save_as=save_as, good=good)
|
|
591
|
+
self._runtimes = self.progress.runtimes
|
|
592
|
+
self.progress.close()
|
|
593
|
+
|
|
594
|
+
def __begin_step(self, step: str | None = None) -> None:
|
|
595
|
+
self.progress.begin_step(step=step)
|
|
596
|
+
|
|
597
|
+
def __complete_step(self) -> None:
|
|
598
|
+
self.progress.complete_step()
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: valediction
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Valediction is a convenience data validation package that allows generation, import, and constraint enforcement of user-defined data dictionaries against datasets.
|
|
5
|
+
Author-email: Cai Davis <Cai.Davis@uhs.nhs.uk>
|
|
6
|
+
Requires-Python: <4.0,>=3.11
|
|
7
|
+
Requires-Dist: certifi<2025,>=2024.2.2
|
|
8
|
+
Requires-Dist: mohawk<2,>=1.1.0
|
|
9
|
+
Requires-Dist: openpyxl<4,>=3.1.5
|
|
10
|
+
Requires-Dist: pandas<3,>=2.2.1
|
|
11
|
+
Requires-Dist: pydantic<3,>=2.11.4
|
|
12
|
+
Requires-Dist: requests<3,>=2.31.0
|
|
13
|
+
Requires-Dist: tabulate<0.10,>=0.9.0
|
|
14
|
+
Requires-Dist: tqdm>=4.67.1
|
|
15
|
+
Requires-Dist: xlsxwriter<4,>=3.2.3
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
valediction/__init__.py,sha256=Bv2Nd-6bolTfToLT2s4NvRsj42ZH2Ls4rLApet5ai1E,519
|
|
2
|
+
valediction/convenience.py,sha256=_N7LFaCRQC8xENpu12vxyPIrpFpR0XA-m1wUa1m7dTo,1670
|
|
3
|
+
valediction/exceptions.py,sha256=YVgf4nOS26lnsImXmROvLhZW-YS8srrfMC7ugsQQKkI,770
|
|
4
|
+
valediction/integrity.py,sha256=8TPJ2mMmb4Qm-XwPj4Fwoivt7dRJdYlWtiBw8kdjXYA,3449
|
|
5
|
+
valediction/progress.py,sha256=ZSigmzRiLwxhqvgUdhW70I1WANEmtCpocwkmeystZDw,6180
|
|
6
|
+
valediction/support.py,sha256=syAxhRpFSzSk2ZmzdO43EI-SK3QNb_45p_oDp_5TMHk,2228
|
|
7
|
+
valediction/data_types/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
|
+
valediction/data_types/data_type_helpers.py,sha256=_VqJLCfOszjftv5OYabMLaWqGXwk6qdXoIOcqVKIYq8,2357
|
|
9
|
+
valediction/data_types/data_types.py,sha256=WiSdBALYeU8d_SeYJ4cxPLQWAE07L8kYwf-JwBX4yKg,1663
|
|
10
|
+
valediction/data_types/type_inference.py,sha256=QIdFtR5niG90jUW0UYi0ymffDgT7KMAxaHmZpkI1jlY,20493
|
|
11
|
+
valediction/datasets/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
12
|
+
valediction/datasets/datasets.py,sha256=-LvecqxkudnV83SBxd2HAgCJzRe5DXJQGoTCN4KvzNk,30890
|
|
13
|
+
valediction/datasets/datasets_helpers.py,sha256=uZUhXA8G0FA4YrItk8JAJzZp0j84bfWBOBaWJHMroRY,1205
|
|
14
|
+
valediction/demo/DEMO - Data Dictionary.xlsx,sha256=wj1JG8dHgdALVwV0zSSYnyWMomMTzrHxGFRm491wM_A,45308
|
|
15
|
+
valediction/demo/DEMOGRAPHICS.csv,sha256=VY3JwQeNotDxFi47MPVp2s0MxfvP9c8g7Wgh2Lmx8dY,3538
|
|
16
|
+
valediction/demo/DIAGNOSES.csv,sha256=21Eb4RQCiCuLNYMqYx_Kq1ov61yDq94WJGzyGnJpMMs,20434
|
|
17
|
+
valediction/demo/LAB_TESTS.csv,sha256=cVuq64DAhdulIy7L3eGXc3PMllkGR4cedhcQgOGgC08,81402
|
|
18
|
+
valediction/demo/VITALS.csv,sha256=q32tIvEB3a6I1vZPKzDZBbYNTld_SUZddX4vse0oX24,47644
|
|
19
|
+
valediction/demo/__init__.py,sha256=y5gM8K1zmFyZgEIQSw4mTGd4F7CsX99Ekio8kp9lBmw,206
|
|
20
|
+
valediction/demo/demo_dictionary.py,sha256=XK-6OVp0vBkWz6TSfpbE_3jZ9cGB1Dgxli2EpsHr5Iw,4539
|
|
21
|
+
valediction/dictionary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
22
|
+
valediction/dictionary/exporting.py,sha256=Kg0boxNBYV4w0uEH7TXN2YumQVIZKMStY158QI_6yFs,19118
|
|
23
|
+
valediction/dictionary/exporting_helpers.py,sha256=FP-4Hn90FwvdrjJjKUvvPDC-pIJMmZQANyPQ4eZLN4k,12805
|
|
24
|
+
valediction/dictionary/generation.py,sha256=SMD_h02WAzHDTF-J5HGLHOYPNVzi3Km3jEm_6jSjg14,12707
|
|
25
|
+
valediction/dictionary/helpers.py,sha256=mN_y1tK8Zo6gKqWGqrKMO1zGTjxG16VP2QMeheucxO4,5634
|
|
26
|
+
valediction/dictionary/importing.py,sha256=boXNyMMqHPj1HUzlu7otVh45dmBqbmsm35Y9NyH3ZvU,19335
|
|
27
|
+
valediction/dictionary/integrity.py,sha256=8vgaRYBm5zmOPwQijqID4_Emt-IfL1eTWzFoDI6MExk,824
|
|
28
|
+
valediction/dictionary/model.py,sha256=fFsIh-L7vB1qcVB3aoB76GJBOsmoDOaAhNSRz3ari_I,19689
|
|
29
|
+
valediction/dictionary/template/PROJECT - Data Dictionary.xltx,sha256=ZsWmJsSBHvh3ADfrntmeVMWI9Vp_q7zqrTgp7rGd-AI,41721
|
|
30
|
+
valediction/io/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
31
|
+
valediction/io/csv_readers.py,sha256=oxrnU8s-HsXLu-wvDZbwY-UJ6X8RFhTiUdFutxhhXLQ,10137
|
|
32
|
+
valediction/validation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
33
|
+
valediction/validation/helpers.py,sha256=RXJwPsZcwzUQqOYB1LlW2wIluHUgILyVRJNqBf-21ig,10495
|
|
34
|
+
valediction/validation/issues.py,sha256=cMBt2iWIzqXrsdlhHIp1UTX_uYh54wwQdhUuhtEIT8U,9402
|
|
35
|
+
valediction/validation/validation.py,sha256=xZvXo0fkP5kTvuR02ZkLRf7WBjZRc1apiLSJ5oJhVhc,22146
|
|
36
|
+
valediction-1.0.0.dist-info/METADATA,sha256=ABQUlOE2SyNLRPTS8emHZQQFAcmAAHz3Cm7OTb5VNck,612
|
|
37
|
+
valediction-1.0.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
38
|
+
valediction-1.0.0.dist-info/RECORD,,
|