valediction 1.1.0__py3-none-any.whl → 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- valediction/data_types/data_type_helpers.py +2 -2
- valediction/data_types/data_types.py +6 -6
- valediction/data_types/type_inference.py +25 -13
- valediction/datasets/datasets.py +12 -12
- valediction/demo/DEMO - Data Dictionary.xlsx +0 -0
- valediction/demo/demo_dictionary.py +1 -1
- valediction/dictionary/generation.py +6 -6
- valediction/dictionary/helpers.py +1 -8
- valediction/dictionary/importing.py +44 -21
- valediction/dictionary/model.py +108 -36
- valediction/dictionary/template/PROJECT - Data Dictionary.xltx +0 -0
- valediction/integrity.py +80 -24
- valediction/io/csv_readers.py +3 -3
- valediction/support.py +5 -1
- valediction/validation/helpers.py +91 -35
- valediction/validation/issues.py +38 -25
- valediction/validation/validation.py +151 -110
- {valediction-1.1.0.dist-info → valediction-1.5.0.dist-info}/METADATA +1 -1
- valediction-1.5.0.dist-info/RECORD +38 -0
- valediction-1.1.0.dist-info/RECORD +0 -38
- {valediction-1.1.0.dist-info → valediction-1.5.0.dist-info}/WHEEL +0 -0
valediction/validation/issues.py
CHANGED
|
@@ -8,7 +8,7 @@ from pandas import DataFrame, concat
|
|
|
8
8
|
|
|
9
9
|
from valediction.datasets.datasets_helpers import DatasetItemLike
|
|
10
10
|
from valediction.io.csv_readers import CsvReadConfig, read_csv_ranges
|
|
11
|
-
from valediction.support import
|
|
11
|
+
from valediction.support import _strip, list_as_bullets
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
class IssueType(Enum):
|
|
@@ -26,6 +26,7 @@ class IssueType(Enum):
|
|
|
26
26
|
TYPE_MISMATCH = "TypeMismatch"
|
|
27
27
|
TEXT_TOO_LONG = "TextTooLong"
|
|
28
28
|
FORBIDDEN_CHARACTER = "ForbiddenCharacter"
|
|
29
|
+
INTEGER_OUT_OF_RANGE = "IntegerOutOfRange"
|
|
29
30
|
|
|
30
31
|
|
|
31
32
|
# Settings
|
|
@@ -107,6 +108,7 @@ class Issue:
|
|
|
107
108
|
merged.append(cur)
|
|
108
109
|
self.ranges = merged
|
|
109
110
|
|
|
111
|
+
# Inspect
|
|
110
112
|
def inspect(
|
|
111
113
|
self,
|
|
112
114
|
additional_columns: bool | str | list[str] | None = None,
|
|
@@ -132,9 +134,9 @@ class Issue:
|
|
|
132
134
|
ValueError: if the issue has no parent DatasetItem
|
|
133
135
|
"""
|
|
134
136
|
# Guard
|
|
135
|
-
|
|
136
|
-
raise ValueError("Issue has no parent DatasetItem")
|
|
137
|
+
self.__guard_parent()
|
|
137
138
|
header = self.__repr__() if print_header else ""
|
|
139
|
+
|
|
138
140
|
# Not applicable
|
|
139
141
|
if self.type in APPLIES_WHOLE_COLUMN:
|
|
140
142
|
print(f"{header}: applies to whole column")
|
|
@@ -143,22 +145,8 @@ class Issue:
|
|
|
143
145
|
# Column Inclusion
|
|
144
146
|
if print_header:
|
|
145
147
|
print(f"{header}:")
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
else:
|
|
149
|
-
additional_columns = (
|
|
150
|
-
[additional_columns]
|
|
151
|
-
if isinstance(additional_columns, str)
|
|
152
|
-
else additional_columns
|
|
153
|
-
)
|
|
154
|
-
base = (
|
|
155
|
-
set(self.parent.primary_keys)
|
|
156
|
-
if self.type in PRIMARY_KEY_ISSUES
|
|
157
|
-
else {self.column}
|
|
158
|
-
)
|
|
159
|
-
base |= set(additional_columns or [])
|
|
160
|
-
base.discard(None)
|
|
161
|
-
columns = list(base) if base else None
|
|
148
|
+
|
|
149
|
+
columns = self.__select_columns(additional_columns)
|
|
162
150
|
|
|
163
151
|
if not self.ranges:
|
|
164
152
|
return DataFrame(columns=columns) if columns else DataFrame()
|
|
@@ -194,6 +182,31 @@ class Issue:
|
|
|
194
182
|
|
|
195
183
|
return out if columns is None else out.loc[:, columns]
|
|
196
184
|
|
|
185
|
+
# Inspect Helpers
|
|
186
|
+
def __guard_parent(self):
|
|
187
|
+
if not self.parent:
|
|
188
|
+
raise ValueError("Issue has no parent DatasetItem")
|
|
189
|
+
|
|
190
|
+
def __select_columns(self, additional_columns: bool | str | list[str]) -> list:
|
|
191
|
+
if additional_columns is True:
|
|
192
|
+
columns = None
|
|
193
|
+
else:
|
|
194
|
+
additional_columns = (
|
|
195
|
+
[additional_columns]
|
|
196
|
+
if isinstance(additional_columns, str)
|
|
197
|
+
else additional_columns
|
|
198
|
+
)
|
|
199
|
+
base = (
|
|
200
|
+
set(self.parent.primary_keys)
|
|
201
|
+
if self.type in PRIMARY_KEY_ISSUES
|
|
202
|
+
else {self.column}
|
|
203
|
+
)
|
|
204
|
+
base |= set(additional_columns or [])
|
|
205
|
+
base.discard(None)
|
|
206
|
+
columns = list(base) if base else None
|
|
207
|
+
|
|
208
|
+
return columns
|
|
209
|
+
|
|
197
210
|
|
|
198
211
|
@dataclass
|
|
199
212
|
class Issues:
|
|
@@ -235,8 +248,8 @@ class Issues:
|
|
|
235
248
|
parent: DatasetItemLike | None = None,
|
|
236
249
|
) -> Issue:
|
|
237
250
|
key = (
|
|
238
|
-
|
|
239
|
-
|
|
251
|
+
_strip(table),
|
|
252
|
+
_strip(column) if column is not None else None,
|
|
240
253
|
issue_type,
|
|
241
254
|
)
|
|
242
255
|
issue = self._index.get(key)
|
|
@@ -255,8 +268,8 @@ class Issues:
|
|
|
255
268
|
issue_type: IssueType | None = None,
|
|
256
269
|
) -> list[Issue]:
|
|
257
270
|
"""Case-insensitive filter; any arg can be None to act as a wildcard."""
|
|
258
|
-
table =
|
|
259
|
-
column =
|
|
271
|
+
table = _strip(table)
|
|
272
|
+
column = _strip(column) if column is not None else None
|
|
260
273
|
output: list[Issue] = []
|
|
261
274
|
if issue_type is not None:
|
|
262
275
|
# direct index lookup where possible
|
|
@@ -268,9 +281,9 @@ class Issues:
|
|
|
268
281
|
|
|
269
282
|
# otherwise scan (still cheap; we maintain a compact list)
|
|
270
283
|
for item in self._items:
|
|
271
|
-
if
|
|
284
|
+
if _strip(item.table) != table:
|
|
272
285
|
continue
|
|
273
|
-
if column is not None and (
|
|
286
|
+
if column is not None and (_strip(item.column) or "") != column:
|
|
274
287
|
continue
|
|
275
288
|
output.append(item)
|
|
276
289
|
return output
|
|
@@ -14,13 +14,14 @@ from valediction.data_types.data_types import DataType
|
|
|
14
14
|
from valediction.datasets.datasets_helpers import DataLike, DatasetItemLike
|
|
15
15
|
from valediction.dictionary.model import Table
|
|
16
16
|
from valediction.exceptions import DataDictionaryImportError, DataIntegrityError
|
|
17
|
+
from valediction.integrity import get_config
|
|
17
18
|
from valediction.io.csv_readers import (
|
|
18
19
|
CsvReadConfig,
|
|
19
20
|
FrameChunk,
|
|
20
21
|
iter_csv_chunks,
|
|
21
22
|
)
|
|
22
23
|
from valediction.progress import Progress
|
|
23
|
-
from valediction.support import _get_runtime_string, calculate_runtime
|
|
24
|
+
from valediction.support import _get_runtime_string, _normalise, calculate_runtime
|
|
24
25
|
from valediction.validation.helpers import (
|
|
25
26
|
_column_has_values,
|
|
26
27
|
_set_nulls,
|
|
@@ -29,6 +30,7 @@ from valediction.validation.helpers import (
|
|
|
29
30
|
invalid_mask_datetime,
|
|
30
31
|
invalid_mask_float,
|
|
31
32
|
invalid_mask_integer,
|
|
33
|
+
invalid_mask_integer_out_of_range,
|
|
32
34
|
invalid_mask_text_forbidden_characters,
|
|
33
35
|
invalid_mask_text_too_long,
|
|
34
36
|
mask_to_ranges,
|
|
@@ -86,7 +88,9 @@ class Validator:
|
|
|
86
88
|
self._dt_needs_infer: set[str] = set()
|
|
87
89
|
|
|
88
90
|
# Helpers
|
|
89
|
-
self._column_names: set =
|
|
91
|
+
self._column_names: set[str] = {
|
|
92
|
+
_normalise(n) for n in self.table_dictionary.get_column_names()
|
|
93
|
+
}
|
|
90
94
|
|
|
91
95
|
# Progress Tracking
|
|
92
96
|
self.progress: Progress | None = None
|
|
@@ -149,12 +153,26 @@ class Validator:
|
|
|
149
153
|
datetime_format = column.datetime_format
|
|
150
154
|
data_type = column.data_type
|
|
151
155
|
|
|
152
|
-
if data_type in (DataType.DATE, DataType.
|
|
156
|
+
if data_type in (DataType.DATE, DataType.TIMESTAMP):
|
|
153
157
|
self._dt_format_cache[name] = datetime_format
|
|
154
158
|
|
|
155
159
|
if not datetime_format:
|
|
156
160
|
self._dt_needs_infer.add(name)
|
|
157
161
|
|
|
162
|
+
# Column Scanning
|
|
163
|
+
def _resolve_df_col(self, df: DataFrame, name: str) -> str | None:
|
|
164
|
+
"""Return the actual df column label matching name case-insensitively."""
|
|
165
|
+
target = _normalise(name)
|
|
166
|
+
return next((c for c in df.columns if _normalise(str(c)) == target), None)
|
|
167
|
+
|
|
168
|
+
def _resolve_df_cols(self, df: DataFrame, names: list[str]) -> list[str]:
|
|
169
|
+
resolved: list[str] = []
|
|
170
|
+
for n in names:
|
|
171
|
+
c = self._resolve_df_col(df, n)
|
|
172
|
+
if c is not None:
|
|
173
|
+
resolved.append(c)
|
|
174
|
+
return resolved
|
|
175
|
+
|
|
158
176
|
# Validate
|
|
159
177
|
def validate(self):
|
|
160
178
|
"""
|
|
@@ -272,28 +290,35 @@ class Validator:
|
|
|
272
290
|
# Validation: Start Helpers
|
|
273
291
|
def _check_for_missing_columns(self, df: DataFrame):
|
|
274
292
|
self.__begin_step(step="Checking for missing columns")
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
293
|
+
|
|
294
|
+
dict_names = self.table_dictionary.get_column_names()
|
|
295
|
+
dict_keys = {_normalise(name) for name in dict_names}
|
|
296
|
+
|
|
297
|
+
df_keys = {_normalise(str(column)) for column in df.columns}
|
|
298
|
+
|
|
299
|
+
missing_keys = dict_keys - df_keys
|
|
300
|
+
if missing_keys:
|
|
301
|
+
for name in dict_names:
|
|
302
|
+
if _normalise(name) in missing_keys:
|
|
303
|
+
self._save_issues(IssueType.MISSING_COLUMN, name)
|
|
304
|
+
|
|
284
305
|
self.__complete_step()
|
|
285
306
|
|
|
286
307
|
def _check_for_extra_columns(self, df: DataFrame):
|
|
287
308
|
self.__begin_step(step="Checking for extra columns")
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
for
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
309
|
+
|
|
310
|
+
dict_keys = {
|
|
311
|
+
_normalise(name) for name in self.table_dictionary.get_column_names()
|
|
312
|
+
}
|
|
313
|
+
df_cols = [str(column) for column in df.columns]
|
|
314
|
+
df_keys = {_normalise(column) for column in df_cols}
|
|
315
|
+
|
|
316
|
+
extra_keys = df_keys - dict_keys
|
|
317
|
+
if extra_keys:
|
|
318
|
+
for col in df_cols:
|
|
319
|
+
if _normalise(col) in extra_keys:
|
|
320
|
+
self._save_issues(IssueType.EXTRA_COLUMN, col)
|
|
321
|
+
|
|
297
322
|
self.__complete_step()
|
|
298
323
|
|
|
299
324
|
# Validation: Chunk Helpers
|
|
@@ -319,21 +344,18 @@ class Validator:
|
|
|
319
344
|
|
|
320
345
|
# Check for whitespace (text cols only)
|
|
321
346
|
self.__begin_step(step="Checking for primary key whitespace")
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
347
|
+
pk_keys = {_normalise(p) for p in pk_cols}
|
|
348
|
+
pk_cols_text = [
|
|
349
|
+
column.name
|
|
350
|
+
for column in self.table_dictionary
|
|
351
|
+
if _normalise(column.name) in pk_keys and column.data_type is DataType.TEXT
|
|
352
|
+
]
|
|
326
353
|
|
|
327
354
|
if pk_cols_text:
|
|
328
|
-
|
|
355
|
+
pk_cols_text_df = self._resolve_df_cols(df, pk_cols_text)
|
|
356
|
+
space_mask = pk_contains_whitespace_mask(df[pk_cols_text_df])
|
|
329
357
|
if space_mask.any():
|
|
330
|
-
self.
|
|
331
|
-
issue_type=IssueType.PK_WHITESPACE,
|
|
332
|
-
table=self.table_name,
|
|
333
|
-
column=None,
|
|
334
|
-
ranges=mask_to_ranges(space_mask, start_row),
|
|
335
|
-
parent=self.dataset_item,
|
|
336
|
-
)
|
|
358
|
+
self._save_issues(IssueType.PK_WHITESPACE, None, space_mask, start_row)
|
|
337
359
|
self.__complete_step()
|
|
338
360
|
|
|
339
361
|
def _check_primary_key_integrity(self, df, start_row: int) -> None:
|
|
@@ -343,7 +365,9 @@ class Validator:
|
|
|
343
365
|
|
|
344
366
|
# Create primary key hashes
|
|
345
367
|
self.__begin_step(step="Creating primary key hashes")
|
|
346
|
-
|
|
368
|
+
pk_cols_df = self._resolve_df_cols(df, pk_cols)
|
|
369
|
+
pk_hashes = create_pk_hashes(df[pk_cols_df])
|
|
370
|
+
|
|
347
371
|
self.__complete_step()
|
|
348
372
|
|
|
349
373
|
# Primary Key Nulls
|
|
@@ -353,13 +377,7 @@ class Validator:
|
|
|
353
377
|
pk_hashes_non_null = pk_hashes[non_null]
|
|
354
378
|
|
|
355
379
|
if null.any():
|
|
356
|
-
self.
|
|
357
|
-
IssueType.PK_NULL,
|
|
358
|
-
table=self.table_name,
|
|
359
|
-
column=None,
|
|
360
|
-
ranges=mask_to_ranges(null, start_row),
|
|
361
|
-
parent=self.dataset_item,
|
|
362
|
-
)
|
|
380
|
+
self._save_issues(IssueType.PK_NULL, None, null, start_row)
|
|
363
381
|
self.__complete_step()
|
|
364
382
|
|
|
365
383
|
# 2) In-chunk collisions
|
|
@@ -389,22 +407,14 @@ class Validator:
|
|
|
389
407
|
|
|
390
408
|
# 7) Emit in-chunk collisions Issues
|
|
391
409
|
if in_chunk_collision.any():
|
|
392
|
-
self.
|
|
393
|
-
IssueType.PK_COLLISION,
|
|
394
|
-
table=self.table_name,
|
|
395
|
-
column=None,
|
|
396
|
-
ranges=mask_to_ranges(in_chunk_collision, start_row),
|
|
397
|
-
parent=self.dataset_item,
|
|
410
|
+
self._save_issues(
|
|
411
|
+
IssueType.PK_COLLISION, None, in_chunk_collision, start_row
|
|
398
412
|
)
|
|
399
413
|
|
|
400
414
|
# 7) Emit cross-chunk collisions Issues
|
|
401
415
|
if cross_chunk_collision.any():
|
|
402
|
-
self.
|
|
403
|
-
IssueType.PK_COLLISION,
|
|
404
|
-
table=self.table_name,
|
|
405
|
-
column=None,
|
|
406
|
-
ranges=mask_to_ranges(cross_chunk_collision, start_row),
|
|
407
|
-
parent=self.dataset_item,
|
|
416
|
+
self._save_issues(
|
|
417
|
+
IssueType.PK_COLLISION, None, cross_chunk_collision, start_row
|
|
408
418
|
)
|
|
409
419
|
|
|
410
420
|
# Add the original PK row as a collision
|
|
@@ -415,7 +425,7 @@ class Validator:
|
|
|
415
425
|
IssueType.PK_COLLISION,
|
|
416
426
|
table=self.table_name,
|
|
417
427
|
column=None,
|
|
418
|
-
ranges=[Range(first_row, first_row)],
|
|
428
|
+
ranges=[Range(first_row, first_row)], # add directly
|
|
419
429
|
parent=self.dataset_item,
|
|
420
430
|
)
|
|
421
431
|
self.tracker_pk_reported_first.add(int(h))
|
|
@@ -437,44 +447,90 @@ class Validator:
|
|
|
437
447
|
self.__complete_step()
|
|
438
448
|
return
|
|
439
449
|
|
|
440
|
-
|
|
441
|
-
|
|
450
|
+
cols = [
|
|
451
|
+
(dict_col, df_col)
|
|
452
|
+
for dict_col in self._dt_needs_infer
|
|
453
|
+
if (df_col := self._resolve_df_col(df, dict_col)) is not None
|
|
454
|
+
]
|
|
455
|
+
if not cols:
|
|
442
456
|
self.__complete_step()
|
|
443
457
|
return
|
|
444
458
|
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
459
|
+
from valediction.validation.helpers import _allowed_formats_for
|
|
460
|
+
|
|
461
|
+
for dict_col, df_col in cols:
|
|
462
|
+
unique = (
|
|
463
|
+
df[df_col].astype("string", copy=False).str.strip().dropna().unique()
|
|
464
|
+
)
|
|
448
465
|
if len(unique) == 0:
|
|
449
466
|
continue
|
|
450
467
|
|
|
451
468
|
try:
|
|
452
|
-
|
|
469
|
+
fmt = infer_datetime_format(Series(unique, dtype="string"))
|
|
453
470
|
except ValueError:
|
|
454
|
-
# ambiguous - try again in later chunk
|
|
455
471
|
continue
|
|
456
472
|
|
|
457
|
-
if
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
pass
|
|
473
|
+
if not fmt or fmt is False:
|
|
474
|
+
continue
|
|
475
|
+
|
|
476
|
+
col_dtype = self._find_data_type(dict_col) # case-insensitive getter
|
|
477
|
+
if fmt not in _allowed_formats_for(col_dtype):
|
|
478
|
+
continue
|
|
479
|
+
|
|
480
|
+
self._dt_format_cache[dict_col] = fmt
|
|
481
|
+
self._dt_needs_infer.discard(dict_col)
|
|
482
|
+
|
|
483
|
+
try:
|
|
484
|
+
self.table_dictionary.get_column(dict_col).datetime_format = fmt
|
|
485
|
+
except Exception:
|
|
486
|
+
pass
|
|
487
|
+
|
|
473
488
|
self.__complete_step()
|
|
474
489
|
|
|
490
|
+
def _save_issues(
|
|
491
|
+
self,
|
|
492
|
+
issue_type: IssueType,
|
|
493
|
+
column: str | None = None,
|
|
494
|
+
invalid: Series | None = None,
|
|
495
|
+
start_row: int | None = None,
|
|
496
|
+
) -> None:
|
|
497
|
+
if invalid is not None and start_row is None:
|
|
498
|
+
raise ValueError(
|
|
499
|
+
"'start_row' must be provided when 'invalid' mask is provided"
|
|
500
|
+
)
|
|
501
|
+
|
|
502
|
+
ranges = None if invalid is None else mask_to_ranges(invalid, start_row)
|
|
503
|
+
self.issues.add(
|
|
504
|
+
issue_type=issue_type,
|
|
505
|
+
table=self.table_name,
|
|
506
|
+
column=column,
|
|
507
|
+
ranges=ranges,
|
|
508
|
+
parent=self.dataset_item,
|
|
509
|
+
)
|
|
510
|
+
|
|
511
|
+
def _check_column_types_integer(
|
|
512
|
+
self, col: str, series: Series, start_row: int, allow_bigint: bool
|
|
513
|
+
) -> None:
|
|
514
|
+
invalid = invalid_mask_integer(series)
|
|
515
|
+
if invalid.any():
|
|
516
|
+
self._save_issues(IssueType.TYPE_MISMATCH, col, invalid, start_row)
|
|
517
|
+
|
|
518
|
+
# Check for out of range integers
|
|
519
|
+
if allow_bigint is False:
|
|
520
|
+
out_of_range = invalid_mask_integer_out_of_range(
|
|
521
|
+
series, invalid_integer_mask=invalid
|
|
522
|
+
)
|
|
523
|
+
if out_of_range.any():
|
|
524
|
+
self._save_issues(
|
|
525
|
+
IssueType.INTEGER_OUT_OF_RANGE, col, out_of_range, start_row
|
|
526
|
+
)
|
|
527
|
+
|
|
475
528
|
def _check_column_types(self, df: DataFrame, start_row: int) -> None:
|
|
476
529
|
self.__begin_step(step="Checking column types")
|
|
477
|
-
present = [
|
|
530
|
+
present = [
|
|
531
|
+
col for col in df.columns if _normalise(str(col)) in self._column_names
|
|
532
|
+
]
|
|
533
|
+
config = get_config()
|
|
478
534
|
for col in present:
|
|
479
535
|
dtype = self._find_data_type(col)
|
|
480
536
|
if dtype == DataType.TEXT:
|
|
@@ -482,61 +538,51 @@ class Validator:
|
|
|
482
538
|
|
|
483
539
|
series = df[col]
|
|
484
540
|
if dtype == DataType.INTEGER:
|
|
485
|
-
|
|
486
|
-
|
|
541
|
+
self._check_column_types_integer(
|
|
542
|
+
col, series, start_row, config.allow_bigint
|
|
543
|
+
)
|
|
544
|
+
continue
|
|
545
|
+
|
|
546
|
+
if dtype == DataType.FLOAT:
|
|
487
547
|
invalid = invalid_mask_float(series)
|
|
488
548
|
elif dtype == DataType.DATE:
|
|
489
549
|
fmt = self._dt_format_cache.get(col) or self._find_datetime_format(col)
|
|
490
550
|
invalid = invalid_mask_date(series, fmt)
|
|
491
|
-
elif dtype == DataType.
|
|
551
|
+
elif dtype == DataType.TIMESTAMP:
|
|
492
552
|
fmt = self._dt_format_cache.get(col) or self._find_datetime_format(col)
|
|
493
553
|
invalid = invalid_mask_datetime(series, fmt)
|
|
494
554
|
else:
|
|
495
555
|
continue
|
|
496
556
|
|
|
497
557
|
if invalid.any():
|
|
498
|
-
self.
|
|
499
|
-
IssueType.TYPE_MISMATCH,
|
|
500
|
-
table=self.table_name,
|
|
501
|
-
column=col,
|
|
502
|
-
ranges=mask_to_ranges(invalid, start_row),
|
|
503
|
-
parent=self.dataset_item,
|
|
504
|
-
)
|
|
558
|
+
self._save_issues(IssueType.TYPE_MISMATCH, col, invalid, start_row)
|
|
505
559
|
self.__complete_step()
|
|
506
560
|
|
|
507
561
|
def _check_text_lengths(self, df: DataFrame, start_row: int) -> None:
|
|
508
562
|
self.__begin_step(step="Checking text lengths")
|
|
509
|
-
present = [
|
|
563
|
+
present = [
|
|
564
|
+
col for col in df.columns if _normalise(str(col)) in self._column_names
|
|
565
|
+
]
|
|
510
566
|
for col in present:
|
|
511
567
|
if self._find_data_type(col) != DataType.TEXT:
|
|
512
568
|
continue
|
|
513
569
|
max_len = self._find_max_length(col)
|
|
514
570
|
invalid = invalid_mask_text_too_long(df[col], max_len)
|
|
515
571
|
if invalid.any():
|
|
516
|
-
self.
|
|
517
|
-
IssueType.TEXT_TOO_LONG,
|
|
518
|
-
table=self.table_name,
|
|
519
|
-
column=col,
|
|
520
|
-
ranges=mask_to_ranges(invalid, start_row),
|
|
521
|
-
parent=self.dataset_item,
|
|
522
|
-
)
|
|
572
|
+
self._save_issues(IssueType.TEXT_TOO_LONG, col, invalid, start_row)
|
|
523
573
|
self.__complete_step()
|
|
524
574
|
|
|
525
575
|
def _check_text_forbidden_chars(self, df: DataFrame, start_row: int) -> None:
|
|
526
576
|
self.__begin_step(step="Checking for forbidden characters")
|
|
527
|
-
present = [
|
|
577
|
+
present = [
|
|
578
|
+
col for col in df.columns if _normalise(str(col)) in self._column_names
|
|
579
|
+
]
|
|
528
580
|
for col in present:
|
|
529
581
|
if self._find_data_type(col) != DataType.TEXT:
|
|
530
582
|
continue
|
|
531
583
|
mask = invalid_mask_text_forbidden_characters(df[col])
|
|
532
584
|
if mask.any():
|
|
533
|
-
self.
|
|
534
|
-
IssueType.FORBIDDEN_CHARACTER,
|
|
535
|
-
table=self.table_name,
|
|
536
|
-
column=col,
|
|
537
|
-
ranges=mask_to_ranges(mask, start_row),
|
|
538
|
-
parent=self.dataset_item,
|
|
539
|
-
)
|
|
585
|
+
self._save_issues(IssueType.FORBIDDEN_CHARACTER, col, mask, start_row)
|
|
540
586
|
self.__complete_step()
|
|
541
587
|
|
|
542
588
|
# Validation: Final Helpers
|
|
@@ -544,12 +590,7 @@ class Validator:
|
|
|
544
590
|
self.__begin_step(step="Checking for fully null columns")
|
|
545
591
|
for column, seen in self.tracker_seen_non_nulls.items():
|
|
546
592
|
if not seen:
|
|
547
|
-
self.
|
|
548
|
-
issue_type=IssueType.FULLY_NULL_COLUMN,
|
|
549
|
-
table=self.table_name,
|
|
550
|
-
column=column,
|
|
551
|
-
parent=self.dataset_item,
|
|
552
|
-
)
|
|
593
|
+
self._save_issues(IssueType.FULLY_NULL_COLUMN, column)
|
|
553
594
|
self.__complete_step()
|
|
554
595
|
|
|
555
596
|
# Progress Helpers
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: valediction
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.5.0
|
|
4
4
|
Summary: Valediction is a convenience data validation package that allows generation, import, and constraint enforcement of user-defined data dictionaries against datasets.
|
|
5
5
|
Author-email: Cai Davis <Cai.Davis@uhs.nhs.uk>
|
|
6
6
|
Requires-Python: <4.0,>=3.11
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
valediction/__init__.py,sha256=HJy57qHyaeENZ0xGf5-jkkal-P92n242UU6vIqtsnaw,511
|
|
2
|
+
valediction/convenience.py,sha256=gDSNcI_T9VKO3Lk1Van4YQCt6hp_fqPyJnUJD8QNP_o,1438
|
|
3
|
+
valediction/exceptions.py,sha256=OtAq_ShVCZeoNx0hWCzJVlVdl3Gm55l72IP1KrKUMR0,748
|
|
4
|
+
valediction/integrity.py,sha256=P18v_5BaUNZTTDfp0OLB9N-eM0IoqzxSVjhmEhNOQKY,5254
|
|
5
|
+
valediction/progress.py,sha256=fXld7BRhp8kk7xPCG50PbRPXvF8RV7Br2hENHuOUlbo,5974
|
|
6
|
+
valediction/support.py,sha256=dhKwhtL6dgG709T6fkGaLZDvjYtnxIO9cMmgz477m-I,2207
|
|
7
|
+
valediction/data_types/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
|
+
valediction/data_types/data_type_helpers.py,sha256=EyADhEHaLwKL7JaLsp2EgIsqo9O5r34WqiawWZKWVHI,2284
|
|
9
|
+
valediction/data_types/data_types.py,sha256=kluHvBdSwAjB5eBEM1xYH-SjiruhDaXiRMI2Bl0Lw7Y,1612
|
|
10
|
+
valediction/data_types/type_inference.py,sha256=a6er1jFZrhwAU0z2h1FsrmooPXUmwu6qkI2cHaciOb4,20315
|
|
11
|
+
valediction/datasets/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
12
|
+
valediction/datasets/datasets.py,sha256=uweSdVkk5-zDBzL8M-cHnC6LETZXnubakajFi0J6L_c,30089
|
|
13
|
+
valediction/datasets/datasets_helpers.py,sha256=AdB3ws5MYFpiXTmHXmSsdm2wZVwDXkXDOtYLvSYhs4I,1159
|
|
14
|
+
valediction/demo/DEMO - Data Dictionary.xlsx,sha256=upbwe14U5haXNarJndSW7ur6rd7RAssyEV2x2D7avao,45271
|
|
15
|
+
valediction/demo/DEMOGRAPHICS.csv,sha256=ochj8tiHdRByvK2YbZTw5UR3UxbjYxA2xVbA1BfUlbU,3437
|
|
16
|
+
valediction/demo/DIAGNOSES.csv,sha256=tJYtjeyLEbLvgulsbLA2MQg07AN_vGDSRGUn33xdiwM,19784
|
|
17
|
+
valediction/demo/LAB_TESTS.csv,sha256=ii1tdQWmm2JiG6MdywB8in_R8vXBJzalD1TEvxG3abw,80401
|
|
18
|
+
valediction/demo/VITALS.csv,sha256=npyaJAP5pLNndz3P3gQ4VbwI5_KwvCrXjSXjNLVJ1g0,46643
|
|
19
|
+
valediction/demo/__init__.py,sha256=trPaz5YRP_HCUJCVZhHdU8uQaaNjAISpFWCRx89ewBg,200
|
|
20
|
+
valediction/demo/demo_dictionary.py,sha256=etTvXk3r9OkPOL9C8Ws9RGZ3DgPsGT-JWt4gRmDgLfg,4411
|
|
21
|
+
valediction/dictionary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
22
|
+
valediction/dictionary/exporting.py,sha256=uRcnVzY5Uj0Yfju4y8sWsjTSP7ywATV9CQDYZMmnws0,18617
|
|
23
|
+
valediction/dictionary/exporting_helpers.py,sha256=O2pOMAHsRIyhLzxrSFrHIHJEf0wEteJguSI2JK9Rxcw,12434
|
|
24
|
+
valediction/dictionary/generation.py,sha256=XVGabsZgoFFC6nyr8vRrypdXnyfajAUTWWLs3miamm8,12306
|
|
25
|
+
valediction/dictionary/helpers.py,sha256=C8LM5FjQmWiOzLLDck08ihGbuVSuXQwCgkVWPKaOoOE,5289
|
|
26
|
+
valediction/dictionary/importing.py,sha256=e_jfVzW2tyfD1eR78_FL0MsFkA17RUDP6n9taX2Bxqk,19564
|
|
27
|
+
valediction/dictionary/integrity.py,sha256=k0RLRyNs8dsHyOivl2WCS6jxlhPsW9wfXB48owyokfs,787
|
|
28
|
+
valediction/dictionary/model.py,sha256=WtTGb5gZAtg7JiurvaWuD1K4DnNkygU-PoEVTZIgBCc,21617
|
|
29
|
+
valediction/dictionary/template/PROJECT - Data Dictionary.xltx,sha256=1boUdgNE1zYUu3pko9QREEi-hn29gp85PQqVwISyS2U,41691
|
|
30
|
+
valediction/io/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
31
|
+
valediction/io/csv_readers.py,sha256=sKYP_xtOuxwm6ce2eDrphQ_wagxP0RYMXiMlEtkybBg,9812
|
|
32
|
+
valediction/validation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
33
|
+
valediction/validation/helpers.py,sha256=RZsFeRGJqHJlxtSbXItV093-4SuXoI74X9539ZyqswY,12034
|
|
34
|
+
valediction/validation/issues.py,sha256=UO5bFxCOfT76rwi3SWi7wdn6Ux2wxxPC02RGvl0dL9Y,9374
|
|
35
|
+
valediction/validation/validation.py,sha256=ezWr4-RRq3w5P-IT3nC9g-KIz0FWXkFdNY-_ouOJxzo,22688
|
|
36
|
+
valediction-1.5.0.dist-info/METADATA,sha256=7OF5xMdNJnOVhEaz1ffd1lJqz61qJqje_T_r9PkKa7o,612
|
|
37
|
+
valediction-1.5.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
38
|
+
valediction-1.5.0.dist-info/RECORD,,
|
|
@@ -1,38 +0,0 @@
|
|
|
1
|
-
valediction/__init__.py,sha256=HJy57qHyaeENZ0xGf5-jkkal-P92n242UU6vIqtsnaw,511
|
|
2
|
-
valediction/convenience.py,sha256=gDSNcI_T9VKO3Lk1Van4YQCt6hp_fqPyJnUJD8QNP_o,1438
|
|
3
|
-
valediction/exceptions.py,sha256=OtAq_ShVCZeoNx0hWCzJVlVdl3Gm55l72IP1KrKUMR0,748
|
|
4
|
-
valediction/integrity.py,sha256=_4kTywJFSoQIkQ3tFbxMJH1---N8ML9SY0C7j2ZXA-c,3352
|
|
5
|
-
valediction/progress.py,sha256=fXld7BRhp8kk7xPCG50PbRPXvF8RV7Br2hENHuOUlbo,5974
|
|
6
|
-
valediction/support.py,sha256=Kmjopx62dGUnLcIcq-kMAk-YRFx6ip2pCHaCwBFxBIk,2156
|
|
7
|
-
valediction/data_types/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
|
-
valediction/data_types/data_type_helpers.py,sha256=iqcpSPBoFZybkTMHBmxrlv56ZRg8PbqSLtVsuJXC2G0,2282
|
|
9
|
-
valediction/data_types/data_types.py,sha256=MJv_io_hvbLo0G0N38kwj71goXlAo0isPFyS3TU05II,1605
|
|
10
|
-
valediction/data_types/type_inference.py,sha256=11SGYgpvfAfwrDwyOewVIwvfA6pQtDo6i94_xfebYM8,19952
|
|
11
|
-
valediction/datasets/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
12
|
-
valediction/datasets/datasets.py,sha256=uDDzmLLAWbzKwXhT_RtLaVH-2f_QEhxAFxNFDotTLdY,30069
|
|
13
|
-
valediction/datasets/datasets_helpers.py,sha256=AdB3ws5MYFpiXTmHXmSsdm2wZVwDXkXDOtYLvSYhs4I,1159
|
|
14
|
-
valediction/demo/DEMO - Data Dictionary.xlsx,sha256=wj1JG8dHgdALVwV0zSSYnyWMomMTzrHxGFRm491wM_A,45308
|
|
15
|
-
valediction/demo/DEMOGRAPHICS.csv,sha256=ochj8tiHdRByvK2YbZTw5UR3UxbjYxA2xVbA1BfUlbU,3437
|
|
16
|
-
valediction/demo/DIAGNOSES.csv,sha256=tJYtjeyLEbLvgulsbLA2MQg07AN_vGDSRGUn33xdiwM,19784
|
|
17
|
-
valediction/demo/LAB_TESTS.csv,sha256=ii1tdQWmm2JiG6MdywB8in_R8vXBJzalD1TEvxG3abw,80401
|
|
18
|
-
valediction/demo/VITALS.csv,sha256=npyaJAP5pLNndz3P3gQ4VbwI5_KwvCrXjSXjNLVJ1g0,46643
|
|
19
|
-
valediction/demo/__init__.py,sha256=trPaz5YRP_HCUJCVZhHdU8uQaaNjAISpFWCRx89ewBg,200
|
|
20
|
-
valediction/demo/demo_dictionary.py,sha256=OQcmKpKuRmLQuidYr2KIVF3_78crki5HU8E6RuFm01s,4410
|
|
21
|
-
valediction/dictionary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
22
|
-
valediction/dictionary/exporting.py,sha256=uRcnVzY5Uj0Yfju4y8sWsjTSP7ywATV9CQDYZMmnws0,18617
|
|
23
|
-
valediction/dictionary/exporting_helpers.py,sha256=O2pOMAHsRIyhLzxrSFrHIHJEf0wEteJguSI2JK9Rxcw,12434
|
|
24
|
-
valediction/dictionary/generation.py,sha256=eN8zBkvppp3fz7winWRx8wW0rhG5Ti88Z4QpXp7wJvE,12350
|
|
25
|
-
valediction/dictionary/helpers.py,sha256=5QLt0isARdkoegVopWmHvTju9JOtz3yLBx7ID67QyHw,5460
|
|
26
|
-
valediction/dictionary/importing.py,sha256=ucpTxAD1rwNdnUAFDeYkyLMppgNsM-LIByCmtAcAvjQ,18841
|
|
27
|
-
valediction/dictionary/integrity.py,sha256=k0RLRyNs8dsHyOivl2WCS6jxlhPsW9wfXB48owyokfs,787
|
|
28
|
-
valediction/dictionary/model.py,sha256=7p7FhTicudq16oXzCc7CgK1iF24AoKIjqgpBcGUJrtU,19107
|
|
29
|
-
valediction/dictionary/template/PROJECT - Data Dictionary.xltx,sha256=ZsWmJsSBHvh3ADfrntmeVMWI9Vp_q7zqrTgp7rGd-AI,41721
|
|
30
|
-
valediction/io/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
31
|
-
valediction/io/csv_readers.py,sha256=6MAOZFZexvNwqiZz-8Er8Me8L4n9NVjLIj6oZOxzZ64,9830
|
|
32
|
-
valediction/validation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
33
|
-
valediction/validation/helpers.py,sha256=qSs_rlJzZ29TPO0pHHFFg9XqVdl1XRxu048NWDl78I0,10180
|
|
34
|
-
valediction/validation/issues.py,sha256=bvHawjehs_e_2xQGoSvPdOt4t8dfASES3KBKHO3zFZo,9122
|
|
35
|
-
valediction/validation/validation.py,sha256=ahYhWlY6wjwL7blYOfNhNIoRCyXCWnNrYn10G0AQGjU,21555
|
|
36
|
-
valediction-1.1.0.dist-info/METADATA,sha256=tg9HnCfKQhYplzno5mOrw_hSICZZL3FyXPhNedYglOE,612
|
|
37
|
-
valediction-1.1.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
38
|
-
valediction-1.1.0.dist-info/RECORD,,
|
|
File without changes
|