valediction 1.2.0__py3-none-any.whl → 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- valediction/data_types/data_type_helpers.py +2 -2
- valediction/data_types/data_types.py +6 -6
- valediction/data_types/type_inference.py +25 -13
- valediction/demo/DEMO - Data Dictionary.xlsx +0 -0
- valediction/demo/demo_dictionary.py +1 -1
- valediction/dictionary/generation.py +1 -1
- valediction/dictionary/helpers.py +1 -1
- valediction/dictionary/importing.py +1 -1
- valediction/dictionary/template/PROJECT - Data Dictionary.xltx +0 -0
- valediction/integrity.py +13 -11
- valediction/validation/helpers.py +61 -2
- valediction/validation/issues.py +1 -0
- valediction/validation/validation.py +62 -70
- {valediction-1.2.0.dist-info → valediction-1.5.0.dist-info}/METADATA +1 -1
- {valediction-1.2.0.dist-info → valediction-1.5.0.dist-info}/RECORD +16 -16
- {valediction-1.2.0.dist-info → valediction-1.5.0.dist-info}/WHEEL +0 -0
|
@@ -62,14 +62,14 @@ def infer_datetime_format(
|
|
|
62
62
|
|
|
63
63
|
|
|
64
64
|
def get_date_type(datetime_format: str) -> DataType | None:
|
|
65
|
-
"""Identifies if a datetime format string corresponds to a Date or
|
|
65
|
+
"""Identifies if a datetime format string corresponds to a Date or Timestamp data
|
|
66
66
|
type.
|
|
67
67
|
|
|
68
68
|
Args:
|
|
69
69
|
datetime_format (str): datetime format string
|
|
70
70
|
|
|
71
71
|
Returns:
|
|
72
|
-
DataType | None: DataType of Date,
|
|
72
|
+
DataType | None: DataType of Date, Timestamp, or None if not found.
|
|
73
73
|
"""
|
|
74
74
|
config = get_config()
|
|
75
75
|
return config.date_formats.get(datetime_format)
|
|
@@ -8,7 +8,7 @@ class DataType(Enum):
|
|
|
8
8
|
INTEGER = "Integer"
|
|
9
9
|
FLOAT = "Float"
|
|
10
10
|
DATE = "Date"
|
|
11
|
-
|
|
11
|
+
TIMESTAMP = "Timestamp"
|
|
12
12
|
FILE = "File"
|
|
13
13
|
|
|
14
14
|
def __str__(self) -> str:
|
|
@@ -32,9 +32,9 @@ class DataType(Enum):
|
|
|
32
32
|
"number": cls.FLOAT,
|
|
33
33
|
"numeric": cls.FLOAT,
|
|
34
34
|
"date": cls.DATE,
|
|
35
|
-
"datetime": cls.
|
|
36
|
-
"datetime64": cls.
|
|
37
|
-
"timestamp": cls.
|
|
35
|
+
"datetime": cls.TIMESTAMP,
|
|
36
|
+
"datetime64": cls.TIMESTAMP,
|
|
37
|
+
"timestamp": cls.TIMESTAMP,
|
|
38
38
|
"file": cls.FILE,
|
|
39
39
|
"blob": cls.FILE,
|
|
40
40
|
"binary": cls.FILE,
|
|
@@ -49,10 +49,10 @@ class DataType(Enum):
|
|
|
49
49
|
return self in {DataType.TEXT}
|
|
50
50
|
|
|
51
51
|
def valid_for_primary_key(self) -> bool:
|
|
52
|
-
"""PKs can only be Text, Integer, Date,
|
|
52
|
+
"""PKs can only be Text, Integer, Date, Timestamp."""
|
|
53
53
|
return self in {
|
|
54
54
|
DataType.TEXT,
|
|
55
55
|
DataType.INTEGER,
|
|
56
56
|
DataType.DATE,
|
|
57
|
-
DataType.
|
|
57
|
+
DataType.TIMESTAMP,
|
|
58
58
|
}
|
|
@@ -4,6 +4,7 @@ import re
|
|
|
4
4
|
import warnings
|
|
5
5
|
|
|
6
6
|
import pandas as pd
|
|
7
|
+
from pandas.api.types import is_object_dtype, is_string_dtype
|
|
7
8
|
|
|
8
9
|
from valediction.data_types.data_type_helpers import infer_datetime_format
|
|
9
10
|
from valediction.data_types.data_types import DataType
|
|
@@ -53,8 +54,8 @@ class ColumnState:
|
|
|
53
54
|
return DataType.FLOAT, None
|
|
54
55
|
if self.data_type == DataType.DATE:
|
|
55
56
|
return DataType.DATE, None
|
|
56
|
-
if self.data_type == DataType.
|
|
57
|
-
return DataType.
|
|
57
|
+
if self.data_type == DataType.TIMESTAMP:
|
|
58
|
+
return DataType.TIMESTAMP, None
|
|
58
59
|
|
|
59
60
|
return DataType.TEXT, _len1()
|
|
60
61
|
|
|
@@ -123,7 +124,7 @@ class TypeInferer:
|
|
|
123
124
|
_handling_function: callable = {
|
|
124
125
|
DataType.TEXT: self._handle_state_text,
|
|
125
126
|
DataType.DATE: self._handle_state_date,
|
|
126
|
-
DataType.
|
|
127
|
+
DataType.TIMESTAMP: self._handle_state_datetime,
|
|
127
128
|
DataType.INTEGER: self._handle_state_integer,
|
|
128
129
|
DataType.FLOAT: self._handle_state_float,
|
|
129
130
|
}.get(state.data_type, self._handle_state_text)
|
|
@@ -141,20 +142,31 @@ class TypeInferer:
|
|
|
141
142
|
self, s: pd.Series
|
|
142
143
|
) -> tuple[pd.Series, pd.Series, pd.Series, int | None]:
|
|
143
144
|
self.__begin_step(step="Trimming whitespace")
|
|
144
|
-
|
|
145
|
+
is_text = is_string_dtype(s) or is_object_dtype(s)
|
|
146
|
+
|
|
147
|
+
if is_text:
|
|
148
|
+
trimmed = s.astype("string").str.strip()
|
|
149
|
+
else:
|
|
150
|
+
trimmed = s
|
|
145
151
|
self.__complete_step()
|
|
146
152
|
|
|
147
153
|
self.__begin_step(step="Checking nulls")
|
|
148
|
-
|
|
154
|
+
if is_text:
|
|
155
|
+
nulls = trimmed.isna() | trimmed.str.lower().isin(self.null_tokens)
|
|
156
|
+
else:
|
|
157
|
+
nulls = trimmed.isna()
|
|
149
158
|
self.__complete_step()
|
|
150
159
|
|
|
151
160
|
self.__begin_step(step="Checking max length")
|
|
152
|
-
|
|
153
|
-
|
|
161
|
+
if is_text:
|
|
162
|
+
lengths = trimmed.str.len()
|
|
163
|
+
max_len = int(lengths.max(skipna=True)) if lengths.notna().any() else None
|
|
164
|
+
else:
|
|
165
|
+
max_len = None
|
|
154
166
|
self.__complete_step()
|
|
155
167
|
|
|
156
168
|
self.__begin_step(step="Setting non-null mask")
|
|
157
|
-
nonnull_mask = (~nulls) &
|
|
169
|
+
nonnull_mask = (~nulls) & trimmed.notna()
|
|
158
170
|
self.__complete_step()
|
|
159
171
|
|
|
160
172
|
return trimmed, nulls, nonnull_mask, max_len
|
|
@@ -193,7 +205,7 @@ class TypeInferer:
|
|
|
193
205
|
if ok.all():
|
|
194
206
|
self._transition(
|
|
195
207
|
st,
|
|
196
|
-
DataType.
|
|
208
|
+
DataType.TIMESTAMP if has_time.any() else DataType.DATE,
|
|
197
209
|
f"cached datetime format={st.cached_datetime_format!r}",
|
|
198
210
|
)
|
|
199
211
|
self.__complete_step()
|
|
@@ -210,7 +222,7 @@ class TypeInferer:
|
|
|
210
222
|
st.cached_datetime_format = fmt
|
|
211
223
|
self._transition(
|
|
212
224
|
st,
|
|
213
|
-
DataType.
|
|
225
|
+
DataType.TIMESTAMP if has_time.any() else DataType.DATE,
|
|
214
226
|
f"explicit datetime format={fmt!r}",
|
|
215
227
|
)
|
|
216
228
|
self.__complete_step()
|
|
@@ -276,7 +288,7 @@ class TypeInferer:
|
|
|
276
288
|
st.lock_text_permanent = True
|
|
277
289
|
self._transition(st, DataType.TEXT, "datetime parse failures")
|
|
278
290
|
elif has_time.any():
|
|
279
|
-
self._transition(st, DataType.
|
|
291
|
+
self._transition(st, DataType.TIMESTAMP, "time component detected")
|
|
280
292
|
|
|
281
293
|
self.__complete_step()
|
|
282
294
|
|
|
@@ -334,7 +346,7 @@ class TypeInferer:
|
|
|
334
346
|
if ok.all():
|
|
335
347
|
self._transition(
|
|
336
348
|
st,
|
|
337
|
-
DataType.
|
|
349
|
+
DataType.TIMESTAMP if has_time.any() else DataType.DATE,
|
|
338
350
|
f"cached datetime format={st.cached_datetime_format!r}",
|
|
339
351
|
)
|
|
340
352
|
return True
|
|
@@ -377,7 +389,7 @@ class TypeInferer:
|
|
|
377
389
|
if ok.all():
|
|
378
390
|
self._transition(
|
|
379
391
|
st,
|
|
380
|
-
DataType.
|
|
392
|
+
DataType.TIMESTAMP if has_time.any() else DataType.DATE,
|
|
381
393
|
f"explicit datetime format={st.cached_datetime_format!r}",
|
|
382
394
|
)
|
|
383
395
|
return True
|
|
Binary file
|
|
@@ -103,7 +103,7 @@ def demo_dictionary() -> Dictionary:
|
|
|
103
103
|
foreign_key="DEMOGRAPHICS.PATIENT_HASH",
|
|
104
104
|
),
|
|
105
105
|
Column(
|
|
106
|
-
name="OBSERVATION_TIME", order=2, data_type="
|
|
106
|
+
name="OBSERVATION_TIME", order=2, data_type="timestamp", primary_key=2
|
|
107
107
|
),
|
|
108
108
|
Column(
|
|
109
109
|
name="OBSERVATION_TYPE",
|
|
@@ -257,7 +257,7 @@ class Generator:
|
|
|
257
257
|
table.add_column(col)
|
|
258
258
|
|
|
259
259
|
def _set_datetime_format(self, column_state: ColumnState, column: Column) -> None:
|
|
260
|
-
if column.data_type in (DataType.DATE, DataType.
|
|
260
|
+
if column.data_type in (DataType.DATE, DataType.TIMESTAMP):
|
|
261
261
|
datetime_format = getattr(column_state, "cached_datetime_format", None)
|
|
262
262
|
if datetime_format and hasattr(column, "datetime_format"):
|
|
263
263
|
column.datetime_format = datetime_format
|
|
@@ -106,7 +106,7 @@ def _check_primary_key(primary_key: int | None, data_type: DataType) -> list[str
|
|
|
106
106
|
):
|
|
107
107
|
errors.append(
|
|
108
108
|
f"invalid data type '{data_type.value}' for primary key column; "
|
|
109
|
-
"primary keys must be Text, Integer, Date, or
|
|
109
|
+
"primary keys must be Text, Integer, Date, or Timestamp"
|
|
110
110
|
)
|
|
111
111
|
|
|
112
112
|
return errors
|
|
@@ -364,7 +364,7 @@ class ExcelDataDictionary:
|
|
|
364
364
|
enumeration_flag_col_header = header_map.get("enumerations")
|
|
365
365
|
primary_key_col_header = header_map.get("primary_key")
|
|
366
366
|
foreign_key_col_header = header_map.get("foreign_key_target")
|
|
367
|
-
description_col_header = header_map.get("
|
|
367
|
+
description_col_header = header_map.get("column_description")
|
|
368
368
|
return (
|
|
369
369
|
table_col_header,
|
|
370
370
|
column_col_header,
|
|
Binary file
|
valediction/integrity.py
CHANGED
|
@@ -77,20 +77,21 @@ class Config:
|
|
|
77
77
|
"%d-%m-%Y": DataType.DATE,
|
|
78
78
|
"%m/%d/%Y": DataType.DATE,
|
|
79
79
|
"%m-%d-%Y": DataType.DATE,
|
|
80
|
-
"%Y-%m-%d %H:%M:%S": DataType.
|
|
81
|
-
"%Y-%m-%d %H:%M": DataType.
|
|
82
|
-
"%d/%m/%Y %H:%M:%S": DataType.
|
|
83
|
-
"%d/%m/%Y %H:%M": DataType.
|
|
84
|
-
"%m/%d/%Y %H:%M:%S": DataType.
|
|
85
|
-
"%Y-%m-%dT%H:%M:%S": DataType.
|
|
86
|
-
"%Y-%m-%dT%H:%M:%S.%f": DataType.
|
|
87
|
-
"%Y-%m-%dT%H:%M:%S%z": DataType.
|
|
88
|
-
"%Y-%m-%dT%H:%M:%S.%f%z": DataType.
|
|
89
|
-
"%Y-%m-%dT%H:%M:%SZ": DataType.
|
|
90
|
-
"%Y-%m-%dT%H:%M:%S.%fZ": DataType.
|
|
80
|
+
"%Y-%m-%d %H:%M:%S": DataType.TIMESTAMP,
|
|
81
|
+
"%Y-%m-%d %H:%M": DataType.TIMESTAMP,
|
|
82
|
+
"%d/%m/%Y %H:%M:%S": DataType.TIMESTAMP,
|
|
83
|
+
"%d/%m/%Y %H:%M": DataType.TIMESTAMP,
|
|
84
|
+
"%m/%d/%Y %H:%M:%S": DataType.TIMESTAMP,
|
|
85
|
+
"%Y-%m-%dT%H:%M:%S": DataType.TIMESTAMP,
|
|
86
|
+
"%Y-%m-%dT%H:%M:%S.%f": DataType.TIMESTAMP,
|
|
87
|
+
"%Y-%m-%dT%H:%M:%S%z": DataType.TIMESTAMP,
|
|
88
|
+
"%Y-%m-%dT%H:%M:%S.%f%z": DataType.TIMESTAMP,
|
|
89
|
+
"%Y-%m-%dT%H:%M:%SZ": DataType.TIMESTAMP,
|
|
90
|
+
"%Y-%m-%dT%H:%M:%S.%fZ": DataType.TIMESTAMP,
|
|
91
91
|
}
|
|
92
92
|
self.enforce_no_null_columns: bool = True
|
|
93
93
|
self.enforce_primary_keys: bool = True
|
|
94
|
+
self.allow_bigint: bool = True
|
|
94
95
|
_apply_external_injections(self)
|
|
95
96
|
|
|
96
97
|
def __repr__(self):
|
|
@@ -110,6 +111,7 @@ class Config:
|
|
|
110
111
|
f" - default_null_values={self.null_values}\n"
|
|
111
112
|
f" - forbidden_characters={self.forbidden_characters}\n"
|
|
112
113
|
f" - date_formats=[{date_list}\n ]\n"
|
|
114
|
+
f" - allow_bigint={self.allow_bigint}\n"
|
|
113
115
|
")"
|
|
114
116
|
)
|
|
115
117
|
|
|
@@ -241,7 +241,7 @@ def invalid_mask_datetime(column: Series, fmt: str | None) -> Series:
|
|
|
241
241
|
ok = parsed.notna()
|
|
242
242
|
return notnull & (~ok)
|
|
243
243
|
|
|
244
|
-
allowed = _allowed_formats_for(DataType.
|
|
244
|
+
allowed = _allowed_formats_for(DataType.TIMESTAMP)
|
|
245
245
|
ok_any = _parse_ok_any(column, allowed)
|
|
246
246
|
return notnull & (~ok_any)
|
|
247
247
|
|
|
@@ -300,7 +300,7 @@ def apply_data_types(df: DataFrame, table_dictionary: Table) -> DataFrame:
|
|
|
300
300
|
)
|
|
301
301
|
df[col] = dtv.dt.normalize() # midnight
|
|
302
302
|
|
|
303
|
-
elif data_type == DataType.
|
|
303
|
+
elif data_type == DataType.TIMESTAMP:
|
|
304
304
|
df[col] = to_datetime(
|
|
305
305
|
df[col], format=datetime_format, errors="raise", utc=False
|
|
306
306
|
)
|
|
@@ -310,3 +310,62 @@ def apply_data_types(df: DataFrame, table_dictionary: Table) -> DataFrame:
|
|
|
310
310
|
df[col] = df[col].astype("string")
|
|
311
311
|
|
|
312
312
|
return df
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
# Bigint Checks
|
|
316
|
+
_PG_INT4_MIN_STR_ABS = "2147483648" # abs(-2147483648)
|
|
317
|
+
_PG_INT4_MAX_STR_ABS = "2147483647"
|
|
318
|
+
_PG_INT4_MIN_LEN = len(_PG_INT4_MIN_STR_ABS)
|
|
319
|
+
_PG_INT4_MAX_LEN = len(_PG_INT4_MAX_STR_ABS)
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
def invalid_mask_integer_out_of_range(
|
|
323
|
+
series: Series,
|
|
324
|
+
invalid_integer_mask: Series | None = None,
|
|
325
|
+
) -> Series:
|
|
326
|
+
"""
|
|
327
|
+
Returns a boolean mask for values that:
|
|
328
|
+
- are integer-like under Valediction's integer rules, AND
|
|
329
|
+
- fall outside PostgreSQL INTEGER (int4) range.
|
|
330
|
+
"""
|
|
331
|
+
|
|
332
|
+
# Start with all-False mask
|
|
333
|
+
out = series.isna() & False
|
|
334
|
+
|
|
335
|
+
# Use caller-provided invalid mask to avoid recomputing if available
|
|
336
|
+
if invalid_integer_mask is None:
|
|
337
|
+
from valediction.validation.helpers import invalid_mask_integer # avoid cycles
|
|
338
|
+
|
|
339
|
+
invalid_integer_mask = invalid_mask_integer(series)
|
|
340
|
+
|
|
341
|
+
# We only check range for values that already pass integer validation
|
|
342
|
+
valid = (~invalid_integer_mask) & series.notna()
|
|
343
|
+
if not valid.any():
|
|
344
|
+
return out
|
|
345
|
+
|
|
346
|
+
# String-normalise for safe compare (works for object/int dtype)
|
|
347
|
+
s = series[valid].astype("string", copy=False).str.strip()
|
|
348
|
+
|
|
349
|
+
# Sign handling
|
|
350
|
+
neg = s.str.startswith("-")
|
|
351
|
+
abs_str = s.str.lstrip("+-")
|
|
352
|
+
|
|
353
|
+
# Lengths
|
|
354
|
+
abs_len = abs_str.str.len()
|
|
355
|
+
|
|
356
|
+
# Positive overflow:
|
|
357
|
+
# abs_len > 10 OR (abs_len == 10 AND abs_str > 2147483647)
|
|
358
|
+
pos = ~neg
|
|
359
|
+
pos_over = (abs_len > _PG_INT4_MAX_LEN) | (
|
|
360
|
+
(abs_len == _PG_INT4_MAX_LEN) & (abs_str > _PG_INT4_MAX_STR_ABS)
|
|
361
|
+
)
|
|
362
|
+
|
|
363
|
+
# Negative overflow (too small):
|
|
364
|
+
# abs_len > 10 OR (abs_len == 10 AND abs_str > 2147483648)
|
|
365
|
+
neg_over = (abs_len > _PG_INT4_MIN_LEN) | (
|
|
366
|
+
(abs_len == _PG_INT4_MIN_LEN) & (abs_str > _PG_INT4_MIN_STR_ABS)
|
|
367
|
+
)
|
|
368
|
+
|
|
369
|
+
# Combine back into the full index
|
|
370
|
+
out.loc[valid] = (pos & pos_over) | (neg & neg_over)
|
|
371
|
+
return out
|
valediction/validation/issues.py
CHANGED
|
@@ -14,6 +14,7 @@ from valediction.data_types.data_types import DataType
|
|
|
14
14
|
from valediction.datasets.datasets_helpers import DataLike, DatasetItemLike
|
|
15
15
|
from valediction.dictionary.model import Table
|
|
16
16
|
from valediction.exceptions import DataDictionaryImportError, DataIntegrityError
|
|
17
|
+
from valediction.integrity import get_config
|
|
17
18
|
from valediction.io.csv_readers import (
|
|
18
19
|
CsvReadConfig,
|
|
19
20
|
FrameChunk,
|
|
@@ -29,6 +30,7 @@ from valediction.validation.helpers import (
|
|
|
29
30
|
invalid_mask_datetime,
|
|
30
31
|
invalid_mask_float,
|
|
31
32
|
invalid_mask_integer,
|
|
33
|
+
invalid_mask_integer_out_of_range,
|
|
32
34
|
invalid_mask_text_forbidden_characters,
|
|
33
35
|
invalid_mask_text_too_long,
|
|
34
36
|
mask_to_ranges,
|
|
@@ -151,7 +153,7 @@ class Validator:
|
|
|
151
153
|
datetime_format = column.datetime_format
|
|
152
154
|
data_type = column.data_type
|
|
153
155
|
|
|
154
|
-
if data_type in (DataType.DATE, DataType.
|
|
156
|
+
if data_type in (DataType.DATE, DataType.TIMESTAMP):
|
|
155
157
|
self._dt_format_cache[name] = datetime_format
|
|
156
158
|
|
|
157
159
|
if not datetime_format:
|
|
@@ -298,12 +300,7 @@ class Validator:
|
|
|
298
300
|
if missing_keys:
|
|
299
301
|
for name in dict_names:
|
|
300
302
|
if _normalise(name) in missing_keys:
|
|
301
|
-
self.
|
|
302
|
-
issue_type=IssueType.MISSING_COLUMN,
|
|
303
|
-
table=self.table_name,
|
|
304
|
-
column=name,
|
|
305
|
-
parent=self.dataset_item,
|
|
306
|
-
)
|
|
303
|
+
self._save_issues(IssueType.MISSING_COLUMN, name)
|
|
307
304
|
|
|
308
305
|
self.__complete_step()
|
|
309
306
|
|
|
@@ -320,12 +317,7 @@ class Validator:
|
|
|
320
317
|
if extra_keys:
|
|
321
318
|
for col in df_cols:
|
|
322
319
|
if _normalise(col) in extra_keys:
|
|
323
|
-
self.
|
|
324
|
-
issue_type=IssueType.EXTRA_COLUMN,
|
|
325
|
-
table=self.table_name,
|
|
326
|
-
column=col, # report the actual df label
|
|
327
|
-
parent=self.dataset_item,
|
|
328
|
-
)
|
|
320
|
+
self._save_issues(IssueType.EXTRA_COLUMN, col)
|
|
329
321
|
|
|
330
322
|
self.__complete_step()
|
|
331
323
|
|
|
@@ -363,13 +355,7 @@ class Validator:
|
|
|
363
355
|
pk_cols_text_df = self._resolve_df_cols(df, pk_cols_text)
|
|
364
356
|
space_mask = pk_contains_whitespace_mask(df[pk_cols_text_df])
|
|
365
357
|
if space_mask.any():
|
|
366
|
-
self.
|
|
367
|
-
issue_type=IssueType.PK_WHITESPACE,
|
|
368
|
-
table=self.table_name,
|
|
369
|
-
column=None,
|
|
370
|
-
ranges=mask_to_ranges(space_mask, start_row),
|
|
371
|
-
parent=self.dataset_item,
|
|
372
|
-
)
|
|
358
|
+
self._save_issues(IssueType.PK_WHITESPACE, None, space_mask, start_row)
|
|
373
359
|
self.__complete_step()
|
|
374
360
|
|
|
375
361
|
def _check_primary_key_integrity(self, df, start_row: int) -> None:
|
|
@@ -391,13 +377,7 @@ class Validator:
|
|
|
391
377
|
pk_hashes_non_null = pk_hashes[non_null]
|
|
392
378
|
|
|
393
379
|
if null.any():
|
|
394
|
-
self.
|
|
395
|
-
IssueType.PK_NULL,
|
|
396
|
-
table=self.table_name,
|
|
397
|
-
column=None,
|
|
398
|
-
ranges=mask_to_ranges(null, start_row),
|
|
399
|
-
parent=self.dataset_item,
|
|
400
|
-
)
|
|
380
|
+
self._save_issues(IssueType.PK_NULL, None, null, start_row)
|
|
401
381
|
self.__complete_step()
|
|
402
382
|
|
|
403
383
|
# 2) In-chunk collisions
|
|
@@ -427,22 +407,14 @@ class Validator:
|
|
|
427
407
|
|
|
428
408
|
# 7) Emit in-chunk collisions Issues
|
|
429
409
|
if in_chunk_collision.any():
|
|
430
|
-
self.
|
|
431
|
-
IssueType.PK_COLLISION,
|
|
432
|
-
table=self.table_name,
|
|
433
|
-
column=None,
|
|
434
|
-
ranges=mask_to_ranges(in_chunk_collision, start_row),
|
|
435
|
-
parent=self.dataset_item,
|
|
410
|
+
self._save_issues(
|
|
411
|
+
IssueType.PK_COLLISION, None, in_chunk_collision, start_row
|
|
436
412
|
)
|
|
437
413
|
|
|
438
414
|
# 7) Emit cross-chunk collisions Issues
|
|
439
415
|
if cross_chunk_collision.any():
|
|
440
|
-
self.
|
|
441
|
-
IssueType.PK_COLLISION,
|
|
442
|
-
table=self.table_name,
|
|
443
|
-
column=None,
|
|
444
|
-
ranges=mask_to_ranges(cross_chunk_collision, start_row),
|
|
445
|
-
parent=self.dataset_item,
|
|
416
|
+
self._save_issues(
|
|
417
|
+
IssueType.PK_COLLISION, None, cross_chunk_collision, start_row
|
|
446
418
|
)
|
|
447
419
|
|
|
448
420
|
# Add the original PK row as a collision
|
|
@@ -453,7 +425,7 @@ class Validator:
|
|
|
453
425
|
IssueType.PK_COLLISION,
|
|
454
426
|
table=self.table_name,
|
|
455
427
|
column=None,
|
|
456
|
-
ranges=[Range(first_row, first_row)],
|
|
428
|
+
ranges=[Range(first_row, first_row)], # add directly
|
|
457
429
|
parent=self.dataset_item,
|
|
458
430
|
)
|
|
459
431
|
self.tracker_pk_reported_first.add(int(h))
|
|
@@ -515,11 +487,50 @@ class Validator:
|
|
|
515
487
|
|
|
516
488
|
self.__complete_step()
|
|
517
489
|
|
|
490
|
+
def _save_issues(
|
|
491
|
+
self,
|
|
492
|
+
issue_type: IssueType,
|
|
493
|
+
column: str | None = None,
|
|
494
|
+
invalid: Series | None = None,
|
|
495
|
+
start_row: int | None = None,
|
|
496
|
+
) -> None:
|
|
497
|
+
if invalid is not None and start_row is None:
|
|
498
|
+
raise ValueError(
|
|
499
|
+
"'start_row' must be provided when 'invalid' mask is provided"
|
|
500
|
+
)
|
|
501
|
+
|
|
502
|
+
ranges = None if invalid is None else mask_to_ranges(invalid, start_row)
|
|
503
|
+
self.issues.add(
|
|
504
|
+
issue_type=issue_type,
|
|
505
|
+
table=self.table_name,
|
|
506
|
+
column=column,
|
|
507
|
+
ranges=ranges,
|
|
508
|
+
parent=self.dataset_item,
|
|
509
|
+
)
|
|
510
|
+
|
|
511
|
+
def _check_column_types_integer(
|
|
512
|
+
self, col: str, series: Series, start_row: int, allow_bigint: bool
|
|
513
|
+
) -> None:
|
|
514
|
+
invalid = invalid_mask_integer(series)
|
|
515
|
+
if invalid.any():
|
|
516
|
+
self._save_issues(IssueType.TYPE_MISMATCH, col, invalid, start_row)
|
|
517
|
+
|
|
518
|
+
# Check for out of range integers
|
|
519
|
+
if allow_bigint is False:
|
|
520
|
+
out_of_range = invalid_mask_integer_out_of_range(
|
|
521
|
+
series, invalid_integer_mask=invalid
|
|
522
|
+
)
|
|
523
|
+
if out_of_range.any():
|
|
524
|
+
self._save_issues(
|
|
525
|
+
IssueType.INTEGER_OUT_OF_RANGE, col, out_of_range, start_row
|
|
526
|
+
)
|
|
527
|
+
|
|
518
528
|
def _check_column_types(self, df: DataFrame, start_row: int) -> None:
|
|
519
529
|
self.__begin_step(step="Checking column types")
|
|
520
530
|
present = [
|
|
521
531
|
col for col in df.columns if _normalise(str(col)) in self._column_names
|
|
522
532
|
]
|
|
533
|
+
config = get_config()
|
|
523
534
|
for col in present:
|
|
524
535
|
dtype = self._find_data_type(col)
|
|
525
536
|
if dtype == DataType.TEXT:
|
|
@@ -527,26 +538,24 @@ class Validator:
|
|
|
527
538
|
|
|
528
539
|
series = df[col]
|
|
529
540
|
if dtype == DataType.INTEGER:
|
|
530
|
-
|
|
531
|
-
|
|
541
|
+
self._check_column_types_integer(
|
|
542
|
+
col, series, start_row, config.allow_bigint
|
|
543
|
+
)
|
|
544
|
+
continue
|
|
545
|
+
|
|
546
|
+
if dtype == DataType.FLOAT:
|
|
532
547
|
invalid = invalid_mask_float(series)
|
|
533
548
|
elif dtype == DataType.DATE:
|
|
534
549
|
fmt = self._dt_format_cache.get(col) or self._find_datetime_format(col)
|
|
535
550
|
invalid = invalid_mask_date(series, fmt)
|
|
536
|
-
elif dtype == DataType.
|
|
551
|
+
elif dtype == DataType.TIMESTAMP:
|
|
537
552
|
fmt = self._dt_format_cache.get(col) or self._find_datetime_format(col)
|
|
538
553
|
invalid = invalid_mask_datetime(series, fmt)
|
|
539
554
|
else:
|
|
540
555
|
continue
|
|
541
556
|
|
|
542
557
|
if invalid.any():
|
|
543
|
-
self.
|
|
544
|
-
IssueType.TYPE_MISMATCH,
|
|
545
|
-
table=self.table_name,
|
|
546
|
-
column=col,
|
|
547
|
-
ranges=mask_to_ranges(invalid, start_row),
|
|
548
|
-
parent=self.dataset_item,
|
|
549
|
-
)
|
|
558
|
+
self._save_issues(IssueType.TYPE_MISMATCH, col, invalid, start_row)
|
|
550
559
|
self.__complete_step()
|
|
551
560
|
|
|
552
561
|
def _check_text_lengths(self, df: DataFrame, start_row: int) -> None:
|
|
@@ -560,13 +569,7 @@ class Validator:
|
|
|
560
569
|
max_len = self._find_max_length(col)
|
|
561
570
|
invalid = invalid_mask_text_too_long(df[col], max_len)
|
|
562
571
|
if invalid.any():
|
|
563
|
-
self.
|
|
564
|
-
IssueType.TEXT_TOO_LONG,
|
|
565
|
-
table=self.table_name,
|
|
566
|
-
column=col,
|
|
567
|
-
ranges=mask_to_ranges(invalid, start_row),
|
|
568
|
-
parent=self.dataset_item,
|
|
569
|
-
)
|
|
572
|
+
self._save_issues(IssueType.TEXT_TOO_LONG, col, invalid, start_row)
|
|
570
573
|
self.__complete_step()
|
|
571
574
|
|
|
572
575
|
def _check_text_forbidden_chars(self, df: DataFrame, start_row: int) -> None:
|
|
@@ -579,13 +582,7 @@ class Validator:
|
|
|
579
582
|
continue
|
|
580
583
|
mask = invalid_mask_text_forbidden_characters(df[col])
|
|
581
584
|
if mask.any():
|
|
582
|
-
self.
|
|
583
|
-
IssueType.FORBIDDEN_CHARACTER,
|
|
584
|
-
table=self.table_name,
|
|
585
|
-
column=col,
|
|
586
|
-
ranges=mask_to_ranges(mask, start_row),
|
|
587
|
-
parent=self.dataset_item,
|
|
588
|
-
)
|
|
585
|
+
self._save_issues(IssueType.FORBIDDEN_CHARACTER, col, mask, start_row)
|
|
589
586
|
self.__complete_step()
|
|
590
587
|
|
|
591
588
|
# Validation: Final Helpers
|
|
@@ -593,12 +590,7 @@ class Validator:
|
|
|
593
590
|
self.__begin_step(step="Checking for fully null columns")
|
|
594
591
|
for column, seen in self.tracker_seen_non_nulls.items():
|
|
595
592
|
if not seen:
|
|
596
|
-
self.
|
|
597
|
-
issue_type=IssueType.FULLY_NULL_COLUMN,
|
|
598
|
-
table=self.table_name,
|
|
599
|
-
column=column,
|
|
600
|
-
parent=self.dataset_item,
|
|
601
|
-
)
|
|
593
|
+
self._save_issues(IssueType.FULLY_NULL_COLUMN, column)
|
|
602
594
|
self.__complete_step()
|
|
603
595
|
|
|
604
596
|
# Progress Helpers
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: valediction
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.5.0
|
|
4
4
|
Summary: Valediction is a convenience data validation package that allows generation, import, and constraint enforcement of user-defined data dictionaries against datasets.
|
|
5
5
|
Author-email: Cai Davis <Cai.Davis@uhs.nhs.uk>
|
|
6
6
|
Requires-Python: <4.0,>=3.11
|
|
@@ -1,38 +1,38 @@
|
|
|
1
1
|
valediction/__init__.py,sha256=HJy57qHyaeENZ0xGf5-jkkal-P92n242UU6vIqtsnaw,511
|
|
2
2
|
valediction/convenience.py,sha256=gDSNcI_T9VKO3Lk1Van4YQCt6hp_fqPyJnUJD8QNP_o,1438
|
|
3
3
|
valediction/exceptions.py,sha256=OtAq_ShVCZeoNx0hWCzJVlVdl3Gm55l72IP1KrKUMR0,748
|
|
4
|
-
valediction/integrity.py,sha256=
|
|
4
|
+
valediction/integrity.py,sha256=P18v_5BaUNZTTDfp0OLB9N-eM0IoqzxSVjhmEhNOQKY,5254
|
|
5
5
|
valediction/progress.py,sha256=fXld7BRhp8kk7xPCG50PbRPXvF8RV7Br2hENHuOUlbo,5974
|
|
6
6
|
valediction/support.py,sha256=dhKwhtL6dgG709T6fkGaLZDvjYtnxIO9cMmgz477m-I,2207
|
|
7
7
|
valediction/data_types/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
|
-
valediction/data_types/data_type_helpers.py,sha256=
|
|
9
|
-
valediction/data_types/data_types.py,sha256=
|
|
10
|
-
valediction/data_types/type_inference.py,sha256=
|
|
8
|
+
valediction/data_types/data_type_helpers.py,sha256=EyADhEHaLwKL7JaLsp2EgIsqo9O5r34WqiawWZKWVHI,2284
|
|
9
|
+
valediction/data_types/data_types.py,sha256=kluHvBdSwAjB5eBEM1xYH-SjiruhDaXiRMI2Bl0Lw7Y,1612
|
|
10
|
+
valediction/data_types/type_inference.py,sha256=a6er1jFZrhwAU0z2h1FsrmooPXUmwu6qkI2cHaciOb4,20315
|
|
11
11
|
valediction/datasets/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
12
12
|
valediction/datasets/datasets.py,sha256=uweSdVkk5-zDBzL8M-cHnC6LETZXnubakajFi0J6L_c,30089
|
|
13
13
|
valediction/datasets/datasets_helpers.py,sha256=AdB3ws5MYFpiXTmHXmSsdm2wZVwDXkXDOtYLvSYhs4I,1159
|
|
14
|
-
valediction/demo/DEMO - Data Dictionary.xlsx,sha256=
|
|
14
|
+
valediction/demo/DEMO - Data Dictionary.xlsx,sha256=upbwe14U5haXNarJndSW7ur6rd7RAssyEV2x2D7avao,45271
|
|
15
15
|
valediction/demo/DEMOGRAPHICS.csv,sha256=ochj8tiHdRByvK2YbZTw5UR3UxbjYxA2xVbA1BfUlbU,3437
|
|
16
16
|
valediction/demo/DIAGNOSES.csv,sha256=tJYtjeyLEbLvgulsbLA2MQg07AN_vGDSRGUn33xdiwM,19784
|
|
17
17
|
valediction/demo/LAB_TESTS.csv,sha256=ii1tdQWmm2JiG6MdywB8in_R8vXBJzalD1TEvxG3abw,80401
|
|
18
18
|
valediction/demo/VITALS.csv,sha256=npyaJAP5pLNndz3P3gQ4VbwI5_KwvCrXjSXjNLVJ1g0,46643
|
|
19
19
|
valediction/demo/__init__.py,sha256=trPaz5YRP_HCUJCVZhHdU8uQaaNjAISpFWCRx89ewBg,200
|
|
20
|
-
valediction/demo/demo_dictionary.py,sha256=
|
|
20
|
+
valediction/demo/demo_dictionary.py,sha256=etTvXk3r9OkPOL9C8Ws9RGZ3DgPsGT-JWt4gRmDgLfg,4411
|
|
21
21
|
valediction/dictionary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
22
22
|
valediction/dictionary/exporting.py,sha256=uRcnVzY5Uj0Yfju4y8sWsjTSP7ywATV9CQDYZMmnws0,18617
|
|
23
23
|
valediction/dictionary/exporting_helpers.py,sha256=O2pOMAHsRIyhLzxrSFrHIHJEf0wEteJguSI2JK9Rxcw,12434
|
|
24
|
-
valediction/dictionary/generation.py,sha256=
|
|
25
|
-
valediction/dictionary/helpers.py,sha256=
|
|
26
|
-
valediction/dictionary/importing.py,sha256=
|
|
24
|
+
valediction/dictionary/generation.py,sha256=XVGabsZgoFFC6nyr8vRrypdXnyfajAUTWWLs3miamm8,12306
|
|
25
|
+
valediction/dictionary/helpers.py,sha256=C8LM5FjQmWiOzLLDck08ihGbuVSuXQwCgkVWPKaOoOE,5289
|
|
26
|
+
valediction/dictionary/importing.py,sha256=e_jfVzW2tyfD1eR78_FL0MsFkA17RUDP6n9taX2Bxqk,19564
|
|
27
27
|
valediction/dictionary/integrity.py,sha256=k0RLRyNs8dsHyOivl2WCS6jxlhPsW9wfXB48owyokfs,787
|
|
28
28
|
valediction/dictionary/model.py,sha256=WtTGb5gZAtg7JiurvaWuD1K4DnNkygU-PoEVTZIgBCc,21617
|
|
29
|
-
valediction/dictionary/template/PROJECT - Data Dictionary.xltx,sha256=
|
|
29
|
+
valediction/dictionary/template/PROJECT - Data Dictionary.xltx,sha256=1boUdgNE1zYUu3pko9QREEi-hn29gp85PQqVwISyS2U,41691
|
|
30
30
|
valediction/io/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
31
31
|
valediction/io/csv_readers.py,sha256=sKYP_xtOuxwm6ce2eDrphQ_wagxP0RYMXiMlEtkybBg,9812
|
|
32
32
|
valediction/validation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
33
|
-
valediction/validation/helpers.py,sha256=
|
|
34
|
-
valediction/validation/issues.py,sha256=
|
|
35
|
-
valediction/validation/validation.py,sha256=
|
|
36
|
-
valediction-1.
|
|
37
|
-
valediction-1.
|
|
38
|
-
valediction-1.
|
|
33
|
+
valediction/validation/helpers.py,sha256=RZsFeRGJqHJlxtSbXItV093-4SuXoI74X9539ZyqswY,12034
|
|
34
|
+
valediction/validation/issues.py,sha256=UO5bFxCOfT76rwi3SWi7wdn6Ux2wxxPC02RGvl0dL9Y,9374
|
|
35
|
+
valediction/validation/validation.py,sha256=ezWr4-RRq3w5P-IT3nC9g-KIz0FWXkFdNY-_ouOJxzo,22688
|
|
36
|
+
valediction-1.5.0.dist-info/METADATA,sha256=7OF5xMdNJnOVhEaz1ffd1lJqz61qJqje_T_r9PkKa7o,612
|
|
37
|
+
valediction-1.5.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
38
|
+
valediction-1.5.0.dist-info/RECORD,,
|
|
File without changes
|